net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_tw_reuse __read_mostly;
  88 int sysctl_tcp_low_latency __read_mostly;
  89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  94 #endif
  95
  96 struct inet_hashinfo tcp_hashinfo;
  97 EXPORT_SYMBOL(tcp_hashinfo);
  98
  99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 100 {
 101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 102                                           ip_hdr(skb)->saddr,
 103                                           tcp_hdr(skb)->dest,
 104                                           tcp_hdr(skb)->source);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110         struct tcp_sock *tp = tcp_sk(sk);
 111
 112         /* With PAWS, it is safe from the viewpoint
 113            of data integrity. Even without PAWS it is safe provided sequence
 114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 115
 116            Actually, the idea is close to VJ's one, only timestamp cache is
 117            held not per host, but per port pair and TW bucket is used as state
 118            holder.
 119
 120            If TW bucket has been already destroyed we fall back to VJ's scheme
 121            and use initial timestamp retrieved from peer table.
 122          */
 123         if (tcptw->tw_ts_recent_stamp &&
 124             (!twp || (sysctl_tcp_tw_reuse &&
 125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 127                 if (tp->write_seq == 0)
 128                         tp->write_seq = 1;
 129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 131                 sock_hold(sktw);
 132                 return 1;
 133         }
 134
 135         return 0;
 136 }
 137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 138
 139 /* This will initiate an outgoing connection. */
 140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 141 {
 142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 143         struct inet_sock *inet = inet_sk(sk);
 144         struct tcp_sock *tp = tcp_sk(sk);
 145         __be16 orig_sport, orig_dport;
 146         __be32 daddr, nexthop;
 147         struct flowi4 *fl4;
 148         struct rtable *rt;
 149         int err;
 150         struct ip_options_rcu *inet_opt;
 151
 152         if (addr_len < sizeof(struct sockaddr_in))
 153                 return -EINVAL;
 154
 155         if (usin->sin_family != AF_INET)
 156                 return -EAFNOSUPPORT;
 157
 158         nexthop = daddr = usin->sin_addr.s_addr;
 159         inet_opt = rcu_dereference_protected(inet->inet_opt,
 160                                              lockdep_sock_is_held(sk));
 161         if (inet_opt && inet_opt->opt.srr) {
 162                 if (!daddr)
 163                         return -EINVAL;
 164                 nexthop = inet_opt->opt.faddr;
 165         }
 166
 167         orig_sport = inet->inet_sport;
 168         orig_dport = usin->sin_port;
 169         fl4 = &inet->cork.fl.u.ip4;
 170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 172                               IPPROTO_TCP,
 173                               orig_sport, orig_dport, sk);
 174         if (IS_ERR(rt)) {
 175                 err = PTR_ERR(rt);
 176                 if (err == -ENETUNREACH)
 177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 178                 return err;
 179         }
 180
 181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 182                 ip_rt_put(rt);
 183                 return -ENETUNREACH;
 184         }
 185
 186         if (!inet_opt || !inet_opt->opt.srr)
 187                 daddr = fl4->daddr;
 188
 189         if (!inet->inet_saddr)
 190                 inet->inet_saddr = fl4->saddr;
 191         sk_rcv_saddr_set(sk, inet->inet_saddr);
 192
 193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 194                 /* Reset inherited state */
 195                 tp->rx_opt.ts_recent       = 0;
 196                 tp->rx_opt.ts_recent_stamp = 0;
 197                 if (likely(!tp->repair))
 198                         tp->write_seq      = 0;
 199         }
 200
 201         if (tcp_death_row.sysctl_tw_recycle &&
 202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 204
 205         inet->inet_dport = usin->sin_port;
 206         sk_daddr_set(sk, daddr);
 207
 208         inet_csk(sk)->icsk_ext_hdr_len = 0;
 209         if (inet_opt)
 210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 211
 212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 213
 214         /* Socket identity is still unknown (sport may be zero).
 215          * However we set state to SYN-SENT and not releasing socket
 216          * lock select source port, enter ourselves into the hash tables and
 217          * complete initialization after this.
 218          */
 219         tcp_set_state(sk, TCP_SYN_SENT);
 220         err = inet_hash_connect(&tcp_death_row, sk);
 221         if (err)
 222                 goto failure;
 223
 224         sk_set_txhash(sk);
 225
 226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 227                                inet->inet_sport, inet->inet_dport, sk);
 228         if (IS_ERR(rt)) {
 229                 err = PTR_ERR(rt);
 230                 rt = NULL;
 231                 goto failure;
 232         }
 233         /* OK, now commit destination to socket.  */
 234         sk->sk_gso_type = SKB_GSO_TCPV4;
 235         sk_setup_caps(sk, &rt->dst);
 236
 237         if (!tp->write_seq && likely(!tp->repair))
 238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 239                                                            inet->inet_daddr,
 240                                                            inet->inet_sport,
 241                                                            usin->sin_port);
 242
 243         inet->inet_id = tp->write_seq ^ jiffies;
 244
 245         err = tcp_connect(sk);
 246
 247         rt = NULL;
 248         if (err)
 249                 goto failure;
 250
 251         return 0;
 252
 253 failure:
 254         /*
 255          * This unhashes the socket and releases the local port,
 256          * if necessary.
 257          */
 258         tcp_set_state(sk, TCP_CLOSE);
 259         ip_rt_put(rt);
 260         sk->sk_route_caps = 0;
 261         inet->inet_dport = 0;
 262         return err;
 263 }
 264 EXPORT_SYMBOL(tcp_v4_connect);
 265
 266 /*
 267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 268  * It can be called through tcp_release_cb() if socket was owned by user
 269  * at the time tcp_v4_err() was called to handle ICMP message.
 270  */
 271 void tcp_v4_mtu_reduced(struct sock *sk)
 272 {
 273         struct dst_entry *dst;
 274         struct inet_sock *inet = inet_sk(sk);
 275         u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277         dst = inet_csk_update_pmtu(sk, mtu);
 278         if (!dst)
 279                 return;
 280
 281         /* Something is about to be wrong... Remember soft error
 282          * for the case, if this connection will not able to recover.
 283          */
 284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 285                 sk->sk_err_soft = EMSGSIZE;
 286
 287         mtu = dst_mtu(dst);
 288
 289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 290             ip_sk_accept_pmtu(sk) &&
 291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 292                 tcp_sync_mss(sk, mtu);
 293
 294                 /* Resend the TCP packet because it's
 295                  * clear that the old packet has been
 296                  * dropped. This is the new "fast" path mtu
 297                  * discovery.
 298                  */
 299                 tcp_simple_retransmit(sk);
 300         } /* else let the usual retransmit timer handle it */
 301 }
 302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 303
 304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 305 {
 306         struct dst_entry *dst = __sk_dst_check(sk, 0);
 307
 308         if (dst)
 309                 dst->ops->redirect(dst, sk, skb);
 310 }
 311
 312
 313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 315 {
 316         struct request_sock *req = inet_reqsk(sk);
 317         struct net *net = sock_net(sk);
 318
 319         /* ICMPs are not backlogged, hence we cannot get
 320          * an established socket here.
 321          */
 322         if (seq != tcp_rsk(req)->snt_isn) {
 323                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 324         } else if (abort) {
 325                 /*
 326                  * Still in SYN_RECV, just remove it silently.
 327                  * There is no good way to pass the error to the newly
 328                  * created socket, and POSIX does not want network
 329                  * errors returned from accept().
 330                  */
 331                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 332                 tcp_listendrop(req->rsk_listener);
 333         }
 334         reqsk_put(req);
 335 }
 336 EXPORT_SYMBOL(tcp_req_err);
 337
 338 /*
 339  * This routine is called by the ICMP module when it gets some
 340  * sort of error condition.  If err < 0 then the socket should
 341  * be closed and the error returned to the user.  If err > 0
 342  * it's just the icmp type << 8 | icmp code.  After adjustment
 343  * header points to the first 8 bytes of the tcp header.  We need
 344  * to find the appropriate port.
 345  *
 346  * The locking strategy used here is very "optimistic". When
 347  * someone else accesses the socket the ICMP is just dropped
 348  * and for some paths there is no check at all.
 349  * A more general error queue to queue errors for later handling
 350  * is probably better.
 351  *
 352  */
 353
 354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 355 {
 356         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 357         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 358         struct inet_connection_sock *icsk;
 359         struct tcp_sock *tp;
 360         struct inet_sock *inet;
 361         const int type = icmp_hdr(icmp_skb)->type;
 362         const int code = icmp_hdr(icmp_skb)->code;
 363         struct sock *sk;
 364         struct sk_buff *skb;
 365         struct request_sock *fastopen;
 366         __u32 seq, snd_una;
 367         __u32 remaining;
 368         int err;
 369         struct net *net = dev_net(icmp_skb->dev);
 370
 371         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 372                                        th->dest, iph->saddr, ntohs(th->source),
 373                                        inet_iif(icmp_skb));
 374         if (!sk) {
 375                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 376                 return;
 377         }
 378         if (sk->sk_state == TCP_TIME_WAIT) {
 379                 inet_twsk_put(inet_twsk(sk));
 380                 return;
 381         }
 382         seq = ntohl(th->seq);
 383         if (sk->sk_state == TCP_NEW_SYN_RECV)
 384                 return tcp_req_err(sk, seq,
 385                                   type == ICMP_PARAMETERPROB ||
 386                                   type == ICMP_TIME_EXCEEDED ||
 387                                   (type == ICMP_DEST_UNREACH &&
 388                                    (code == ICMP_NET_UNREACH ||
 389                                     code == ICMP_HOST_UNREACH)));
 390
 391         bh_lock_sock(sk);
 392         /* If too many ICMPs get dropped on busy
 393          * servers this needs to be solved differently.
 394          * We do take care of PMTU discovery (RFC1191) special case :
 395          * we can receive locally generated ICMP messages while socket is held.
 396          */
 397         if (sock_owned_by_user(sk)) {
 398                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 399                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 400         }
 401         if (sk->sk_state == TCP_CLOSE)
 402                 goto out;
 403
 404         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 405                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 406                 goto out;
 407         }
 408
 409         icsk = inet_csk(sk);
 410         tp = tcp_sk(sk);
 411         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 412         fastopen = tp->fastopen_rsk;
 413         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 414         if (sk->sk_state != TCP_LISTEN &&
 415             !between(seq, snd_una, tp->snd_nxt)) {
 416                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 417                 goto out;
 418         }
 419
 420         switch (type) {
 421         case ICMP_REDIRECT:
 422                 do_redirect(icmp_skb, sk);
 423                 goto out;
 424         case ICMP_SOURCE_QUENCH:
 425                 /* Just silently ignore these. */
 426                 goto out;
 427         case ICMP_PARAMETERPROB:
 428                 err = EPROTO;
 429                 break;
 430         case ICMP_DEST_UNREACH:
 431                 if (code > NR_ICMP_UNREACH)
 432                         goto out;
 433
 434                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 435                         /* We are not interested in TCP_LISTEN and open_requests
 436                          * (SYN-ACKs send out by Linux are always <576bytes so
 437                          * they should go through unfragmented).
 438                          */
 439                         if (sk->sk_state == TCP_LISTEN)
 440                                 goto out;
 441
 442                         tp->mtu_info = info;
 443                         if (!sock_owned_by_user(sk)) {
 444                                 tcp_v4_mtu_reduced(sk);
 445                         } else {
 446                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 447                                         sock_hold(sk);
 448                         }
 449                         goto out;
 450                 }
 451
 452                 err = icmp_err_convert[code].errno;
 453                 /* check if icmp_skb allows revert of backoff
 454                  * (see draft-zimmermann-tcp-lcd) */
 455                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 456                         break;
 457                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 458                     !icsk->icsk_backoff || fastopen)
 459                         break;
 460
 461                 if (sock_owned_by_user(sk))
 462                         break;
 463
 464                 icsk->icsk_backoff--;
 465                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 466                                                TCP_TIMEOUT_INIT;
 467                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 468
 469                 skb = tcp_write_queue_head(sk);
 470                 BUG_ON(!skb);
 471
 472                 remaining = icsk->icsk_rto -
 473                             min(icsk->icsk_rto,
 474                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 475
 476                 if (remaining) {
 477                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 478                                                   remaining, TCP_RTO_MAX);
 479                 } else {
 480                         /* RTO revert clocked out retransmission.
 481                          * Will retransmit now */
 482                         tcp_retransmit_timer(sk);
 483                 }
 484
 485                 break;
 486         case ICMP_TIME_EXCEEDED:
 487                 err = EHOSTUNREACH;
 488                 break;
 489         default:
 490                 goto out;
 491         }
 492
 493         switch (sk->sk_state) {
 494         case TCP_SYN_SENT:
 495         case TCP_SYN_RECV:
 496                 /* Only in fast or simultaneous open. If a fast open socket is
 497                  * is already accepted it is treated as a connected one below.
 498                  */
 499                 if (fastopen && !fastopen->sk)
 500                         break;
 501
 502                 if (!sock_owned_by_user(sk)) {
 503                         sk->sk_err = err;
 504
 505                         sk->sk_error_report(sk);
 506
 507                         tcp_done(sk);
 508                 } else {
 509                         sk->sk_err_soft = err;
 510                 }
 511                 goto out;
 512         }
 513
 514         /* If we've already connected we will keep trying
 515          * until we time out, or the user gives up.
 516          *
 517          * rfc1122 4.2.3.9 allows to consider as hard errors
 518          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 519          * but it is obsoleted by pmtu discovery).
 520          *
 521          * Note, that in modern internet, where routing is unreliable
 522          * and in each dark corner broken firewalls sit, sending random
 523          * errors ordered by their masters even this two messages finally lose
 524          * their original sense (even Linux sends invalid PORT_UNREACHs)
 525          *
 526          * Now we are in compliance with RFCs.
 527          *                                                      --ANK (980905)
 528          */
 529
 530         inet = inet_sk(sk);
 531         if (!sock_owned_by_user(sk) && inet->recverr) {
 532                 sk->sk_err = err;
 533                 sk->sk_error_report(sk);
 534         } else  { /* Only an error on timeout */
 535                 sk->sk_err_soft = err;
 536         }
 537
 538 out:
 539         bh_unlock_sock(sk);
 540         sock_put(sk);
 541 }
 542
 543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 544 {
 545         struct tcphdr *th = tcp_hdr(skb);
 546
 547         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 548                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 549                 skb->csum_start = skb_transport_header(skb) - skb->head;
 550                 skb->csum_offset = offsetof(struct tcphdr, check);
 551         } else {
 552                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 553                                          csum_partial(th,
 554                                                       th->doff << 2,
 555                                                       skb->csum));
 556         }
 557 }
 558
 559 /* This routine computes an IPv4 TCP checksum. */
 560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 561 {
 562         const struct inet_sock *inet = inet_sk(sk);
 563
 564         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 565 }
 566 EXPORT_SYMBOL(tcp_v4_send_check);
 567
 568 /*
 569  *      This routine will send an RST to the other tcp.
 570  *
 571  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 572  *                    for reset.
 573  *      Answer: if a packet caused RST, it is not for a socket
 574  *              existing in our system, if it is matched to a socket,
 575  *              it is just duplicate segment or bug in other side's TCP.
 576  *              So that we build reply only basing on parameters
 577  *              arrived with segment.
 578  *      Exception: precedence violation. We do not implement it in any case.
 579  */
 580
 581 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 582 {
 583         const struct tcphdr *th = tcp_hdr(skb);
 584         struct {
 585                 struct tcphdr th;
 586 #ifdef CONFIG_TCP_MD5SIG
 587                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 588 #endif
 589         } rep;
 590         struct ip_reply_arg arg;
 591 #ifdef CONFIG_TCP_MD5SIG
 592         struct tcp_md5sig_key *key = NULL;
 593         const __u8 *hash_location = NULL;
 594         unsigned char newhash[16];
 595         int genhash;
 596         struct sock *sk1 = NULL;
 597 #endif
 598         struct net *net;
 599
 600         /* Never send a reset in response to a reset. */
 601         if (th->rst)
 602                 return;
 603
 604         /* If sk not NULL, it means we did a successful lookup and incoming
 605          * route had to be correct. prequeue might have dropped our dst.
 606          */
 607         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 608                 return;
 609
 610         /* Swap the send and the receive. */
 611         memset(&rep, 0, sizeof(rep));
 612         rep.th.dest   = th->source;
 613         rep.th.source = th->dest;
 614         rep.th.doff   = sizeof(struct tcphdr) / 4;
 615         rep.th.rst    = 1;
 616
 617         if (th->ack) {
 618                 rep.th.seq = th->ack_seq;
 619         } else {
 620                 rep.th.ack = 1;
 621                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 622                                        skb->len - (th->doff << 2));
 623         }
 624
 625         memset(&arg, 0, sizeof(arg));
 626         arg.iov[0].iov_base = (unsigned char *)&rep;
 627         arg.iov[0].iov_len  = sizeof(rep.th);
 628
 629         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 630 #ifdef CONFIG_TCP_MD5SIG
 631         rcu_read_lock();
 632         hash_location = tcp_parse_md5sig_option(th);
 633         if (sk && sk_fullsock(sk)) {
 634                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 635                                         &ip_hdr(skb)->saddr, AF_INET);
 636         } else if (hash_location) {
 637                 /*
 638                  * active side is lost. Try to find listening socket through
 639                  * source port, and then find md5 key through listening socket.
 640                  * we are not loose security here:
 641                  * Incoming packet is checked with md5 hash with finding key,
 642                  * no RST generated if md5 hash doesn't match.
 643                  */
 644                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 645                                              ip_hdr(skb)->saddr,
 646                                              th->source, ip_hdr(skb)->daddr,
 647                                              ntohs(th->source), inet_iif(skb));
 648                 /* don't send rst if it can't find key */
 649                 if (!sk1)
 650                         goto out;
 651
 652                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 653                                         &ip_hdr(skb)->saddr, AF_INET);
 654                 if (!key)
 655                         goto out;
 656
 657
 658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 660                         goto out;
 661
 662         }
 663
 664         if (key) {
 665                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 666                                    (TCPOPT_NOP << 16) |
 667                                    (TCPOPT_MD5SIG << 8) |
 668                                    TCPOLEN_MD5SIG);
 669                 /* Update length and the length the header thinks exists */
 670                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 671                 rep.th.doff = arg.iov[0].iov_len / 4;
 672
 673                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 674                                      key, ip_hdr(skb)->saddr,
 675                                      ip_hdr(skb)->daddr, &rep.th);
 676         }
 677 #endif
 678         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 679                                       ip_hdr(skb)->saddr, /* XXX */
 680                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 681         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 682         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 683
 684         /* When socket is gone, all binding information is lost.
 685          * routing might fail in this case. No choice here, if we choose to force
 686          * input interface, we will misroute in case of asymmetric route.
 687          */
 688         if (sk)
 689                 arg.bound_dev_if = sk->sk_bound_dev_if;
 690
 691         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 692                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 693
 694         arg.tos = ip_hdr(skb)->tos;
 695         local_bh_disable();
 696         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 697                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 698                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 699                               &arg, arg.iov[0].iov_len);
 700
 701         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 702         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 703         local_bh_enable();
 704
 705 #ifdef CONFIG_TCP_MD5SIG
 706 out:
 707         rcu_read_unlock();
 708 #endif
 709 }
 710
 711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 712    outside socket context is ugly, certainly. What can I do?
 713  */
 714
 715 static void tcp_v4_send_ack(struct net *net,
 716                             struct sk_buff *skb, u32 seq, u32 ack,
 717                             u32 win, u32 tsval, u32 tsecr, int oif,
 718                             struct tcp_md5sig_key *key,
 719                             int reply_flags, u8 tos)
 720 {
 721         const struct tcphdr *th = tcp_hdr(skb);
 722         struct {
 723                 struct tcphdr th;
 724                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 725 #ifdef CONFIG_TCP_MD5SIG
 726                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 727 #endif
 728                         ];
 729         } rep;
 730         struct ip_reply_arg arg;
 731
 732         memset(&rep.th, 0, sizeof(struct tcphdr));
 733         memset(&arg, 0, sizeof(arg));
 734
 735         arg.iov[0].iov_base = (unsigned char *)&rep;
 736         arg.iov[0].iov_len  = sizeof(rep.th);
 737         if (tsecr) {
 738                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 739                                    (TCPOPT_TIMESTAMP << 8) |
 740                                    TCPOLEN_TIMESTAMP);
 741                 rep.opt[1] = htonl(tsval);
 742                 rep.opt[2] = htonl(tsecr);
 743                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 744         }
 745
 746         /* Swap the send and the receive. */
 747         rep.th.dest    = th->source;
 748         rep.th.source  = th->dest;
 749         rep.th.doff    = arg.iov[0].iov_len / 4;
 750         rep.th.seq     = htonl(seq);
 751         rep.th.ack_seq = htonl(ack);
 752         rep.th.ack     = 1;
 753         rep.th.window  = htons(win);
 754
 755 #ifdef CONFIG_TCP_MD5SIG
 756         if (key) {
 757                 int offset = (tsecr) ? 3 : 0;
 758
 759                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 760                                           (TCPOPT_NOP << 16) |
 761                                           (TCPOPT_MD5SIG << 8) |
 762                                           TCPOLEN_MD5SIG);
 763                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 764                 rep.th.doff = arg.iov[0].iov_len/4;
 765
 766                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 767                                     key, ip_hdr(skb)->saddr,
 768                                     ip_hdr(skb)->daddr, &rep.th);
 769         }
 770 #endif
 771         arg.flags = reply_flags;
 772         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 773                                       ip_hdr(skb)->saddr, /* XXX */
 774                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 775         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 776         if (oif)
 777                 arg.bound_dev_if = oif;
 778         arg.tos = tos;
 779         local_bh_disable();
 780         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 781                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 782                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 783                               &arg, arg.iov[0].iov_len);
 784
 785         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 786         local_bh_enable();
 787 }
 788
 789 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 790 {
 791         struct inet_timewait_sock *tw = inet_twsk(sk);
 792         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 793
 794         tcp_v4_send_ack(sock_net(sk), skb,
 795                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 796                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 797                         tcp_time_stamp + tcptw->tw_ts_offset,
 798                         tcptw->tw_ts_recent,
 799                         tw->tw_bound_dev_if,
 800                         tcp_twsk_md5_key(tcptw),
 801                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 802                         tw->tw_tos
 803                         );
 804
 805         inet_twsk_put(tw);
 806 }
 807
 808 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 809                                   struct request_sock *req)
 810 {
 811         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 812          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 813          */
 814         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 815                                              tcp_sk(sk)->snd_nxt;
 816
 817         /* RFC 7323 2.3
 818          * The window field (SEG.WND) of every outgoing segment, with the
 819          * exception of <SYN> segments, MUST be right-shifted by
 820          * Rcv.Wind.Shift bits:
 821          */
 822         tcp_v4_send_ack(sock_net(sk), skb, seq,
 823                         tcp_rsk(req)->rcv_nxt,
 824                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 825                         tcp_time_stamp,
 826                         req->ts_recent,
 827                         0,
 828                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 829                                           AF_INET),
 830                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 831                         ip_hdr(skb)->tos);
 832 }
 833
 834 /*
 835  *      Send a SYN-ACK after having received a SYN.
 836  *      This still operates on a request_sock only, not on a big
 837  *      socket.
 838  */
 839 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 840                               struct flowi *fl,
 841                               struct request_sock *req,
 842                               struct tcp_fastopen_cookie *foc,
 843                               enum tcp_synack_type synack_type)
 844 {
 845         const struct inet_request_sock *ireq = inet_rsk(req);
 846         struct flowi4 fl4;
 847         int err = -1;
 848         struct sk_buff *skb;
 849
 850         /* First, grab a route. */
 851         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 852                 return -1;
 853
 854         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 855
 856         if (skb) {
 857                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 858
 859                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 860                                             ireq->ir_rmt_addr,
 861                                             ireq->opt);
 862                 err = net_xmit_eval(err);
 863         }
 864
 865         return err;
 866 }
 867
 868 /*
 869  *      IPv4 request_sock destructor.
 870  */
 871 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 872 {
 873         kfree(inet_rsk(req)->opt);
 874 }
 875
 876 #ifdef CONFIG_TCP_MD5SIG
 877 /*
 878  * RFC2385 MD5 checksumming requires a mapping of
 879  * IP address->MD5 Key.
 880  * We need to maintain these in the sk structure.
 881  */
 882
 883 /* Find the Key structure for an address.  */
 884 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 885                                          const union tcp_md5_addr *addr,
 886                                          int family)
 887 {
 888         const struct tcp_sock *tp = tcp_sk(sk);
 889         struct tcp_md5sig_key *key;
 890         unsigned int size = sizeof(struct in_addr);
 891         const struct tcp_md5sig_info *md5sig;
 892
 893         /* caller either holds rcu_read_lock() or socket lock */
 894         md5sig = rcu_dereference_check(tp->md5sig_info,
 895                                        lockdep_sock_is_held(sk));
 896         if (!md5sig)
 897                 return NULL;
 898 #if IS_ENABLED(CONFIG_IPV6)
 899         if (family == AF_INET6)
 900                 size = sizeof(struct in6_addr);
 901 #endif
 902         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 903                 if (key->family != family)
 904                         continue;
 905                 if (!memcmp(&key->addr, addr, size))
 906                         return key;
 907         }
 908         return NULL;
 909 }
 910 EXPORT_SYMBOL(tcp_md5_do_lookup);
 911
 912 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 913                                          const struct sock *addr_sk)
 914 {
 915         const union tcp_md5_addr *addr;
 916
 917         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 918         return tcp_md5_do_lookup(sk, addr, AF_INET);
 919 }
 920 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 921
 922 /* This can be called on a newly created socket, from other files */
 923 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 924                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 925 {
 926         /* Add Key to the list */
 927         struct tcp_md5sig_key *key;
 928         struct tcp_sock *tp = tcp_sk(sk);
 929         struct tcp_md5sig_info *md5sig;
 930
 931         key = tcp_md5_do_lookup(sk, addr, family);
 932         if (key) {
 933                 /* Pre-existing entry - just update that one. */
 934                 memcpy(key->key, newkey, newkeylen);
 935                 key->keylen = newkeylen;
 936                 return 0;
 937         }
 938
 939         md5sig = rcu_dereference_protected(tp->md5sig_info,
 940                                            lockdep_sock_is_held(sk));
 941         if (!md5sig) {
 942                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 943                 if (!md5sig)
 944                         return -ENOMEM;
 945
 946                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 947                 INIT_HLIST_HEAD(&md5sig->head);
 948                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 949         }
 950
 951         key = sock_kmalloc(sk, sizeof(*key), gfp);
 952         if (!key)
 953                 return -ENOMEM;
 954         if (!tcp_alloc_md5sig_pool()) {
 955                 sock_kfree_s(sk, key, sizeof(*key));
 956                 return -ENOMEM;
 957         }
 958
 959         memcpy(key->key, newkey, newkeylen);
 960         key->keylen = newkeylen;
 961         key->family = family;
 962         memcpy(&key->addr, addr,
 963                (family == AF_INET6) ? sizeof(struct in6_addr) :
 964                                       sizeof(struct in_addr));
 965         hlist_add_head_rcu(&key->node, &md5sig->head);
 966         return 0;
 967 }
 968 EXPORT_SYMBOL(tcp_md5_do_add);
 969
 970 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 971 {
 972         struct tcp_md5sig_key *key;
 973
 974         key = tcp_md5_do_lookup(sk, addr, family);
 975         if (!key)
 976                 return -ENOENT;
 977         hlist_del_rcu(&key->node);
 978         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 979         kfree_rcu(key, rcu);
 980         return 0;
 981 }
 982 EXPORT_SYMBOL(tcp_md5_do_del);
 983
 984 static void tcp_clear_md5_list(struct sock *sk)
 985 {
 986         struct tcp_sock *tp = tcp_sk(sk);
 987         struct tcp_md5sig_key *key;
 988         struct hlist_node *n;
 989         struct tcp_md5sig_info *md5sig;
 990
 991         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 992
 993         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 994                 hlist_del_rcu(&key->node);
 995                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 996                 kfree_rcu(key, rcu);
 997         }
 998 }
 999
1000 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1001                                  int optlen)
1002 {
1003         struct tcp_md5sig cmd;
1004         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1005
1006         if (optlen < sizeof(cmd))
1007                 return -EINVAL;
1008
1009         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1010                 return -EFAULT;
1011
1012         if (sin->sin_family != AF_INET)
1013                 return -EINVAL;
1014
1015         if (!cmd.tcpm_keylen)
1016                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1017                                       AF_INET);
1018
1019         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1020                 return -EINVAL;
1021
1022         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1023                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1024                               GFP_KERNEL);
1025 }
1026
1027 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1028                                    __be32 daddr, __be32 saddr,
1029                                    const struct tcphdr *th, int nbytes)
1030 {
1031         struct tcp4_pseudohdr *bp;
1032         struct scatterlist sg;
1033         struct tcphdr *_th;
1034
1035         bp = hp->scratch;
1036         bp->saddr = saddr;
1037         bp->daddr = daddr;
1038         bp->pad = 0;
1039         bp->protocol = IPPROTO_TCP;
1040         bp->len = cpu_to_be16(nbytes);
1041
1042         _th = (struct tcphdr *)(bp + 1);
1043         memcpy(_th, th, sizeof(*th));
1044         _th->check = 0;
1045
1046         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1047         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1048                                 sizeof(*bp) + sizeof(*th));
1049         return crypto_ahash_update(hp->md5_req);
1050 }
1051
1052 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1053                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1054 {
1055         struct tcp_md5sig_pool *hp;
1056         struct ahash_request *req;
1057
1058         hp = tcp_get_md5sig_pool();
1059         if (!hp)
1060                 goto clear_hash_noput;
1061         req = hp->md5_req;
1062
1063         if (crypto_ahash_init(req))
1064                 goto clear_hash;
1065         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1066                 goto clear_hash;
1067         if (tcp_md5_hash_key(hp, key))
1068                 goto clear_hash;
1069         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1070         if (crypto_ahash_final(req))
1071                 goto clear_hash;
1072
1073         tcp_put_md5sig_pool();
1074         return 0;
1075
1076 clear_hash:
1077         tcp_put_md5sig_pool();
1078 clear_hash_noput:
1079         memset(md5_hash, 0, 16);
1080         return 1;
1081 }
1082
1083 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1084                         const struct sock *sk,
1085                         const struct sk_buff *skb)
1086 {
1087         struct tcp_md5sig_pool *hp;
1088         struct ahash_request *req;
1089         const struct tcphdr *th = tcp_hdr(skb);
1090         __be32 saddr, daddr;
1091
1092         if (sk) { /* valid for establish/request sockets */
1093                 saddr = sk->sk_rcv_saddr;
1094                 daddr = sk->sk_daddr;
1095         } else {
1096                 const struct iphdr *iph = ip_hdr(skb);
1097                 saddr = iph->saddr;
1098                 daddr = iph->daddr;
1099         }
1100
1101         hp = tcp_get_md5sig_pool();
1102         if (!hp)
1103                 goto clear_hash_noput;
1104         req = hp->md5_req;
1105
1106         if (crypto_ahash_init(req))
1107                 goto clear_hash;
1108
1109         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1110                 goto clear_hash;
1111         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1112                 goto clear_hash;
1113         if (tcp_md5_hash_key(hp, key))
1114                 goto clear_hash;
1115         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1116         if (crypto_ahash_final(req))
1117                 goto clear_hash;
1118
1119         tcp_put_md5sig_pool();
1120         return 0;
1121
1122 clear_hash:
1123         tcp_put_md5sig_pool();
1124 clear_hash_noput:
1125         memset(md5_hash, 0, 16);
1126         return 1;
1127 }
1128 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1129
1130 #endif
1131
1132 /* Called with rcu_read_lock() */
1133 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1134                                     const struct sk_buff *skb)
1135 {
1136 #ifdef CONFIG_TCP_MD5SIG
1137         /*
1138          * This gets called for each TCP segment that arrives
1139          * so we want to be efficient.
1140          * We have 3 drop cases:
1141          * o No MD5 hash and one expected.
1142          * o MD5 hash and we're not expecting one.
1143          * o MD5 hash and its wrong.
1144          */
1145         const __u8 *hash_location = NULL;
1146         struct tcp_md5sig_key *hash_expected;
1147         const struct iphdr *iph = ip_hdr(skb);
1148         const struct tcphdr *th = tcp_hdr(skb);
1149         int genhash;
1150         unsigned char newhash[16];
1151
1152         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1153                                           AF_INET);
1154         hash_location = tcp_parse_md5sig_option(th);
1155
1156         /* We've parsed the options - do we have a hash? */
1157         if (!hash_expected && !hash_location)
1158                 return false;
1159
1160         if (hash_expected && !hash_location) {
1161                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1162                 return true;
1163         }
1164
1165         if (!hash_expected && hash_location) {
1166                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1167                 return true;
1168         }
1169
1170         /* Okay, so this is hash_expected and hash_location -
1171          * so we need to calculate the checksum.
1172          */
1173         genhash = tcp_v4_md5_hash_skb(newhash,
1174                                       hash_expected,
1175                                       NULL, skb);
1176
1177         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1178                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1179                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1180                                      &iph->saddr, ntohs(th->source),
1181                                      &iph->daddr, ntohs(th->dest),
1182                                      genhash ? " tcp_v4_calc_md5_hash failed"
1183                                      : "");
1184                 return true;
1185         }
1186         return false;
1187 #endif
1188         return false;
1189 }
1190
1191 static void tcp_v4_init_req(struct request_sock *req,
1192                             const struct sock *sk_listener,
1193                             struct sk_buff *skb)
1194 {
1195         struct inet_request_sock *ireq = inet_rsk(req);
1196
1197         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1198         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1199         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1200         ireq->opt = tcp_v4_save_options(skb);
1201 }
1202
1203 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1204                                           struct flowi *fl,
1205                                           const struct request_sock *req,
1206                                           bool *strict)
1207 {
1208         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1209
1210         if (strict) {
1211                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1212                         *strict = true;
1213                 else
1214                         *strict = false;
1215         }
1216
1217         return dst;
1218 }
1219
1220 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1221         .family         =       PF_INET,
1222         .obj_size       =       sizeof(struct tcp_request_sock),
1223         .rtx_syn_ack    =       tcp_rtx_synack,
1224         .send_ack       =       tcp_v4_reqsk_send_ack,
1225         .destructor     =       tcp_v4_reqsk_destructor,
1226         .send_reset     =       tcp_v4_send_reset,
1227         .syn_ack_timeout =      tcp_syn_ack_timeout,
1228 };
1229
1230 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1231         .mss_clamp      =       TCP_MSS_DEFAULT,
1232 #ifdef CONFIG_TCP_MD5SIG
1233         .req_md5_lookup =       tcp_v4_md5_lookup,
1234         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1235 #endif
1236         .init_req       =       tcp_v4_init_req,
1237 #ifdef CONFIG_SYN_COOKIES
1238         .cookie_init_seq =      cookie_v4_init_sequence,
1239 #endif
1240         .route_req      =       tcp_v4_route_req,
1241         .init_seq       =       tcp_v4_init_sequence,
1242         .send_synack    =       tcp_v4_send_synack,
1243 };
1244
1245 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1246 {
1247         /* Never answer to SYNs send to broadcast or multicast */
1248         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1249                 goto drop;
1250
1251         return tcp_conn_request(&tcp_request_sock_ops,
1252                                 &tcp_request_sock_ipv4_ops, sk, skb);
1253
1254 drop:
1255         tcp_listendrop(sk);
1256         return 0;
1257 }
1258 EXPORT_SYMBOL(tcp_v4_conn_request);
1259
1260
1261 /*
1262  * The three way handshake has completed - we got a valid synack -
1263  * now create the new socket.
1264  */
1265 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1266                                   struct request_sock *req,
1267                                   struct dst_entry *dst,
1268                                   struct request_sock *req_unhash,
1269                                   bool *own_req)
1270 {
1271         struct inet_request_sock *ireq;
1272         struct inet_sock *newinet;
1273         struct tcp_sock *newtp;
1274         struct sock *newsk;
1275 #ifdef CONFIG_TCP_MD5SIG
1276         struct tcp_md5sig_key *key;
1277 #endif
1278         struct ip_options_rcu *inet_opt;
1279
1280         if (sk_acceptq_is_full(sk))
1281                 goto exit_overflow;
1282
1283         newsk = tcp_create_openreq_child(sk, req, skb);
1284         if (!newsk)
1285                 goto exit_nonewsk;
1286
1287         newsk->sk_gso_type = SKB_GSO_TCPV4;
1288         inet_sk_rx_dst_set(newsk, skb);
1289
1290         newtp                 = tcp_sk(newsk);
1291         newinet               = inet_sk(newsk);
1292         ireq                  = inet_rsk(req);
1293         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1294         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1295         newsk->sk_bound_dev_if = ireq->ir_iif;
1296         newinet->inet_saddr           = ireq->ir_loc_addr;
1297         inet_opt              = ireq->opt;
1298         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1299         ireq->opt             = NULL;
1300         newinet->mc_index     = inet_iif(skb);
1301         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1302         newinet->rcv_tos      = ip_hdr(skb)->tos;
1303         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1304         if (inet_opt)
1305                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1306         newinet->inet_id = newtp->write_seq ^ jiffies;
1307
1308         if (!dst) {
1309                 dst = inet_csk_route_child_sock(sk, newsk, req);
1310                 if (!dst)
1311                         goto put_and_exit;
1312         } else {
1313                 /* syncookie case : see end of cookie_v4_check() */
1314         }
1315         sk_setup_caps(newsk, dst);
1316
1317         tcp_ca_openreq_child(newsk, dst);
1318
1319         tcp_sync_mss(newsk, dst_mtu(dst));
1320         newtp->advmss = dst_metric_advmss(dst);
1321         if (tcp_sk(sk)->rx_opt.user_mss &&
1322             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1323                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1324
1325         tcp_initialize_rcv_mss(newsk);
1326
1327 #ifdef CONFIG_TCP_MD5SIG
1328         /* Copy over the MD5 key from the original socket */
1329         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1330                                 AF_INET);
1331         if (key) {
1332                 /*
1333                  * We're using one, so create a matching key
1334                  * on the newsk structure. If we fail to get
1335                  * memory, then we end up not copying the key
1336                  * across. Shucks.
1337                  */
1338                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1339                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1340                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1341         }
1342 #endif
1343
1344         if (__inet_inherit_port(sk, newsk) < 0)
1345                 goto put_and_exit;
1346         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1347         if (*own_req)
1348                 tcp_move_syn(newtp, req);
1349
1350         return newsk;
1351
1352 exit_overflow:
1353         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1354 exit_nonewsk:
1355         dst_release(dst);
1356 exit:
1357         tcp_listendrop(sk);
1358         return NULL;
1359 put_and_exit:
1360         inet_csk_prepare_forced_close(newsk);
1361         tcp_done(newsk);
1362         goto exit;
1363 }
1364 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1365
1366 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1367 {
1368 #ifdef CONFIG_SYN_COOKIES
1369         const struct tcphdr *th = tcp_hdr(skb);
1370
1371         if (!th->syn)
1372                 sk = cookie_v4_check(sk, skb);
1373 #endif
1374         return sk;
1375 }
1376
1377 /* The socket must have it's spinlock held when we get
1378  * here, unless it is a TCP_LISTEN socket.
1379  *
1380  * We have a potential double-lock case here, so even when
1381  * doing backlog processing we use the BH locking scheme.
1382  * This is because we cannot sleep with the original spinlock
1383  * held.
1384  */
1385 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1386 {
1387         struct sock *rsk;
1388
1389         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1390                 struct dst_entry *dst = sk->sk_rx_dst;
1391
1392                 sock_rps_save_rxhash(sk, skb);
1393                 sk_mark_napi_id(sk, skb);
1394                 if (dst) {
1395                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1396                             !dst->ops->check(dst, 0)) {
1397                                 dst_release(dst);
1398                                 sk->sk_rx_dst = NULL;
1399                         }
1400                 }
1401                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1402                 return 0;
1403         }
1404
1405         if (tcp_checksum_complete(skb))
1406                 goto csum_err;
1407
1408         if (sk->sk_state == TCP_LISTEN) {
1409                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1410
1411                 if (!nsk)
1412                         goto discard;
1413                 if (nsk != sk) {
1414                         sock_rps_save_rxhash(nsk, skb);
1415                         sk_mark_napi_id(nsk, skb);
1416                         if (tcp_child_process(sk, nsk, skb)) {
1417                                 rsk = nsk;
1418                                 goto reset;
1419                         }
1420                         return 0;
1421                 }
1422         } else
1423                 sock_rps_save_rxhash(sk, skb);
1424
1425         if (tcp_rcv_state_process(sk, skb)) {
1426                 rsk = sk;
1427                 goto reset;
1428         }
1429         return 0;
1430
1431 reset:
1432         tcp_v4_send_reset(rsk, skb);
1433 discard:
1434         kfree_skb(skb);
1435         /* Be careful here. If this function gets more complicated and
1436          * gcc suffers from register pressure on the x86, sk (in %ebx)
1437          * might be destroyed here. This current version compiles correctly,
1438          * but you have been warned.
1439          */
1440         return 0;
1441
1442 csum_err:
1443         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1444         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1445         goto discard;
1446 }
1447 EXPORT_SYMBOL(tcp_v4_do_rcv);
1448
1449 void tcp_v4_early_demux(struct sk_buff *skb)
1450 {
1451         const struct iphdr *iph;
1452         const struct tcphdr *th;
1453         struct sock *sk;
1454
1455         if (skb->pkt_type != PACKET_HOST)
1456                 return;
1457
1458         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1459                 return;
1460
1461         iph = ip_hdr(skb);
1462         th = tcp_hdr(skb);
1463
1464         if (th->doff < sizeof(struct tcphdr) / 4)
1465                 return;
1466
1467         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1468                                        iph->saddr, th->source,
1469                                        iph->daddr, ntohs(th->dest),
1470                                        skb->skb_iif);
1471         if (sk) {
1472                 skb->sk = sk;
1473                 skb->destructor = sock_edemux;
1474                 if (sk_fullsock(sk)) {
1475                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1476
1477                         if (dst)
1478                                 dst = dst_check(dst, 0);
1479                         if (dst &&
1480                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1481                                 skb_dst_set_noref(skb, dst);
1482                 }
1483         }
1484 }
1485
1486 /* Packet is added to VJ-style prequeue for processing in process
1487  * context, if a reader task is waiting. Apparently, this exciting
1488  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1489  * failed somewhere. Latency? Burstiness? Well, at least now we will
1490  * see, why it failed. 8)8)                               --ANK
1491  *
1492  */
1493 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1494 {
1495         struct tcp_sock *tp = tcp_sk(sk);
1496
1497         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1498                 return false;
1499
1500         if (skb->len <= tcp_hdrlen(skb) &&
1501             skb_queue_len(&tp->ucopy.prequeue) == 0)
1502                 return false;
1503
1504         /* Before escaping RCU protected region, we need to take care of skb
1505          * dst. Prequeue is only enabled for established sockets.
1506          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1507          * Instead of doing full sk_rx_dst validity here, let's perform
1508          * an optimistic check.
1509          */
1510         if (likely(sk->sk_rx_dst))
1511                 skb_dst_drop(skb);
1512         else
1513                 skb_dst_force_safe(skb);
1514
1515         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1516         tp->ucopy.memory += skb->truesize;
1517         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1518             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1519                 struct sk_buff *skb1;
1520
1521                 BUG_ON(sock_owned_by_user(sk));
1522                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1523                                 skb_queue_len(&tp->ucopy.prequeue));
1524
1525                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1526                         sk_backlog_rcv(sk, skb1);
1527
1528                 tp->ucopy.memory = 0;
1529         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1530                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1531                                            POLLIN | POLLRDNORM | POLLRDBAND);
1532                 if (!inet_csk_ack_scheduled(sk))
1533                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1534                                                   (3 * tcp_rto_min(sk)) / 4,
1535                                                   TCP_RTO_MAX);
1536         }
1537         return true;
1538 }
1539 EXPORT_SYMBOL(tcp_prequeue);
1540
1541 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1542 {
1543         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1544
1545         /* Only socket owner can try to collapse/prune rx queues
1546          * to reduce memory overhead, so add a little headroom here.
1547          * Few sockets backlog are possibly concurrently non empty.
1548          */
1549         limit += 64*1024;
1550
1551         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1552          * we can fix skb->truesize to its real value to avoid future drops.
1553          * This is valid because skb is not yet charged to the socket.
1554          * It has been noticed pure SACK packets were sometimes dropped
1555          * (if cooked by drivers without copybreak feature).
1556          */
1557         if (!skb->data_len)
1558                 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1559
1560         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1561                 bh_unlock_sock(sk);
1562                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1563                 return true;
1564         }
1565         return false;
1566 }
1567 EXPORT_SYMBOL(tcp_add_backlog);
1568
1569 /*
1570  *      From tcp_input.c
1571  */
1572
1573 int tcp_v4_rcv(struct sk_buff *skb)
1574 {
1575         struct net *net = dev_net(skb->dev);
1576         const struct iphdr *iph;
1577         const struct tcphdr *th;
1578         bool refcounted;
1579         struct sock *sk;
1580         int ret;
1581
1582         if (skb->pkt_type != PACKET_HOST)
1583                 goto discard_it;
1584
1585         /* Count it even if it's bad */
1586         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1587
1588         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1589                 goto discard_it;
1590
1591         th = (const struct tcphdr *)skb->data;
1592
1593         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1594                 goto bad_packet;
1595         if (!pskb_may_pull(skb, th->doff * 4))
1596                 goto discard_it;
1597
1598         /* An explanation is required here, I think.
1599          * Packet length and doff are validated by header prediction,
1600          * provided case of th->doff==0 is eliminated.
1601          * So, we defer the checks. */
1602
1603         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1604                 goto csum_error;
1605
1606         th = (const struct tcphdr *)skb->data;
1607         iph = ip_hdr(skb);
1608         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1609          * barrier() makes sure compiler wont play fool^Waliasing games.
1610          */
1611         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1612                 sizeof(struct inet_skb_parm));
1613         barrier();
1614
1615         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1616         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1617                                     skb->len - th->doff * 4);
1618         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1619         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1620         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1621         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1622         TCP_SKB_CB(skb)->sacked  = 0;
1623
1624 lookup:
1625         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1626                                th->dest, &refcounted);
1627         if (!sk)
1628                 goto no_tcp_socket;
1629
1630 process:
1631         if (sk->sk_state == TCP_TIME_WAIT)
1632                 goto do_time_wait;
1633
1634         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1635                 struct request_sock *req = inet_reqsk(sk);
1636                 struct sock *nsk;
1637
1638                 sk = req->rsk_listener;
1639                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1640                         sk_drops_add(sk, skb);
1641                         reqsk_put(req);
1642                         goto discard_it;
1643                 }
1644                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1645                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1646                         goto lookup;
1647                 }
1648                 /* We own a reference on the listener, increase it again
1649                  * as we might lose it too soon.
1650                  */
1651                 sock_hold(sk);
1652                 refcounted = true;
1653                 nsk = tcp_check_req(sk, skb, req, false);
1654                 if (!nsk) {
1655                         reqsk_put(req);
1656                         goto discard_and_relse;
1657                 }
1658                 if (nsk == sk) {
1659                         reqsk_put(req);
1660                 } else if (tcp_child_process(sk, nsk, skb)) {
1661                         tcp_v4_send_reset(nsk, skb);
1662                         goto discard_and_relse;
1663                 } else {
1664                         sock_put(sk);
1665                         return 0;
1666                 }
1667         }
1668         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1669                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1670                 goto discard_and_relse;
1671         }
1672
1673         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1674                 goto discard_and_relse;
1675
1676         if (tcp_v4_inbound_md5_hash(sk, skb))
1677                 goto discard_and_relse;
1678
1679         nf_reset(skb);
1680
1681         if (sk_filter(sk, skb))
1682                 goto discard_and_relse;
1683
1684         skb->dev = NULL;
1685
1686         if (sk->sk_state == TCP_LISTEN) {
1687                 ret = tcp_v4_do_rcv(sk, skb);
1688                 goto put_and_return;
1689         }
1690
1691         sk_incoming_cpu_update(sk);
1692
1693         bh_lock_sock_nested(sk);
1694         tcp_segs_in(tcp_sk(sk), skb);
1695         ret = 0;
1696         if (!sock_owned_by_user(sk)) {
1697                 if (!tcp_prequeue(sk, skb))
1698                         ret = tcp_v4_do_rcv(sk, skb);
1699         } else if (tcp_add_backlog(sk, skb)) {
1700                 goto discard_and_relse;
1701         }
1702         bh_unlock_sock(sk);
1703
1704 put_and_return:
1705         if (refcounted)
1706                 sock_put(sk);
1707
1708         return ret;
1709
1710 no_tcp_socket:
1711         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1712                 goto discard_it;
1713
1714         if (tcp_checksum_complete(skb)) {
1715 csum_error:
1716                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1717 bad_packet:
1718                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1719         } else {
1720                 tcp_v4_send_reset(NULL, skb);
1721         }
1722
1723 discard_it:
1724         /* Discard frame. */
1725         kfree_skb(skb);
1726         return 0;
1727
1728 discard_and_relse:
1729         sk_drops_add(sk, skb);
1730         if (refcounted)
1731                 sock_put(sk);
1732         goto discard_it;
1733
1734 do_time_wait:
1735         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1736                 inet_twsk_put(inet_twsk(sk));
1737                 goto discard_it;
1738         }
1739
1740         if (tcp_checksum_complete(skb)) {
1741                 inet_twsk_put(inet_twsk(sk));
1742                 goto csum_error;
1743         }
1744         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1745         case TCP_TW_SYN: {
1746                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1747                                                         &tcp_hashinfo, skb,
1748                                                         __tcp_hdrlen(th),
1749                                                         iph->saddr, th->source,
1750                                                         iph->daddr, th->dest,
1751                                                         inet_iif(skb));
1752                 if (sk2) {
1753                         inet_twsk_deschedule_put(inet_twsk(sk));
1754                         sk = sk2;
1755                         refcounted = false;
1756                         goto process;
1757                 }
1758                 /* Fall through to ACK */
1759         }
1760         case TCP_TW_ACK:
1761                 tcp_v4_timewait_ack(sk, skb);
1762                 break;
1763         case TCP_TW_RST:
1764                 tcp_v4_send_reset(sk, skb);
1765                 inet_twsk_deschedule_put(inet_twsk(sk));
1766                 goto discard_it;
1767         case TCP_TW_SUCCESS:;
1768         }
1769         goto discard_it;
1770 }
1771
1772 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1773         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1774         .twsk_unique    = tcp_twsk_unique,
1775         .twsk_destructor= tcp_twsk_destructor,
1776 };
1777
1778 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1779 {
1780         struct dst_entry *dst = skb_dst(skb);
1781
1782         if (dst && dst_hold_safe(dst)) {
1783                 sk->sk_rx_dst = dst;
1784                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1785         }
1786 }
1787 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1788
1789 const struct inet_connection_sock_af_ops ipv4_specific = {
1790         .queue_xmit        = ip_queue_xmit,
1791         .send_check        = tcp_v4_send_check,
1792         .rebuild_header    = inet_sk_rebuild_header,
1793         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1794         .conn_request      = tcp_v4_conn_request,
1795         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1796         .net_header_len    = sizeof(struct iphdr),
1797         .setsockopt        = ip_setsockopt,
1798         .getsockopt        = ip_getsockopt,
1799         .addr2sockaddr     = inet_csk_addr2sockaddr,
1800         .sockaddr_len      = sizeof(struct sockaddr_in),
1801         .bind_conflict     = inet_csk_bind_conflict,
1802 #ifdef CONFIG_COMPAT
1803         .compat_setsockopt = compat_ip_setsockopt,
1804         .compat_getsockopt = compat_ip_getsockopt,
1805 #endif
1806         .mtu_reduced       = tcp_v4_mtu_reduced,
1807 };
1808 EXPORT_SYMBOL(ipv4_specific);
1809
1810 #ifdef CONFIG_TCP_MD5SIG
1811 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1812         .md5_lookup             = tcp_v4_md5_lookup,
1813         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1814         .md5_parse              = tcp_v4_parse_md5_keys,
1815 };
1816 #endif
1817
1818 /* NOTE: A lot of things set to zero explicitly by call to
1819  *       sk_alloc() so need not be done here.
1820  */
1821 static int tcp_v4_init_sock(struct sock *sk)
1822 {
1823         struct inet_connection_sock *icsk = inet_csk(sk);
1824
1825         tcp_init_sock(sk);
1826
1827         icsk->icsk_af_ops = &ipv4_specific;
1828
1829 #ifdef CONFIG_TCP_MD5SIG
1830         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1831 #endif
1832
1833         return 0;
1834 }
1835
1836 void tcp_v4_destroy_sock(struct sock *sk)
1837 {
1838         struct tcp_sock *tp = tcp_sk(sk);
1839
1840         tcp_clear_xmit_timers(sk);
1841
1842         tcp_cleanup_congestion_control(sk);
1843
1844         /* Cleanup up the write buffer. */
1845         tcp_write_queue_purge(sk);
1846
1847         /* Cleans up our, hopefully empty, out_of_order_queue. */
1848         __skb_queue_purge(&tp->out_of_order_queue);
1849
1850 #ifdef CONFIG_TCP_MD5SIG
1851         /* Clean up the MD5 key list, if any */
1852         if (tp->md5sig_info) {
1853                 tcp_clear_md5_list(sk);
1854                 kfree_rcu(tp->md5sig_info, rcu);
1855                 tp->md5sig_info = NULL;
1856         }
1857 #endif
1858
1859         /* Clean prequeue, it must be empty really */
1860         __skb_queue_purge(&tp->ucopy.prequeue);
1861
1862         /* Clean up a referenced TCP bind bucket. */
1863         if (inet_csk(sk)->icsk_bind_hash)
1864                 inet_put_port(sk);
1865
1866         BUG_ON(tp->fastopen_rsk);
1867
1868         /* If socket is aborted during connect operation */
1869         tcp_free_fastopen_req(tp);
1870         tcp_saved_syn_free(tp);
1871
1872         local_bh_disable();
1873         sk_sockets_allocated_dec(sk);
1874         local_bh_enable();
1875
1876         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1877                 sock_release_memcg(sk);
1878 }
1879 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1880
1881 #ifdef CONFIG_PROC_FS
1882 /* Proc filesystem TCP sock list dumping. */
1883
1884 /*
1885  * Get next listener socket follow cur.  If cur is NULL, get first socket
1886  * starting from bucket given in st->bucket; when st->bucket is zero the
1887  * very first socket in the hash table is returned.
1888  */
1889 static void *listening_get_next(struct seq_file *seq, void *cur)
1890 {
1891         struct tcp_iter_state *st = seq->private;
1892         struct net *net = seq_file_net(seq);
1893         struct inet_listen_hashbucket *ilb;
1894         struct inet_connection_sock *icsk;
1895         struct sock *sk = cur;
1896
1897         if (!sk) {
1898 get_head:
1899                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1900                 spin_lock_bh(&ilb->lock);
1901                 sk = sk_head(&ilb->head);
1902                 st->offset = 0;
1903                 goto get_sk;
1904         }
1905         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1906         ++st->num;
1907         ++st->offset;
1908
1909         sk = sk_next(sk);
1910 get_sk:
1911         sk_for_each_from(sk) {
1912                 if (!net_eq(sock_net(sk), net))
1913                         continue;
1914                 if (sk->sk_family == st->family)
1915                         return sk;
1916                 icsk = inet_csk(sk);
1917         }
1918         spin_unlock_bh(&ilb->lock);
1919         st->offset = 0;
1920         if (++st->bucket < INET_LHTABLE_SIZE)
1921                 goto get_head;
1922         return NULL;
1923 }
1924
1925 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1926 {
1927         struct tcp_iter_state *st = seq->private;
1928         void *rc;
1929
1930         st->bucket = 0;
1931         st->offset = 0;
1932         rc = listening_get_next(seq, NULL);
1933
1934         while (rc && *pos) {
1935                 rc = listening_get_next(seq, rc);
1936                 --*pos;
1937         }
1938         return rc;
1939 }
1940
1941 static inline bool empty_bucket(const struct tcp_iter_state *st)
1942 {
1943         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1944 }
1945
1946 /*
1947  * Get first established socket starting from bucket given in st->bucket.
1948  * If st->bucket is zero, the very first socket in the hash is returned.
1949  */
1950 static void *established_get_first(struct seq_file *seq)
1951 {
1952         struct tcp_iter_state *st = seq->private;
1953         struct net *net = seq_file_net(seq);
1954         void *rc = NULL;
1955
1956         st->offset = 0;
1957         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1958                 struct sock *sk;
1959                 struct hlist_nulls_node *node;
1960                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1961
1962                 /* Lockless fast path for the common case of empty buckets */
1963                 if (empty_bucket(st))
1964                         continue;
1965
1966                 spin_lock_bh(lock);
1967                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1968                         if (sk->sk_family != st->family ||
1969                             !net_eq(sock_net(sk), net)) {
1970                                 continue;
1971                         }
1972                         rc = sk;
1973                         goto out;
1974                 }
1975                 spin_unlock_bh(lock);
1976         }
1977 out:
1978         return rc;
1979 }
1980
1981 static void *established_get_next(struct seq_file *seq, void *cur)
1982 {
1983         struct sock *sk = cur;
1984         struct hlist_nulls_node *node;
1985         struct tcp_iter_state *st = seq->private;
1986         struct net *net = seq_file_net(seq);
1987
1988         ++st->num;
1989         ++st->offset;
1990
1991         sk = sk_nulls_next(sk);
1992
1993         sk_nulls_for_each_from(sk, node) {
1994                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1995                         return sk;
1996         }
1997
1998         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1999         ++st->bucket;
2000         return established_get_first(seq);
2001 }
2002
2003 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2004 {
2005         struct tcp_iter_state *st = seq->private;
2006         void *rc;
2007
2008         st->bucket = 0;
2009         rc = established_get_first(seq);
2010
2011         while (rc && pos) {
2012                 rc = established_get_next(seq, rc);
2013                 --pos;
2014         }
2015         return rc;
2016 }
2017
2018 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2019 {
2020         void *rc;
2021         struct tcp_iter_state *st = seq->private;
2022
2023         st->state = TCP_SEQ_STATE_LISTENING;
2024         rc        = listening_get_idx(seq, &pos);
2025
2026         if (!rc) {
2027                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2028                 rc        = established_get_idx(seq, pos);
2029         }
2030
2031         return rc;
2032 }
2033
2034 static void *tcp_seek_last_pos(struct seq_file *seq)
2035 {
2036         struct tcp_iter_state *st = seq->private;
2037         int offset = st->offset;
2038         int orig_num = st->num;
2039         void *rc = NULL;
2040
2041         switch (st->state) {
2042         case TCP_SEQ_STATE_LISTENING:
2043                 if (st->bucket >= INET_LHTABLE_SIZE)
2044                         break;
2045                 st->state = TCP_SEQ_STATE_LISTENING;
2046                 rc = listening_get_next(seq, NULL);
2047                 while (offset-- && rc)
2048                         rc = listening_get_next(seq, rc);
2049                 if (rc)
2050                         break;
2051                 st->bucket = 0;
2052                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2053                 /* Fallthrough */
2054         case TCP_SEQ_STATE_ESTABLISHED:
2055                 if (st->bucket > tcp_hashinfo.ehash_mask)
2056                         break;
2057                 rc = established_get_first(seq);
2058                 while (offset-- && rc)
2059                         rc = established_get_next(seq, rc);
2060         }
2061
2062         st->num = orig_num;
2063
2064         return rc;
2065 }
2066
2067 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2068 {
2069         struct tcp_iter_state *st = seq->private;
2070         void *rc;
2071
2072         if (*pos && *pos == st->last_pos) {
2073                 rc = tcp_seek_last_pos(seq);
2074                 if (rc)
2075                         goto out;
2076         }
2077
2078         st->state = TCP_SEQ_STATE_LISTENING;
2079         st->num = 0;
2080         st->bucket = 0;
2081         st->offset = 0;
2082         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2083
2084 out:
2085         st->last_pos = *pos;
2086         return rc;
2087 }
2088
2089 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2090 {
2091         struct tcp_iter_state *st = seq->private;
2092         void *rc = NULL;
2093
2094         if (v == SEQ_START_TOKEN) {
2095                 rc = tcp_get_idx(seq, 0);
2096                 goto out;
2097         }
2098
2099         switch (st->state) {
2100         case TCP_SEQ_STATE_LISTENING:
2101                 rc = listening_get_next(seq, v);
2102                 if (!rc) {
2103                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2104                         st->bucket = 0;
2105                         st->offset = 0;
2106                         rc        = established_get_first(seq);
2107                 }
2108                 break;
2109         case TCP_SEQ_STATE_ESTABLISHED:
2110                 rc = established_get_next(seq, v);
2111                 break;
2112         }
2113 out:
2114         ++*pos;
2115         st->last_pos = *pos;
2116         return rc;
2117 }
2118
2119 static void tcp_seq_stop(struct seq_file *seq, void *v)
2120 {
2121         struct tcp_iter_state *st = seq->private;
2122
2123         switch (st->state) {
2124         case TCP_SEQ_STATE_LISTENING:
2125                 if (v != SEQ_START_TOKEN)
2126                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2127                 break;
2128         case TCP_SEQ_STATE_ESTABLISHED:
2129                 if (v)
2130                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2131                 break;
2132         }
2133 }
2134
2135 int tcp_seq_open(struct inode *inode, struct file *file)
2136 {
2137         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2138         struct tcp_iter_state *s;
2139         int err;
2140
2141         err = seq_open_net(inode, file, &afinfo->seq_ops,
2142                           sizeof(struct tcp_iter_state));
2143         if (err < 0)
2144                 return err;
2145
2146         s = ((struct seq_file *)file->private_data)->private;
2147         s->family               = afinfo->family;
2148         s->last_pos             = 0;
2149         return 0;
2150 }
2151 EXPORT_SYMBOL(tcp_seq_open);
2152
2153 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2154 {
2155         int rc = 0;
2156         struct proc_dir_entry *p;
2157
2158         afinfo->seq_ops.start           = tcp_seq_start;
2159         afinfo->seq_ops.next            = tcp_seq_next;
2160         afinfo->seq_ops.stop            = tcp_seq_stop;
2161
2162         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2163                              afinfo->seq_fops, afinfo);
2164         if (!p)
2165                 rc = -ENOMEM;
2166         return rc;
2167 }
2168 EXPORT_SYMBOL(tcp_proc_register);
2169
2170 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2171 {
2172         remove_proc_entry(afinfo->name, net->proc_net);
2173 }
2174 EXPORT_SYMBOL(tcp_proc_unregister);
2175
2176 static void get_openreq4(const struct request_sock *req,
2177                          struct seq_file *f, int i)
2178 {
2179         const struct inet_request_sock *ireq = inet_rsk(req);
2180         long delta = req->rsk_timer.expires - jiffies;
2181
2182         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2183                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2184                 i,
2185                 ireq->ir_loc_addr,
2186                 ireq->ir_num,
2187                 ireq->ir_rmt_addr,
2188                 ntohs(ireq->ir_rmt_port),
2189                 TCP_SYN_RECV,
2190                 0, 0, /* could print option size, but that is af dependent. */
2191                 1,    /* timers active (only the expire timer) */
2192                 jiffies_delta_to_clock_t(delta),
2193                 req->num_timeout,
2194                 from_kuid_munged(seq_user_ns(f),
2195                                  sock_i_uid(req->rsk_listener)),
2196                 0,  /* non standard timer */
2197                 0, /* open_requests have no inode */
2198                 0,
2199                 req);
2200 }
2201
2202 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2203 {
2204         int timer_active;
2205         unsigned long timer_expires;
2206         const struct tcp_sock *tp = tcp_sk(sk);
2207         const struct inet_connection_sock *icsk = inet_csk(sk);
2208         const struct inet_sock *inet = inet_sk(sk);
2209         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2210         __be32 dest = inet->inet_daddr;
2211         __be32 src = inet->inet_rcv_saddr;
2212         __u16 destp = ntohs(inet->inet_dport);
2213         __u16 srcp = ntohs(inet->inet_sport);
2214         int rx_queue;
2215         int state;
2216
2217         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2218             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2219             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2220                 timer_active    = 1;
2221                 timer_expires   = icsk->icsk_timeout;
2222         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2223                 timer_active    = 4;
2224                 timer_expires   = icsk->icsk_timeout;
2225         } else if (timer_pending(&sk->sk_timer)) {
2226                 timer_active    = 2;
2227                 timer_expires   = sk->sk_timer.expires;
2228         } else {
2229                 timer_active    = 0;
2230                 timer_expires = jiffies;
2231         }
2232
2233         state = sk_state_load(sk);
2234         if (state == TCP_LISTEN)
2235                 rx_queue = sk->sk_ack_backlog;
2236         else
2237                 /* Because we don't lock the socket,
2238                  * we might find a transient negative value.
2239                  */
2240                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2241
2242         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2243                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2244                 i, src, srcp, dest, destp, state,
2245                 tp->write_seq - tp->snd_una,
2246                 rx_queue,
2247                 timer_active,
2248                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2249                 icsk->icsk_retransmits,
2250                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2251                 icsk->icsk_probes_out,
2252                 sock_i_ino(sk),
2253                 atomic_read(&sk->sk_refcnt), sk,
2254                 jiffies_to_clock_t(icsk->icsk_rto),
2255                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2256                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2257                 tp->snd_cwnd,
2258                 state == TCP_LISTEN ?
2259                     fastopenq->max_qlen :
2260                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2261 }
2262
2263 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2264                                struct seq_file *f, int i)
2265 {
2266         long delta = tw->tw_timer.expires - jiffies;
2267         __be32 dest, src;
2268         __u16 destp, srcp;
2269
2270         dest  = tw->tw_daddr;
2271         src   = tw->tw_rcv_saddr;
2272         destp = ntohs(tw->tw_dport);
2273         srcp  = ntohs(tw->tw_sport);
2274
2275         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2276                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2277                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2278                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2279                 atomic_read(&tw->tw_refcnt), tw);
2280 }
2281
2282 #define TMPSZ 150
2283
2284 static int tcp4_seq_show(struct seq_file *seq, void *v)
2285 {
2286         struct tcp_iter_state *st;
2287         struct sock *sk = v;
2288
2289         seq_setwidth(seq, TMPSZ - 1);
2290         if (v == SEQ_START_TOKEN) {
2291                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2292                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2293                            "inode");
2294                 goto out;
2295         }
2296         st = seq->private;
2297
2298         if (sk->sk_state == TCP_TIME_WAIT)
2299                 get_timewait4_sock(v, seq, st->num);
2300         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2301                 get_openreq4(v, seq, st->num);
2302         else
2303                 get_tcp4_sock(v, seq, st->num);
2304 out:
2305         seq_pad(seq, '\n');
2306         return 0;
2307 }
2308
2309 static const struct file_operations tcp_afinfo_seq_fops = {
2310         .owner   = THIS_MODULE,
2311         .open    = tcp_seq_open,
2312         .read    = seq_read,
2313         .llseek  = seq_lseek,
2314         .release = seq_release_net
2315 };
2316
2317 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2318         .name           = "tcp",
2319         .family         = AF_INET,
2320         .seq_fops       = &tcp_afinfo_seq_fops,
2321         .seq_ops        = {
2322                 .show           = tcp4_seq_show,
2323         },
2324 };
2325
2326 static int __net_init tcp4_proc_init_net(struct net *net)
2327 {
2328         return tcp_proc_register(net, &tcp4_seq_afinfo);
2329 }
2330
2331 static void __net_exit tcp4_proc_exit_net(struct net *net)
2332 {
2333         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2334 }
2335
2336 static struct pernet_operations tcp4_net_ops = {
2337         .init = tcp4_proc_init_net,
2338         .exit = tcp4_proc_exit_net,
2339 };
2340
2341 int __init tcp4_proc_init(void)
2342 {
2343         return register_pernet_subsys(&tcp4_net_ops);
2344 }
2345
2346 void tcp4_proc_exit(void)
2347 {
2348         unregister_pernet_subsys(&tcp4_net_ops);
2349 }
2350 #endif /* CONFIG_PROC_FS */
2351
2352 struct proto tcp_prot = {
2353         .name                   = "TCP",
2354         .owner                  = THIS_MODULE,
2355         .close                  = tcp_close,
2356         .connect                = tcp_v4_connect,
2357         .disconnect             = tcp_disconnect,
2358         .accept                 = inet_csk_accept,
2359         .ioctl                  = tcp_ioctl,
2360         .init                   = tcp_v4_init_sock,
2361         .destroy                = tcp_v4_destroy_sock,
2362         .shutdown               = tcp_shutdown,
2363         .setsockopt             = tcp_setsockopt,
2364         .getsockopt             = tcp_getsockopt,
2365         .recvmsg                = tcp_recvmsg,
2366         .sendmsg                = tcp_sendmsg,
2367         .sendpage               = tcp_sendpage,
2368         .backlog_rcv            = tcp_v4_do_rcv,
2369         .release_cb             = tcp_release_cb,
2370         .hash                   = inet_hash,
2371         .unhash                 = inet_unhash,
2372         .get_port               = inet_csk_get_port,
2373         .enter_memory_pressure  = tcp_enter_memory_pressure,
2374         .stream_memory_free     = tcp_stream_memory_free,
2375         .sockets_allocated      = &tcp_sockets_allocated,
2376         .orphan_count           = &tcp_orphan_count,
2377         .memory_allocated       = &tcp_memory_allocated,
2378         .memory_pressure        = &tcp_memory_pressure,
2379         .sysctl_mem             = sysctl_tcp_mem,
2380         .sysctl_wmem            = sysctl_tcp_wmem,
2381         .sysctl_rmem            = sysctl_tcp_rmem,
2382         .max_header             = MAX_TCP_HEADER,
2383         .obj_size               = sizeof(struct tcp_sock),
2384         .slab_flags             = SLAB_DESTROY_BY_RCU,
2385         .twsk_prot              = &tcp_timewait_sock_ops,
2386         .rsk_prot               = &tcp_request_sock_ops,
2387         .h.hashinfo             = &tcp_hashinfo,
2388         .no_autobind            = true,
2389 #ifdef CONFIG_COMPAT
2390         .compat_setsockopt      = compat_tcp_setsockopt,
2391         .compat_getsockopt      = compat_tcp_getsockopt,
2392 #endif
2393         .diag_destroy           = tcp_abort,
2394 };
2395 EXPORT_SYMBOL(tcp_prot);
2396
2397 static void __net_exit tcp_sk_exit(struct net *net)
2398 {
2399         int cpu;
2400
2401         for_each_possible_cpu(cpu)
2402                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2403         free_percpu(net->ipv4.tcp_sk);
2404 }
2405
2406 static int __net_init tcp_sk_init(struct net *net)
2407 {
2408         int res, cpu;
2409
2410         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2411         if (!net->ipv4.tcp_sk)
2412                 return -ENOMEM;
2413
2414         for_each_possible_cpu(cpu) {
2415                 struct sock *sk;
2416
2417                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2418                                            IPPROTO_TCP, net);
2419                 if (res)
2420                         goto fail;
2421                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2422                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2423         }
2424
2425         net->ipv4.sysctl_tcp_ecn = 2;
2426         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2427
2428         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2429         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2430         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2431
2432         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2433         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2434         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2435
2436         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2437         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2438         net->ipv4.sysctl_tcp_syncookies = 1;
2439         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2440         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2441         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2442         net->ipv4.sysctl_tcp_orphan_retries = 0;
2443         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2444         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2445
2446         return 0;
2447 fail:
2448         tcp_sk_exit(net);
2449
2450         return res;
2451 }
2452
2453 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2454 {
2455         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2456 }
2457
2458 static struct pernet_operations __net_initdata tcp_sk_ops = {
2459        .init       = tcp_sk_init,
2460        .exit       = tcp_sk_exit,
2461        .exit_batch = tcp_sk_exit_batch,
2462 };
2463
2464 void __init tcp_v4_init(void)
2465 {
2466         inet_hashinfo_init(&tcp_hashinfo);
2467         if (register_pernet_subsys(&tcp_sk_ops))
2468                 panic("Failed to create the TCP control socket.\n");
2469 }