net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78 #include <net/busy_poll.h>
  79
  80 #include <linux/inet.h>
  81 #include <linux/ipv6.h>
  82 #include <linux/stddef.h>
  83 #include <linux/proc_fs.h>
  84 #include <linux/seq_file.h>
  85
  86 #include <linux/crypto.h>
  87 #include <linux/scatterlist.h>
  88
  89 int sysctl_tcp_tw_reuse __read_mostly;
  90 int sysctl_tcp_low_latency __read_mostly;
  91 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96 #endif
  97
  98 struct inet_hashinfo tcp_hashinfo;
  99 EXPORT_SYMBOL(tcp_hashinfo);
 100
 101 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102 {
 103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                           ip_hdr(skb)->saddr,
 105                                           tcp_hdr(skb)->dest,
 106                                           tcp_hdr(skb)->source);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 /* This will initiate an outgoing connection. */
 142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 143 {
 144         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct tcp_sock *tp = tcp_sk(sk);
 147         __be16 orig_sport, orig_dport;
 148         __be32 daddr, nexthop;
 149         struct flowi4 *fl4;
 150         struct rtable *rt;
 151         int err;
 152         struct ip_options_rcu *inet_opt;
 153
 154         if (addr_len < sizeof(struct sockaddr_in))
 155                 return -EINVAL;
 156
 157         if (usin->sin_family != AF_INET)
 158                 return -EAFNOSUPPORT;
 159
 160         nexthop = daddr = usin->sin_addr.s_addr;
 161         inet_opt = rcu_dereference_protected(inet->inet_opt,
 162                                              sock_owned_by_user(sk));
 163         if (inet_opt && inet_opt->opt.srr) {
 164                 if (!daddr)
 165                         return -EINVAL;
 166                 nexthop = inet_opt->opt.faddr;
 167         }
 168
 169         orig_sport = inet->inet_sport;
 170         orig_dport = usin->sin_port;
 171         fl4 = &inet->cork.fl.u.ip4;
 172         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 173                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 174                               IPPROTO_TCP,
 175                               orig_sport, orig_dport, sk);
 176         if (IS_ERR(rt)) {
 177                 err = PTR_ERR(rt);
 178                 if (err == -ENETUNREACH)
 179                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 180                 return err;
 181         }
 182
 183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 184                 ip_rt_put(rt);
 185                 return -ENETUNREACH;
 186         }
 187
 188         if (!inet_opt || !inet_opt->opt.srr)
 189                 daddr = fl4->daddr;
 190
 191         if (!inet->inet_saddr)
 192                 inet->inet_saddr = fl4->saddr;
 193         inet->inet_rcv_saddr = inet->inet_saddr;
 194
 195         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 196                 /* Reset inherited state */
 197                 tp->rx_opt.ts_recent       = 0;
 198                 tp->rx_opt.ts_recent_stamp = 0;
 199                 if (likely(!tp->repair))
 200                         tp->write_seq      = 0;
 201         }
 202
 203         if (tcp_death_row.sysctl_tw_recycle &&
 204             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 205                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 206
 207         inet->inet_dport = usin->sin_port;
 208         inet->inet_daddr = daddr;
 209
 210         inet_set_txhash(sk);
 211
 212         inet_csk(sk)->icsk_ext_hdr_len = 0;
 213         if (inet_opt)
 214                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 215
 216         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 217
 218         /* Socket identity is still unknown (sport may be zero).
 219          * However we set state to SYN-SENT and not releasing socket
 220          * lock select source port, enter ourselves into the hash tables and
 221          * complete initialization after this.
 222          */
 223         tcp_set_state(sk, TCP_SYN_SENT);
 224         err = inet_hash_connect(&tcp_death_row, sk);
 225         if (err)
 226                 goto failure;
 227
 228         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 229                                inet->inet_sport, inet->inet_dport, sk);
 230         if (IS_ERR(rt)) {
 231                 err = PTR_ERR(rt);
 232                 rt = NULL;
 233                 goto failure;
 234         }
 235         /* OK, now commit destination to socket.  */
 236         sk->sk_gso_type = SKB_GSO_TCPV4;
 237         sk_setup_caps(sk, &rt->dst);
 238
 239         if (!tp->write_seq && likely(!tp->repair))
 240                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 241                                                            inet->inet_daddr,
 242                                                            inet->inet_sport,
 243                                                            usin->sin_port);
 244
 245         inet->inet_id = tp->write_seq ^ jiffies;
 246
 247         err = tcp_connect(sk);
 248
 249         rt = NULL;
 250         if (err)
 251                 goto failure;
 252
 253         return 0;
 254
 255 failure:
 256         /*
 257          * This unhashes the socket and releases the local port,
 258          * if necessary.
 259          */
 260         tcp_set_state(sk, TCP_CLOSE);
 261         ip_rt_put(rt);
 262         sk->sk_route_caps = 0;
 263         inet->inet_dport = 0;
 264         return err;
 265 }
 266 EXPORT_SYMBOL(tcp_v4_connect);
 267
 268 /*
 269  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 270  * It can be called through tcp_release_cb() if socket was owned by user
 271  * at the time tcp_v4_err() was called to handle ICMP message.
 272  */
 273 void tcp_v4_mtu_reduced(struct sock *sk)
 274 {
 275         struct dst_entry *dst;
 276         struct inet_sock *inet = inet_sk(sk);
 277         u32 mtu = tcp_sk(sk)->mtu_info;
 278
 279         dst = inet_csk_update_pmtu(sk, mtu);
 280         if (!dst)
 281                 return;
 282
 283         /* Something is about to be wrong... Remember soft error
 284          * for the case, if this connection will not able to recover.
 285          */
 286         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 287                 sk->sk_err_soft = EMSGSIZE;
 288
 289         mtu = dst_mtu(dst);
 290
 291         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 292             ip_sk_accept_pmtu(sk) &&
 293             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 294                 tcp_sync_mss(sk, mtu);
 295
 296                 /* Resend the TCP packet because it's
 297                  * clear that the old packet has been
 298                  * dropped. This is the new "fast" path mtu
 299                  * discovery.
 300                  */
 301                 tcp_simple_retransmit(sk);
 302         } /* else let the usual retransmit timer handle it */
 303 }
 304 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 305
 306 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 307 {
 308         struct dst_entry *dst = __sk_dst_check(sk, 0);
 309
 310         if (dst)
 311                 dst->ops->redirect(dst, sk, skb);
 312 }
 313
 314 /*
 315  * This routine is called by the ICMP module when it gets some
 316  * sort of error condition.  If err < 0 then the socket should
 317  * be closed and the error returned to the user.  If err > 0
 318  * it's just the icmp type << 8 | icmp code.  After adjustment
 319  * header points to the first 8 bytes of the tcp header.  We need
 320  * to find the appropriate port.
 321  *
 322  * The locking strategy used here is very "optimistic". When
 323  * someone else accesses the socket the ICMP is just dropped
 324  * and for some paths there is no check at all.
 325  * A more general error queue to queue errors for later handling
 326  * is probably better.
 327  *
 328  */
 329
 330 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 331 {
 332         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 333         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 334         struct inet_connection_sock *icsk;
 335         struct tcp_sock *tp;
 336         struct inet_sock *inet;
 337         const int type = icmp_hdr(icmp_skb)->type;
 338         const int code = icmp_hdr(icmp_skb)->code;
 339         struct sock *sk;
 340         struct sk_buff *skb;
 341         struct request_sock *fastopen;
 342         __u32 seq, snd_una;
 343         __u32 remaining;
 344         int err;
 345         struct net *net = dev_net(icmp_skb->dev);
 346
 347         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 348                         iph->saddr, th->source, inet_iif(icmp_skb));
 349         if (!sk) {
 350                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 351                 return;
 352         }
 353         if (sk->sk_state == TCP_TIME_WAIT) {
 354                 inet_twsk_put(inet_twsk(sk));
 355                 return;
 356         }
 357
 358         bh_lock_sock(sk);
 359         /* If too many ICMPs get dropped on busy
 360          * servers this needs to be solved differently.
 361          * We do take care of PMTU discovery (RFC1191) special case :
 362          * we can receive locally generated ICMP messages while socket is held.
 363          */
 364         if (sock_owned_by_user(sk)) {
 365                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 366                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 367         }
 368         if (sk->sk_state == TCP_CLOSE)
 369                 goto out;
 370
 371         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 372                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 373                 goto out;
 374         }
 375
 376         icsk = inet_csk(sk);
 377         tp = tcp_sk(sk);
 378         seq = ntohl(th->seq);
 379         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 380         fastopen = tp->fastopen_rsk;
 381         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 382         if (sk->sk_state != TCP_LISTEN &&
 383             !between(seq, snd_una, tp->snd_nxt)) {
 384                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 385                 goto out;
 386         }
 387
 388         switch (type) {
 389         case ICMP_REDIRECT:
 390                 do_redirect(icmp_skb, sk);
 391                 goto out;
 392         case ICMP_SOURCE_QUENCH:
 393                 /* Just silently ignore these. */
 394                 goto out;
 395         case ICMP_PARAMETERPROB:
 396                 err = EPROTO;
 397                 break;
 398         case ICMP_DEST_UNREACH:
 399                 if (code > NR_ICMP_UNREACH)
 400                         goto out;
 401
 402                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 403                         /* We are not interested in TCP_LISTEN and open_requests
 404                          * (SYN-ACKs send out by Linux are always <576bytes so
 405                          * they should go through unfragmented).
 406                          */
 407                         if (sk->sk_state == TCP_LISTEN)
 408                                 goto out;
 409
 410                         tp->mtu_info = info;
 411                         if (!sock_owned_by_user(sk)) {
 412                                 tcp_v4_mtu_reduced(sk);
 413                         } else {
 414                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 415                                         sock_hold(sk);
 416                         }
 417                         goto out;
 418                 }
 419
 420                 err = icmp_err_convert[code].errno;
 421                 /* check if icmp_skb allows revert of backoff
 422                  * (see draft-zimmermann-tcp-lcd) */
 423                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 424                         break;
 425                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 426                     !icsk->icsk_backoff || fastopen)
 427                         break;
 428
 429                 if (sock_owned_by_user(sk))
 430                         break;
 431
 432                 icsk->icsk_backoff--;
 433                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 434                                                TCP_TIMEOUT_INIT;
 435                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 436
 437                 skb = tcp_write_queue_head(sk);
 438                 BUG_ON(!skb);
 439
 440                 remaining = icsk->icsk_rto -
 441                             min(icsk->icsk_rto,
 442                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 443
 444                 if (remaining) {
 445                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 446                                                   remaining, TCP_RTO_MAX);
 447                 } else {
 448                         /* RTO revert clocked out retransmission.
 449                          * Will retransmit now */
 450                         tcp_retransmit_timer(sk);
 451                 }
 452
 453                 break;
 454         case ICMP_TIME_EXCEEDED:
 455                 err = EHOSTUNREACH;
 456                 break;
 457         default:
 458                 goto out;
 459         }
 460
 461         switch (sk->sk_state) {
 462                 struct request_sock *req, **prev;
 463         case TCP_LISTEN:
 464                 if (sock_owned_by_user(sk))
 465                         goto out;
 466
 467                 req = inet_csk_search_req(sk, &prev, th->dest,
 468                                           iph->daddr, iph->saddr);
 469                 if (!req)
 470                         goto out;
 471
 472                 /* ICMPs are not backlogged, hence we cannot get
 473                    an established socket here.
 474                  */
 475                 WARN_ON(req->sk);
 476
 477                 if (seq != tcp_rsk(req)->snt_isn) {
 478                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 479                         goto out;
 480                 }
 481
 482                 /*
 483                  * Still in SYN_RECV, just remove it silently.
 484                  * There is no good way to pass the error to the newly
 485                  * created socket, and POSIX does not want network
 486                  * errors returned from accept().
 487                  */
 488                 inet_csk_reqsk_queue_drop(sk, req, prev);
 489                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 490                 goto out;
 491
 492         case TCP_SYN_SENT:
 493         case TCP_SYN_RECV:
 494                 /* Only in fast or simultaneous open. If a fast open socket is
 495                  * is already accepted it is treated as a connected one below.
 496                  */
 497                 if (fastopen && fastopen->sk == NULL)
 498                         break;
 499
 500                 if (!sock_owned_by_user(sk)) {
 501                         sk->sk_err = err;
 502
 503                         sk->sk_error_report(sk);
 504
 505                         tcp_done(sk);
 506                 } else {
 507                         sk->sk_err_soft = err;
 508                 }
 509                 goto out;
 510         }
 511
 512         /* If we've already connected we will keep trying
 513          * until we time out, or the user gives up.
 514          *
 515          * rfc1122 4.2.3.9 allows to consider as hard errors
 516          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 517          * but it is obsoleted by pmtu discovery).
 518          *
 519          * Note, that in modern internet, where routing is unreliable
 520          * and in each dark corner broken firewalls sit, sending random
 521          * errors ordered by their masters even this two messages finally lose
 522          * their original sense (even Linux sends invalid PORT_UNREACHs)
 523          *
 524          * Now we are in compliance with RFCs.
 525          *                                                      --ANK (980905)
 526          */
 527
 528         inet = inet_sk(sk);
 529         if (!sock_owned_by_user(sk) && inet->recverr) {
 530                 sk->sk_err = err;
 531                 sk->sk_error_report(sk);
 532         } else  { /* Only an error on timeout */
 533                 sk->sk_err_soft = err;
 534         }
 535
 536 out:
 537         bh_unlock_sock(sk);
 538         sock_put(sk);
 539 }
 540
 541 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 542 {
 543         struct tcphdr *th = tcp_hdr(skb);
 544
 545         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 546                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 547                 skb->csum_start = skb_transport_header(skb) - skb->head;
 548                 skb->csum_offset = offsetof(struct tcphdr, check);
 549         } else {
 550                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 551                                          csum_partial(th,
 552                                                       th->doff << 2,
 553                                                       skb->csum));
 554         }
 555 }
 556
 557 /* This routine computes an IPv4 TCP checksum. */
 558 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 559 {
 560         const struct inet_sock *inet = inet_sk(sk);
 561
 562         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 563 }
 564 EXPORT_SYMBOL(tcp_v4_send_check);
 565
 566 /*
 567  *      This routine will send an RST to the other tcp.
 568  *
 569  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 570  *                    for reset.
 571  *      Answer: if a packet caused RST, it is not for a socket
 572  *              existing in our system, if it is matched to a socket,
 573  *              it is just duplicate segment or bug in other side's TCP.
 574  *              So that we build reply only basing on parameters
 575  *              arrived with segment.
 576  *      Exception: precedence violation. We do not implement it in any case.
 577  */
 578
 579 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 580 {
 581         const struct tcphdr *th = tcp_hdr(skb);
 582         struct {
 583                 struct tcphdr th;
 584 #ifdef CONFIG_TCP_MD5SIG
 585                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 586 #endif
 587         } rep;
 588         struct ip_reply_arg arg;
 589 #ifdef CONFIG_TCP_MD5SIG
 590         struct tcp_md5sig_key *key;
 591         const __u8 *hash_location = NULL;
 592         unsigned char newhash[16];
 593         int genhash;
 594         struct sock *sk1 = NULL;
 595 #endif
 596         struct net *net;
 597
 598         /* Never send a reset in response to a reset. */
 599         if (th->rst)
 600                 return;
 601
 602         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 603                 return;
 604
 605         /* Swap the send and the receive. */
 606         memset(&rep, 0, sizeof(rep));
 607         rep.th.dest   = th->source;
 608         rep.th.source = th->dest;
 609         rep.th.doff   = sizeof(struct tcphdr) / 4;
 610         rep.th.rst    = 1;
 611
 612         if (th->ack) {
 613                 rep.th.seq = th->ack_seq;
 614         } else {
 615                 rep.th.ack = 1;
 616                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 617                                        skb->len - (th->doff << 2));
 618         }
 619
 620         memset(&arg, 0, sizeof(arg));
 621         arg.iov[0].iov_base = (unsigned char *)&rep;
 622         arg.iov[0].iov_len  = sizeof(rep.th);
 623
 624 #ifdef CONFIG_TCP_MD5SIG
 625         hash_location = tcp_parse_md5sig_option(th);
 626         if (!sk && hash_location) {
 627                 /*
 628                  * active side is lost. Try to find listening socket through
 629                  * source port, and then find md5 key through listening socket.
 630                  * we are not loose security here:
 631                  * Incoming packet is checked with md5 hash with finding key,
 632                  * no RST generated if md5 hash doesn't match.
 633                  */
 634                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 635                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
 636                                              th->source, ip_hdr(skb)->daddr,
 637                                              ntohs(th->source), inet_iif(skb));
 638                 /* don't send rst if it can't find key */
 639                 if (!sk1)
 640                         return;
 641                 rcu_read_lock();
 642                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 643                                         &ip_hdr(skb)->saddr, AF_INET);
 644                 if (!key)
 645                         goto release_sk1;
 646
 647                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 648                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 649                         goto release_sk1;
 650         } else {
 651                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 652                                              &ip_hdr(skb)->saddr,
 653                                              AF_INET) : NULL;
 654         }
 655
 656         if (key) {
 657                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 658                                    (TCPOPT_NOP << 16) |
 659                                    (TCPOPT_MD5SIG << 8) |
 660                                    TCPOLEN_MD5SIG);
 661                 /* Update length and the length the header thinks exists */
 662                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 663                 rep.th.doff = arg.iov[0].iov_len / 4;
 664
 665                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 666                                      key, ip_hdr(skb)->saddr,
 667                                      ip_hdr(skb)->daddr, &rep.th);
 668         }
 669 #endif
 670         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 671                                       ip_hdr(skb)->saddr, /* XXX */
 672                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 673         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 674         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 675         /* When socket is gone, all binding information is lost.
 676          * routing might fail in this case. No choice here, if we choose to force
 677          * input interface, we will misroute in case of asymmetric route.
 678          */
 679         if (sk)
 680                 arg.bound_dev_if = sk->sk_bound_dev_if;
 681
 682         net = dev_net(skb_dst(skb)->dev);
 683         arg.tos = ip_hdr(skb)->tos;
 684         ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
 685                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 686                               &arg, arg.iov[0].iov_len);
 687
 688         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 689         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 690
 691 #ifdef CONFIG_TCP_MD5SIG
 692 release_sk1:
 693         if (sk1) {
 694                 rcu_read_unlock();
 695                 sock_put(sk1);
 696         }
 697 #endif
 698 }
 699
 700 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 701    outside socket context is ugly, certainly. What can I do?
 702  */
 703
 704 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 705                             u32 win, u32 tsval, u32 tsecr, int oif,
 706                             struct tcp_md5sig_key *key,
 707                             int reply_flags, u8 tos)
 708 {
 709         const struct tcphdr *th = tcp_hdr(skb);
 710         struct {
 711                 struct tcphdr th;
 712                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 713 #ifdef CONFIG_TCP_MD5SIG
 714                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 715 #endif
 716                         ];
 717         } rep;
 718         struct ip_reply_arg arg;
 719         struct net *net = dev_net(skb_dst(skb)->dev);
 720
 721         memset(&rep.th, 0, sizeof(struct tcphdr));
 722         memset(&arg, 0, sizeof(arg));
 723
 724         arg.iov[0].iov_base = (unsigned char *)&rep;
 725         arg.iov[0].iov_len  = sizeof(rep.th);
 726         if (tsecr) {
 727                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 728                                    (TCPOPT_TIMESTAMP << 8) |
 729                                    TCPOLEN_TIMESTAMP);
 730                 rep.opt[1] = htonl(tsval);
 731                 rep.opt[2] = htonl(tsecr);
 732                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 733         }
 734
 735         /* Swap the send and the receive. */
 736         rep.th.dest    = th->source;
 737         rep.th.source  = th->dest;
 738         rep.th.doff    = arg.iov[0].iov_len / 4;
 739         rep.th.seq     = htonl(seq);
 740         rep.th.ack_seq = htonl(ack);
 741         rep.th.ack     = 1;
 742         rep.th.window  = htons(win);
 743
 744 #ifdef CONFIG_TCP_MD5SIG
 745         if (key) {
 746                 int offset = (tsecr) ? 3 : 0;
 747
 748                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 749                                           (TCPOPT_NOP << 16) |
 750                                           (TCPOPT_MD5SIG << 8) |
 751                                           TCPOLEN_MD5SIG);
 752                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 753                 rep.th.doff = arg.iov[0].iov_len/4;
 754
 755                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 756                                     key, ip_hdr(skb)->saddr,
 757                                     ip_hdr(skb)->daddr, &rep.th);
 758         }
 759 #endif
 760         arg.flags = reply_flags;
 761         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 762                                       ip_hdr(skb)->saddr, /* XXX */
 763                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 764         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 765         if (oif)
 766                 arg.bound_dev_if = oif;
 767         arg.tos = tos;
 768         ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
 769                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 770                               &arg, arg.iov[0].iov_len);
 771
 772         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 773 }
 774
 775 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 776 {
 777         struct inet_timewait_sock *tw = inet_twsk(sk);
 778         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 779
 780         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 781                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 782                         tcp_time_stamp + tcptw->tw_ts_offset,
 783                         tcptw->tw_ts_recent,
 784                         tw->tw_bound_dev_if,
 785                         tcp_twsk_md5_key(tcptw),
 786                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 787                         tw->tw_tos
 788                         );
 789
 790         inet_twsk_put(tw);
 791 }
 792
 793 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 794                                   struct request_sock *req)
 795 {
 796         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 797          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 798          */
 799         tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 800                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 801                         tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 802                         tcp_time_stamp,
 803                         req->ts_recent,
 804                         0,
 805                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 806                                           AF_INET),
 807                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 808                         ip_hdr(skb)->tos);
 809 }
 810
 811 /*
 812  *      Send a SYN-ACK after having received a SYN.
 813  *      This still operates on a request_sock only, not on a big
 814  *      socket.
 815  */
 816 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 817                               struct flowi *fl,
 818                               struct request_sock *req,
 819                               u16 queue_mapping,
 820                               struct tcp_fastopen_cookie *foc)
 821 {
 822         const struct inet_request_sock *ireq = inet_rsk(req);
 823         struct flowi4 fl4;
 824         int err = -1;
 825         struct sk_buff *skb;
 826
 827         /* First, grab a route. */
 828         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 829                 return -1;
 830
 831         skb = tcp_make_synack(sk, dst, req, foc);
 832
 833         if (skb) {
 834                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 835
 836                 skb_set_queue_mapping(skb, queue_mapping);
 837                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 838                                             ireq->ir_rmt_addr,
 839                                             ireq->opt);
 840                 err = net_xmit_eval(err);
 841         }
 842
 843         return err;
 844 }
 845
 846 /*
 847  *      IPv4 request_sock destructor.
 848  */
 849 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 850 {
 851         kfree(inet_rsk(req)->opt);
 852 }
 853
 854 /*
 855  * Return true if a syncookie should be sent
 856  */
 857 bool tcp_syn_flood_action(struct sock *sk,
 858                          const struct sk_buff *skb,
 859                          const char *proto)
 860 {
 861         const char *msg = "Dropping request";
 862         bool want_cookie = false;
 863         struct listen_sock *lopt;
 864
 865 #ifdef CONFIG_SYN_COOKIES
 866         if (sysctl_tcp_syncookies) {
 867                 msg = "Sending cookies";
 868                 want_cookie = true;
 869                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 870         } else
 871 #endif
 872                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 873
 874         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 875         if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
 876                 lopt->synflood_warned = 1;
 877                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 878                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 879         }
 880         return want_cookie;
 881 }
 882 EXPORT_SYMBOL(tcp_syn_flood_action);
 883
 884 /*
 885  * Save and compile IPv4 options into the request_sock if needed.
 886  */
 887 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 888 {
 889         const struct ip_options *opt = &(IPCB(skb)->opt);
 890         struct ip_options_rcu *dopt = NULL;
 891
 892         if (opt && opt->optlen) {
 893                 int opt_size = sizeof(*dopt) + opt->optlen;
 894
 895                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 896                 if (dopt) {
 897                         if (ip_options_echo(&dopt->opt, skb)) {
 898                                 kfree(dopt);
 899                                 dopt = NULL;
 900                         }
 901                 }
 902         }
 903         return dopt;
 904 }
 905
 906 #ifdef CONFIG_TCP_MD5SIG
 907 /*
 908  * RFC2385 MD5 checksumming requires a mapping of
 909  * IP address->MD5 Key.
 910  * We need to maintain these in the sk structure.
 911  */
 912
 913 /* Find the Key structure for an address.  */
 914 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 915                                          const union tcp_md5_addr *addr,
 916                                          int family)
 917 {
 918         struct tcp_sock *tp = tcp_sk(sk);
 919         struct tcp_md5sig_key *key;
 920         unsigned int size = sizeof(struct in_addr);
 921         struct tcp_md5sig_info *md5sig;
 922
 923         /* caller either holds rcu_read_lock() or socket lock */
 924         md5sig = rcu_dereference_check(tp->md5sig_info,
 925                                        sock_owned_by_user(sk) ||
 926                                        lockdep_is_held(&sk->sk_lock.slock));
 927         if (!md5sig)
 928                 return NULL;
 929 #if IS_ENABLED(CONFIG_IPV6)
 930         if (family == AF_INET6)
 931                 size = sizeof(struct in6_addr);
 932 #endif
 933         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 934                 if (key->family != family)
 935                         continue;
 936                 if (!memcmp(&key->addr, addr, size))
 937                         return key;
 938         }
 939         return NULL;
 940 }
 941 EXPORT_SYMBOL(tcp_md5_do_lookup);
 942
 943 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 944                                          struct sock *addr_sk)
 945 {
 946         union tcp_md5_addr *addr;
 947
 948         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 949         return tcp_md5_do_lookup(sk, addr, AF_INET);
 950 }
 951 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 952
 953 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 954                                                       struct request_sock *req)
 955 {
 956         union tcp_md5_addr *addr;
 957
 958         addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
 959         return tcp_md5_do_lookup(sk, addr, AF_INET);
 960 }
 961
 962 /* This can be called on a newly created socket, from other files */
 963 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 964                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 965 {
 966         /* Add Key to the list */
 967         struct tcp_md5sig_key *key;
 968         struct tcp_sock *tp = tcp_sk(sk);
 969         struct tcp_md5sig_info *md5sig;
 970
 971         key = tcp_md5_do_lookup(sk, addr, family);
 972         if (key) {
 973                 /* Pre-existing entry - just update that one. */
 974                 memcpy(key->key, newkey, newkeylen);
 975                 key->keylen = newkeylen;
 976                 return 0;
 977         }
 978
 979         md5sig = rcu_dereference_protected(tp->md5sig_info,
 980                                            sock_owned_by_user(sk));
 981         if (!md5sig) {
 982                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 983                 if (!md5sig)
 984                         return -ENOMEM;
 985
 986                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 987                 INIT_HLIST_HEAD(&md5sig->head);
 988                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 989         }
 990
 991         key = sock_kmalloc(sk, sizeof(*key), gfp);
 992         if (!key)
 993                 return -ENOMEM;
 994         if (!tcp_alloc_md5sig_pool()) {
 995                 sock_kfree_s(sk, key, sizeof(*key));
 996                 return -ENOMEM;
 997         }
 998
 999         memcpy(key->key, newkey, newkeylen);
1000         key->keylen = newkeylen;
1001         key->family = family;
1002         memcpy(&key->addr, addr,
1003                (family == AF_INET6) ? sizeof(struct in6_addr) :
1004                                       sizeof(struct in_addr));
1005         hlist_add_head_rcu(&key->node, &md5sig->head);
1006         return 0;
1007 }
1008 EXPORT_SYMBOL(tcp_md5_do_add);
1009
1010 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1011 {
1012         struct tcp_md5sig_key *key;
1013
1014         key = tcp_md5_do_lookup(sk, addr, family);
1015         if (!key)
1016                 return -ENOENT;
1017         hlist_del_rcu(&key->node);
1018         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1019         kfree_rcu(key, rcu);
1020         return 0;
1021 }
1022 EXPORT_SYMBOL(tcp_md5_do_del);
1023
1024 static void tcp_clear_md5_list(struct sock *sk)
1025 {
1026         struct tcp_sock *tp = tcp_sk(sk);
1027         struct tcp_md5sig_key *key;
1028         struct hlist_node *n;
1029         struct tcp_md5sig_info *md5sig;
1030
1031         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1032
1033         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1034                 hlist_del_rcu(&key->node);
1035                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1036                 kfree_rcu(key, rcu);
1037         }
1038 }
1039
1040 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1041                                  int optlen)
1042 {
1043         struct tcp_md5sig cmd;
1044         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1045
1046         if (optlen < sizeof(cmd))
1047                 return -EINVAL;
1048
1049         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1050                 return -EFAULT;
1051
1052         if (sin->sin_family != AF_INET)
1053                 return -EINVAL;
1054
1055         if (!cmd.tcpm_keylen)
1056                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1057                                       AF_INET);
1058
1059         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1060                 return -EINVAL;
1061
1062         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1063                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1064                               GFP_KERNEL);
1065 }
1066
1067 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1068                                         __be32 daddr, __be32 saddr, int nbytes)
1069 {
1070         struct tcp4_pseudohdr *bp;
1071         struct scatterlist sg;
1072
1073         bp = &hp->md5_blk.ip4;
1074
1075         /*
1076          * 1. the TCP pseudo-header (in the order: source IP address,
1077          * destination IP address, zero-padded protocol number, and
1078          * segment length)
1079          */
1080         bp->saddr = saddr;
1081         bp->daddr = daddr;
1082         bp->pad = 0;
1083         bp->protocol = IPPROTO_TCP;
1084         bp->len = cpu_to_be16(nbytes);
1085
1086         sg_init_one(&sg, bp, sizeof(*bp));
1087         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1088 }
1089
1090 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1091                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1092 {
1093         struct tcp_md5sig_pool *hp;
1094         struct hash_desc *desc;
1095
1096         hp = tcp_get_md5sig_pool();
1097         if (!hp)
1098                 goto clear_hash_noput;
1099         desc = &hp->md5_desc;
1100
1101         if (crypto_hash_init(desc))
1102                 goto clear_hash;
1103         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1104                 goto clear_hash;
1105         if (tcp_md5_hash_header(hp, th))
1106                 goto clear_hash;
1107         if (tcp_md5_hash_key(hp, key))
1108                 goto clear_hash;
1109         if (crypto_hash_final(desc, md5_hash))
1110                 goto clear_hash;
1111
1112         tcp_put_md5sig_pool();
1113         return 0;
1114
1115 clear_hash:
1116         tcp_put_md5sig_pool();
1117 clear_hash_noput:
1118         memset(md5_hash, 0, 16);
1119         return 1;
1120 }
1121
1122 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1123                         const struct sock *sk, const struct request_sock *req,
1124                         const struct sk_buff *skb)
1125 {
1126         struct tcp_md5sig_pool *hp;
1127         struct hash_desc *desc;
1128         const struct tcphdr *th = tcp_hdr(skb);
1129         __be32 saddr, daddr;
1130
1131         if (sk) {
1132                 saddr = inet_sk(sk)->inet_saddr;
1133                 daddr = inet_sk(sk)->inet_daddr;
1134         } else if (req) {
1135                 saddr = inet_rsk(req)->ir_loc_addr;
1136                 daddr = inet_rsk(req)->ir_rmt_addr;
1137         } else {
1138                 const struct iphdr *iph = ip_hdr(skb);
1139                 saddr = iph->saddr;
1140                 daddr = iph->daddr;
1141         }
1142
1143         hp = tcp_get_md5sig_pool();
1144         if (!hp)
1145                 goto clear_hash_noput;
1146         desc = &hp->md5_desc;
1147
1148         if (crypto_hash_init(desc))
1149                 goto clear_hash;
1150
1151         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1152                 goto clear_hash;
1153         if (tcp_md5_hash_header(hp, th))
1154                 goto clear_hash;
1155         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1156                 goto clear_hash;
1157         if (tcp_md5_hash_key(hp, key))
1158                 goto clear_hash;
1159         if (crypto_hash_final(desc, md5_hash))
1160                 goto clear_hash;
1161
1162         tcp_put_md5sig_pool();
1163         return 0;
1164
1165 clear_hash:
1166         tcp_put_md5sig_pool();
1167 clear_hash_noput:
1168         memset(md5_hash, 0, 16);
1169         return 1;
1170 }
1171 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1172
1173 static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1174                                       const struct sk_buff *skb)
1175 {
1176         /*
1177          * This gets called for each TCP segment that arrives
1178          * so we want to be efficient.
1179          * We have 3 drop cases:
1180          * o No MD5 hash and one expected.
1181          * o MD5 hash and we're not expecting one.
1182          * o MD5 hash and its wrong.
1183          */
1184         const __u8 *hash_location = NULL;
1185         struct tcp_md5sig_key *hash_expected;
1186         const struct iphdr *iph = ip_hdr(skb);
1187         const struct tcphdr *th = tcp_hdr(skb);
1188         int genhash;
1189         unsigned char newhash[16];
1190
1191         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1192                                           AF_INET);
1193         hash_location = tcp_parse_md5sig_option(th);
1194
1195         /* We've parsed the options - do we have a hash? */
1196         if (!hash_expected && !hash_location)
1197                 return false;
1198
1199         if (hash_expected && !hash_location) {
1200                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1201                 return true;
1202         }
1203
1204         if (!hash_expected && hash_location) {
1205                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1206                 return true;
1207         }
1208
1209         /* Okay, so this is hash_expected and hash_location -
1210          * so we need to calculate the checksum.
1211          */
1212         genhash = tcp_v4_md5_hash_skb(newhash,
1213                                       hash_expected,
1214                                       NULL, NULL, skb);
1215
1216         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1217                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1218                                      &iph->saddr, ntohs(th->source),
1219                                      &iph->daddr, ntohs(th->dest),
1220                                      genhash ? " tcp_v4_calc_md5_hash failed"
1221                                      : "");
1222                 return true;
1223         }
1224         return false;
1225 }
1226
1227 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1228 {
1229         bool ret;
1230
1231         rcu_read_lock();
1232         ret = __tcp_v4_inbound_md5_hash(sk, skb);
1233         rcu_read_unlock();
1234
1235         return ret;
1236 }
1237
1238 #endif
1239
1240 static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1241                             struct sk_buff *skb)
1242 {
1243         struct inet_request_sock *ireq = inet_rsk(req);
1244
1245         ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1246         ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1247         ireq->no_srccheck = inet_sk(sk)->transparent;
1248         ireq->opt = tcp_v4_save_options(skb);
1249 }
1250
1251 static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1252                                           const struct request_sock *req,
1253                                           bool *strict)
1254 {
1255         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1256
1257         if (strict) {
1258                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1259                         *strict = true;
1260                 else
1261                         *strict = false;
1262         }
1263
1264         return dst;
1265 }
1266
1267 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1268         .family         =       PF_INET,
1269         .obj_size       =       sizeof(struct tcp_request_sock),
1270         .rtx_syn_ack    =       tcp_rtx_synack,
1271         .send_ack       =       tcp_v4_reqsk_send_ack,
1272         .destructor     =       tcp_v4_reqsk_destructor,
1273         .send_reset     =       tcp_v4_send_reset,
1274         .syn_ack_timeout =      tcp_syn_ack_timeout,
1275 };
1276
1277 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1278         .mss_clamp      =       TCP_MSS_DEFAULT,
1279 #ifdef CONFIG_TCP_MD5SIG
1280         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1281         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1282 #endif
1283         .init_req       =       tcp_v4_init_req,
1284 #ifdef CONFIG_SYN_COOKIES
1285         .cookie_init_seq =      cookie_v4_init_sequence,
1286 #endif
1287         .route_req      =       tcp_v4_route_req,
1288         .init_seq       =       tcp_v4_init_sequence,
1289         .send_synack    =       tcp_v4_send_synack,
1290         .queue_hash_add =       inet_csk_reqsk_queue_hash_add,
1291 };
1292
1293 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1294 {
1295         /* Never answer to SYNs send to broadcast or multicast */
1296         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1297                 goto drop;
1298
1299         return tcp_conn_request(&tcp_request_sock_ops,
1300                                 &tcp_request_sock_ipv4_ops, sk, skb);
1301
1302 drop:
1303         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1304         return 0;
1305 }
1306 EXPORT_SYMBOL(tcp_v4_conn_request);
1307
1308
1309 /*
1310  * The three way handshake has completed - we got a valid synack -
1311  * now create the new socket.
1312  */
1313 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1314                                   struct request_sock *req,
1315                                   struct dst_entry *dst)
1316 {
1317         struct inet_request_sock *ireq;
1318         struct inet_sock *newinet;
1319         struct tcp_sock *newtp;
1320         struct sock *newsk;
1321 #ifdef CONFIG_TCP_MD5SIG
1322         struct tcp_md5sig_key *key;
1323 #endif
1324         struct ip_options_rcu *inet_opt;
1325
1326         if (sk_acceptq_is_full(sk))
1327                 goto exit_overflow;
1328
1329         newsk = tcp_create_openreq_child(sk, req, skb);
1330         if (!newsk)
1331                 goto exit_nonewsk;
1332
1333         newsk->sk_gso_type = SKB_GSO_TCPV4;
1334         inet_sk_rx_dst_set(newsk, skb);
1335
1336         newtp                 = tcp_sk(newsk);
1337         newinet               = inet_sk(newsk);
1338         ireq                  = inet_rsk(req);
1339         newinet->inet_daddr   = ireq->ir_rmt_addr;
1340         newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1341         newinet->inet_saddr           = ireq->ir_loc_addr;
1342         inet_opt              = ireq->opt;
1343         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1344         ireq->opt             = NULL;
1345         newinet->mc_index     = inet_iif(skb);
1346         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1347         newinet->rcv_tos      = ip_hdr(skb)->tos;
1348         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1349         inet_set_txhash(newsk);
1350         if (inet_opt)
1351                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1352         newinet->inet_id = newtp->write_seq ^ jiffies;
1353
1354         if (!dst) {
1355                 dst = inet_csk_route_child_sock(sk, newsk, req);
1356                 if (!dst)
1357                         goto put_and_exit;
1358         } else {
1359                 /* syncookie case : see end of cookie_v4_check() */
1360         }
1361         sk_setup_caps(newsk, dst);
1362
1363         tcp_sync_mss(newsk, dst_mtu(dst));
1364         newtp->advmss = dst_metric_advmss(dst);
1365         if (tcp_sk(sk)->rx_opt.user_mss &&
1366             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1367                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1368
1369         tcp_initialize_rcv_mss(newsk);
1370
1371 #ifdef CONFIG_TCP_MD5SIG
1372         /* Copy over the MD5 key from the original socket */
1373         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1374                                 AF_INET);
1375         if (key != NULL) {
1376                 /*
1377                  * We're using one, so create a matching key
1378                  * on the newsk structure. If we fail to get
1379                  * memory, then we end up not copying the key
1380                  * across. Shucks.
1381                  */
1382                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1383                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1384                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1385         }
1386 #endif
1387
1388         if (__inet_inherit_port(sk, newsk) < 0)
1389                 goto put_and_exit;
1390         __inet_hash_nolisten(newsk, NULL);
1391
1392         return newsk;
1393
1394 exit_overflow:
1395         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1396 exit_nonewsk:
1397         dst_release(dst);
1398 exit:
1399         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1400         return NULL;
1401 put_and_exit:
1402         inet_csk_prepare_forced_close(newsk);
1403         tcp_done(newsk);
1404         goto exit;
1405 }
1406 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1407
1408 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1409 {
1410         struct tcphdr *th = tcp_hdr(skb);
1411         const struct iphdr *iph = ip_hdr(skb);
1412         struct sock *nsk;
1413         struct request_sock **prev;
1414         /* Find possible connection requests. */
1415         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1416                                                        iph->saddr, iph->daddr);
1417         if (req)
1418                 return tcp_check_req(sk, skb, req, prev, false);
1419
1420         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1421                         th->source, iph->daddr, th->dest, inet_iif(skb));
1422
1423         if (nsk) {
1424                 if (nsk->sk_state != TCP_TIME_WAIT) {
1425                         bh_lock_sock(nsk);
1426                         return nsk;
1427                 }
1428                 inet_twsk_put(inet_twsk(nsk));
1429                 return NULL;
1430         }
1431
1432 #ifdef CONFIG_SYN_COOKIES
1433         if (!th->syn)
1434                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1435 #endif
1436         return sk;
1437 }
1438
1439 /* The socket must have it's spinlock held when we get
1440  * here.
1441  *
1442  * We have a potential double-lock case here, so even when
1443  * doing backlog processing we use the BH locking scheme.
1444  * This is because we cannot sleep with the original spinlock
1445  * held.
1446  */
1447 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1448 {
1449         struct sock *rsk;
1450
1451         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1452                 struct dst_entry *dst = sk->sk_rx_dst;
1453
1454                 sock_rps_save_rxhash(sk, skb);
1455                 if (dst) {
1456                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1457                             dst->ops->check(dst, 0) == NULL) {
1458                                 dst_release(dst);
1459                                 sk->sk_rx_dst = NULL;
1460                         }
1461                 }
1462                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1463                 return 0;
1464         }
1465
1466         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1467                 goto csum_err;
1468
1469         if (sk->sk_state == TCP_LISTEN) {
1470                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1471                 if (!nsk)
1472                         goto discard;
1473
1474                 if (nsk != sk) {
1475                         sock_rps_save_rxhash(nsk, skb);
1476                         if (tcp_child_process(sk, nsk, skb)) {
1477                                 rsk = nsk;
1478                                 goto reset;
1479                         }
1480                         return 0;
1481                 }
1482         } else
1483                 sock_rps_save_rxhash(sk, skb);
1484
1485         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1486                 rsk = sk;
1487                 goto reset;
1488         }
1489         return 0;
1490
1491 reset:
1492         tcp_v4_send_reset(rsk, skb);
1493 discard:
1494         kfree_skb(skb);
1495         /* Be careful here. If this function gets more complicated and
1496          * gcc suffers from register pressure on the x86, sk (in %ebx)
1497          * might be destroyed here. This current version compiles correctly,
1498          * but you have been warned.
1499          */
1500         return 0;
1501
1502 csum_err:
1503         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1504         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1505         goto discard;
1506 }
1507 EXPORT_SYMBOL(tcp_v4_do_rcv);
1508
1509 void tcp_v4_early_demux(struct sk_buff *skb)
1510 {
1511         const struct iphdr *iph;
1512         const struct tcphdr *th;
1513         struct sock *sk;
1514
1515         if (skb->pkt_type != PACKET_HOST)
1516                 return;
1517
1518         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1519                 return;
1520
1521         iph = ip_hdr(skb);
1522         th = tcp_hdr(skb);
1523
1524         if (th->doff < sizeof(struct tcphdr) / 4)
1525                 return;
1526
1527         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1528                                        iph->saddr, th->source,
1529                                        iph->daddr, ntohs(th->dest),
1530                                        skb->skb_iif);
1531         if (sk) {
1532                 skb->sk = sk;
1533                 skb->destructor = sock_edemux;
1534                 if (sk->sk_state != TCP_TIME_WAIT) {
1535                         struct dst_entry *dst = sk->sk_rx_dst;
1536
1537                         if (dst)
1538                                 dst = dst_check(dst, 0);
1539                         if (dst &&
1540                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1541                                 skb_dst_set_noref(skb, dst);
1542                 }
1543         }
1544 }
1545
1546 /* Packet is added to VJ-style prequeue for processing in process
1547  * context, if a reader task is waiting. Apparently, this exciting
1548  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1549  * failed somewhere. Latency? Burstiness? Well, at least now we will
1550  * see, why it failed. 8)8)                               --ANK
1551  *
1552  */
1553 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1554 {
1555         struct tcp_sock *tp = tcp_sk(sk);
1556
1557         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1558                 return false;
1559
1560         if (skb->len <= tcp_hdrlen(skb) &&
1561             skb_queue_len(&tp->ucopy.prequeue) == 0)
1562                 return false;
1563
1564         /* Before escaping RCU protected region, we need to take care of skb
1565          * dst. Prequeue is only enabled for established sockets.
1566          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1567          * Instead of doing full sk_rx_dst validity here, let's perform
1568          * an optimistic check.
1569          */
1570         if (likely(sk->sk_rx_dst))
1571                 skb_dst_drop(skb);
1572         else
1573                 skb_dst_force(skb);
1574
1575         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1576         tp->ucopy.memory += skb->truesize;
1577         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1578                 struct sk_buff *skb1;
1579
1580                 BUG_ON(sock_owned_by_user(sk));
1581
1582                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1583                         sk_backlog_rcv(sk, skb1);
1584                         NET_INC_STATS_BH(sock_net(sk),
1585                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1586                 }
1587
1588                 tp->ucopy.memory = 0;
1589         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1590                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1591                                            POLLIN | POLLRDNORM | POLLRDBAND);
1592                 if (!inet_csk_ack_scheduled(sk))
1593                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1594                                                   (3 * tcp_rto_min(sk)) / 4,
1595                                                   TCP_RTO_MAX);
1596         }
1597         return true;
1598 }
1599 EXPORT_SYMBOL(tcp_prequeue);
1600
1601 /*
1602  *      From tcp_input.c
1603  */
1604
1605 int tcp_v4_rcv(struct sk_buff *skb)
1606 {
1607         const struct iphdr *iph;
1608         const struct tcphdr *th;
1609         struct sock *sk;
1610         int ret;
1611         struct net *net = dev_net(skb->dev);
1612
1613         if (skb->pkt_type != PACKET_HOST)
1614                 goto discard_it;
1615
1616         /* Count it even if it's bad */
1617         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1618
1619         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1620                 goto discard_it;
1621
1622         th = tcp_hdr(skb);
1623
1624         if (th->doff < sizeof(struct tcphdr) / 4)
1625                 goto bad_packet;
1626         if (!pskb_may_pull(skb, th->doff * 4))
1627                 goto discard_it;
1628
1629         /* An explanation is required here, I think.
1630          * Packet length and doff are validated by header prediction,
1631          * provided case of th->doff==0 is eliminated.
1632          * So, we defer the checks. */
1633
1634         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1635                 goto csum_error;
1636
1637         th = tcp_hdr(skb);
1638         iph = ip_hdr(skb);
1639         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1640         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1641                                     skb->len - th->doff * 4);
1642         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1643         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1644         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1645         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1646         TCP_SKB_CB(skb)->sacked  = 0;
1647
1648         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1649         if (!sk)
1650                 goto no_tcp_socket;
1651
1652 process:
1653         if (sk->sk_state == TCP_TIME_WAIT)
1654                 goto do_time_wait;
1655
1656         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1657                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1658                 goto discard_and_relse;
1659         }
1660
1661         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1662                 goto discard_and_relse;
1663
1664 #ifdef CONFIG_TCP_MD5SIG
1665         /*
1666          * We really want to reject the packet as early as possible
1667          * if:
1668          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1669          *  o There is an MD5 option and we're not expecting one
1670          */
1671         if (tcp_v4_inbound_md5_hash(sk, skb))
1672                 goto discard_and_relse;
1673 #endif
1674
1675         nf_reset(skb);
1676
1677         if (sk_filter(sk, skb))
1678                 goto discard_and_relse;
1679
1680         sk_mark_napi_id(sk, skb);
1681         skb->dev = NULL;
1682
1683         bh_lock_sock_nested(sk);
1684         ret = 0;
1685         if (!sock_owned_by_user(sk)) {
1686 #ifdef CONFIG_NET_DMA
1687                 struct tcp_sock *tp = tcp_sk(sk);
1688                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1689                         tp->ucopy.dma_chan = net_dma_find_channel();
1690                 if (tp->ucopy.dma_chan)
1691                         ret = tcp_v4_do_rcv(sk, skb);
1692                 else
1693 #endif
1694                 {
1695                         if (!tcp_prequeue(sk, skb))
1696                                 ret = tcp_v4_do_rcv(sk, skb);
1697                 }
1698         } else if (unlikely(sk_add_backlog(sk, skb,
1699                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1700                 bh_unlock_sock(sk);
1701                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1702                 goto discard_and_relse;
1703         }
1704         bh_unlock_sock(sk);
1705
1706         sock_put(sk);
1707
1708         return ret;
1709
1710 no_tcp_socket:
1711         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1712                 goto discard_it;
1713
1714         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1715 csum_error:
1716                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1717 bad_packet:
1718                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1719         } else {
1720                 tcp_v4_send_reset(NULL, skb);
1721         }
1722
1723 discard_it:
1724         /* Discard frame. */
1725         kfree_skb(skb);
1726         return 0;
1727
1728 discard_and_relse:
1729         sock_put(sk);
1730         goto discard_it;
1731
1732 do_time_wait:
1733         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1734                 inet_twsk_put(inet_twsk(sk));
1735                 goto discard_it;
1736         }
1737
1738         if (skb->len < (th->doff << 2)) {
1739                 inet_twsk_put(inet_twsk(sk));
1740                 goto bad_packet;
1741         }
1742         if (tcp_checksum_complete(skb)) {
1743                 inet_twsk_put(inet_twsk(sk));
1744                 goto csum_error;
1745         }
1746         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1747         case TCP_TW_SYN: {
1748                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1749                                                         &tcp_hashinfo,
1750                                                         iph->saddr, th->source,
1751                                                         iph->daddr, th->dest,
1752                                                         inet_iif(skb));
1753                 if (sk2) {
1754                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1755                         inet_twsk_put(inet_twsk(sk));
1756                         sk = sk2;
1757                         goto process;
1758                 }
1759                 /* Fall through to ACK */
1760         }
1761         case TCP_TW_ACK:
1762                 tcp_v4_timewait_ack(sk, skb);
1763                 break;
1764         case TCP_TW_RST:
1765                 goto no_tcp_socket;
1766         case TCP_TW_SUCCESS:;
1767         }
1768         goto discard_it;
1769 }
1770
1771 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1772         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1773         .twsk_unique    = tcp_twsk_unique,
1774         .twsk_destructor= tcp_twsk_destructor,
1775 };
1776
1777 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1778 {
1779         struct dst_entry *dst = skb_dst(skb);
1780
1781         if (dst) {
1782                 dst_hold(dst);
1783                 sk->sk_rx_dst = dst;
1784                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1785         }
1786 }
1787 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1788
1789 const struct inet_connection_sock_af_ops ipv4_specific = {
1790         .queue_xmit        = ip_queue_xmit,
1791         .send_check        = tcp_v4_send_check,
1792         .rebuild_header    = inet_sk_rebuild_header,
1793         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1794         .conn_request      = tcp_v4_conn_request,
1795         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1796         .net_header_len    = sizeof(struct iphdr),
1797         .setsockopt        = ip_setsockopt,
1798         .getsockopt        = ip_getsockopt,
1799         .addr2sockaddr     = inet_csk_addr2sockaddr,
1800         .sockaddr_len      = sizeof(struct sockaddr_in),
1801         .bind_conflict     = inet_csk_bind_conflict,
1802 #ifdef CONFIG_COMPAT
1803         .compat_setsockopt = compat_ip_setsockopt,
1804         .compat_getsockopt = compat_ip_getsockopt,
1805 #endif
1806         .mtu_reduced       = tcp_v4_mtu_reduced,
1807 };
1808 EXPORT_SYMBOL(ipv4_specific);
1809
1810 #ifdef CONFIG_TCP_MD5SIG
1811 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1812         .md5_lookup             = tcp_v4_md5_lookup,
1813         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1814         .md5_parse              = tcp_v4_parse_md5_keys,
1815 };
1816 #endif
1817
1818 /* NOTE: A lot of things set to zero explicitly by call to
1819  *       sk_alloc() so need not be done here.
1820  */
1821 static int tcp_v4_init_sock(struct sock *sk)
1822 {
1823         struct inet_connection_sock *icsk = inet_csk(sk);
1824
1825         tcp_init_sock(sk);
1826
1827         icsk->icsk_af_ops = &ipv4_specific;
1828
1829 #ifdef CONFIG_TCP_MD5SIG
1830         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1831 #endif
1832
1833         return 0;
1834 }
1835
1836 void tcp_v4_destroy_sock(struct sock *sk)
1837 {
1838         struct tcp_sock *tp = tcp_sk(sk);
1839
1840         tcp_clear_xmit_timers(sk);
1841
1842         tcp_cleanup_congestion_control(sk);
1843
1844         /* Cleanup up the write buffer. */
1845         tcp_write_queue_purge(sk);
1846
1847         /* Cleans up our, hopefully empty, out_of_order_queue. */
1848         __skb_queue_purge(&tp->out_of_order_queue);
1849
1850 #ifdef CONFIG_TCP_MD5SIG
1851         /* Clean up the MD5 key list, if any */
1852         if (tp->md5sig_info) {
1853                 tcp_clear_md5_list(sk);
1854                 kfree_rcu(tp->md5sig_info, rcu);
1855                 tp->md5sig_info = NULL;
1856         }
1857 #endif
1858
1859 #ifdef CONFIG_NET_DMA
1860         /* Cleans up our sk_async_wait_queue */
1861         __skb_queue_purge(&sk->sk_async_wait_queue);
1862 #endif
1863
1864         /* Clean prequeue, it must be empty really */
1865         __skb_queue_purge(&tp->ucopy.prequeue);
1866
1867         /* Clean up a referenced TCP bind bucket. */
1868         if (inet_csk(sk)->icsk_bind_hash)
1869                 inet_put_port(sk);
1870
1871         BUG_ON(tp->fastopen_rsk != NULL);
1872
1873         /* If socket is aborted during connect operation */
1874         tcp_free_fastopen_req(tp);
1875
1876         sk_sockets_allocated_dec(sk);
1877         sock_release_memcg(sk);
1878 }
1879 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1880
1881 #ifdef CONFIG_PROC_FS
1882 /* Proc filesystem TCP sock list dumping. */
1883
1884 /*
1885  * Get next listener socket follow cur.  If cur is NULL, get first socket
1886  * starting from bucket given in st->bucket; when st->bucket is zero the
1887  * very first socket in the hash table is returned.
1888  */
1889 static void *listening_get_next(struct seq_file *seq, void *cur)
1890 {
1891         struct inet_connection_sock *icsk;
1892         struct hlist_nulls_node *node;
1893         struct sock *sk = cur;
1894         struct inet_listen_hashbucket *ilb;
1895         struct tcp_iter_state *st = seq->private;
1896         struct net *net = seq_file_net(seq);
1897
1898         if (!sk) {
1899                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1900                 spin_lock_bh(&ilb->lock);
1901                 sk = sk_nulls_head(&ilb->head);
1902                 st->offset = 0;
1903                 goto get_sk;
1904         }
1905         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1906         ++st->num;
1907         ++st->offset;
1908
1909         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1910                 struct request_sock *req = cur;
1911
1912                 icsk = inet_csk(st->syn_wait_sk);
1913                 req = req->dl_next;
1914                 while (1) {
1915                         while (req) {
1916                                 if (req->rsk_ops->family == st->family) {
1917                                         cur = req;
1918                                         goto out;
1919                                 }
1920                                 req = req->dl_next;
1921                         }
1922                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1923                                 break;
1924 get_req:
1925                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1926                 }
1927                 sk        = sk_nulls_next(st->syn_wait_sk);
1928                 st->state = TCP_SEQ_STATE_LISTENING;
1929                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1930         } else {
1931                 icsk = inet_csk(sk);
1932                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1933                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1934                         goto start_req;
1935                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1936                 sk = sk_nulls_next(sk);
1937         }
1938 get_sk:
1939         sk_nulls_for_each_from(sk, node) {
1940                 if (!net_eq(sock_net(sk), net))
1941                         continue;
1942                 if (sk->sk_family == st->family) {
1943                         cur = sk;
1944                         goto out;
1945                 }
1946                 icsk = inet_csk(sk);
1947                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1948                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1949 start_req:
1950                         st->uid         = sock_i_uid(sk);
1951                         st->syn_wait_sk = sk;
1952                         st->state       = TCP_SEQ_STATE_OPENREQ;
1953                         st->sbucket     = 0;
1954                         goto get_req;
1955                 }
1956                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1957         }
1958         spin_unlock_bh(&ilb->lock);
1959         st->offset = 0;
1960         if (++st->bucket < INET_LHTABLE_SIZE) {
1961                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1962                 spin_lock_bh(&ilb->lock);
1963                 sk = sk_nulls_head(&ilb->head);
1964                 goto get_sk;
1965         }
1966         cur = NULL;
1967 out:
1968         return cur;
1969 }
1970
1971 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1972 {
1973         struct tcp_iter_state *st = seq->private;
1974         void *rc;
1975
1976         st->bucket = 0;
1977         st->offset = 0;
1978         rc = listening_get_next(seq, NULL);
1979
1980         while (rc && *pos) {
1981                 rc = listening_get_next(seq, rc);
1982                 --*pos;
1983         }
1984         return rc;
1985 }
1986
1987 static inline bool empty_bucket(const struct tcp_iter_state *st)
1988 {
1989         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1990 }
1991
1992 /*
1993  * Get first established socket starting from bucket given in st->bucket.
1994  * If st->bucket is zero, the very first socket in the hash is returned.
1995  */
1996 static void *established_get_first(struct seq_file *seq)
1997 {
1998         struct tcp_iter_state *st = seq->private;
1999         struct net *net = seq_file_net(seq);
2000         void *rc = NULL;
2001
2002         st->offset = 0;
2003         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2004                 struct sock *sk;
2005                 struct hlist_nulls_node *node;
2006                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2007
2008                 /* Lockless fast path for the common case of empty buckets */
2009                 if (empty_bucket(st))
2010                         continue;
2011
2012                 spin_lock_bh(lock);
2013                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2014                         if (sk->sk_family != st->family ||
2015                             !net_eq(sock_net(sk), net)) {
2016                                 continue;
2017                         }
2018                         rc = sk;
2019                         goto out;
2020                 }
2021                 spin_unlock_bh(lock);
2022         }
2023 out:
2024         return rc;
2025 }
2026
2027 static void *established_get_next(struct seq_file *seq, void *cur)
2028 {
2029         struct sock *sk = cur;
2030         struct hlist_nulls_node *node;
2031         struct tcp_iter_state *st = seq->private;
2032         struct net *net = seq_file_net(seq);
2033
2034         ++st->num;
2035         ++st->offset;
2036
2037         sk = sk_nulls_next(sk);
2038
2039         sk_nulls_for_each_from(sk, node) {
2040                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2041                         return sk;
2042         }
2043
2044         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2045         ++st->bucket;
2046         return established_get_first(seq);
2047 }
2048
2049 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2050 {
2051         struct tcp_iter_state *st = seq->private;
2052         void *rc;
2053
2054         st->bucket = 0;
2055         rc = established_get_first(seq);
2056
2057         while (rc && pos) {
2058                 rc = established_get_next(seq, rc);
2059                 --pos;
2060         }
2061         return rc;
2062 }
2063
2064 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2065 {
2066         void *rc;
2067         struct tcp_iter_state *st = seq->private;
2068
2069         st->state = TCP_SEQ_STATE_LISTENING;
2070         rc        = listening_get_idx(seq, &pos);
2071
2072         if (!rc) {
2073                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2074                 rc        = established_get_idx(seq, pos);
2075         }
2076
2077         return rc;
2078 }
2079
2080 static void *tcp_seek_last_pos(struct seq_file *seq)
2081 {
2082         struct tcp_iter_state *st = seq->private;
2083         int offset = st->offset;
2084         int orig_num = st->num;
2085         void *rc = NULL;
2086
2087         switch (st->state) {
2088         case TCP_SEQ_STATE_OPENREQ:
2089         case TCP_SEQ_STATE_LISTENING:
2090                 if (st->bucket >= INET_LHTABLE_SIZE)
2091                         break;
2092                 st->state = TCP_SEQ_STATE_LISTENING;
2093                 rc = listening_get_next(seq, NULL);
2094                 while (offset-- && rc)
2095                         rc = listening_get_next(seq, rc);
2096                 if (rc)
2097                         break;
2098                 st->bucket = 0;
2099                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2100                 /* Fallthrough */
2101         case TCP_SEQ_STATE_ESTABLISHED:
2102                 if (st->bucket > tcp_hashinfo.ehash_mask)
2103                         break;
2104                 rc = established_get_first(seq);
2105                 while (offset-- && rc)
2106                         rc = established_get_next(seq, rc);
2107         }
2108
2109         st->num = orig_num;
2110
2111         return rc;
2112 }
2113
2114 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2115 {
2116         struct tcp_iter_state *st = seq->private;
2117         void *rc;
2118
2119         if (*pos && *pos == st->last_pos) {
2120                 rc = tcp_seek_last_pos(seq);
2121                 if (rc)
2122                         goto out;
2123         }
2124
2125         st->state = TCP_SEQ_STATE_LISTENING;
2126         st->num = 0;
2127         st->bucket = 0;
2128         st->offset = 0;
2129         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2130
2131 out:
2132         st->last_pos = *pos;
2133         return rc;
2134 }
2135
2136 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2137 {
2138         struct tcp_iter_state *st = seq->private;
2139         void *rc = NULL;
2140
2141         if (v == SEQ_START_TOKEN) {
2142                 rc = tcp_get_idx(seq, 0);
2143                 goto out;
2144         }
2145
2146         switch (st->state) {
2147         case TCP_SEQ_STATE_OPENREQ:
2148         case TCP_SEQ_STATE_LISTENING:
2149                 rc = listening_get_next(seq, v);
2150                 if (!rc) {
2151                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2152                         st->bucket = 0;
2153                         st->offset = 0;
2154                         rc        = established_get_first(seq);
2155                 }
2156                 break;
2157         case TCP_SEQ_STATE_ESTABLISHED:
2158                 rc = established_get_next(seq, v);
2159                 break;
2160         }
2161 out:
2162         ++*pos;
2163         st->last_pos = *pos;
2164         return rc;
2165 }
2166
2167 static void tcp_seq_stop(struct seq_file *seq, void *v)
2168 {
2169         struct tcp_iter_state *st = seq->private;
2170
2171         switch (st->state) {
2172         case TCP_SEQ_STATE_OPENREQ:
2173                 if (v) {
2174                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2175                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2176                 }
2177         case TCP_SEQ_STATE_LISTENING:
2178                 if (v != SEQ_START_TOKEN)
2179                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2180                 break;
2181         case TCP_SEQ_STATE_ESTABLISHED:
2182                 if (v)
2183                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2184                 break;
2185         }
2186 }
2187
2188 int tcp_seq_open(struct inode *inode, struct file *file)
2189 {
2190         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2191         struct tcp_iter_state *s;
2192         int err;
2193
2194         err = seq_open_net(inode, file, &afinfo->seq_ops,
2195                           sizeof(struct tcp_iter_state));
2196         if (err < 0)
2197                 return err;
2198
2199         s = ((struct seq_file *)file->private_data)->private;
2200         s->family               = afinfo->family;
2201         s->last_pos             = 0;
2202         return 0;
2203 }
2204 EXPORT_SYMBOL(tcp_seq_open);
2205
2206 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2207 {
2208         int rc = 0;
2209         struct proc_dir_entry *p;
2210
2211         afinfo->seq_ops.start           = tcp_seq_start;
2212         afinfo->seq_ops.next            = tcp_seq_next;
2213         afinfo->seq_ops.stop            = tcp_seq_stop;
2214
2215         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2216                              afinfo->seq_fops, afinfo);
2217         if (!p)
2218                 rc = -ENOMEM;
2219         return rc;
2220 }
2221 EXPORT_SYMBOL(tcp_proc_register);
2222
2223 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2224 {
2225         remove_proc_entry(afinfo->name, net->proc_net);
2226 }
2227 EXPORT_SYMBOL(tcp_proc_unregister);
2228
2229 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2230                          struct seq_file *f, int i, kuid_t uid)
2231 {
2232         const struct inet_request_sock *ireq = inet_rsk(req);
2233         long delta = req->expires - jiffies;
2234
2235         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2236                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2237                 i,
2238                 ireq->ir_loc_addr,
2239                 ntohs(inet_sk(sk)->inet_sport),
2240                 ireq->ir_rmt_addr,
2241                 ntohs(ireq->ir_rmt_port),
2242                 TCP_SYN_RECV,
2243                 0, 0, /* could print option size, but that is af dependent. */
2244                 1,    /* timers active (only the expire timer) */
2245                 jiffies_delta_to_clock_t(delta),
2246                 req->num_timeout,
2247                 from_kuid_munged(seq_user_ns(f), uid),
2248                 0,  /* non standard timer */
2249                 0, /* open_requests have no inode */
2250                 atomic_read(&sk->sk_refcnt),
2251                 req);
2252 }
2253
2254 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2255 {
2256         int timer_active;
2257         unsigned long timer_expires;
2258         const struct tcp_sock *tp = tcp_sk(sk);
2259         const struct inet_connection_sock *icsk = inet_csk(sk);
2260         const struct inet_sock *inet = inet_sk(sk);
2261         struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2262         __be32 dest = inet->inet_daddr;
2263         __be32 src = inet->inet_rcv_saddr;
2264         __u16 destp = ntohs(inet->inet_dport);
2265         __u16 srcp = ntohs(inet->inet_sport);
2266         int rx_queue;
2267
2268         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2269             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2270             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2271                 timer_active    = 1;
2272                 timer_expires   = icsk->icsk_timeout;
2273         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2274                 timer_active    = 4;
2275                 timer_expires   = icsk->icsk_timeout;
2276         } else if (timer_pending(&sk->sk_timer)) {
2277                 timer_active    = 2;
2278                 timer_expires   = sk->sk_timer.expires;
2279         } else {
2280                 timer_active    = 0;
2281                 timer_expires = jiffies;
2282         }
2283
2284         if (sk->sk_state == TCP_LISTEN)
2285                 rx_queue = sk->sk_ack_backlog;
2286         else
2287                 /*
2288                  * because we dont lock socket, we might find a transient negative value
2289                  */
2290                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2291
2292         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2293                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2294                 i, src, srcp, dest, destp, sk->sk_state,
2295                 tp->write_seq - tp->snd_una,
2296                 rx_queue,
2297                 timer_active,
2298                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2299                 icsk->icsk_retransmits,
2300                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2301                 icsk->icsk_probes_out,
2302                 sock_i_ino(sk),
2303                 atomic_read(&sk->sk_refcnt), sk,
2304                 jiffies_to_clock_t(icsk->icsk_rto),
2305                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2306                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2307                 tp->snd_cwnd,
2308                 sk->sk_state == TCP_LISTEN ?
2309                     (fastopenq ? fastopenq->max_qlen : 0) :
2310                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2311 }
2312
2313 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2314                                struct seq_file *f, int i)
2315 {
2316         __be32 dest, src;
2317         __u16 destp, srcp;
2318         s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2319
2320         dest  = tw->tw_daddr;
2321         src   = tw->tw_rcv_saddr;
2322         destp = ntohs(tw->tw_dport);
2323         srcp  = ntohs(tw->tw_sport);
2324
2325         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2326                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2327                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2328                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2329                 atomic_read(&tw->tw_refcnt), tw);
2330 }
2331
2332 #define TMPSZ 150
2333
2334 static int tcp4_seq_show(struct seq_file *seq, void *v)
2335 {
2336         struct tcp_iter_state *st;
2337         struct sock *sk = v;
2338
2339         seq_setwidth(seq, TMPSZ - 1);
2340         if (v == SEQ_START_TOKEN) {
2341                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2342                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2343                            "inode");
2344                 goto out;
2345         }
2346         st = seq->private;
2347
2348         switch (st->state) {
2349         case TCP_SEQ_STATE_LISTENING:
2350         case TCP_SEQ_STATE_ESTABLISHED:
2351                 if (sk->sk_state == TCP_TIME_WAIT)
2352                         get_timewait4_sock(v, seq, st->num);
2353                 else
2354                         get_tcp4_sock(v, seq, st->num);
2355                 break;
2356         case TCP_SEQ_STATE_OPENREQ:
2357                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
2358                 break;
2359         }
2360 out:
2361         seq_pad(seq, '\n');
2362         return 0;
2363 }
2364
2365 static const struct file_operations tcp_afinfo_seq_fops = {
2366         .owner   = THIS_MODULE,
2367         .open    = tcp_seq_open,
2368         .read    = seq_read,
2369         .llseek  = seq_lseek,
2370         .release = seq_release_net
2371 };
2372
2373 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2374         .name           = "tcp",
2375         .family         = AF_INET,
2376         .seq_fops       = &tcp_afinfo_seq_fops,
2377         .seq_ops        = {
2378                 .show           = tcp4_seq_show,
2379         },
2380 };
2381
2382 static int __net_init tcp4_proc_init_net(struct net *net)
2383 {
2384         return tcp_proc_register(net, &tcp4_seq_afinfo);
2385 }
2386
2387 static void __net_exit tcp4_proc_exit_net(struct net *net)
2388 {
2389         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2390 }
2391
2392 static struct pernet_operations tcp4_net_ops = {
2393         .init = tcp4_proc_init_net,
2394         .exit = tcp4_proc_exit_net,
2395 };
2396
2397 int __init tcp4_proc_init(void)
2398 {
2399         return register_pernet_subsys(&tcp4_net_ops);
2400 }
2401
2402 void tcp4_proc_exit(void)
2403 {
2404         unregister_pernet_subsys(&tcp4_net_ops);
2405 }
2406 #endif /* CONFIG_PROC_FS */
2407
2408 struct proto tcp_prot = {
2409         .name                   = "TCP",
2410         .owner                  = THIS_MODULE,
2411         .close                  = tcp_close,
2412         .connect                = tcp_v4_connect,
2413         .disconnect             = tcp_disconnect,
2414         .accept                 = inet_csk_accept,
2415         .ioctl                  = tcp_ioctl,
2416         .init                   = tcp_v4_init_sock,
2417         .destroy                = tcp_v4_destroy_sock,
2418         .shutdown               = tcp_shutdown,
2419         .setsockopt             = tcp_setsockopt,
2420         .getsockopt             = tcp_getsockopt,
2421         .recvmsg                = tcp_recvmsg,
2422         .sendmsg                = tcp_sendmsg,
2423         .sendpage               = tcp_sendpage,
2424         .backlog_rcv            = tcp_v4_do_rcv,
2425         .release_cb             = tcp_release_cb,
2426         .hash                   = inet_hash,
2427         .unhash                 = inet_unhash,
2428         .get_port               = inet_csk_get_port,
2429         .enter_memory_pressure  = tcp_enter_memory_pressure,
2430         .stream_memory_free     = tcp_stream_memory_free,
2431         .sockets_allocated      = &tcp_sockets_allocated,
2432         .orphan_count           = &tcp_orphan_count,
2433         .memory_allocated       = &tcp_memory_allocated,
2434         .memory_pressure        = &tcp_memory_pressure,
2435         .sysctl_mem             = sysctl_tcp_mem,
2436         .sysctl_wmem            = sysctl_tcp_wmem,
2437         .sysctl_rmem            = sysctl_tcp_rmem,
2438         .max_header             = MAX_TCP_HEADER,
2439         .obj_size               = sizeof(struct tcp_sock),
2440         .slab_flags             = SLAB_DESTROY_BY_RCU,
2441         .twsk_prot              = &tcp_timewait_sock_ops,
2442         .rsk_prot               = &tcp_request_sock_ops,
2443         .h.hashinfo             = &tcp_hashinfo,
2444         .no_autobind            = true,
2445 #ifdef CONFIG_COMPAT
2446         .compat_setsockopt      = compat_tcp_setsockopt,
2447         .compat_getsockopt      = compat_tcp_getsockopt,
2448 #endif
2449 #ifdef CONFIG_MEMCG_KMEM
2450         .init_cgroup            = tcp_init_cgroup,
2451         .destroy_cgroup         = tcp_destroy_cgroup,
2452         .proto_cgroup           = tcp_proto_cgroup,
2453 #endif
2454 };
2455 EXPORT_SYMBOL(tcp_prot);
2456
2457 static int __net_init tcp_sk_init(struct net *net)
2458 {
2459         net->ipv4.sysctl_tcp_ecn = 2;
2460         return 0;
2461 }
2462
2463 static void __net_exit tcp_sk_exit(struct net *net)
2464 {
2465 }
2466
2467 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2468 {
2469         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2470 }
2471
2472 static struct pernet_operations __net_initdata tcp_sk_ops = {
2473        .init       = tcp_sk_init,
2474        .exit       = tcp_sk_exit,
2475        .exit_batch = tcp_sk_exit_batch,
2476 };
2477
2478 void __init tcp_v4_init(void)
2479 {
2480         inet_hashinfo_init(&tcp_hashinfo);
2481         if (register_pernet_subsys(&tcp_sk_ops))
2482                 panic("Failed to create the TCP control socket.\n");
2483 }