net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96 #endif
  97
  98 struct inet_hashinfo tcp_hashinfo;
  99 EXPORT_SYMBOL(tcp_hashinfo);
 100
 101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102 {
 103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                           ip_hdr(skb)->saddr,
 105                                           tcp_hdr(skb)->dest,
 106                                           tcp_hdr(skb)->source);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 /* This will initiate an outgoing connection. */
 142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 143 {
 144         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct tcp_sock *tp = tcp_sk(sk);
 147         __be16 orig_sport, orig_dport;
 148         __be32 daddr, nexthop;
 149         struct flowi4 *fl4;
 150         struct rtable *rt;
 151         int err;
 152         struct ip_options_rcu *inet_opt;
 153
 154         if (addr_len < sizeof(struct sockaddr_in))
 155                 return -EINVAL;
 156
 157         if (usin->sin_family != AF_INET)
 158                 return -EAFNOSUPPORT;
 159
 160         nexthop = daddr = usin->sin_addr.s_addr;
 161         inet_opt = rcu_dereference_protected(inet->inet_opt,
 162                                              sock_owned_by_user(sk));
 163         if (inet_opt && inet_opt->opt.srr) {
 164                 if (!daddr)
 165                         return -EINVAL;
 166                 nexthop = inet_opt->opt.faddr;
 167         }
 168
 169         orig_sport = inet->inet_sport;
 170         orig_dport = usin->sin_port;
 171         fl4 = &inet->cork.fl.u.ip4;
 172         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 173                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 174                               IPPROTO_TCP,
 175                               orig_sport, orig_dport, sk, true);
 176         if (IS_ERR(rt)) {
 177                 err = PTR_ERR(rt);
 178                 if (err == -ENETUNREACH)
 179                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 180                 return err;
 181         }
 182
 183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 184                 ip_rt_put(rt);
 185                 return -ENETUNREACH;
 186         }
 187
 188         if (!inet_opt || !inet_opt->opt.srr)
 189                 daddr = fl4->daddr;
 190
 191         if (!inet->inet_saddr)
 192                 inet->inet_saddr = fl4->saddr;
 193         inet->inet_rcv_saddr = inet->inet_saddr;
 194
 195         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 196                 /* Reset inherited state */
 197                 tp->rx_opt.ts_recent       = 0;
 198                 tp->rx_opt.ts_recent_stamp = 0;
 199                 tp->write_seq              = 0;
 200         }
 201
 202         if (tcp_death_row.sysctl_tw_recycle &&
 203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
 204                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
 205                 /*
 206                  * VJ's idea. We save last timestamp seen from
 207                  * the destination in peer table, when entering state
 208                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 209                  * when trying new connection.
 210                  */
 211                 if (peer) {
 212                         inet_peer_refcheck(peer);
 213                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 214                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 215                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 216                         }
 217                 }
 218         }
 219
 220         inet->inet_dport = usin->sin_port;
 221         inet->inet_daddr = daddr;
 222
 223         inet_csk(sk)->icsk_ext_hdr_len = 0;
 224         if (inet_opt)
 225                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 226
 227         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 228
 229         /* Socket identity is still unknown (sport may be zero).
 230          * However we set state to SYN-SENT and not releasing socket
 231          * lock select source port, enter ourselves into the hash tables and
 232          * complete initialization after this.
 233          */
 234         tcp_set_state(sk, TCP_SYN_SENT);
 235         err = inet_hash_connect(&tcp_death_row, sk);
 236         if (err)
 237                 goto failure;
 238
 239         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 240                                inet->inet_sport, inet->inet_dport, sk);
 241         if (IS_ERR(rt)) {
 242                 err = PTR_ERR(rt);
 243                 rt = NULL;
 244                 goto failure;
 245         }
 246         /* OK, now commit destination to socket.  */
 247         sk->sk_gso_type = SKB_GSO_TCPV4;
 248         sk_setup_caps(sk, &rt->dst);
 249
 250         if (!tp->write_seq)
 251                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 252                                                            inet->inet_daddr,
 253                                                            inet->inet_sport,
 254                                                            usin->sin_port);
 255
 256         inet->inet_id = tp->write_seq ^ jiffies;
 257
 258         err = tcp_connect(sk);
 259         rt = NULL;
 260         if (err)
 261                 goto failure;
 262
 263         return 0;
 264
 265 failure:
 266         /*
 267          * This unhashes the socket and releases the local port,
 268          * if necessary.
 269          */
 270         tcp_set_state(sk, TCP_CLOSE);
 271         ip_rt_put(rt);
 272         sk->sk_route_caps = 0;
 273         inet->inet_dport = 0;
 274         return err;
 275 }
 276 EXPORT_SYMBOL(tcp_v4_connect);
 277
 278 /*
 279  * This routine does path mtu discovery as defined in RFC1191.
 280  */
 281 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 282 {
 283         struct dst_entry *dst;
 284         struct inet_sock *inet = inet_sk(sk);
 285
 286         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 287          * send out by Linux are always <576bytes so they should go through
 288          * unfragmented).
 289          */
 290         if (sk->sk_state == TCP_LISTEN)
 291                 return;
 292
 293         /* We don't check in the destentry if pmtu discovery is forbidden
 294          * on this route. We just assume that no packet_to_big packets
 295          * are send back when pmtu discovery is not active.
 296          * There is a small race when the user changes this flag in the
 297          * route, but I think that's acceptable.
 298          */
 299         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 300                 return;
 301
 302         dst->ops->update_pmtu(dst, mtu);
 303
 304         /* Something is about to be wrong... Remember soft error
 305          * for the case, if this connection will not able to recover.
 306          */
 307         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 308                 sk->sk_err_soft = EMSGSIZE;
 309
 310         mtu = dst_mtu(dst);
 311
 312         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 313             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 314                 tcp_sync_mss(sk, mtu);
 315
 316                 /* Resend the TCP packet because it's
 317                  * clear that the old packet has been
 318                  * dropped. This is the new "fast" path mtu
 319                  * discovery.
 320                  */
 321                 tcp_simple_retransmit(sk);
 322         } /* else let the usual retransmit timer handle it */
 323 }
 324
 325 /*
 326  * This routine is called by the ICMP module when it gets some
 327  * sort of error condition.  If err < 0 then the socket should
 328  * be closed and the error returned to the user.  If err > 0
 329  * it's just the icmp type << 8 | icmp code.  After adjustment
 330  * header points to the first 8 bytes of the tcp header.  We need
 331  * to find the appropriate port.
 332  *
 333  * The locking strategy used here is very "optimistic". When
 334  * someone else accesses the socket the ICMP is just dropped
 335  * and for some paths there is no check at all.
 336  * A more general error queue to queue errors for later handling
 337  * is probably better.
 338  *
 339  */
 340
 341 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 342 {
 343         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 344         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 345         struct inet_connection_sock *icsk;
 346         struct tcp_sock *tp;
 347         struct inet_sock *inet;
 348         const int type = icmp_hdr(icmp_skb)->type;
 349         const int code = icmp_hdr(icmp_skb)->code;
 350         struct sock *sk;
 351         struct sk_buff *skb;
 352         __u32 seq;
 353         __u32 remaining;
 354         int err;
 355         struct net *net = dev_net(icmp_skb->dev);
 356
 357         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 358                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 359                 return;
 360         }
 361
 362         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 363                         iph->saddr, th->source, inet_iif(icmp_skb));
 364         if (!sk) {
 365                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 366                 return;
 367         }
 368         if (sk->sk_state == TCP_TIME_WAIT) {
 369                 inet_twsk_put(inet_twsk(sk));
 370                 return;
 371         }
 372
 373         bh_lock_sock(sk);
 374         /* If too many ICMPs get dropped on busy
 375          * servers this needs to be solved differently.
 376          */
 377         if (sock_owned_by_user(sk))
 378                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 379
 380         if (sk->sk_state == TCP_CLOSE)
 381                 goto out;
 382
 383         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 384                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 385                 goto out;
 386         }
 387
 388         icsk = inet_csk(sk);
 389         tp = tcp_sk(sk);
 390         seq = ntohl(th->seq);
 391         if (sk->sk_state != TCP_LISTEN &&
 392             !between(seq, tp->snd_una, tp->snd_nxt)) {
 393                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 394                 goto out;
 395         }
 396
 397         switch (type) {
 398         case ICMP_SOURCE_QUENCH:
 399                 /* Just silently ignore these. */
 400                 goto out;
 401         case ICMP_PARAMETERPROB:
 402                 err = EPROTO;
 403                 break;
 404         case ICMP_DEST_UNREACH:
 405                 if (code > NR_ICMP_UNREACH)
 406                         goto out;
 407
 408                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 409                         if (!sock_owned_by_user(sk))
 410                                 do_pmtu_discovery(sk, iph, info);
 411                         goto out;
 412                 }
 413
 414                 err = icmp_err_convert[code].errno;
 415                 /* check if icmp_skb allows revert of backoff
 416                  * (see draft-zimmermann-tcp-lcd) */
 417                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 418                         break;
 419                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420                     !icsk->icsk_backoff)
 421                         break;
 422
 423                 if (sock_owned_by_user(sk))
 424                         break;
 425
 426                 icsk->icsk_backoff--;
 427                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 428                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 429                 tcp_bound_rto(sk);
 430
 431                 skb = tcp_write_queue_head(sk);
 432                 BUG_ON(!skb);
 433
 434                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 435                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 436
 437                 if (remaining) {
 438                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 439                                                   remaining, TCP_RTO_MAX);
 440                 } else {
 441                         /* RTO revert clocked out retransmission.
 442                          * Will retransmit now */
 443                         tcp_retransmit_timer(sk);
 444                 }
 445
 446                 break;
 447         case ICMP_TIME_EXCEEDED:
 448                 err = EHOSTUNREACH;
 449                 break;
 450         default:
 451                 goto out;
 452         }
 453
 454         switch (sk->sk_state) {
 455                 struct request_sock *req, **prev;
 456         case TCP_LISTEN:
 457                 if (sock_owned_by_user(sk))
 458                         goto out;
 459
 460                 req = inet_csk_search_req(sk, &prev, th->dest,
 461                                           iph->daddr, iph->saddr);
 462                 if (!req)
 463                         goto out;
 464
 465                 /* ICMPs are not backlogged, hence we cannot get
 466                    an established socket here.
 467                  */
 468                 WARN_ON(req->sk);
 469
 470                 if (seq != tcp_rsk(req)->snt_isn) {
 471                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 472                         goto out;
 473                 }
 474
 475                 /*
 476                  * Still in SYN_RECV, just remove it silently.
 477                  * There is no good way to pass the error to the newly
 478                  * created socket, and POSIX does not want network
 479                  * errors returned from accept().
 480                  */
 481                 inet_csk_reqsk_queue_drop(sk, req, prev);
 482                 goto out;
 483
 484         case TCP_SYN_SENT:
 485         case TCP_SYN_RECV:  /* Cannot happen.
 486                                It can f.e. if SYNs crossed.
 487                              */
 488                 if (!sock_owned_by_user(sk)) {
 489                         sk->sk_err = err;
 490
 491                         sk->sk_error_report(sk);
 492
 493                         tcp_done(sk);
 494                 } else {
 495                         sk->sk_err_soft = err;
 496                 }
 497                 goto out;
 498         }
 499
 500         /* If we've already connected we will keep trying
 501          * until we time out, or the user gives up.
 502          *
 503          * rfc1122 4.2.3.9 allows to consider as hard errors
 504          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 505          * but it is obsoleted by pmtu discovery).
 506          *
 507          * Note, that in modern internet, where routing is unreliable
 508          * and in each dark corner broken firewalls sit, sending random
 509          * errors ordered by their masters even this two messages finally lose
 510          * their original sense (even Linux sends invalid PORT_UNREACHs)
 511          *
 512          * Now we are in compliance with RFCs.
 513          *                                                      --ANK (980905)
 514          */
 515
 516         inet = inet_sk(sk);
 517         if (!sock_owned_by_user(sk) && inet->recverr) {
 518                 sk->sk_err = err;
 519                 sk->sk_error_report(sk);
 520         } else  { /* Only an error on timeout */
 521                 sk->sk_err_soft = err;
 522         }
 523
 524 out:
 525         bh_unlock_sock(sk);
 526         sock_put(sk);
 527 }
 528
 529 static void __tcp_v4_send_check(struct sk_buff *skb,
 530                                 __be32 saddr, __be32 daddr)
 531 {
 532         struct tcphdr *th = tcp_hdr(skb);
 533
 534         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 535                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 536                 skb->csum_start = skb_transport_header(skb) - skb->head;
 537                 skb->csum_offset = offsetof(struct tcphdr, check);
 538         } else {
 539                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 540                                          csum_partial(th,
 541                                                       th->doff << 2,
 542                                                       skb->csum));
 543         }
 544 }
 545
 546 /* This routine computes an IPv4 TCP checksum. */
 547 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 548 {
 549         const struct inet_sock *inet = inet_sk(sk);
 550
 551         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 552 }
 553 EXPORT_SYMBOL(tcp_v4_send_check);
 554
 555 int tcp_v4_gso_send_check(struct sk_buff *skb)
 556 {
 557         const struct iphdr *iph;
 558         struct tcphdr *th;
 559
 560         if (!pskb_may_pull(skb, sizeof(*th)))
 561                 return -EINVAL;
 562
 563         iph = ip_hdr(skb);
 564         th = tcp_hdr(skb);
 565
 566         th->check = 0;
 567         skb->ip_summed = CHECKSUM_PARTIAL;
 568         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 569         return 0;
 570 }
 571
 572 /*
 573  *      This routine will send an RST to the other tcp.
 574  *
 575  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 576  *                    for reset.
 577  *      Answer: if a packet caused RST, it is not for a socket
 578  *              existing in our system, if it is matched to a socket,
 579  *              it is just duplicate segment or bug in other side's TCP.
 580  *              So that we build reply only basing on parameters
 581  *              arrived with segment.
 582  *      Exception: precedence violation. We do not implement it in any case.
 583  */
 584
 585 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 586 {
 587         const struct tcphdr *th = tcp_hdr(skb);
 588         struct {
 589                 struct tcphdr th;
 590 #ifdef CONFIG_TCP_MD5SIG
 591                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 592 #endif
 593         } rep;
 594         struct ip_reply_arg arg;
 595 #ifdef CONFIG_TCP_MD5SIG
 596         struct tcp_md5sig_key *key;
 597         const __u8 *hash_location = NULL;
 598         unsigned char newhash[16];
 599         int genhash;
 600         struct sock *sk1 = NULL;
 601 #endif
 602         struct net *net;
 603
 604         /* Never send a reset in response to a reset. */
 605         if (th->rst)
 606                 return;
 607
 608         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 609                 return;
 610
 611         /* Swap the send and the receive. */
 612         memset(&rep, 0, sizeof(rep));
 613         rep.th.dest   = th->source;
 614         rep.th.source = th->dest;
 615         rep.th.doff   = sizeof(struct tcphdr) / 4;
 616         rep.th.rst    = 1;
 617
 618         if (th->ack) {
 619                 rep.th.seq = th->ack_seq;
 620         } else {
 621                 rep.th.ack = 1;
 622                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 623                                        skb->len - (th->doff << 2));
 624         }
 625
 626         memset(&arg, 0, sizeof(arg));
 627         arg.iov[0].iov_base = (unsigned char *)&rep;
 628         arg.iov[0].iov_len  = sizeof(rep.th);
 629
 630 #ifdef CONFIG_TCP_MD5SIG
 631         hash_location = tcp_parse_md5sig_option(th);
 632         if (!sk && hash_location) {
 633                 /*
 634                  * active side is lost. Try to find listening socket through
 635                  * source port, and then find md5 key through listening socket.
 636                  * we are not loose security here:
 637                  * Incoming packet is checked with md5 hash with finding key,
 638                  * no RST generated if md5 hash doesn't match.
 639                  */
 640                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 641                                              &tcp_hashinfo, ip_hdr(skb)->daddr,
 642                                              ntohs(th->source), inet_iif(skb));
 643                 /* don't send rst if it can't find key */
 644                 if (!sk1)
 645                         return;
 646                 rcu_read_lock();
 647                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 648                                         &ip_hdr(skb)->saddr, AF_INET);
 649                 if (!key)
 650                         goto release_sk1;
 651
 652                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 653                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 654                         goto release_sk1;
 655         } else {
 656                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 657                                              &ip_hdr(skb)->saddr,
 658                                              AF_INET) : NULL;
 659         }
 660
 661         if (key) {
 662                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 663                                    (TCPOPT_NOP << 16) |
 664                                    (TCPOPT_MD5SIG << 8) |
 665                                    TCPOLEN_MD5SIG);
 666                 /* Update length and the length the header thinks exists */
 667                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 668                 rep.th.doff = arg.iov[0].iov_len / 4;
 669
 670                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 671                                      key, ip_hdr(skb)->saddr,
 672                                      ip_hdr(skb)->daddr, &rep.th);
 673         }
 674 #endif
 675         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 676                                       ip_hdr(skb)->saddr, /* XXX */
 677                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 678         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 679         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 680         /* When socket is gone, all binding information is lost.
 681          * routing might fail in this case. using iif for oif to
 682          * make sure we can deliver it
 683          */
 684         arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
 685
 686         net = dev_net(skb_dst(skb)->dev);
 687         arg.tos = ip_hdr(skb)->tos;
 688         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 689                       &arg, arg.iov[0].iov_len);
 690
 691         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 692         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 693
 694 #ifdef CONFIG_TCP_MD5SIG
 695 release_sk1:
 696         if (sk1) {
 697                 rcu_read_unlock();
 698                 sock_put(sk1);
 699         }
 700 #endif
 701 }
 702
 703 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 704    outside socket context is ugly, certainly. What can I do?
 705  */
 706
 707 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 708                             u32 win, u32 ts, int oif,
 709                             struct tcp_md5sig_key *key,
 710                             int reply_flags, u8 tos)
 711 {
 712         const struct tcphdr *th = tcp_hdr(skb);
 713         struct {
 714                 struct tcphdr th;
 715                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 716 #ifdef CONFIG_TCP_MD5SIG
 717                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 718 #endif
 719                         ];
 720         } rep;
 721         struct ip_reply_arg arg;
 722         struct net *net = dev_net(skb_dst(skb)->dev);
 723
 724         memset(&rep.th, 0, sizeof(struct tcphdr));
 725         memset(&arg, 0, sizeof(arg));
 726
 727         arg.iov[0].iov_base = (unsigned char *)&rep;
 728         arg.iov[0].iov_len  = sizeof(rep.th);
 729         if (ts) {
 730                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 731                                    (TCPOPT_TIMESTAMP << 8) |
 732                                    TCPOLEN_TIMESTAMP);
 733                 rep.opt[1] = htonl(tcp_time_stamp);
 734                 rep.opt[2] = htonl(ts);
 735                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 736         }
 737
 738         /* Swap the send and the receive. */
 739         rep.th.dest    = th->source;
 740         rep.th.source  = th->dest;
 741         rep.th.doff    = arg.iov[0].iov_len / 4;
 742         rep.th.seq     = htonl(seq);
 743         rep.th.ack_seq = htonl(ack);
 744         rep.th.ack     = 1;
 745         rep.th.window  = htons(win);
 746
 747 #ifdef CONFIG_TCP_MD5SIG
 748         if (key) {
 749                 int offset = (ts) ? 3 : 0;
 750
 751                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 752                                           (TCPOPT_NOP << 16) |
 753                                           (TCPOPT_MD5SIG << 8) |
 754                                           TCPOLEN_MD5SIG);
 755                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 756                 rep.th.doff = arg.iov[0].iov_len/4;
 757
 758                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 759                                     key, ip_hdr(skb)->saddr,
 760                                     ip_hdr(skb)->daddr, &rep.th);
 761         }
 762 #endif
 763         arg.flags = reply_flags;
 764         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 765                                       ip_hdr(skb)->saddr, /* XXX */
 766                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 767         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 768         if (oif)
 769                 arg.bound_dev_if = oif;
 770         arg.tos = tos;
 771         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 772                       &arg, arg.iov[0].iov_len);
 773
 774         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 775 }
 776
 777 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 778 {
 779         struct inet_timewait_sock *tw = inet_twsk(sk);
 780         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 781
 782         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 783                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 784                         tcptw->tw_ts_recent,
 785                         tw->tw_bound_dev_if,
 786                         tcp_twsk_md5_key(tcptw),
 787                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 788                         tw->tw_tos
 789                         );
 790
 791         inet_twsk_put(tw);
 792 }
 793
 794 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 795                                   struct request_sock *req)
 796 {
 797         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 798                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 799                         req->ts_recent,
 800                         0,
 801                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 802                                           AF_INET),
 803                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 804                         ip_hdr(skb)->tos);
 805 }
 806
 807 /*
 808  *      Send a SYN-ACK after having received a SYN.
 809  *      This still operates on a request_sock only, not on a big
 810  *      socket.
 811  */
 812 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 813                               struct request_sock *req,
 814                               struct request_values *rvp)
 815 {
 816         const struct inet_request_sock *ireq = inet_rsk(req);
 817         struct flowi4 fl4;
 818         int err = -1;
 819         struct sk_buff * skb;
 820
 821         /* First, grab a route. */
 822         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 823                 return -1;
 824
 825         skb = tcp_make_synack(sk, dst, req, rvp);
 826
 827         if (skb) {
 828                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 829
 830                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 831                                             ireq->rmt_addr,
 832                                             ireq->opt);
 833                 err = net_xmit_eval(err);
 834         }
 835
 836         dst_release(dst);
 837         return err;
 838 }
 839
 840 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 841                               struct request_values *rvp)
 842 {
 843         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 844         return tcp_v4_send_synack(sk, NULL, req, rvp);
 845 }
 846
 847 /*
 848  *      IPv4 request_sock destructor.
 849  */
 850 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 851 {
 852         kfree(inet_rsk(req)->opt);
 853 }
 854
 855 /*
 856  * Return 1 if a syncookie should be sent
 857  */
 858 int tcp_syn_flood_action(struct sock *sk,
 859                          const struct sk_buff *skb,
 860                          const char *proto)
 861 {
 862         const char *msg = "Dropping request";
 863         int want_cookie = 0;
 864         struct listen_sock *lopt;
 865
 866
 867
 868 #ifdef CONFIG_SYN_COOKIES
 869         if (sysctl_tcp_syncookies) {
 870                 msg = "Sending cookies";
 871                 want_cookie = 1;
 872                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 873         } else
 874 #endif
 875                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 876
 877         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 878         if (!lopt->synflood_warned) {
 879                 lopt->synflood_warned = 1;
 880                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 881                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 882         }
 883         return want_cookie;
 884 }
 885 EXPORT_SYMBOL(tcp_syn_flood_action);
 886
 887 /*
 888  * Save and compile IPv4 options into the request_sock if needed.
 889  */
 890 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 891                                                   struct sk_buff *skb)
 892 {
 893         const struct ip_options *opt = &(IPCB(skb)->opt);
 894         struct ip_options_rcu *dopt = NULL;
 895
 896         if (opt && opt->optlen) {
 897                 int opt_size = sizeof(*dopt) + opt->optlen;
 898
 899                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 900                 if (dopt) {
 901                         if (ip_options_echo(&dopt->opt, skb)) {
 902                                 kfree(dopt);
 903                                 dopt = NULL;
 904                         }
 905                 }
 906         }
 907         return dopt;
 908 }
 909
 910 #ifdef CONFIG_TCP_MD5SIG
 911 /*
 912  * RFC2385 MD5 checksumming requires a mapping of
 913  * IP address->MD5 Key.
 914  * We need to maintain these in the sk structure.
 915  */
 916
 917 /* Find the Key structure for an address.  */
 918 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 919                                          const union tcp_md5_addr *addr,
 920                                          int family)
 921 {
 922         struct tcp_sock *tp = tcp_sk(sk);
 923         struct tcp_md5sig_key *key;
 924         struct hlist_node *pos;
 925         unsigned int size = sizeof(struct in_addr);
 926         struct tcp_md5sig_info *md5sig;
 927
 928         /* caller either holds rcu_read_lock() or socket lock */
 929         md5sig = rcu_dereference_check(tp->md5sig_info,
 930                                        sock_owned_by_user(sk) ||
 931                                        lockdep_is_held(&sk->sk_lock.slock));
 932         if (!md5sig)
 933                 return NULL;
 934 #if IS_ENABLED(CONFIG_IPV6)
 935         if (family == AF_INET6)
 936                 size = sizeof(struct in6_addr);
 937 #endif
 938         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
 939                 if (key->family != family)
 940                         continue;
 941                 if (!memcmp(&key->addr, addr, size))
 942                         return key;
 943         }
 944         return NULL;
 945 }
 946 EXPORT_SYMBOL(tcp_md5_do_lookup);
 947
 948 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 949                                          struct sock *addr_sk)
 950 {
 951         union tcp_md5_addr *addr;
 952
 953         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 954         return tcp_md5_do_lookup(sk, addr, AF_INET);
 955 }
 956 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 957
 958 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 959                                                       struct request_sock *req)
 960 {
 961         union tcp_md5_addr *addr;
 962
 963         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
 964         return tcp_md5_do_lookup(sk, addr, AF_INET);
 965 }
 966
 967 /* This can be called on a newly created socket, from other files */
 968 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 969                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 970 {
 971         /* Add Key to the list */
 972         struct tcp_md5sig_key *key;
 973         struct tcp_sock *tp = tcp_sk(sk);
 974         struct tcp_md5sig_info *md5sig;
 975
 976         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
 977         if (key) {
 978                 /* Pre-existing entry - just update that one. */
 979                 memcpy(key->key, newkey, newkeylen);
 980                 key->keylen = newkeylen;
 981                 return 0;
 982         }
 983
 984         md5sig = rcu_dereference_protected(tp->md5sig_info,
 985                                            sock_owned_by_user(sk));
 986         if (!md5sig) {
 987                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 988                 if (!md5sig)
 989                         return -ENOMEM;
 990
 991                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 992                 INIT_HLIST_HEAD(&md5sig->head);
 993                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 994         }
 995
 996         key = sock_kmalloc(sk, sizeof(*key), gfp);
 997         if (!key)
 998                 return -ENOMEM;
 999         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1000                 sock_kfree_s(sk, key, sizeof(*key));
1001                 return -ENOMEM;
1002         }
1003
1004         memcpy(key->key, newkey, newkeylen);
1005         key->keylen = newkeylen;
1006         key->family = family;
1007         memcpy(&key->addr, addr,
1008                (family == AF_INET6) ? sizeof(struct in6_addr) :
1009                                       sizeof(struct in_addr));
1010         hlist_add_head_rcu(&key->node, &md5sig->head);
1011         return 0;
1012 }
1013 EXPORT_SYMBOL(tcp_md5_do_add);
1014
1015 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1016 {
1017         struct tcp_sock *tp = tcp_sk(sk);
1018         struct tcp_md5sig_key *key;
1019         struct tcp_md5sig_info *md5sig;
1020
1021         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1022         if (!key)
1023                 return -ENOENT;
1024         hlist_del_rcu(&key->node);
1025         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1026         kfree_rcu(key, rcu);
1027         md5sig = rcu_dereference_protected(tp->md5sig_info,
1028                                            sock_owned_by_user(sk));
1029         if (hlist_empty(&md5sig->head))
1030                 tcp_free_md5sig_pool();
1031         return 0;
1032 }
1033 EXPORT_SYMBOL(tcp_md5_do_del);
1034
1035 void tcp_clear_md5_list(struct sock *sk)
1036 {
1037         struct tcp_sock *tp = tcp_sk(sk);
1038         struct tcp_md5sig_key *key;
1039         struct hlist_node *pos, *n;
1040         struct tcp_md5sig_info *md5sig;
1041
1042         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1043
1044         if (!hlist_empty(&md5sig->head))
1045                 tcp_free_md5sig_pool();
1046         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1047                 hlist_del_rcu(&key->node);
1048                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1049                 kfree_rcu(key, rcu);
1050         }
1051 }
1052
1053 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1054                                  int optlen)
1055 {
1056         struct tcp_md5sig cmd;
1057         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1058
1059         if (optlen < sizeof(cmd))
1060                 return -EINVAL;
1061
1062         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1063                 return -EFAULT;
1064
1065         if (sin->sin_family != AF_INET)
1066                 return -EINVAL;
1067
1068         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1069                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1070                                       AF_INET);
1071
1072         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1073                 return -EINVAL;
1074
1075         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1076                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1077                               GFP_KERNEL);
1078 }
1079
1080 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1081                                         __be32 daddr, __be32 saddr, int nbytes)
1082 {
1083         struct tcp4_pseudohdr *bp;
1084         struct scatterlist sg;
1085
1086         bp = &hp->md5_blk.ip4;
1087
1088         /*
1089          * 1. the TCP pseudo-header (in the order: source IP address,
1090          * destination IP address, zero-padded protocol number, and
1091          * segment length)
1092          */
1093         bp->saddr = saddr;
1094         bp->daddr = daddr;
1095         bp->pad = 0;
1096         bp->protocol = IPPROTO_TCP;
1097         bp->len = cpu_to_be16(nbytes);
1098
1099         sg_init_one(&sg, bp, sizeof(*bp));
1100         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1101 }
1102
1103 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1104                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1105 {
1106         struct tcp_md5sig_pool *hp;
1107         struct hash_desc *desc;
1108
1109         hp = tcp_get_md5sig_pool();
1110         if (!hp)
1111                 goto clear_hash_noput;
1112         desc = &hp->md5_desc;
1113
1114         if (crypto_hash_init(desc))
1115                 goto clear_hash;
1116         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1117                 goto clear_hash;
1118         if (tcp_md5_hash_header(hp, th))
1119                 goto clear_hash;
1120         if (tcp_md5_hash_key(hp, key))
1121                 goto clear_hash;
1122         if (crypto_hash_final(desc, md5_hash))
1123                 goto clear_hash;
1124
1125         tcp_put_md5sig_pool();
1126         return 0;
1127
1128 clear_hash:
1129         tcp_put_md5sig_pool();
1130 clear_hash_noput:
1131         memset(md5_hash, 0, 16);
1132         return 1;
1133 }
1134
1135 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1136                         const struct sock *sk, const struct request_sock *req,
1137                         const struct sk_buff *skb)
1138 {
1139         struct tcp_md5sig_pool *hp;
1140         struct hash_desc *desc;
1141         const struct tcphdr *th = tcp_hdr(skb);
1142         __be32 saddr, daddr;
1143
1144         if (sk) {
1145                 saddr = inet_sk(sk)->inet_saddr;
1146                 daddr = inet_sk(sk)->inet_daddr;
1147         } else if (req) {
1148                 saddr = inet_rsk(req)->loc_addr;
1149                 daddr = inet_rsk(req)->rmt_addr;
1150         } else {
1151                 const struct iphdr *iph = ip_hdr(skb);
1152                 saddr = iph->saddr;
1153                 daddr = iph->daddr;
1154         }
1155
1156         hp = tcp_get_md5sig_pool();
1157         if (!hp)
1158                 goto clear_hash_noput;
1159         desc = &hp->md5_desc;
1160
1161         if (crypto_hash_init(desc))
1162                 goto clear_hash;
1163
1164         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1165                 goto clear_hash;
1166         if (tcp_md5_hash_header(hp, th))
1167                 goto clear_hash;
1168         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1169                 goto clear_hash;
1170         if (tcp_md5_hash_key(hp, key))
1171                 goto clear_hash;
1172         if (crypto_hash_final(desc, md5_hash))
1173                 goto clear_hash;
1174
1175         tcp_put_md5sig_pool();
1176         return 0;
1177
1178 clear_hash:
1179         tcp_put_md5sig_pool();
1180 clear_hash_noput:
1181         memset(md5_hash, 0, 16);
1182         return 1;
1183 }
1184 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1185
1186 static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1187 {
1188         /*
1189          * This gets called for each TCP segment that arrives
1190          * so we want to be efficient.
1191          * We have 3 drop cases:
1192          * o No MD5 hash and one expected.
1193          * o MD5 hash and we're not expecting one.
1194          * o MD5 hash and its wrong.
1195          */
1196         const __u8 *hash_location = NULL;
1197         struct tcp_md5sig_key *hash_expected;
1198         const struct iphdr *iph = ip_hdr(skb);
1199         const struct tcphdr *th = tcp_hdr(skb);
1200         int genhash;
1201         unsigned char newhash[16];
1202
1203         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1204                                           AF_INET);
1205         hash_location = tcp_parse_md5sig_option(th);
1206
1207         /* We've parsed the options - do we have a hash? */
1208         if (!hash_expected && !hash_location)
1209                 return 0;
1210
1211         if (hash_expected && !hash_location) {
1212                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1213                 return 1;
1214         }
1215
1216         if (!hash_expected && hash_location) {
1217                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1218                 return 1;
1219         }
1220
1221         /* Okay, so this is hash_expected and hash_location -
1222          * so we need to calculate the checksum.
1223          */
1224         genhash = tcp_v4_md5_hash_skb(newhash,
1225                                       hash_expected,
1226                                       NULL, NULL, skb);
1227
1228         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1229                 if (net_ratelimit()) {
1230                         pr_info("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1231                                 &iph->saddr, ntohs(th->source),
1232                                 &iph->daddr, ntohs(th->dest),
1233                                 genhash ? " tcp_v4_calc_md5_hash failed" : "");
1234                 }
1235                 return 1;
1236         }
1237         return 0;
1238 }
1239
1240 #endif
1241
1242 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1243         .family         =       PF_INET,
1244         .obj_size       =       sizeof(struct tcp_request_sock),
1245         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1246         .send_ack       =       tcp_v4_reqsk_send_ack,
1247         .destructor     =       tcp_v4_reqsk_destructor,
1248         .send_reset     =       tcp_v4_send_reset,
1249         .syn_ack_timeout =      tcp_syn_ack_timeout,
1250 };
1251
1252 #ifdef CONFIG_TCP_MD5SIG
1253 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1254         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1255         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1256 };
1257 #endif
1258
1259 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1260 {
1261         struct tcp_extend_values tmp_ext;
1262         struct tcp_options_received tmp_opt;
1263         const u8 *hash_location;
1264         struct request_sock *req;
1265         struct inet_request_sock *ireq;
1266         struct tcp_sock *tp = tcp_sk(sk);
1267         struct dst_entry *dst = NULL;
1268         __be32 saddr = ip_hdr(skb)->saddr;
1269         __be32 daddr = ip_hdr(skb)->daddr;
1270         __u32 isn = TCP_SKB_CB(skb)->when;
1271         int want_cookie = 0;
1272
1273         /* Never answer to SYNs send to broadcast or multicast */
1274         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1275                 goto drop;
1276
1277         /* TW buckets are converted to open requests without
1278          * limitations, they conserve resources and peer is
1279          * evidently real one.
1280          */
1281         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1282                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1283                 if (!want_cookie)
1284                         goto drop;
1285         }
1286
1287         /* Accept backlog is full. If we have already queued enough
1288          * of warm entries in syn queue, drop request. It is better than
1289          * clogging syn queue with openreqs with exponentially increasing
1290          * timeout.
1291          */
1292         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1293                 goto drop;
1294
1295         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1296         if (!req)
1297                 goto drop;
1298
1299 #ifdef CONFIG_TCP_MD5SIG
1300         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1301 #endif
1302
1303         tcp_clear_options(&tmp_opt);
1304         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1305         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1306         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1307
1308         if (tmp_opt.cookie_plus > 0 &&
1309             tmp_opt.saw_tstamp &&
1310             !tp->rx_opt.cookie_out_never &&
1311             (sysctl_tcp_cookie_size > 0 ||
1312              (tp->cookie_values != NULL &&
1313               tp->cookie_values->cookie_desired > 0))) {
1314                 u8 *c;
1315                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1316                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1317
1318                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1319                         goto drop_and_release;
1320
1321                 /* Secret recipe starts with IP addresses */
1322                 *mess++ ^= (__force u32)daddr;
1323                 *mess++ ^= (__force u32)saddr;
1324
1325                 /* plus variable length Initiator Cookie */
1326                 c = (u8 *)mess;
1327                 while (l-- > 0)
1328                         *c++ ^= *hash_location++;
1329
1330                 want_cookie = 0;        /* not our kind of cookie */
1331                 tmp_ext.cookie_out_never = 0; /* false */
1332                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1333         } else if (!tp->rx_opt.cookie_in_always) {
1334                 /* redundant indications, but ensure initialization. */
1335                 tmp_ext.cookie_out_never = 1; /* true */
1336                 tmp_ext.cookie_plus = 0;
1337         } else {
1338                 goto drop_and_release;
1339         }
1340         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1341
1342         if (want_cookie && !tmp_opt.saw_tstamp)
1343                 tcp_clear_options(&tmp_opt);
1344
1345         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1346         tcp_openreq_init(req, &tmp_opt, skb);
1347
1348         ireq = inet_rsk(req);
1349         ireq->loc_addr = daddr;
1350         ireq->rmt_addr = saddr;
1351         ireq->no_srccheck = inet_sk(sk)->transparent;
1352         ireq->opt = tcp_v4_save_options(sk, skb);
1353
1354         if (security_inet_conn_request(sk, skb, req))
1355                 goto drop_and_free;
1356
1357         if (!want_cookie || tmp_opt.tstamp_ok)
1358                 TCP_ECN_create_request(req, tcp_hdr(skb));
1359
1360         if (want_cookie) {
1361                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1362                 req->cookie_ts = tmp_opt.tstamp_ok;
1363         } else if (!isn) {
1364                 struct inet_peer *peer = NULL;
1365                 struct flowi4 fl4;
1366
1367                 /* VJ's idea. We save last timestamp seen
1368                  * from the destination in peer table, when entering
1369                  * state TIME-WAIT, and check against it before
1370                  * accepting new connection request.
1371                  *
1372                  * If "isn" is not zero, this request hit alive
1373                  * timewait bucket, so that all the necessary checks
1374                  * are made in the function processing timewait state.
1375                  */
1376                 if (tmp_opt.saw_tstamp &&
1377                     tcp_death_row.sysctl_tw_recycle &&
1378                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1379                     fl4.daddr == saddr &&
1380                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1381                         inet_peer_refcheck(peer);
1382                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1383                             (s32)(peer->tcp_ts - req->ts_recent) >
1384                                                         TCP_PAWS_WINDOW) {
1385                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1386                                 goto drop_and_release;
1387                         }
1388                 }
1389                 /* Kill the following clause, if you dislike this way. */
1390                 else if (!sysctl_tcp_syncookies &&
1391                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1392                           (sysctl_max_syn_backlog >> 2)) &&
1393                          (!peer || !peer->tcp_ts_stamp) &&
1394                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1395                         /* Without syncookies last quarter of
1396                          * backlog is filled with destinations,
1397                          * proven to be alive.
1398                          * It means that we continue to communicate
1399                          * to destinations, already remembered
1400                          * to the moment of synflood.
1401                          */
1402                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1403                                        &saddr, ntohs(tcp_hdr(skb)->source));
1404                         goto drop_and_release;
1405                 }
1406
1407                 isn = tcp_v4_init_sequence(skb);
1408         }
1409         tcp_rsk(req)->snt_isn = isn;
1410         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1411
1412         if (tcp_v4_send_synack(sk, dst, req,
1413                                (struct request_values *)&tmp_ext) ||
1414             want_cookie)
1415                 goto drop_and_free;
1416
1417         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1418         return 0;
1419
1420 drop_and_release:
1421         dst_release(dst);
1422 drop_and_free:
1423         reqsk_free(req);
1424 drop:
1425         return 0;
1426 }
1427 EXPORT_SYMBOL(tcp_v4_conn_request);
1428
1429
1430 /*
1431  * The three way handshake has completed - we got a valid synack -
1432  * now create the new socket.
1433  */
1434 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1435                                   struct request_sock *req,
1436                                   struct dst_entry *dst)
1437 {
1438         struct inet_request_sock *ireq;
1439         struct inet_sock *newinet;
1440         struct tcp_sock *newtp;
1441         struct sock *newsk;
1442 #ifdef CONFIG_TCP_MD5SIG
1443         struct tcp_md5sig_key *key;
1444 #endif
1445         struct ip_options_rcu *inet_opt;
1446
1447         if (sk_acceptq_is_full(sk))
1448                 goto exit_overflow;
1449
1450         newsk = tcp_create_openreq_child(sk, req, skb);
1451         if (!newsk)
1452                 goto exit_nonewsk;
1453
1454         newsk->sk_gso_type = SKB_GSO_TCPV4;
1455
1456         newtp                 = tcp_sk(newsk);
1457         newinet               = inet_sk(newsk);
1458         ireq                  = inet_rsk(req);
1459         newinet->inet_daddr   = ireq->rmt_addr;
1460         newinet->inet_rcv_saddr = ireq->loc_addr;
1461         newinet->inet_saddr           = ireq->loc_addr;
1462         inet_opt              = ireq->opt;
1463         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1464         ireq->opt             = NULL;
1465         newinet->mc_index     = inet_iif(skb);
1466         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1467         newinet->rcv_tos      = ip_hdr(skb)->tos;
1468         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1469         if (inet_opt)
1470                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1471         newinet->inet_id = newtp->write_seq ^ jiffies;
1472
1473         if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1474                 goto put_and_exit;
1475
1476         sk_setup_caps(newsk, dst);
1477
1478         tcp_mtup_init(newsk);
1479         tcp_sync_mss(newsk, dst_mtu(dst));
1480         newtp->advmss = dst_metric_advmss(dst);
1481         if (tcp_sk(sk)->rx_opt.user_mss &&
1482             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1483                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1484
1485         tcp_initialize_rcv_mss(newsk);
1486         if (tcp_rsk(req)->snt_synack)
1487                 tcp_valid_rtt_meas(newsk,
1488                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1489         newtp->total_retrans = req->retrans;
1490
1491 #ifdef CONFIG_TCP_MD5SIG
1492         /* Copy over the MD5 key from the original socket */
1493         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1494                                 AF_INET);
1495         if (key != NULL) {
1496                 /*
1497                  * We're using one, so create a matching key
1498                  * on the newsk structure. If we fail to get
1499                  * memory, then we end up not copying the key
1500                  * across. Shucks.
1501                  */
1502                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1503                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1504                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1505         }
1506 #endif
1507
1508         if (__inet_inherit_port(sk, newsk) < 0)
1509                 goto put_and_exit;
1510         __inet_hash_nolisten(newsk, NULL);
1511
1512         return newsk;
1513
1514 exit_overflow:
1515         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1516 exit_nonewsk:
1517         dst_release(dst);
1518 exit:
1519         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1520         return NULL;
1521 put_and_exit:
1522         tcp_clear_xmit_timers(newsk);
1523         tcp_cleanup_congestion_control(newsk);
1524         bh_unlock_sock(newsk);
1525         sock_put(newsk);
1526         goto exit;
1527 }
1528 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1529
1530 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1531 {
1532         struct tcphdr *th = tcp_hdr(skb);
1533         const struct iphdr *iph = ip_hdr(skb);
1534         struct sock *nsk;
1535         struct request_sock **prev;
1536         /* Find possible connection requests. */
1537         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1538                                                        iph->saddr, iph->daddr);
1539         if (req)
1540                 return tcp_check_req(sk, skb, req, prev);
1541
1542         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1543                         th->source, iph->daddr, th->dest, inet_iif(skb));
1544
1545         if (nsk) {
1546                 if (nsk->sk_state != TCP_TIME_WAIT) {
1547                         bh_lock_sock(nsk);
1548                         return nsk;
1549                 }
1550                 inet_twsk_put(inet_twsk(nsk));
1551                 return NULL;
1552         }
1553
1554 #ifdef CONFIG_SYN_COOKIES
1555         if (!th->syn)
1556                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1557 #endif
1558         return sk;
1559 }
1560
1561 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1562 {
1563         const struct iphdr *iph = ip_hdr(skb);
1564
1565         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1566                 if (!tcp_v4_check(skb->len, iph->saddr,
1567                                   iph->daddr, skb->csum)) {
1568                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1569                         return 0;
1570                 }
1571         }
1572
1573         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1574                                        skb->len, IPPROTO_TCP, 0);
1575
1576         if (skb->len <= 76) {
1577                 return __skb_checksum_complete(skb);
1578         }
1579         return 0;
1580 }
1581
1582
1583 /* The socket must have it's spinlock held when we get
1584  * here.
1585  *
1586  * We have a potential double-lock case here, so even when
1587  * doing backlog processing we use the BH locking scheme.
1588  * This is because we cannot sleep with the original spinlock
1589  * held.
1590  */
1591 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1592 {
1593         struct sock *rsk;
1594 #ifdef CONFIG_TCP_MD5SIG
1595         /*
1596          * We really want to reject the packet as early as possible
1597          * if:
1598          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1599          *  o There is an MD5 option and we're not expecting one
1600          */
1601         if (tcp_v4_inbound_md5_hash(sk, skb))
1602                 goto discard;
1603 #endif
1604
1605         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1606                 sock_rps_save_rxhash(sk, skb);
1607                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1608                         rsk = sk;
1609                         goto reset;
1610                 }
1611                 return 0;
1612         }
1613
1614         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1615                 goto csum_err;
1616
1617         if (sk->sk_state == TCP_LISTEN) {
1618                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1619                 if (!nsk)
1620                         goto discard;
1621
1622                 if (nsk != sk) {
1623                         sock_rps_save_rxhash(nsk, skb);
1624                         if (tcp_child_process(sk, nsk, skb)) {
1625                                 rsk = nsk;
1626                                 goto reset;
1627                         }
1628                         return 0;
1629                 }
1630         } else
1631                 sock_rps_save_rxhash(sk, skb);
1632
1633         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1634                 rsk = sk;
1635                 goto reset;
1636         }
1637         return 0;
1638
1639 reset:
1640         tcp_v4_send_reset(rsk, skb);
1641 discard:
1642         kfree_skb(skb);
1643         /* Be careful here. If this function gets more complicated and
1644          * gcc suffers from register pressure on the x86, sk (in %ebx)
1645          * might be destroyed here. This current version compiles correctly,
1646          * but you have been warned.
1647          */
1648         return 0;
1649
1650 csum_err:
1651         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1652         goto discard;
1653 }
1654 EXPORT_SYMBOL(tcp_v4_do_rcv);
1655
1656 /*
1657  *      From tcp_input.c
1658  */
1659
1660 int tcp_v4_rcv(struct sk_buff *skb)
1661 {
1662         const struct iphdr *iph;
1663         const struct tcphdr *th;
1664         struct sock *sk;
1665         int ret;
1666         struct net *net = dev_net(skb->dev);
1667
1668         if (skb->pkt_type != PACKET_HOST)
1669                 goto discard_it;
1670
1671         /* Count it even if it's bad */
1672         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1673
1674         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1675                 goto discard_it;
1676
1677         th = tcp_hdr(skb);
1678
1679         if (th->doff < sizeof(struct tcphdr) / 4)
1680                 goto bad_packet;
1681         if (!pskb_may_pull(skb, th->doff * 4))
1682                 goto discard_it;
1683
1684         /* An explanation is required here, I think.
1685          * Packet length and doff are validated by header prediction,
1686          * provided case of th->doff==0 is eliminated.
1687          * So, we defer the checks. */
1688         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1689                 goto bad_packet;
1690
1691         th = tcp_hdr(skb);
1692         iph = ip_hdr(skb);
1693         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1694         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1695                                     skb->len - th->doff * 4);
1696         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1697         TCP_SKB_CB(skb)->when    = 0;
1698         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1699         TCP_SKB_CB(skb)->sacked  = 0;
1700
1701         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1702         if (!sk)
1703                 goto no_tcp_socket;
1704
1705 process:
1706         if (sk->sk_state == TCP_TIME_WAIT)
1707                 goto do_time_wait;
1708
1709         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1710                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1711                 goto discard_and_relse;
1712         }
1713
1714         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1715                 goto discard_and_relse;
1716         nf_reset(skb);
1717
1718         if (sk_filter(sk, skb))
1719                 goto discard_and_relse;
1720
1721         skb->dev = NULL;
1722
1723         bh_lock_sock_nested(sk);
1724         ret = 0;
1725         if (!sock_owned_by_user(sk)) {
1726 #ifdef CONFIG_NET_DMA
1727                 struct tcp_sock *tp = tcp_sk(sk);
1728                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1729                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1730                 if (tp->ucopy.dma_chan)
1731                         ret = tcp_v4_do_rcv(sk, skb);
1732                 else
1733 #endif
1734                 {
1735                         if (!tcp_prequeue(sk, skb))
1736                                 ret = tcp_v4_do_rcv(sk, skb);
1737                 }
1738         } else if (unlikely(sk_add_backlog(sk, skb))) {
1739                 bh_unlock_sock(sk);
1740                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1741                 goto discard_and_relse;
1742         }
1743         bh_unlock_sock(sk);
1744
1745         sock_put(sk);
1746
1747         return ret;
1748
1749 no_tcp_socket:
1750         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1751                 goto discard_it;
1752
1753         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1754 bad_packet:
1755                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1756         } else {
1757                 tcp_v4_send_reset(NULL, skb);
1758         }
1759
1760 discard_it:
1761         /* Discard frame. */
1762         kfree_skb(skb);
1763         return 0;
1764
1765 discard_and_relse:
1766         sock_put(sk);
1767         goto discard_it;
1768
1769 do_time_wait:
1770         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1771                 inet_twsk_put(inet_twsk(sk));
1772                 goto discard_it;
1773         }
1774
1775         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1776                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1777                 inet_twsk_put(inet_twsk(sk));
1778                 goto discard_it;
1779         }
1780         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1781         case TCP_TW_SYN: {
1782                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1783                                                         &tcp_hashinfo,
1784                                                         iph->daddr, th->dest,
1785                                                         inet_iif(skb));
1786                 if (sk2) {
1787                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1788                         inet_twsk_put(inet_twsk(sk));
1789                         sk = sk2;
1790                         goto process;
1791                 }
1792                 /* Fall through to ACK */
1793         }
1794         case TCP_TW_ACK:
1795                 tcp_v4_timewait_ack(sk, skb);
1796                 break;
1797         case TCP_TW_RST:
1798                 goto no_tcp_socket;
1799         case TCP_TW_SUCCESS:;
1800         }
1801         goto discard_it;
1802 }
1803
1804 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1805 {
1806         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1807         struct inet_sock *inet = inet_sk(sk);
1808         struct inet_peer *peer;
1809
1810         if (!rt ||
1811             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1812                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1813                 *release_it = true;
1814         } else {
1815                 if (!rt->peer)
1816                         rt_bind_peer(rt, inet->inet_daddr, 1);
1817                 peer = rt->peer;
1818                 *release_it = false;
1819         }
1820
1821         return peer;
1822 }
1823 EXPORT_SYMBOL(tcp_v4_get_peer);
1824
1825 void *tcp_v4_tw_get_peer(struct sock *sk)
1826 {
1827         const struct inet_timewait_sock *tw = inet_twsk(sk);
1828
1829         return inet_getpeer_v4(tw->tw_daddr, 1);
1830 }
1831 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1832
1833 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1834         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1835         .twsk_unique    = tcp_twsk_unique,
1836         .twsk_destructor= tcp_twsk_destructor,
1837         .twsk_getpeer   = tcp_v4_tw_get_peer,
1838 };
1839
1840 const struct inet_connection_sock_af_ops ipv4_specific = {
1841         .queue_xmit        = ip_queue_xmit,
1842         .send_check        = tcp_v4_send_check,
1843         .rebuild_header    = inet_sk_rebuild_header,
1844         .conn_request      = tcp_v4_conn_request,
1845         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1846         .get_peer          = tcp_v4_get_peer,
1847         .net_header_len    = sizeof(struct iphdr),
1848         .setsockopt        = ip_setsockopt,
1849         .getsockopt        = ip_getsockopt,
1850         .addr2sockaddr     = inet_csk_addr2sockaddr,
1851         .sockaddr_len      = sizeof(struct sockaddr_in),
1852         .bind_conflict     = inet_csk_bind_conflict,
1853 #ifdef CONFIG_COMPAT
1854         .compat_setsockopt = compat_ip_setsockopt,
1855         .compat_getsockopt = compat_ip_getsockopt,
1856 #endif
1857 };
1858 EXPORT_SYMBOL(ipv4_specific);
1859
1860 #ifdef CONFIG_TCP_MD5SIG
1861 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1862         .md5_lookup             = tcp_v4_md5_lookup,
1863         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1864         .md5_parse              = tcp_v4_parse_md5_keys,
1865 };
1866 #endif
1867
1868 /* NOTE: A lot of things set to zero explicitly by call to
1869  *       sk_alloc() so need not be done here.
1870  */
1871 static int tcp_v4_init_sock(struct sock *sk)
1872 {
1873         struct inet_connection_sock *icsk = inet_csk(sk);
1874         struct tcp_sock *tp = tcp_sk(sk);
1875
1876         skb_queue_head_init(&tp->out_of_order_queue);
1877         tcp_init_xmit_timers(sk);
1878         tcp_prequeue_init(tp);
1879
1880         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1881         tp->mdev = TCP_TIMEOUT_INIT;
1882
1883         /* So many TCP implementations out there (incorrectly) count the
1884          * initial SYN frame in their delayed-ACK and congestion control
1885          * algorithms that we must have the following bandaid to talk
1886          * efficiently to them.  -DaveM
1887          */
1888         tp->snd_cwnd = TCP_INIT_CWND;
1889
1890         /* See draft-stevens-tcpca-spec-01 for discussion of the
1891          * initialization of these values.
1892          */
1893         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1894         tp->snd_cwnd_clamp = ~0;
1895         tp->mss_cache = TCP_MSS_DEFAULT;
1896
1897         tp->reordering = sysctl_tcp_reordering;
1898         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1899
1900         sk->sk_state = TCP_CLOSE;
1901
1902         sk->sk_write_space = sk_stream_write_space;
1903         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1904
1905         icsk->icsk_af_ops = &ipv4_specific;
1906         icsk->icsk_sync_mss = tcp_sync_mss;
1907 #ifdef CONFIG_TCP_MD5SIG
1908         tp->af_specific = &tcp_sock_ipv4_specific;
1909 #endif
1910
1911         /* TCP Cookie Transactions */
1912         if (sysctl_tcp_cookie_size > 0) {
1913                 /* Default, cookies without s_data_payload. */
1914                 tp->cookie_values =
1915                         kzalloc(sizeof(*tp->cookie_values),
1916                                 sk->sk_allocation);
1917                 if (tp->cookie_values != NULL)
1918                         kref_init(&tp->cookie_values->kref);
1919         }
1920         /* Presumed zeroed, in order of appearance:
1921          *      cookie_in_always, cookie_out_never,
1922          *      s_data_constant, s_data_in, s_data_out
1923          */
1924         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1925         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1926
1927         local_bh_disable();
1928         sock_update_memcg(sk);
1929         sk_sockets_allocated_inc(sk);
1930         local_bh_enable();
1931
1932         return 0;
1933 }
1934
1935 void tcp_v4_destroy_sock(struct sock *sk)
1936 {
1937         struct tcp_sock *tp = tcp_sk(sk);
1938
1939         tcp_clear_xmit_timers(sk);
1940
1941         tcp_cleanup_congestion_control(sk);
1942
1943         /* Cleanup up the write buffer. */
1944         tcp_write_queue_purge(sk);
1945
1946         /* Cleans up our, hopefully empty, out_of_order_queue. */
1947         __skb_queue_purge(&tp->out_of_order_queue);
1948
1949 #ifdef CONFIG_TCP_MD5SIG
1950         /* Clean up the MD5 key list, if any */
1951         if (tp->md5sig_info) {
1952                 tcp_clear_md5_list(sk);
1953                 kfree_rcu(tp->md5sig_info, rcu);
1954                 tp->md5sig_info = NULL;
1955         }
1956 #endif
1957
1958 #ifdef CONFIG_NET_DMA
1959         /* Cleans up our sk_async_wait_queue */
1960         __skb_queue_purge(&sk->sk_async_wait_queue);
1961 #endif
1962
1963         /* Clean prequeue, it must be empty really */
1964         __skb_queue_purge(&tp->ucopy.prequeue);
1965
1966         /* Clean up a referenced TCP bind bucket. */
1967         if (inet_csk(sk)->icsk_bind_hash)
1968                 inet_put_port(sk);
1969
1970         /*
1971          * If sendmsg cached page exists, toss it.
1972          */
1973         if (sk->sk_sndmsg_page) {
1974                 __free_page(sk->sk_sndmsg_page);
1975                 sk->sk_sndmsg_page = NULL;
1976         }
1977
1978         /* TCP Cookie Transactions */
1979         if (tp->cookie_values != NULL) {
1980                 kref_put(&tp->cookie_values->kref,
1981                          tcp_cookie_values_release);
1982                 tp->cookie_values = NULL;
1983         }
1984
1985         sk_sockets_allocated_dec(sk);
1986         sock_release_memcg(sk);
1987 }
1988 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1989
1990 #ifdef CONFIG_PROC_FS
1991 /* Proc filesystem TCP sock list dumping. */
1992
1993 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1994 {
1995         return hlist_nulls_empty(head) ? NULL :
1996                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1997 }
1998
1999 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2000 {
2001         return !is_a_nulls(tw->tw_node.next) ?
2002                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2003 }
2004
2005 /*
2006  * Get next listener socket follow cur.  If cur is NULL, get first socket
2007  * starting from bucket given in st->bucket; when st->bucket is zero the
2008  * very first socket in the hash table is returned.
2009  */
2010 static void *listening_get_next(struct seq_file *seq, void *cur)
2011 {
2012         struct inet_connection_sock *icsk;
2013         struct hlist_nulls_node *node;
2014         struct sock *sk = cur;
2015         struct inet_listen_hashbucket *ilb;
2016         struct tcp_iter_state *st = seq->private;
2017         struct net *net = seq_file_net(seq);
2018
2019         if (!sk) {
2020                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2021                 spin_lock_bh(&ilb->lock);
2022                 sk = sk_nulls_head(&ilb->head);
2023                 st->offset = 0;
2024                 goto get_sk;
2025         }
2026         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2027         ++st->num;
2028         ++st->offset;
2029
2030         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2031                 struct request_sock *req = cur;
2032
2033                 icsk = inet_csk(st->syn_wait_sk);
2034                 req = req->dl_next;
2035                 while (1) {
2036                         while (req) {
2037                                 if (req->rsk_ops->family == st->family) {
2038                                         cur = req;
2039                                         goto out;
2040                                 }
2041                                 req = req->dl_next;
2042                         }
2043                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2044                                 break;
2045 get_req:
2046                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2047                 }
2048                 sk        = sk_nulls_next(st->syn_wait_sk);
2049                 st->state = TCP_SEQ_STATE_LISTENING;
2050                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2051         } else {
2052                 icsk = inet_csk(sk);
2053                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2054                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2055                         goto start_req;
2056                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2057                 sk = sk_nulls_next(sk);
2058         }
2059 get_sk:
2060         sk_nulls_for_each_from(sk, node) {
2061                 if (!net_eq(sock_net(sk), net))
2062                         continue;
2063                 if (sk->sk_family == st->family) {
2064                         cur = sk;
2065                         goto out;
2066                 }
2067                 icsk = inet_csk(sk);
2068                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2069                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2070 start_req:
2071                         st->uid         = sock_i_uid(sk);
2072                         st->syn_wait_sk = sk;
2073                         st->state       = TCP_SEQ_STATE_OPENREQ;
2074                         st->sbucket     = 0;
2075                         goto get_req;
2076                 }
2077                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2078         }
2079         spin_unlock_bh(&ilb->lock);
2080         st->offset = 0;
2081         if (++st->bucket < INET_LHTABLE_SIZE) {
2082                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2083                 spin_lock_bh(&ilb->lock);
2084                 sk = sk_nulls_head(&ilb->head);
2085                 goto get_sk;
2086         }
2087         cur = NULL;
2088 out:
2089         return cur;
2090 }
2091
2092 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2093 {
2094         struct tcp_iter_state *st = seq->private;
2095         void *rc;
2096
2097         st->bucket = 0;
2098         st->offset = 0;
2099         rc = listening_get_next(seq, NULL);
2100
2101         while (rc && *pos) {
2102                 rc = listening_get_next(seq, rc);
2103                 --*pos;
2104         }
2105         return rc;
2106 }
2107
2108 static inline int empty_bucket(struct tcp_iter_state *st)
2109 {
2110         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2111                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2112 }
2113
2114 /*
2115  * Get first established socket starting from bucket given in st->bucket.
2116  * If st->bucket is zero, the very first socket in the hash is returned.
2117  */
2118 static void *established_get_first(struct seq_file *seq)
2119 {
2120         struct tcp_iter_state *st = seq->private;
2121         struct net *net = seq_file_net(seq);
2122         void *rc = NULL;
2123
2124         st->offset = 0;
2125         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2126                 struct sock *sk;
2127                 struct hlist_nulls_node *node;
2128                 struct inet_timewait_sock *tw;
2129                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2130
2131                 /* Lockless fast path for the common case of empty buckets */
2132                 if (empty_bucket(st))
2133                         continue;
2134
2135                 spin_lock_bh(lock);
2136                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2137                         if (sk->sk_family != st->family ||
2138                             !net_eq(sock_net(sk), net)) {
2139                                 continue;
2140                         }
2141                         rc = sk;
2142                         goto out;
2143                 }
2144                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2145                 inet_twsk_for_each(tw, node,
2146                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2147                         if (tw->tw_family != st->family ||
2148                             !net_eq(twsk_net(tw), net)) {
2149                                 continue;
2150                         }
2151                         rc = tw;
2152                         goto out;
2153                 }
2154                 spin_unlock_bh(lock);
2155                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2156         }
2157 out:
2158         return rc;
2159 }
2160
2161 static void *established_get_next(struct seq_file *seq, void *cur)
2162 {
2163         struct sock *sk = cur;
2164         struct inet_timewait_sock *tw;
2165         struct hlist_nulls_node *node;
2166         struct tcp_iter_state *st = seq->private;
2167         struct net *net = seq_file_net(seq);
2168
2169         ++st->num;
2170         ++st->offset;
2171
2172         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2173                 tw = cur;
2174                 tw = tw_next(tw);
2175 get_tw:
2176                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2177                         tw = tw_next(tw);
2178                 }
2179                 if (tw) {
2180                         cur = tw;
2181                         goto out;
2182                 }
2183                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2184                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2185
2186                 /* Look for next non empty bucket */
2187                 st->offset = 0;
2188                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2189                                 empty_bucket(st))
2190                         ;
2191                 if (st->bucket > tcp_hashinfo.ehash_mask)
2192                         return NULL;
2193
2194                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2195                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2196         } else
2197                 sk = sk_nulls_next(sk);
2198
2199         sk_nulls_for_each_from(sk, node) {
2200                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2201                         goto found;
2202         }
2203
2204         st->state = TCP_SEQ_STATE_TIME_WAIT;
2205         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2206         goto get_tw;
2207 found:
2208         cur = sk;
2209 out:
2210         return cur;
2211 }
2212
2213 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2214 {
2215         struct tcp_iter_state *st = seq->private;
2216         void *rc;
2217
2218         st->bucket = 0;
2219         rc = established_get_first(seq);
2220
2221         while (rc && pos) {
2222                 rc = established_get_next(seq, rc);
2223                 --pos;
2224         }
2225         return rc;
2226 }
2227
2228 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2229 {
2230         void *rc;
2231         struct tcp_iter_state *st = seq->private;
2232
2233         st->state = TCP_SEQ_STATE_LISTENING;
2234         rc        = listening_get_idx(seq, &pos);
2235
2236         if (!rc) {
2237                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2238                 rc        = established_get_idx(seq, pos);
2239         }
2240
2241         return rc;
2242 }
2243
2244 static void *tcp_seek_last_pos(struct seq_file *seq)
2245 {
2246         struct tcp_iter_state *st = seq->private;
2247         int offset = st->offset;
2248         int orig_num = st->num;
2249         void *rc = NULL;
2250
2251         switch (st->state) {
2252         case TCP_SEQ_STATE_OPENREQ:
2253         case TCP_SEQ_STATE_LISTENING:
2254                 if (st->bucket >= INET_LHTABLE_SIZE)
2255                         break;
2256                 st->state = TCP_SEQ_STATE_LISTENING;
2257                 rc = listening_get_next(seq, NULL);
2258                 while (offset-- && rc)
2259                         rc = listening_get_next(seq, rc);
2260                 if (rc)
2261                         break;
2262                 st->bucket = 0;
2263                 /* Fallthrough */
2264         case TCP_SEQ_STATE_ESTABLISHED:
2265         case TCP_SEQ_STATE_TIME_WAIT:
2266                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2267                 if (st->bucket > tcp_hashinfo.ehash_mask)
2268                         break;
2269                 rc = established_get_first(seq);
2270                 while (offset-- && rc)
2271                         rc = established_get_next(seq, rc);
2272         }
2273
2274         st->num = orig_num;
2275
2276         return rc;
2277 }
2278
2279 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2280 {
2281         struct tcp_iter_state *st = seq->private;
2282         void *rc;
2283
2284         if (*pos && *pos == st->last_pos) {
2285                 rc = tcp_seek_last_pos(seq);
2286                 if (rc)
2287                         goto out;
2288         }
2289
2290         st->state = TCP_SEQ_STATE_LISTENING;
2291         st->num = 0;
2292         st->bucket = 0;
2293         st->offset = 0;
2294         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2295
2296 out:
2297         st->last_pos = *pos;
2298         return rc;
2299 }
2300
2301 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2302 {
2303         struct tcp_iter_state *st = seq->private;
2304         void *rc = NULL;
2305
2306         if (v == SEQ_START_TOKEN) {
2307                 rc = tcp_get_idx(seq, 0);
2308                 goto out;
2309         }
2310
2311         switch (st->state) {
2312         case TCP_SEQ_STATE_OPENREQ:
2313         case TCP_SEQ_STATE_LISTENING:
2314                 rc = listening_get_next(seq, v);
2315                 if (!rc) {
2316                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2317                         st->bucket = 0;
2318                         st->offset = 0;
2319                         rc        = established_get_first(seq);
2320                 }
2321                 break;
2322         case TCP_SEQ_STATE_ESTABLISHED:
2323         case TCP_SEQ_STATE_TIME_WAIT:
2324                 rc = established_get_next(seq, v);
2325                 break;
2326         }
2327 out:
2328         ++*pos;
2329         st->last_pos = *pos;
2330         return rc;
2331 }
2332
2333 static void tcp_seq_stop(struct seq_file *seq, void *v)
2334 {
2335         struct tcp_iter_state *st = seq->private;
2336
2337         switch (st->state) {
2338         case TCP_SEQ_STATE_OPENREQ:
2339                 if (v) {
2340                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2341                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2342                 }
2343         case TCP_SEQ_STATE_LISTENING:
2344                 if (v != SEQ_START_TOKEN)
2345                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2346                 break;
2347         case TCP_SEQ_STATE_TIME_WAIT:
2348         case TCP_SEQ_STATE_ESTABLISHED:
2349                 if (v)
2350                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2351                 break;
2352         }
2353 }
2354
2355 int tcp_seq_open(struct inode *inode, struct file *file)
2356 {
2357         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2358         struct tcp_iter_state *s;
2359         int err;
2360
2361         err = seq_open_net(inode, file, &afinfo->seq_ops,
2362                           sizeof(struct tcp_iter_state));
2363         if (err < 0)
2364                 return err;
2365
2366         s = ((struct seq_file *)file->private_data)->private;
2367         s->family               = afinfo->family;
2368         s->last_pos             = 0;
2369         return 0;
2370 }
2371 EXPORT_SYMBOL(tcp_seq_open);
2372
2373 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2374 {
2375         int rc = 0;
2376         struct proc_dir_entry *p;
2377
2378         afinfo->seq_ops.start           = tcp_seq_start;
2379         afinfo->seq_ops.next            = tcp_seq_next;
2380         afinfo->seq_ops.stop            = tcp_seq_stop;
2381
2382         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2383                              afinfo->seq_fops, afinfo);
2384         if (!p)
2385                 rc = -ENOMEM;
2386         return rc;
2387 }
2388 EXPORT_SYMBOL(tcp_proc_register);
2389
2390 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2391 {
2392         proc_net_remove(net, afinfo->name);
2393 }
2394 EXPORT_SYMBOL(tcp_proc_unregister);
2395
2396 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2397                          struct seq_file *f, int i, int uid, int *len)
2398 {
2399         const struct inet_request_sock *ireq = inet_rsk(req);
2400         int ttd = req->expires - jiffies;
2401
2402         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2403                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2404                 i,
2405                 ireq->loc_addr,
2406                 ntohs(inet_sk(sk)->inet_sport),
2407                 ireq->rmt_addr,
2408                 ntohs(ireq->rmt_port),
2409                 TCP_SYN_RECV,
2410                 0, 0, /* could print option size, but that is af dependent. */
2411                 1,    /* timers active (only the expire timer) */
2412                 jiffies_to_clock_t(ttd),
2413                 req->retrans,
2414                 uid,
2415                 0,  /* non standard timer */
2416                 0, /* open_requests have no inode */
2417                 atomic_read(&sk->sk_refcnt),
2418                 req,
2419                 len);
2420 }
2421
2422 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2423 {
2424         int timer_active;
2425         unsigned long timer_expires;
2426         const struct tcp_sock *tp = tcp_sk(sk);
2427         const struct inet_connection_sock *icsk = inet_csk(sk);
2428         const struct inet_sock *inet = inet_sk(sk);
2429         __be32 dest = inet->inet_daddr;
2430         __be32 src = inet->inet_rcv_saddr;
2431         __u16 destp = ntohs(inet->inet_dport);
2432         __u16 srcp = ntohs(inet->inet_sport);
2433         int rx_queue;
2434
2435         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2436                 timer_active    = 1;
2437                 timer_expires   = icsk->icsk_timeout;
2438         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2439                 timer_active    = 4;
2440                 timer_expires   = icsk->icsk_timeout;
2441         } else if (timer_pending(&sk->sk_timer)) {
2442                 timer_active    = 2;
2443                 timer_expires   = sk->sk_timer.expires;
2444         } else {
2445                 timer_active    = 0;
2446                 timer_expires = jiffies;
2447         }
2448
2449         if (sk->sk_state == TCP_LISTEN)
2450                 rx_queue = sk->sk_ack_backlog;
2451         else
2452                 /*
2453                  * because we dont lock socket, we might find a transient negative value
2454                  */
2455                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2456
2457         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2458                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2459                 i, src, srcp, dest, destp, sk->sk_state,
2460                 tp->write_seq - tp->snd_una,
2461                 rx_queue,
2462                 timer_active,
2463                 jiffies_to_clock_t(timer_expires - jiffies),
2464                 icsk->icsk_retransmits,
2465                 sock_i_uid(sk),
2466                 icsk->icsk_probes_out,
2467                 sock_i_ino(sk),
2468                 atomic_read(&sk->sk_refcnt), sk,
2469                 jiffies_to_clock_t(icsk->icsk_rto),
2470                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2471                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2472                 tp->snd_cwnd,
2473                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2474                 len);
2475 }
2476
2477 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2478                                struct seq_file *f, int i, int *len)
2479 {
2480         __be32 dest, src;
2481         __u16 destp, srcp;
2482         int ttd = tw->tw_ttd - jiffies;
2483
2484         if (ttd < 0)
2485                 ttd = 0;
2486
2487         dest  = tw->tw_daddr;
2488         src   = tw->tw_rcv_saddr;
2489         destp = ntohs(tw->tw_dport);
2490         srcp  = ntohs(tw->tw_sport);
2491
2492         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2493                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2494                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2495                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2496                 atomic_read(&tw->tw_refcnt), tw, len);
2497 }
2498
2499 #define TMPSZ 150
2500
2501 static int tcp4_seq_show(struct seq_file *seq, void *v)
2502 {
2503         struct tcp_iter_state *st;
2504         int len;
2505
2506         if (v == SEQ_START_TOKEN) {
2507                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2508                            "  sl  local_address rem_address   st tx_queue "
2509                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2510                            "inode");
2511                 goto out;
2512         }
2513         st = seq->private;
2514
2515         switch (st->state) {
2516         case TCP_SEQ_STATE_LISTENING:
2517         case TCP_SEQ_STATE_ESTABLISHED:
2518                 get_tcp4_sock(v, seq, st->num, &len);
2519                 break;
2520         case TCP_SEQ_STATE_OPENREQ:
2521                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2522                 break;
2523         case TCP_SEQ_STATE_TIME_WAIT:
2524                 get_timewait4_sock(v, seq, st->num, &len);
2525                 break;
2526         }
2527         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2528 out:
2529         return 0;
2530 }
2531
2532 static const struct file_operations tcp_afinfo_seq_fops = {
2533         .owner   = THIS_MODULE,
2534         .open    = tcp_seq_open,
2535         .read    = seq_read,
2536         .llseek  = seq_lseek,
2537         .release = seq_release_net
2538 };
2539
2540 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2541         .name           = "tcp",
2542         .family         = AF_INET,
2543         .seq_fops       = &tcp_afinfo_seq_fops,
2544         .seq_ops        = {
2545                 .show           = tcp4_seq_show,
2546         },
2547 };
2548
2549 static int __net_init tcp4_proc_init_net(struct net *net)
2550 {
2551         return tcp_proc_register(net, &tcp4_seq_afinfo);
2552 }
2553
2554 static void __net_exit tcp4_proc_exit_net(struct net *net)
2555 {
2556         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2557 }
2558
2559 static struct pernet_operations tcp4_net_ops = {
2560         .init = tcp4_proc_init_net,
2561         .exit = tcp4_proc_exit_net,
2562 };
2563
2564 int __init tcp4_proc_init(void)
2565 {
2566         return register_pernet_subsys(&tcp4_net_ops);
2567 }
2568
2569 void tcp4_proc_exit(void)
2570 {
2571         unregister_pernet_subsys(&tcp4_net_ops);
2572 }
2573 #endif /* CONFIG_PROC_FS */
2574
2575 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2576 {
2577         const struct iphdr *iph = skb_gro_network_header(skb);
2578
2579         switch (skb->ip_summed) {
2580         case CHECKSUM_COMPLETE:
2581                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2582                                   skb->csum)) {
2583                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2584                         break;
2585                 }
2586
2587                 /* fall through */
2588         case CHECKSUM_NONE:
2589                 NAPI_GRO_CB(skb)->flush = 1;
2590                 return NULL;
2591         }
2592
2593         return tcp_gro_receive(head, skb);
2594 }
2595
2596 int tcp4_gro_complete(struct sk_buff *skb)
2597 {
2598         const struct iphdr *iph = ip_hdr(skb);
2599         struct tcphdr *th = tcp_hdr(skb);
2600
2601         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2602                                   iph->saddr, iph->daddr, 0);
2603         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2604
2605         return tcp_gro_complete(skb);
2606 }
2607
2608 struct proto tcp_prot = {
2609         .name                   = "TCP",
2610         .owner                  = THIS_MODULE,
2611         .close                  = tcp_close,
2612         .connect                = tcp_v4_connect,
2613         .disconnect             = tcp_disconnect,
2614         .accept                 = inet_csk_accept,
2615         .ioctl                  = tcp_ioctl,
2616         .init                   = tcp_v4_init_sock,
2617         .destroy                = tcp_v4_destroy_sock,
2618         .shutdown               = tcp_shutdown,
2619         .setsockopt             = tcp_setsockopt,
2620         .getsockopt             = tcp_getsockopt,
2621         .recvmsg                = tcp_recvmsg,
2622         .sendmsg                = tcp_sendmsg,
2623         .sendpage               = tcp_sendpage,
2624         .backlog_rcv            = tcp_v4_do_rcv,
2625         .hash                   = inet_hash,
2626         .unhash                 = inet_unhash,
2627         .get_port               = inet_csk_get_port,
2628         .enter_memory_pressure  = tcp_enter_memory_pressure,
2629         .sockets_allocated      = &tcp_sockets_allocated,
2630         .orphan_count           = &tcp_orphan_count,
2631         .memory_allocated       = &tcp_memory_allocated,
2632         .memory_pressure        = &tcp_memory_pressure,
2633         .sysctl_wmem            = sysctl_tcp_wmem,
2634         .sysctl_rmem            = sysctl_tcp_rmem,
2635         .max_header             = MAX_TCP_HEADER,
2636         .obj_size               = sizeof(struct tcp_sock),
2637         .slab_flags             = SLAB_DESTROY_BY_RCU,
2638         .twsk_prot              = &tcp_timewait_sock_ops,
2639         .rsk_prot               = &tcp_request_sock_ops,
2640         .h.hashinfo             = &tcp_hashinfo,
2641         .no_autobind            = true,
2642 #ifdef CONFIG_COMPAT
2643         .compat_setsockopt      = compat_tcp_setsockopt,
2644         .compat_getsockopt      = compat_tcp_getsockopt,
2645 #endif
2646 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2647         .init_cgroup            = tcp_init_cgroup,
2648         .destroy_cgroup         = tcp_destroy_cgroup,
2649         .proto_cgroup           = tcp_proto_cgroup,
2650 #endif
2651 };
2652 EXPORT_SYMBOL(tcp_prot);
2653
2654 static int __net_init tcp_sk_init(struct net *net)
2655 {
2656         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2657                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2658 }
2659
2660 static void __net_exit tcp_sk_exit(struct net *net)
2661 {
2662         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2663 }
2664
2665 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2666 {
2667         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2668 }
2669
2670 static struct pernet_operations __net_initdata tcp_sk_ops = {
2671        .init       = tcp_sk_init,
2672        .exit       = tcp_sk_exit,
2673        .exit_batch = tcp_sk_exit_batch,
2674 };
2675
2676 void __init tcp_v4_init(void)
2677 {
2678         inet_hashinfo_init(&tcp_hashinfo);
2679         if (register_pernet_subsys(&tcp_sk_ops))
2680                 panic("Failed to create the TCP control socket.\n");
2681 }