2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 static int ip6_finish_output2(struct sk_buff *skb)
61 struct dst_entry *dst = skb_dst(skb);
62 struct net_device *dev = dst->dev;
63 struct neighbour *neigh;
64 struct in6_addr *nexthop;
67 skb->protocol = htons(ETH_P_IPV6);
70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
74 ((mroute6_socket(dev_net(dev), skb) &&
75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 newskb, NULL, newskb->dev,
88 if (ipv6_hdr(skb)->hop_limit == 0) {
89 IP6_INC_STATS(dev_net(dev), idev,
90 IPSTATS_MIB_OUTDISCARDS);
96 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
99 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 IPV6_ADDR_SCOPE_NODELOCAL &&
101 !(dev->flags & IFF_LOOPBACK)) {
108 nexthop = rt6_nexthop((struct rt6_info *)dst);
109 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 if (unlikely(!neigh))
111 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 if (!IS_ERR(neigh)) {
113 ret = dst_neigh_output(dst, neigh, skb);
114 rcu_read_unlock_bh();
117 rcu_read_unlock_bh();
119 IP6_INC_STATS(dev_net(dst->dev),
120 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
125 static int ip6_finish_output(struct sk_buff *skb)
127 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 dst_allfrag(skb_dst(skb)) ||
129 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 return ip6_fragment(skb, ip6_finish_output2);
132 return ip6_finish_output2(skb);
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
137 struct net_device *dev = skb_dst(skb)->dev;
138 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 if (unlikely(idev->cnf.disable_ipv6)) {
140 IP6_INC_STATS(dev_net(dev), idev,
141 IPSTATS_MIB_OUTDISCARDS);
146 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
148 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
152 * xmit an sk_buff (used by TCP, SCTP and DCCP)
155 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
156 struct ipv6_txoptions *opt, int tclass)
158 struct net *net = sock_net(sk);
159 struct ipv6_pinfo *np = inet6_sk(sk);
160 struct in6_addr *first_hop = &fl6->daddr;
161 struct dst_entry *dst = skb_dst(skb);
163 u8 proto = fl6->flowi6_proto;
164 int seg_len = skb->len;
169 unsigned int head_room;
171 /* First: exthdrs may take lots of space (~8K for now)
172 MAX_HEADER is not enough.
174 head_room = opt->opt_nflen + opt->opt_flen;
175 seg_len += head_room;
176 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178 if (skb_headroom(skb) < head_room) {
179 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
182 IPSTATS_MIB_OUTDISCARDS);
188 skb_set_owner_w(skb, sk);
191 ipv6_push_frag_opts(skb, opt, &proto);
193 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
196 skb_push(skb, sizeof(struct ipv6hdr));
197 skb_reset_network_header(skb);
201 * Fill in the IPv6 header
204 hlimit = np->hop_limit;
206 hlimit = ip6_dst_hoplimit(dst);
208 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
211 hdr->payload_len = htons(seg_len);
212 hdr->nexthdr = proto;
213 hdr->hop_limit = hlimit;
215 hdr->saddr = fl6->saddr;
216 hdr->daddr = *first_hop;
218 skb->protocol = htons(ETH_P_IPV6);
219 skb->priority = sk->sk_priority;
220 skb->mark = sk->sk_mark;
223 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
224 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
225 IPSTATS_MIB_OUT, skb->len);
226 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
227 dst->dev, dst_output);
231 ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
232 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
237 EXPORT_SYMBOL(ip6_xmit);
239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
241 struct ip6_ra_chain *ra;
242 struct sock *last = NULL;
244 read_lock(&ip6_ra_lock);
245 for (ra = ip6_ra_chain; ra; ra = ra->next) {
246 struct sock *sk = ra->sk;
247 if (sk && ra->sel == sel &&
248 (!sk->sk_bound_dev_if ||
249 sk->sk_bound_dev_if == skb->dev->ifindex)) {
251 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
253 rawv6_rcv(last, skb2);
260 rawv6_rcv(last, skb);
261 read_unlock(&ip6_ra_lock);
264 read_unlock(&ip6_ra_lock);
268 static int ip6_forward_proxy_check(struct sk_buff *skb)
270 struct ipv6hdr *hdr = ipv6_hdr(skb);
271 u8 nexthdr = hdr->nexthdr;
275 if (ipv6_ext_hdr(nexthdr)) {
276 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
280 offset = sizeof(struct ipv6hdr);
282 if (nexthdr == IPPROTO_ICMPV6) {
283 struct icmp6hdr *icmp6;
285 if (!pskb_may_pull(skb, (skb_network_header(skb) +
286 offset + 1 - skb->data)))
289 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
291 switch (icmp6->icmp6_type) {
292 case NDISC_ROUTER_SOLICITATION:
293 case NDISC_ROUTER_ADVERTISEMENT:
294 case NDISC_NEIGHBOUR_SOLICITATION:
295 case NDISC_NEIGHBOUR_ADVERTISEMENT:
297 /* For reaction involving unicast neighbor discovery
298 * message destined to the proxied address, pass it to
308 * The proxying router can't forward traffic sent to a link-local
309 * address, so signal the sender and discard the packet. This
310 * behavior is clarified by the MIPv6 specification.
312 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313 dst_link_failure(skb);
320 static inline int ip6_forward_finish(struct sk_buff *skb)
322 return dst_output(skb);
325 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
328 struct inet6_dev *idev;
330 if (dst_metric_locked(dst, RTAX_MTU)) {
331 mtu = dst_metric_raw(dst, RTAX_MTU);
338 idev = __in6_dev_get(dst->dev);
340 mtu = idev->cnf.mtu6;
346 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
351 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
352 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
358 if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
364 int ip6_forward(struct sk_buff *skb)
366 struct dst_entry *dst = skb_dst(skb);
367 struct ipv6hdr *hdr = ipv6_hdr(skb);
368 struct inet6_skb_parm *opt = IP6CB(skb);
369 struct net *net = dev_net(dst->dev);
372 if (net->ipv6.devconf_all->forwarding == 0)
375 if (skb->pkt_type != PACKET_HOST)
378 if (skb_warn_if_lro(skb))
381 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
382 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
383 IPSTATS_MIB_INDISCARDS);
387 skb_forward_csum(skb);
390 * We DO NOT make any processing on
391 * RA packets, pushing them to user level AS IS
392 * without ane WARRANTY that application will be able
393 * to interpret them. The reason is that we
394 * cannot make anything clever here.
396 * We are not end-node, so that if packet contains
397 * AH/ESP, we cannot make anything.
398 * Defragmentation also would be mistake, RA packets
399 * cannot be fragmented, because there is no warranty
400 * that different fragments will go along one path. --ANK
402 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
403 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
408 * check and decrement ttl
410 if (hdr->hop_limit <= 1) {
411 /* Force OUTPUT device used as source address */
413 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
414 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
415 IPSTATS_MIB_INHDRERRORS);
421 /* XXX: idev->cnf.proxy_ndp? */
422 if (net->ipv6.devconf_all->proxy_ndp &&
423 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
424 int proxied = ip6_forward_proxy_check(skb);
426 return ip6_input(skb);
427 else if (proxied < 0) {
428 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
429 IPSTATS_MIB_INDISCARDS);
434 if (!xfrm6_route_forward(skb)) {
435 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
436 IPSTATS_MIB_INDISCARDS);
441 /* IPv6 specs say nothing about it, but it is clear that we cannot
442 send redirects to source routed frames.
443 We don't send redirects to frames decapsulated from IPsec.
445 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
446 struct in6_addr *target = NULL;
447 struct inet_peer *peer;
451 * incoming and outgoing devices are the same
455 rt = (struct rt6_info *) dst;
456 if (rt->rt6i_flags & RTF_GATEWAY)
457 target = &rt->rt6i_gateway;
459 target = &hdr->daddr;
461 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
463 /* Limit redirects both by destination (here)
464 and by source (inside ndisc_send_redirect)
466 if (inet_peer_xrlim_allow(peer, 1*HZ))
467 ndisc_send_redirect(skb, target);
471 int addrtype = ipv6_addr_type(&hdr->saddr);
473 /* This check is security critical. */
474 if (addrtype == IPV6_ADDR_ANY ||
475 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
477 if (addrtype & IPV6_ADDR_LINKLOCAL) {
478 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
479 ICMPV6_NOT_NEIGHBOUR, 0);
484 mtu = ip6_dst_mtu_forward(dst);
485 if (mtu < IPV6_MIN_MTU)
488 if (ip6_pkt_too_big(skb, mtu)) {
489 /* Again, force OUTPUT device used as source address */
491 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
492 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
493 IPSTATS_MIB_INTOOBIGERRORS);
494 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
495 IPSTATS_MIB_FRAGFAILS);
500 if (skb_cow(skb, dst->dev->hard_header_len)) {
501 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
502 IPSTATS_MIB_OUTDISCARDS);
508 /* Mangling hops number delayed to point after skb COW */
512 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
513 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
514 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
518 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
524 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
526 to->pkt_type = from->pkt_type;
527 to->priority = from->priority;
528 to->protocol = from->protocol;
530 skb_dst_set(to, dst_clone(skb_dst(from)));
532 to->mark = from->mark;
534 #ifdef CONFIG_NET_SCHED
535 to->tc_index = from->tc_index;
538 skb_copy_secmark(to, from);
541 static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
543 static u32 ip6_idents_hashrnd __read_mostly;
546 net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
548 hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
549 id = ip_idents_reserve(hash, 1);
550 fhdr->identification = htonl(id);
553 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
555 struct sk_buff *frag;
556 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
557 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
558 struct ipv6hdr *tmp_hdr;
560 unsigned int mtu, hlen, left, len;
563 int ptr, offset = 0, err=0;
564 u8 *prevhdr, nexthdr = 0;
565 struct net *net = dev_net(skb_dst(skb)->dev);
567 hlen = ip6_find_1stfragopt(skb, &prevhdr);
570 mtu = ip6_skb_dst_mtu(skb);
572 /* We must not fragment if the socket is set to force MTU discovery
573 * or if the skb it not generated by a local socket.
575 if (unlikely(!skb->ignore_df && skb->len > mtu) ||
576 (IP6CB(skb)->frag_max_size &&
577 IP6CB(skb)->frag_max_size > mtu)) {
578 if (skb->sk && dst_allfrag(skb_dst(skb)))
579 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
581 skb->dev = skb_dst(skb)->dev;
582 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
583 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
584 IPSTATS_MIB_FRAGFAILS);
589 if (np && np->frag_size < mtu) {
593 mtu -= hlen + sizeof(struct frag_hdr);
595 if (skb_has_frag_list(skb)) {
596 int first_len = skb_pagelen(skb);
597 struct sk_buff *frag2;
599 if (first_len - hlen > mtu ||
600 ((first_len - hlen) & 7) ||
604 skb_walk_frags(skb, frag) {
605 /* Correct geometry. */
606 if (frag->len > mtu ||
607 ((frag->len & 7) && frag->next) ||
608 skb_headroom(frag) < hlen)
609 goto slow_path_clean;
611 /* Partially cloned skb? */
612 if (skb_shared(frag))
613 goto slow_path_clean;
618 frag->destructor = sock_wfree;
620 skb->truesize -= frag->truesize;
625 frag = skb_shinfo(skb)->frag_list;
626 skb_frag_list_init(skb);
629 *prevhdr = NEXTHDR_FRAGMENT;
630 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
632 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
633 IPSTATS_MIB_FRAGFAILS);
637 __skb_pull(skb, hlen);
638 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
639 __skb_push(skb, hlen);
640 skb_reset_network_header(skb);
641 memcpy(skb_network_header(skb), tmp_hdr, hlen);
643 ipv6_select_ident(fh, rt);
644 fh->nexthdr = nexthdr;
646 fh->frag_off = htons(IP6_MF);
647 frag_id = fh->identification;
649 first_len = skb_pagelen(skb);
650 skb->data_len = first_len - skb_headlen(skb);
651 skb->len = first_len;
652 ipv6_hdr(skb)->payload_len = htons(first_len -
653 sizeof(struct ipv6hdr));
658 /* Prepare header of the next frame,
659 * before previous one went down. */
661 frag->ip_summed = CHECKSUM_NONE;
662 skb_reset_transport_header(frag);
663 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
664 __skb_push(frag, hlen);
665 skb_reset_network_header(frag);
666 memcpy(skb_network_header(frag), tmp_hdr,
668 offset += skb->len - hlen - sizeof(struct frag_hdr);
669 fh->nexthdr = nexthdr;
671 fh->frag_off = htons(offset);
672 if (frag->next != NULL)
673 fh->frag_off |= htons(IP6_MF);
674 fh->identification = frag_id;
675 ipv6_hdr(frag)->payload_len =
677 sizeof(struct ipv6hdr));
678 ip6_copy_metadata(frag, skb);
683 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
684 IPSTATS_MIB_FRAGCREATES);
697 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
698 IPSTATS_MIB_FRAGOKS);
709 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
710 IPSTATS_MIB_FRAGFAILS);
715 skb_walk_frags(skb, frag2) {
719 frag2->destructor = NULL;
720 skb->truesize += frag2->truesize;
725 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
726 skb_checksum_help(skb))
729 left = skb->len - hlen; /* Space per frame */
730 ptr = hlen; /* Where to start from */
733 * Fragment the datagram.
736 *prevhdr = NEXTHDR_FRAGMENT;
737 hroom = LL_RESERVED_SPACE(rt->dst.dev);
738 troom = rt->dst.dev->needed_tailroom;
741 * Keep copying data until we run out.
745 /* IF: it doesn't fit, use 'mtu' - the data space left */
748 /* IF: we are not sending up to and including the packet end
749 then align the next start on an eight byte boundary */
757 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
758 hroom + troom, GFP_ATOMIC)) == NULL) {
759 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
760 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
761 IPSTATS_MIB_FRAGFAILS);
767 * Set up data on packet
770 ip6_copy_metadata(frag, skb);
771 skb_reserve(frag, hroom);
772 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
773 skb_reset_network_header(frag);
774 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
775 frag->transport_header = (frag->network_header + hlen +
776 sizeof(struct frag_hdr));
779 * Charge the memory for the fragment to any owner
783 skb_set_owner_w(frag, skb->sk);
786 * Copy the packet header into the new buffer.
788 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
791 * Build fragment header.
793 fh->nexthdr = nexthdr;
796 ipv6_select_ident(fh, rt);
797 frag_id = fh->identification;
799 fh->identification = frag_id;
802 * Copy a block of the IP datagram.
804 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
808 fh->frag_off = htons(offset);
810 fh->frag_off |= htons(IP6_MF);
811 ipv6_hdr(frag)->payload_len = htons(frag->len -
812 sizeof(struct ipv6hdr));
818 * Put this fragment into the sending queue.
824 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
825 IPSTATS_MIB_FRAGCREATES);
827 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
828 IPSTATS_MIB_FRAGOKS);
833 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
834 IPSTATS_MIB_FRAGFAILS);
839 static inline int ip6_rt_check(const struct rt6key *rt_key,
840 const struct in6_addr *fl_addr,
841 const struct in6_addr *addr_cache)
843 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
844 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
847 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
848 struct dst_entry *dst,
849 const struct flowi6 *fl6)
851 struct ipv6_pinfo *np = inet6_sk(sk);
857 if (dst->ops->family != AF_INET6) {
862 rt = (struct rt6_info *)dst;
863 /* Yes, checking route validity in not connected
864 * case is not very simple. Take into account,
865 * that we do not support routing by source, TOS,
866 * and MSG_DONTROUTE --ANK (980726)
868 * 1. ip6_rt_check(): If route was host route,
869 * check that cached destination is current.
870 * If it is network route, we still may
871 * check its validity using saved pointer
872 * to the last used address: daddr_cache.
873 * We do not want to save whole address now,
874 * (because main consumer of this service
875 * is tcp, which has not this problem),
876 * so that the last trick works only on connected
878 * 2. oif also should be the same.
880 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
881 #ifdef CONFIG_IPV6_SUBTREES
882 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
884 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
893 static int ip6_dst_lookup_tail(struct sock *sk,
894 struct dst_entry **dst, struct flowi6 *fl6)
896 struct net *net = sock_net(sk);
897 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
904 *dst = ip6_route_output(net, sk, fl6);
906 if ((err = (*dst)->error))
907 goto out_err_release;
909 if (ipv6_addr_any(&fl6->saddr)) {
910 struct rt6_info *rt = (struct rt6_info *) *dst;
911 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
912 sk ? inet6_sk(sk)->srcprefs : 0,
915 goto out_err_release;
918 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
920 * Here if the dst entry we've looked up
921 * has a neighbour entry that is in the INCOMPLETE
922 * state and the src address from the flow is
923 * marked as OPTIMISTIC, we release the found
924 * dst entry and replace it instead with the
925 * dst entry of the nexthop router
927 rt = (struct rt6_info *) *dst;
929 n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
930 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
931 rcu_read_unlock_bh();
934 struct inet6_ifaddr *ifp;
935 struct flowi6 fl_gw6;
938 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
941 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
947 * We need to get the dst entry for the
948 * default router instead
951 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
952 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
953 *dst = ip6_route_output(net, sk, &fl_gw6);
954 if ((err = (*dst)->error))
955 goto out_err_release;
963 if (err == -ENETUNREACH)
964 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
971 * ip6_dst_lookup - perform route lookup on flow
972 * @sk: socket which provides route info
973 * @dst: pointer to dst_entry * for result
974 * @fl6: flow to lookup
976 * This function performs a route lookup on the given flow.
978 * It returns zero on success, or a standard errno code on error.
980 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
983 return ip6_dst_lookup_tail(sk, dst, fl6);
985 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
988 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
989 * @sk: socket which provides route info
990 * @fl6: flow to lookup
991 * @final_dst: final destination address for ipsec lookup
993 * This function performs a route lookup on the given flow.
995 * It returns a valid dst pointer on success, or a pointer encoded
998 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
999 const struct in6_addr *final_dst)
1001 struct dst_entry *dst = NULL;
1004 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1006 return ERR_PTR(err);
1008 fl6->daddr = *final_dst;
1010 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1012 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1015 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1016 * @sk: socket which provides the dst cache and route info
1017 * @fl6: flow to lookup
1018 * @final_dst: final destination address for ipsec lookup
1020 * This function performs a route lookup on the given flow with the
1021 * possibility of using the cached route in the socket if it is valid.
1022 * It will take the socket dst lock when operating on the dst cache.
1023 * As a result, this function can only be used in process context.
1025 * It returns a valid dst pointer on success, or a pointer encoded
1028 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1029 const struct in6_addr *final_dst)
1031 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1034 dst = ip6_sk_dst_check(sk, dst, fl6);
1036 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1038 return ERR_PTR(err);
1040 fl6->daddr = *final_dst;
1042 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1044 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1046 static inline int ip6_ufo_append_data(struct sock *sk,
1047 int getfrag(void *from, char *to, int offset, int len,
1048 int odd, struct sk_buff *skb),
1049 void *from, int length, int hh_len, int fragheaderlen,
1050 int transhdrlen, int mtu,unsigned int flags,
1051 struct rt6_info *rt)
1054 struct sk_buff *skb;
1055 struct frag_hdr fhdr;
1058 /* There is support for UDP large send offload by network
1059 * device, so create one single skb packet containing complete
1062 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1063 skb = sock_alloc_send_skb(sk,
1064 hh_len + fragheaderlen + transhdrlen + 20,
1065 (flags & MSG_DONTWAIT), &err);
1069 /* reserve space for Hardware header */
1070 skb_reserve(skb, hh_len);
1072 /* create space for UDP/IP header */
1073 skb_put(skb,fragheaderlen + transhdrlen);
1075 /* initialize network header pointer */
1076 skb_reset_network_header(skb);
1078 /* initialize protocol header pointer */
1079 skb->transport_header = skb->network_header + fragheaderlen;
1081 skb->protocol = htons(ETH_P_IPV6);
1084 __skb_queue_tail(&sk->sk_write_queue, skb);
1085 } else if (skb_is_gso(skb)) {
1089 skb->ip_summed = CHECKSUM_PARTIAL;
1090 /* Specify the length of each IPv6 datagram fragment.
1091 * It has to be a multiple of 8.
1093 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1094 sizeof(struct frag_hdr)) & ~7;
1095 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1096 ipv6_select_ident(&fhdr, rt);
1097 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1100 return skb_append_datato_frags(sk, skb, getfrag, from,
1101 (length - transhdrlen));
1104 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1107 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1110 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1113 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1116 static void ip6_append_data_mtu(unsigned int *mtu,
1118 unsigned int fragheaderlen,
1119 struct sk_buff *skb,
1120 struct rt6_info *rt,
1121 unsigned int orig_mtu)
1123 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1125 /* first fragment, reserve header_len */
1126 *mtu = orig_mtu - rt->dst.header_len;
1130 * this fragment is not first, the headers
1131 * space is regarded as data space.
1135 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1136 + fragheaderlen - sizeof(struct frag_hdr);
1140 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1141 int offset, int len, int odd, struct sk_buff *skb),
1142 void *from, int length, int transhdrlen,
1143 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1144 struct rt6_info *rt, unsigned int flags, int dontfrag)
1146 struct inet_sock *inet = inet_sk(sk);
1147 struct ipv6_pinfo *np = inet6_sk(sk);
1148 struct inet_cork *cork;
1149 struct sk_buff *skb, *skb_prev = NULL;
1150 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1159 if (flags&MSG_PROBE)
1161 cork = &inet->cork.base;
1162 if (skb_queue_empty(&sk->sk_write_queue)) {
1167 if (WARN_ON(np->cork.opt))
1170 np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1171 if (unlikely(np->cork.opt == NULL))
1174 np->cork.opt->tot_len = opt->tot_len;
1175 np->cork.opt->opt_flen = opt->opt_flen;
1176 np->cork.opt->opt_nflen = opt->opt_nflen;
1178 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1180 if (opt->dst0opt && !np->cork.opt->dst0opt)
1183 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1185 if (opt->dst1opt && !np->cork.opt->dst1opt)
1188 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1190 if (opt->hopopt && !np->cork.opt->hopopt)
1193 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1195 if (opt->srcrt && !np->cork.opt->srcrt)
1198 /* need source address above miyazawa*/
1201 cork->dst = &rt->dst;
1202 inet->cork.fl.u.ip6 = *fl6;
1203 np->cork.hop_limit = hlimit;
1204 np->cork.tclass = tclass;
1205 if (rt->dst.flags & DST_XFRM_TUNNEL)
1206 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1207 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1209 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1210 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1211 if (np->frag_size < mtu) {
1213 mtu = np->frag_size;
1215 cork->fragsize = mtu;
1216 if (dst_allfrag(rt->dst.path))
1217 cork->flags |= IPCORK_ALLFRAG;
1219 exthdrlen = (opt ? opt->opt_flen : 0);
1220 length += exthdrlen;
1221 transhdrlen += exthdrlen;
1222 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1224 rt = (struct rt6_info *)cork->dst;
1225 fl6 = &inet->cork.fl.u.ip6;
1230 mtu = cork->fragsize;
1234 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1236 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1237 (opt ? opt->opt_nflen : 0);
1238 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1239 sizeof(struct frag_hdr);
1241 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1242 unsigned int maxnonfragsize, headersize;
1244 headersize = sizeof(struct ipv6hdr) +
1245 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1246 (dst_allfrag(&rt->dst) ?
1247 sizeof(struct frag_hdr) : 0) +
1248 rt->rt6i_nfheader_len;
1250 if (ip6_sk_ignore_df(sk))
1251 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1253 maxnonfragsize = mtu;
1255 /* dontfrag active */
1256 if ((cork->length + length > mtu - headersize) && dontfrag &&
1257 (sk->sk_protocol == IPPROTO_UDP ||
1258 sk->sk_protocol == IPPROTO_RAW)) {
1259 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1260 sizeof(struct ipv6hdr));
1264 if (cork->length + length > maxnonfragsize - headersize) {
1266 ipv6_local_error(sk, EMSGSIZE, fl6,
1268 sizeof(struct ipv6hdr));
1273 /* For UDP, check if TX timestamp is enabled */
1274 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW)
1275 sock_tx_timestamp(sk, &tx_flags);
1278 * Let's try using as much space as possible.
1279 * Use MTU if total length of the message fits into the MTU.
1280 * Otherwise, we need to reserve fragment header and
1281 * fragment alignment (= 8-15 octects, in total).
1283 * Note that we may need to "move" the data from the tail of
1284 * of the buffer to the new fragment when we split
1287 * FIXME: It may be fragmented into multiple chunks
1288 * at once if non-fragmentable extension headers
1293 skb = skb_peek_tail(&sk->sk_write_queue);
1294 cork->length += length;
1295 if (((length > mtu) ||
1296 (skb && skb_is_gso(skb))) &&
1297 (sk->sk_protocol == IPPROTO_UDP) &&
1298 (rt->dst.dev->features & NETIF_F_UFO)) {
1299 err = ip6_ufo_append_data(sk, getfrag, from, length,
1300 hh_len, fragheaderlen,
1301 transhdrlen, mtu, flags, rt);
1310 while (length > 0) {
1311 /* Check if the remaining data fits into current packet. */
1312 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1314 copy = maxfraglen - skb->len;
1318 unsigned int datalen;
1319 unsigned int fraglen;
1320 unsigned int fraggap;
1321 unsigned int alloclen;
1323 /* There's no room in the current skb */
1325 fraggap = skb->len - maxfraglen;
1328 /* update mtu and maxfraglen if necessary */
1329 if (skb == NULL || skb_prev == NULL)
1330 ip6_append_data_mtu(&mtu, &maxfraglen,
1331 fragheaderlen, skb, rt,
1337 * If remaining data exceeds the mtu,
1338 * we know we need more fragment(s).
1340 datalen = length + fraggap;
1342 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1343 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1344 if ((flags & MSG_MORE) &&
1345 !(rt->dst.dev->features&NETIF_F_SG))
1348 alloclen = datalen + fragheaderlen;
1350 alloclen += dst_exthdrlen;
1352 if (datalen != length + fraggap) {
1354 * this is not the last fragment, the trailer
1355 * space is regarded as data space.
1357 datalen += rt->dst.trailer_len;
1360 alloclen += rt->dst.trailer_len;
1361 fraglen = datalen + fragheaderlen;
1364 * We just reserve space for fragment header.
1365 * Note: this may be overallocation if the message
1366 * (without MSG_MORE) fits into the MTU.
1368 alloclen += sizeof(struct frag_hdr);
1371 skb = sock_alloc_send_skb(sk,
1373 (flags & MSG_DONTWAIT), &err);
1376 if (atomic_read(&sk->sk_wmem_alloc) <=
1378 skb = sock_wmalloc(sk,
1379 alloclen + hh_len, 1,
1381 if (unlikely(skb == NULL))
1387 * Fill in the control structures
1389 skb->protocol = htons(ETH_P_IPV6);
1390 skb->ip_summed = CHECKSUM_NONE;
1392 /* reserve for fragmentation and ipsec header */
1393 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1396 /* Only the initial fragment is time stamped */
1397 skb_shinfo(skb)->tx_flags = tx_flags;
1401 * Find where to start putting bytes
1403 data = skb_put(skb, fraglen);
1404 skb_set_network_header(skb, exthdrlen);
1405 data += fragheaderlen;
1406 skb->transport_header = (skb->network_header +
1409 skb->csum = skb_copy_and_csum_bits(
1410 skb_prev, maxfraglen,
1411 data + transhdrlen, fraggap, 0);
1412 skb_prev->csum = csum_sub(skb_prev->csum,
1415 pskb_trim_unique(skb_prev, maxfraglen);
1417 copy = datalen - transhdrlen - fraggap;
1423 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1430 length -= datalen - fraggap;
1436 * Put the packet on the pending queue
1438 __skb_queue_tail(&sk->sk_write_queue, skb);
1445 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1449 if (getfrag(from, skb_put(skb, copy),
1450 offset, copy, off, skb) < 0) {
1451 __skb_trim(skb, off);
1456 int i = skb_shinfo(skb)->nr_frags;
1457 struct page_frag *pfrag = sk_page_frag(sk);
1460 if (!sk_page_frag_refill(sk, pfrag))
1463 if (!skb_can_coalesce(skb, i, pfrag->page,
1466 if (i == MAX_SKB_FRAGS)
1469 __skb_fill_page_desc(skb, i, pfrag->page,
1471 skb_shinfo(skb)->nr_frags = ++i;
1472 get_page(pfrag->page);
1474 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1476 page_address(pfrag->page) + pfrag->offset,
1477 offset, copy, skb->len, skb) < 0)
1480 pfrag->offset += copy;
1481 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1483 skb->data_len += copy;
1484 skb->truesize += copy;
1485 atomic_add(copy, &sk->sk_wmem_alloc);
1496 cork->length -= length;
1497 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1500 EXPORT_SYMBOL_GPL(ip6_append_data);
1502 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1505 kfree(np->cork.opt->dst0opt);
1506 kfree(np->cork.opt->dst1opt);
1507 kfree(np->cork.opt->hopopt);
1508 kfree(np->cork.opt->srcrt);
1509 kfree(np->cork.opt);
1510 np->cork.opt = NULL;
1513 if (inet->cork.base.dst) {
1514 dst_release(inet->cork.base.dst);
1515 inet->cork.base.dst = NULL;
1516 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1518 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1521 int ip6_push_pending_frames(struct sock *sk)
1523 struct sk_buff *skb, *tmp_skb;
1524 struct sk_buff **tail_skb;
1525 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1526 struct inet_sock *inet = inet_sk(sk);
1527 struct ipv6_pinfo *np = inet6_sk(sk);
1528 struct net *net = sock_net(sk);
1529 struct ipv6hdr *hdr;
1530 struct ipv6_txoptions *opt = np->cork.opt;
1531 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1532 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1533 unsigned char proto = fl6->flowi6_proto;
1536 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1538 tail_skb = &(skb_shinfo(skb)->frag_list);
1540 /* move skb->data to ip header from ext header */
1541 if (skb->data < skb_network_header(skb))
1542 __skb_pull(skb, skb_network_offset(skb));
1543 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1544 __skb_pull(tmp_skb, skb_network_header_len(skb));
1545 *tail_skb = tmp_skb;
1546 tail_skb = &(tmp_skb->next);
1547 skb->len += tmp_skb->len;
1548 skb->data_len += tmp_skb->len;
1549 skb->truesize += tmp_skb->truesize;
1550 tmp_skb->destructor = NULL;
1554 /* Allow local fragmentation. */
1555 skb->ignore_df = ip6_sk_ignore_df(sk);
1557 *final_dst = fl6->daddr;
1558 __skb_pull(skb, skb_network_header_len(skb));
1559 if (opt && opt->opt_flen)
1560 ipv6_push_frag_opts(skb, opt, &proto);
1561 if (opt && opt->opt_nflen)
1562 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1564 skb_push(skb, sizeof(struct ipv6hdr));
1565 skb_reset_network_header(skb);
1566 hdr = ipv6_hdr(skb);
1568 ip6_flow_hdr(hdr, np->cork.tclass,
1569 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1570 np->autoflowlabel));
1571 hdr->hop_limit = np->cork.hop_limit;
1572 hdr->nexthdr = proto;
1573 hdr->saddr = fl6->saddr;
1574 hdr->daddr = *final_dst;
1576 skb->priority = sk->sk_priority;
1577 skb->mark = sk->sk_mark;
1579 skb_dst_set(skb, dst_clone(&rt->dst));
1580 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1581 if (proto == IPPROTO_ICMPV6) {
1582 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1584 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1585 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1588 err = ip6_local_out(skb);
1591 err = net_xmit_errno(err);
1597 ip6_cork_release(inet, np);
1600 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1603 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1605 void ip6_flush_pending_frames(struct sock *sk)
1607 struct sk_buff *skb;
1609 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1611 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1612 IPSTATS_MIB_OUTDISCARDS);
1616 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1618 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);