6ea6caace3a894030584b5e346ef8cca229a56f2
[cascardo/linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 #include <net/l3mdev.h>
59 #include <net/lwtunnel.h>
60
61 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
62 {
63         struct dst_entry *dst = skb_dst(skb);
64         struct net_device *dev = dst->dev;
65         struct neighbour *neigh;
66         struct in6_addr *nexthop;
67         int ret;
68
69         skb->protocol = htons(ETH_P_IPV6);
70         skb->dev = dev;
71
72         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
73                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
74
75                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
76                     ((mroute6_socket(net, skb) &&
77                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
78                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
79                                          &ipv6_hdr(skb)->saddr))) {
80                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
81
82                         /* Do not check for IFF_ALLMULTI; multicast routing
83                            is not supported in any case.
84                          */
85                         if (newskb)
86                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
87                                         net, sk, newskb, NULL, newskb->dev,
88                                         dev_loopback_xmit);
89
90                         if (ipv6_hdr(skb)->hop_limit == 0) {
91                                 IP6_INC_STATS(net, idev,
92                                               IPSTATS_MIB_OUTDISCARDS);
93                                 kfree_skb(skb);
94                                 return 0;
95                         }
96                 }
97
98                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
99
100                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
101                     IPV6_ADDR_SCOPE_NODELOCAL &&
102                     !(dev->flags & IFF_LOOPBACK)) {
103                         kfree_skb(skb);
104                         return 0;
105                 }
106         }
107
108         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
109                 int res = lwtunnel_xmit(skb);
110
111                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
112                         return res;
113         }
114
115         rcu_read_lock_bh();
116         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
117         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
118         if (unlikely(!neigh))
119                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
120         if (!IS_ERR(neigh)) {
121                 ret = dst_neigh_output(dst, neigh, skb);
122                 rcu_read_unlock_bh();
123                 return ret;
124         }
125         rcu_read_unlock_bh();
126
127         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
128         kfree_skb(skb);
129         return -EINVAL;
130 }
131
132 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
133 {
134         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
135             dst_allfrag(skb_dst(skb)) ||
136             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
137                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
138         else
139                 return ip6_finish_output2(net, sk, skb);
140 }
141
142 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
143 {
144         struct net_device *dev = skb_dst(skb)->dev;
145         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
146
147         if (unlikely(idev->cnf.disable_ipv6)) {
148                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
149                 kfree_skb(skb);
150                 return 0;
151         }
152
153         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
154                             net, sk, skb, NULL, dev,
155                             ip6_finish_output,
156                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
157 }
158
159 /*
160  * xmit an sk_buff (used by TCP, SCTP and DCCP)
161  * Note : socket lock is not held for SYNACK packets, but might be modified
162  * by calls to skb_set_owner_w() and ipv6_local_error(),
163  * which are using proper atomic operations or spinlocks.
164  */
165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
166              struct ipv6_txoptions *opt, int tclass)
167 {
168         struct net *net = sock_net(sk);
169         const struct ipv6_pinfo *np = inet6_sk(sk);
170         struct in6_addr *first_hop = &fl6->daddr;
171         struct dst_entry *dst = skb_dst(skb);
172         struct ipv6hdr *hdr;
173         u8  proto = fl6->flowi6_proto;
174         int seg_len = skb->len;
175         int hlimit = -1;
176         u32 mtu;
177
178         if (opt) {
179                 unsigned int head_room;
180
181                 /* First: exthdrs may take lots of space (~8K for now)
182                    MAX_HEADER is not enough.
183                  */
184                 head_room = opt->opt_nflen + opt->opt_flen;
185                 seg_len += head_room;
186                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
187
188                 if (skb_headroom(skb) < head_room) {
189                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
190                         if (!skb2) {
191                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
192                                               IPSTATS_MIB_OUTDISCARDS);
193                                 kfree_skb(skb);
194                                 return -ENOBUFS;
195                         }
196                         consume_skb(skb);
197                         skb = skb2;
198                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
199                          * it is safe to call in our context (socket lock not held)
200                          */
201                         skb_set_owner_w(skb, (struct sock *)sk);
202                 }
203                 if (opt->opt_flen)
204                         ipv6_push_frag_opts(skb, opt, &proto);
205                 if (opt->opt_nflen)
206                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
207         }
208
209         skb_push(skb, sizeof(struct ipv6hdr));
210         skb_reset_network_header(skb);
211         hdr = ipv6_hdr(skb);
212
213         /*
214          *      Fill in the IPv6 header
215          */
216         if (np)
217                 hlimit = np->hop_limit;
218         if (hlimit < 0)
219                 hlimit = ip6_dst_hoplimit(dst);
220
221         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
222                                                      np->autoflowlabel, fl6));
223
224         hdr->payload_len = htons(seg_len);
225         hdr->nexthdr = proto;
226         hdr->hop_limit = hlimit;
227
228         hdr->saddr = fl6->saddr;
229         hdr->daddr = *first_hop;
230
231         skb->protocol = htons(ETH_P_IPV6);
232         skb->priority = sk->sk_priority;
233         skb->mark = sk->sk_mark;
234
235         mtu = dst_mtu(dst);
236         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
237                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
238                               IPSTATS_MIB_OUT, skb->len);
239
240                 /* if egress device is enslaved to an L3 master device pass the
241                  * skb to its handler for processing
242                  */
243                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
244                 if (unlikely(!skb))
245                         return 0;
246
247                 /* hooks should never assume socket lock is held.
248                  * we promote our socket to non const
249                  */
250                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
251                                net, (struct sock *)sk, skb, NULL, dst->dev,
252                                dst_output);
253         }
254
255         skb->dev = dst->dev;
256         /* ipv6_local_error() does not require socket lock,
257          * we promote our socket to non const
258          */
259         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
260
261         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
262         kfree_skb(skb);
263         return -EMSGSIZE;
264 }
265 EXPORT_SYMBOL(ip6_xmit);
266
267 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
268 {
269         struct ip6_ra_chain *ra;
270         struct sock *last = NULL;
271
272         read_lock(&ip6_ra_lock);
273         for (ra = ip6_ra_chain; ra; ra = ra->next) {
274                 struct sock *sk = ra->sk;
275                 if (sk && ra->sel == sel &&
276                     (!sk->sk_bound_dev_if ||
277                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
278                         if (last) {
279                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
280                                 if (skb2)
281                                         rawv6_rcv(last, skb2);
282                         }
283                         last = sk;
284                 }
285         }
286
287         if (last) {
288                 rawv6_rcv(last, skb);
289                 read_unlock(&ip6_ra_lock);
290                 return 1;
291         }
292         read_unlock(&ip6_ra_lock);
293         return 0;
294 }
295
296 static int ip6_forward_proxy_check(struct sk_buff *skb)
297 {
298         struct ipv6hdr *hdr = ipv6_hdr(skb);
299         u8 nexthdr = hdr->nexthdr;
300         __be16 frag_off;
301         int offset;
302
303         if (ipv6_ext_hdr(nexthdr)) {
304                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
305                 if (offset < 0)
306                         return 0;
307         } else
308                 offset = sizeof(struct ipv6hdr);
309
310         if (nexthdr == IPPROTO_ICMPV6) {
311                 struct icmp6hdr *icmp6;
312
313                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
314                                          offset + 1 - skb->data)))
315                         return 0;
316
317                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
318
319                 switch (icmp6->icmp6_type) {
320                 case NDISC_ROUTER_SOLICITATION:
321                 case NDISC_ROUTER_ADVERTISEMENT:
322                 case NDISC_NEIGHBOUR_SOLICITATION:
323                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
324                 case NDISC_REDIRECT:
325                         /* For reaction involving unicast neighbor discovery
326                          * message destined to the proxied address, pass it to
327                          * input function.
328                          */
329                         return 1;
330                 default:
331                         break;
332                 }
333         }
334
335         /*
336          * The proxying router can't forward traffic sent to a link-local
337          * address, so signal the sender and discard the packet. This
338          * behavior is clarified by the MIPv6 specification.
339          */
340         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
341                 dst_link_failure(skb);
342                 return -1;
343         }
344
345         return 0;
346 }
347
348 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
349                                      struct sk_buff *skb)
350 {
351         return dst_output(net, sk, skb);
352 }
353
354 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
355 {
356         unsigned int mtu;
357         struct inet6_dev *idev;
358
359         if (dst_metric_locked(dst, RTAX_MTU)) {
360                 mtu = dst_metric_raw(dst, RTAX_MTU);
361                 if (mtu)
362                         return mtu;
363         }
364
365         mtu = IPV6_MIN_MTU;
366         rcu_read_lock();
367         idev = __in6_dev_get(dst->dev);
368         if (idev)
369                 mtu = idev->cnf.mtu6;
370         rcu_read_unlock();
371
372         return mtu;
373 }
374
375 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
376 {
377         if (skb->len <= mtu)
378                 return false;
379
380         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
381         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
382                 return true;
383
384         if (skb->ignore_df)
385                 return false;
386
387         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
388                 return false;
389
390         return true;
391 }
392
393 int ip6_forward(struct sk_buff *skb)
394 {
395         struct dst_entry *dst = skb_dst(skb);
396         struct ipv6hdr *hdr = ipv6_hdr(skb);
397         struct inet6_skb_parm *opt = IP6CB(skb);
398         struct net *net = dev_net(dst->dev);
399         u32 mtu;
400
401         if (net->ipv6.devconf_all->forwarding == 0)
402                 goto error;
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         if (unlikely(skb->sk))
408                 goto drop;
409
410         if (skb_warn_if_lro(skb))
411                 goto drop;
412
413         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
414                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
415                                 IPSTATS_MIB_INDISCARDS);
416                 goto drop;
417         }
418
419         skb_forward_csum(skb);
420
421         /*
422          *      We DO NOT make any processing on
423          *      RA packets, pushing them to user level AS IS
424          *      without ane WARRANTY that application will be able
425          *      to interpret them. The reason is that we
426          *      cannot make anything clever here.
427          *
428          *      We are not end-node, so that if packet contains
429          *      AH/ESP, we cannot make anything.
430          *      Defragmentation also would be mistake, RA packets
431          *      cannot be fragmented, because there is no warranty
432          *      that different fragments will go along one path. --ANK
433          */
434         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
435                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
436                         return 0;
437         }
438
439         /*
440          *      check and decrement ttl
441          */
442         if (hdr->hop_limit <= 1) {
443                 /* Force OUTPUT device used as source address */
444                 skb->dev = dst->dev;
445                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
446                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
447                                 IPSTATS_MIB_INHDRERRORS);
448
449                 kfree_skb(skb);
450                 return -ETIMEDOUT;
451         }
452
453         /* XXX: idev->cnf.proxy_ndp? */
454         if (net->ipv6.devconf_all->proxy_ndp &&
455             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
456                 int proxied = ip6_forward_proxy_check(skb);
457                 if (proxied > 0)
458                         return ip6_input(skb);
459                 else if (proxied < 0) {
460                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
461                                         IPSTATS_MIB_INDISCARDS);
462                         goto drop;
463                 }
464         }
465
466         if (!xfrm6_route_forward(skb)) {
467                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
468                                 IPSTATS_MIB_INDISCARDS);
469                 goto drop;
470         }
471         dst = skb_dst(skb);
472
473         /* IPv6 specs say nothing about it, but it is clear that we cannot
474            send redirects to source routed frames.
475            We don't send redirects to frames decapsulated from IPsec.
476          */
477         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
478                 struct in6_addr *target = NULL;
479                 struct inet_peer *peer;
480                 struct rt6_info *rt;
481
482                 /*
483                  *      incoming and outgoing devices are the same
484                  *      send a redirect.
485                  */
486
487                 rt = (struct rt6_info *) dst;
488                 if (rt->rt6i_flags & RTF_GATEWAY)
489                         target = &rt->rt6i_gateway;
490                 else
491                         target = &hdr->daddr;
492
493                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
494
495                 /* Limit redirects both by destination (here)
496                    and by source (inside ndisc_send_redirect)
497                  */
498                 if (inet_peer_xrlim_allow(peer, 1*HZ))
499                         ndisc_send_redirect(skb, target);
500                 if (peer)
501                         inet_putpeer(peer);
502         } else {
503                 int addrtype = ipv6_addr_type(&hdr->saddr);
504
505                 /* This check is security critical. */
506                 if (addrtype == IPV6_ADDR_ANY ||
507                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
508                         goto error;
509                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
510                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
511                                     ICMPV6_NOT_NEIGHBOUR, 0);
512                         goto error;
513                 }
514         }
515
516         mtu = ip6_dst_mtu_forward(dst);
517         if (mtu < IPV6_MIN_MTU)
518                 mtu = IPV6_MIN_MTU;
519
520         if (ip6_pkt_too_big(skb, mtu)) {
521                 /* Again, force OUTPUT device used as source address */
522                 skb->dev = dst->dev;
523                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
524                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
525                                 IPSTATS_MIB_INTOOBIGERRORS);
526                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
527                                 IPSTATS_MIB_FRAGFAILS);
528                 kfree_skb(skb);
529                 return -EMSGSIZE;
530         }
531
532         if (skb_cow(skb, dst->dev->hard_header_len)) {
533                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
534                                 IPSTATS_MIB_OUTDISCARDS);
535                 goto drop;
536         }
537
538         hdr = ipv6_hdr(skb);
539
540         /* Mangling hops number delayed to point after skb COW */
541
542         hdr->hop_limit--;
543
544         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
545         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
546         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
547                        net, NULL, skb, skb->dev, dst->dev,
548                        ip6_forward_finish);
549
550 error:
551         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
552 drop:
553         kfree_skb(skb);
554         return -EINVAL;
555 }
556
557 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
558 {
559         to->pkt_type = from->pkt_type;
560         to->priority = from->priority;
561         to->protocol = from->protocol;
562         skb_dst_drop(to);
563         skb_dst_set(to, dst_clone(skb_dst(from)));
564         to->dev = from->dev;
565         to->mark = from->mark;
566
567 #ifdef CONFIG_NET_SCHED
568         to->tc_index = from->tc_index;
569 #endif
570         nf_copy(to, from);
571         skb_copy_secmark(to, from);
572 }
573
574 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
575                  int (*output)(struct net *, struct sock *, struct sk_buff *))
576 {
577         struct sk_buff *frag;
578         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
579         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
580                                 inet6_sk(skb->sk) : NULL;
581         struct ipv6hdr *tmp_hdr;
582         struct frag_hdr *fh;
583         unsigned int mtu, hlen, left, len;
584         int hroom, troom;
585         __be32 frag_id;
586         int ptr, offset = 0, err = 0;
587         u8 *prevhdr, nexthdr = 0;
588
589         hlen = ip6_find_1stfragopt(skb, &prevhdr);
590         nexthdr = *prevhdr;
591
592         mtu = ip6_skb_dst_mtu(skb);
593
594         /* We must not fragment if the socket is set to force MTU discovery
595          * or if the skb it not generated by a local socket.
596          */
597         if (unlikely(!skb->ignore_df && skb->len > mtu))
598                 goto fail_toobig;
599
600         if (IP6CB(skb)->frag_max_size) {
601                 if (IP6CB(skb)->frag_max_size > mtu)
602                         goto fail_toobig;
603
604                 /* don't send fragments larger than what we received */
605                 mtu = IP6CB(skb)->frag_max_size;
606                 if (mtu < IPV6_MIN_MTU)
607                         mtu = IPV6_MIN_MTU;
608         }
609
610         if (np && np->frag_size < mtu) {
611                 if (np->frag_size)
612                         mtu = np->frag_size;
613         }
614         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
615                 goto fail_toobig;
616         mtu -= hlen + sizeof(struct frag_hdr);
617
618         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
619                                     &ipv6_hdr(skb)->saddr);
620
621         if (skb->ip_summed == CHECKSUM_PARTIAL &&
622             (err = skb_checksum_help(skb)))
623                 goto fail;
624
625         hroom = LL_RESERVED_SPACE(rt->dst.dev);
626         if (skb_has_frag_list(skb)) {
627                 int first_len = skb_pagelen(skb);
628                 struct sk_buff *frag2;
629
630                 if (first_len - hlen > mtu ||
631                     ((first_len - hlen) & 7) ||
632                     skb_cloned(skb) ||
633                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
634                         goto slow_path;
635
636                 skb_walk_frags(skb, frag) {
637                         /* Correct geometry. */
638                         if (frag->len > mtu ||
639                             ((frag->len & 7) && frag->next) ||
640                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
641                                 goto slow_path_clean;
642
643                         /* Partially cloned skb? */
644                         if (skb_shared(frag))
645                                 goto slow_path_clean;
646
647                         BUG_ON(frag->sk);
648                         if (skb->sk) {
649                                 frag->sk = skb->sk;
650                                 frag->destructor = sock_wfree;
651                         }
652                         skb->truesize -= frag->truesize;
653                 }
654
655                 err = 0;
656                 offset = 0;
657                 /* BUILD HEADER */
658
659                 *prevhdr = NEXTHDR_FRAGMENT;
660                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
661                 if (!tmp_hdr) {
662                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
663                                       IPSTATS_MIB_FRAGFAILS);
664                         err = -ENOMEM;
665                         goto fail;
666                 }
667                 frag = skb_shinfo(skb)->frag_list;
668                 skb_frag_list_init(skb);
669
670                 __skb_pull(skb, hlen);
671                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
672                 __skb_push(skb, hlen);
673                 skb_reset_network_header(skb);
674                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
675
676                 fh->nexthdr = nexthdr;
677                 fh->reserved = 0;
678                 fh->frag_off = htons(IP6_MF);
679                 fh->identification = frag_id;
680
681                 first_len = skb_pagelen(skb);
682                 skb->data_len = first_len - skb_headlen(skb);
683                 skb->len = first_len;
684                 ipv6_hdr(skb)->payload_len = htons(first_len -
685                                                    sizeof(struct ipv6hdr));
686
687                 dst_hold(&rt->dst);
688
689                 for (;;) {
690                         /* Prepare header of the next frame,
691                          * before previous one went down. */
692                         if (frag) {
693                                 frag->ip_summed = CHECKSUM_NONE;
694                                 skb_reset_transport_header(frag);
695                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
696                                 __skb_push(frag, hlen);
697                                 skb_reset_network_header(frag);
698                                 memcpy(skb_network_header(frag), tmp_hdr,
699                                        hlen);
700                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
701                                 fh->nexthdr = nexthdr;
702                                 fh->reserved = 0;
703                                 fh->frag_off = htons(offset);
704                                 if (frag->next)
705                                         fh->frag_off |= htons(IP6_MF);
706                                 fh->identification = frag_id;
707                                 ipv6_hdr(frag)->payload_len =
708                                                 htons(frag->len -
709                                                       sizeof(struct ipv6hdr));
710                                 ip6_copy_metadata(frag, skb);
711                         }
712
713                         err = output(net, sk, skb);
714                         if (!err)
715                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
716                                               IPSTATS_MIB_FRAGCREATES);
717
718                         if (err || !frag)
719                                 break;
720
721                         skb = frag;
722                         frag = skb->next;
723                         skb->next = NULL;
724                 }
725
726                 kfree(tmp_hdr);
727
728                 if (err == 0) {
729                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
730                                       IPSTATS_MIB_FRAGOKS);
731                         ip6_rt_put(rt);
732                         return 0;
733                 }
734
735                 kfree_skb_list(frag);
736
737                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738                               IPSTATS_MIB_FRAGFAILS);
739                 ip6_rt_put(rt);
740                 return err;
741
742 slow_path_clean:
743                 skb_walk_frags(skb, frag2) {
744                         if (frag2 == frag)
745                                 break;
746                         frag2->sk = NULL;
747                         frag2->destructor = NULL;
748                         skb->truesize += frag2->truesize;
749                 }
750         }
751
752 slow_path:
753         left = skb->len - hlen;         /* Space per frame */
754         ptr = hlen;                     /* Where to start from */
755
756         /*
757          *      Fragment the datagram.
758          */
759
760         *prevhdr = NEXTHDR_FRAGMENT;
761         troom = rt->dst.dev->needed_tailroom;
762
763         /*
764          *      Keep copying data until we run out.
765          */
766         while (left > 0)        {
767                 len = left;
768                 /* IF: it doesn't fit, use 'mtu' - the data space left */
769                 if (len > mtu)
770                         len = mtu;
771                 /* IF: we are not sending up to and including the packet end
772                    then align the next start on an eight byte boundary */
773                 if (len < left) {
774                         len &= ~7;
775                 }
776
777                 /* Allocate buffer */
778                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
779                                  hroom + troom, GFP_ATOMIC);
780                 if (!frag) {
781                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
782                                       IPSTATS_MIB_FRAGFAILS);
783                         err = -ENOMEM;
784                         goto fail;
785                 }
786
787                 /*
788                  *      Set up data on packet
789                  */
790
791                 ip6_copy_metadata(frag, skb);
792                 skb_reserve(frag, hroom);
793                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
794                 skb_reset_network_header(frag);
795                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
796                 frag->transport_header = (frag->network_header + hlen +
797                                           sizeof(struct frag_hdr));
798
799                 /*
800                  *      Charge the memory for the fragment to any owner
801                  *      it might possess
802                  */
803                 if (skb->sk)
804                         skb_set_owner_w(frag, skb->sk);
805
806                 /*
807                  *      Copy the packet header into the new buffer.
808                  */
809                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
810
811                 /*
812                  *      Build fragment header.
813                  */
814                 fh->nexthdr = nexthdr;
815                 fh->reserved = 0;
816                 fh->identification = frag_id;
817
818                 /*
819                  *      Copy a block of the IP datagram.
820                  */
821                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
822                                      len));
823                 left -= len;
824
825                 fh->frag_off = htons(offset);
826                 if (left > 0)
827                         fh->frag_off |= htons(IP6_MF);
828                 ipv6_hdr(frag)->payload_len = htons(frag->len -
829                                                     sizeof(struct ipv6hdr));
830
831                 ptr += len;
832                 offset += len;
833
834                 /*
835                  *      Put this fragment into the sending queue.
836                  */
837                 err = output(net, sk, frag);
838                 if (err)
839                         goto fail;
840
841                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
842                               IPSTATS_MIB_FRAGCREATES);
843         }
844         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
845                       IPSTATS_MIB_FRAGOKS);
846         consume_skb(skb);
847         return err;
848
849 fail_toobig:
850         if (skb->sk && dst_allfrag(skb_dst(skb)))
851                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
852
853         skb->dev = skb_dst(skb)->dev;
854         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
855         err = -EMSGSIZE;
856
857 fail:
858         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
859                       IPSTATS_MIB_FRAGFAILS);
860         kfree_skb(skb);
861         return err;
862 }
863
864 static inline int ip6_rt_check(const struct rt6key *rt_key,
865                                const struct in6_addr *fl_addr,
866                                const struct in6_addr *addr_cache)
867 {
868         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
869                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
870 }
871
872 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
873                                           struct dst_entry *dst,
874                                           const struct flowi6 *fl6)
875 {
876         struct ipv6_pinfo *np = inet6_sk(sk);
877         struct rt6_info *rt;
878
879         if (!dst)
880                 goto out;
881
882         if (dst->ops->family != AF_INET6) {
883                 dst_release(dst);
884                 return NULL;
885         }
886
887         rt = (struct rt6_info *)dst;
888         /* Yes, checking route validity in not connected
889          * case is not very simple. Take into account,
890          * that we do not support routing by source, TOS,
891          * and MSG_DONTROUTE            --ANK (980726)
892          *
893          * 1. ip6_rt_check(): If route was host route,
894          *    check that cached destination is current.
895          *    If it is network route, we still may
896          *    check its validity using saved pointer
897          *    to the last used address: daddr_cache.
898          *    We do not want to save whole address now,
899          *    (because main consumer of this service
900          *    is tcp, which has not this problem),
901          *    so that the last trick works only on connected
902          *    sockets.
903          * 2. oif also should be the same.
904          */
905         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
906 #ifdef CONFIG_IPV6_SUBTREES
907             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
908 #endif
909            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
910               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
911                 dst_release(dst);
912                 dst = NULL;
913         }
914
915 out:
916         return dst;
917 }
918
919 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
920                                struct dst_entry **dst, struct flowi6 *fl6)
921 {
922 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
923         struct neighbour *n;
924         struct rt6_info *rt;
925 #endif
926         int err;
927         int flags = 0;
928
929         if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
930             (!*dst || !(*dst)->error)) {
931                 err = l3mdev_get_saddr6(net, sk, fl6);
932                 if (err)
933                         goto out_err;
934         }
935
936         /* The correct way to handle this would be to do
937          * ip6_route_get_saddr, and then ip6_route_output; however,
938          * the route-specific preferred source forces the
939          * ip6_route_output call _before_ ip6_route_get_saddr.
940          *
941          * In source specific routing (no src=any default route),
942          * ip6_route_output will fail given src=any saddr, though, so
943          * that's why we try it again later.
944          */
945         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
946                 struct rt6_info *rt;
947                 bool had_dst = *dst != NULL;
948
949                 if (!had_dst)
950                         *dst = ip6_route_output(net, sk, fl6);
951                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
952                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
953                                           sk ? inet6_sk(sk)->srcprefs : 0,
954                                           &fl6->saddr);
955                 if (err)
956                         goto out_err_release;
957
958                 /* If we had an erroneous initial result, pretend it
959                  * never existed and let the SA-enabled version take
960                  * over.
961                  */
962                 if (!had_dst && (*dst)->error) {
963                         dst_release(*dst);
964                         *dst = NULL;
965                 }
966
967                 if (fl6->flowi6_oif)
968                         flags |= RT6_LOOKUP_F_IFACE;
969         }
970
971         if (!*dst)
972                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
973
974         err = (*dst)->error;
975         if (err)
976                 goto out_err_release;
977
978 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
979         /*
980          * Here if the dst entry we've looked up
981          * has a neighbour entry that is in the INCOMPLETE
982          * state and the src address from the flow is
983          * marked as OPTIMISTIC, we release the found
984          * dst entry and replace it instead with the
985          * dst entry of the nexthop router
986          */
987         rt = (struct rt6_info *) *dst;
988         rcu_read_lock_bh();
989         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
990                                       rt6_nexthop(rt, &fl6->daddr));
991         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
992         rcu_read_unlock_bh();
993
994         if (err) {
995                 struct inet6_ifaddr *ifp;
996                 struct flowi6 fl_gw6;
997                 int redirect;
998
999                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1000                                       (*dst)->dev, 1);
1001
1002                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1003                 if (ifp)
1004                         in6_ifa_put(ifp);
1005
1006                 if (redirect) {
1007                         /*
1008                          * We need to get the dst entry for the
1009                          * default router instead
1010                          */
1011                         dst_release(*dst);
1012                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1013                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1014                         *dst = ip6_route_output(net, sk, &fl_gw6);
1015                         err = (*dst)->error;
1016                         if (err)
1017                                 goto out_err_release;
1018                 }
1019         }
1020 #endif
1021
1022         return 0;
1023
1024 out_err_release:
1025         dst_release(*dst);
1026         *dst = NULL;
1027 out_err:
1028         if (err == -ENETUNREACH)
1029                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1030         return err;
1031 }
1032
1033 /**
1034  *      ip6_dst_lookup - perform route lookup on flow
1035  *      @sk: socket which provides route info
1036  *      @dst: pointer to dst_entry * for result
1037  *      @fl6: flow to lookup
1038  *
1039  *      This function performs a route lookup on the given flow.
1040  *
1041  *      It returns zero on success, or a standard errno code on error.
1042  */
1043 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1044                    struct flowi6 *fl6)
1045 {
1046         *dst = NULL;
1047         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1048 }
1049 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1050
1051 /**
1052  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1053  *      @sk: socket which provides route info
1054  *      @fl6: flow to lookup
1055  *      @final_dst: final destination address for ipsec lookup
1056  *
1057  *      This function performs a route lookup on the given flow.
1058  *
1059  *      It returns a valid dst pointer on success, or a pointer encoded
1060  *      error code.
1061  */
1062 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1063                                       const struct in6_addr *final_dst)
1064 {
1065         struct dst_entry *dst = NULL;
1066         int err;
1067
1068         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1069         if (err)
1070                 return ERR_PTR(err);
1071         if (final_dst)
1072                 fl6->daddr = *final_dst;
1073         if (!fl6->flowi6_oif)
1074                 fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1075
1076         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1077 }
1078 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1079
1080 /**
1081  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1082  *      @sk: socket which provides the dst cache and route info
1083  *      @fl6: flow to lookup
1084  *      @final_dst: final destination address for ipsec lookup
1085  *
1086  *      This function performs a route lookup on the given flow with the
1087  *      possibility of using the cached route in the socket if it is valid.
1088  *      It will take the socket dst lock when operating on the dst cache.
1089  *      As a result, this function can only be used in process context.
1090  *
1091  *      It returns a valid dst pointer on success, or a pointer encoded
1092  *      error code.
1093  */
1094 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1095                                          const struct in6_addr *final_dst)
1096 {
1097         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1098
1099         dst = ip6_sk_dst_check(sk, dst, fl6);
1100         if (!dst)
1101                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1102
1103         return dst;
1104 }
1105 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1106
1107 static inline int ip6_ufo_append_data(struct sock *sk,
1108                         struct sk_buff_head *queue,
1109                         int getfrag(void *from, char *to, int offset, int len,
1110                         int odd, struct sk_buff *skb),
1111                         void *from, int length, int hh_len, int fragheaderlen,
1112                         int exthdrlen, int transhdrlen, int mtu,
1113                         unsigned int flags, const struct flowi6 *fl6)
1114
1115 {
1116         struct sk_buff *skb;
1117         int err;
1118
1119         /* There is support for UDP large send offload by network
1120          * device, so create one single skb packet containing complete
1121          * udp datagram
1122          */
1123         skb = skb_peek_tail(queue);
1124         if (!skb) {
1125                 skb = sock_alloc_send_skb(sk,
1126                         hh_len + fragheaderlen + transhdrlen + 20,
1127                         (flags & MSG_DONTWAIT), &err);
1128                 if (!skb)
1129                         return err;
1130
1131                 /* reserve space for Hardware header */
1132                 skb_reserve(skb, hh_len);
1133
1134                 /* create space for UDP/IP header */
1135                 skb_put(skb, fragheaderlen + transhdrlen);
1136
1137                 /* initialize network header pointer */
1138                 skb_set_network_header(skb, exthdrlen);
1139
1140                 /* initialize protocol header pointer */
1141                 skb->transport_header = skb->network_header + fragheaderlen;
1142
1143                 skb->protocol = htons(ETH_P_IPV6);
1144                 skb->csum = 0;
1145
1146                 __skb_queue_tail(queue, skb);
1147         } else if (skb_is_gso(skb)) {
1148                 goto append;
1149         }
1150
1151         skb->ip_summed = CHECKSUM_PARTIAL;
1152         /* Specify the length of each IPv6 datagram fragment.
1153          * It has to be a multiple of 8.
1154          */
1155         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1156                                      sizeof(struct frag_hdr)) & ~7;
1157         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1158         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1159                                                          &fl6->daddr,
1160                                                          &fl6->saddr);
1161
1162 append:
1163         return skb_append_datato_frags(sk, skb, getfrag, from,
1164                                        (length - transhdrlen));
1165 }
1166
1167 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1168                                                gfp_t gfp)
1169 {
1170         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1171 }
1172
1173 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1174                                                 gfp_t gfp)
1175 {
1176         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178
1179 static void ip6_append_data_mtu(unsigned int *mtu,
1180                                 int *maxfraglen,
1181                                 unsigned int fragheaderlen,
1182                                 struct sk_buff *skb,
1183                                 struct rt6_info *rt,
1184                                 unsigned int orig_mtu)
1185 {
1186         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1187                 if (!skb) {
1188                         /* first fragment, reserve header_len */
1189                         *mtu = orig_mtu - rt->dst.header_len;
1190
1191                 } else {
1192                         /*
1193                          * this fragment is not first, the headers
1194                          * space is regarded as data space.
1195                          */
1196                         *mtu = orig_mtu;
1197                 }
1198                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1199                               + fragheaderlen - sizeof(struct frag_hdr);
1200         }
1201 }
1202
1203 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1204                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1205                           struct rt6_info *rt, struct flowi6 *fl6)
1206 {
1207         struct ipv6_pinfo *np = inet6_sk(sk);
1208         unsigned int mtu;
1209         struct ipv6_txoptions *opt = ipc6->opt;
1210
1211         /*
1212          * setup for corking
1213          */
1214         if (opt) {
1215                 if (WARN_ON(v6_cork->opt))
1216                         return -EINVAL;
1217
1218                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1219                 if (unlikely(!v6_cork->opt))
1220                         return -ENOBUFS;
1221
1222                 v6_cork->opt->tot_len = opt->tot_len;
1223                 v6_cork->opt->opt_flen = opt->opt_flen;
1224                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1225
1226                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1227                                                     sk->sk_allocation);
1228                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1229                         return -ENOBUFS;
1230
1231                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1232                                                     sk->sk_allocation);
1233                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1234                         return -ENOBUFS;
1235
1236                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1237                                                    sk->sk_allocation);
1238                 if (opt->hopopt && !v6_cork->opt->hopopt)
1239                         return -ENOBUFS;
1240
1241                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1242                                                     sk->sk_allocation);
1243                 if (opt->srcrt && !v6_cork->opt->srcrt)
1244                         return -ENOBUFS;
1245
1246                 /* need source address above miyazawa*/
1247         }
1248         dst_hold(&rt->dst);
1249         cork->base.dst = &rt->dst;
1250         cork->fl.u.ip6 = *fl6;
1251         v6_cork->hop_limit = ipc6->hlimit;
1252         v6_cork->tclass = ipc6->tclass;
1253         if (rt->dst.flags & DST_XFRM_TUNNEL)
1254                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1255                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1256         else
1257                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1258                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1259         if (np->frag_size < mtu) {
1260                 if (np->frag_size)
1261                         mtu = np->frag_size;
1262         }
1263         cork->base.fragsize = mtu;
1264         if (dst_allfrag(rt->dst.path))
1265                 cork->base.flags |= IPCORK_ALLFRAG;
1266         cork->base.length = 0;
1267
1268         return 0;
1269 }
1270
1271 static int __ip6_append_data(struct sock *sk,
1272                              struct flowi6 *fl6,
1273                              struct sk_buff_head *queue,
1274                              struct inet_cork *cork,
1275                              struct inet6_cork *v6_cork,
1276                              struct page_frag *pfrag,
1277                              int getfrag(void *from, char *to, int offset,
1278                                          int len, int odd, struct sk_buff *skb),
1279                              void *from, int length, int transhdrlen,
1280                              unsigned int flags, struct ipcm6_cookie *ipc6,
1281                              const struct sockcm_cookie *sockc)
1282 {
1283         struct sk_buff *skb, *skb_prev = NULL;
1284         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1285         int exthdrlen = 0;
1286         int dst_exthdrlen = 0;
1287         int hh_len;
1288         int copy;
1289         int err;
1290         int offset = 0;
1291         __u8 tx_flags = 0;
1292         u32 tskey = 0;
1293         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1294         struct ipv6_txoptions *opt = v6_cork->opt;
1295         int csummode = CHECKSUM_NONE;
1296         unsigned int maxnonfragsize, headersize;
1297
1298         skb = skb_peek_tail(queue);
1299         if (!skb) {
1300                 exthdrlen = opt ? opt->opt_flen : 0;
1301                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1302         }
1303
1304         mtu = cork->fragsize;
1305         orig_mtu = mtu;
1306
1307         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1308
1309         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1310                         (opt ? opt->opt_nflen : 0);
1311         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1312                      sizeof(struct frag_hdr);
1313
1314         headersize = sizeof(struct ipv6hdr) +
1315                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1316                      (dst_allfrag(&rt->dst) ?
1317                       sizeof(struct frag_hdr) : 0) +
1318                      rt->rt6i_nfheader_len;
1319
1320         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1321             (sk->sk_protocol == IPPROTO_UDP ||
1322              sk->sk_protocol == IPPROTO_RAW)) {
1323                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1324                                 sizeof(struct ipv6hdr));
1325                 goto emsgsize;
1326         }
1327
1328         if (ip6_sk_ignore_df(sk))
1329                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1330         else
1331                 maxnonfragsize = mtu;
1332
1333         if (cork->length + length > maxnonfragsize - headersize) {
1334 emsgsize:
1335                 ipv6_local_error(sk, EMSGSIZE, fl6,
1336                                  mtu - headersize +
1337                                  sizeof(struct ipv6hdr));
1338                 return -EMSGSIZE;
1339         }
1340
1341         /* CHECKSUM_PARTIAL only with no extension headers and when
1342          * we are not going to fragment
1343          */
1344         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1345             headersize == sizeof(struct ipv6hdr) &&
1346             length < mtu - headersize &&
1347             !(flags & MSG_MORE) &&
1348             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1349                 csummode = CHECKSUM_PARTIAL;
1350
1351         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1352                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1353                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1354                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1355                         tskey = sk->sk_tskey++;
1356         }
1357
1358         /*
1359          * Let's try using as much space as possible.
1360          * Use MTU if total length of the message fits into the MTU.
1361          * Otherwise, we need to reserve fragment header and
1362          * fragment alignment (= 8-15 octects, in total).
1363          *
1364          * Note that we may need to "move" the data from the tail of
1365          * of the buffer to the new fragment when we split
1366          * the message.
1367          *
1368          * FIXME: It may be fragmented into multiple chunks
1369          *        at once if non-fragmentable extension headers
1370          *        are too large.
1371          * --yoshfuji
1372          */
1373
1374         cork->length += length;
1375         if (((length > mtu) ||
1376              (skb && skb_is_gso(skb))) &&
1377             (sk->sk_protocol == IPPROTO_UDP) &&
1378             (rt->dst.dev->features & NETIF_F_UFO) &&
1379             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1380                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1381                                           hh_len, fragheaderlen, exthdrlen,
1382                                           transhdrlen, mtu, flags, fl6);
1383                 if (err)
1384                         goto error;
1385                 return 0;
1386         }
1387
1388         if (!skb)
1389                 goto alloc_new_skb;
1390
1391         while (length > 0) {
1392                 /* Check if the remaining data fits into current packet. */
1393                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1394                 if (copy < length)
1395                         copy = maxfraglen - skb->len;
1396
1397                 if (copy <= 0) {
1398                         char *data;
1399                         unsigned int datalen;
1400                         unsigned int fraglen;
1401                         unsigned int fraggap;
1402                         unsigned int alloclen;
1403 alloc_new_skb:
1404                         /* There's no room in the current skb */
1405                         if (skb)
1406                                 fraggap = skb->len - maxfraglen;
1407                         else
1408                                 fraggap = 0;
1409                         /* update mtu and maxfraglen if necessary */
1410                         if (!skb || !skb_prev)
1411                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1412                                                     fragheaderlen, skb, rt,
1413                                                     orig_mtu);
1414
1415                         skb_prev = skb;
1416
1417                         /*
1418                          * If remaining data exceeds the mtu,
1419                          * we know we need more fragment(s).
1420                          */
1421                         datalen = length + fraggap;
1422
1423                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1424                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1425                         if ((flags & MSG_MORE) &&
1426                             !(rt->dst.dev->features&NETIF_F_SG))
1427                                 alloclen = mtu;
1428                         else
1429                                 alloclen = datalen + fragheaderlen;
1430
1431                         alloclen += dst_exthdrlen;
1432
1433                         if (datalen != length + fraggap) {
1434                                 /*
1435                                  * this is not the last fragment, the trailer
1436                                  * space is regarded as data space.
1437                                  */
1438                                 datalen += rt->dst.trailer_len;
1439                         }
1440
1441                         alloclen += rt->dst.trailer_len;
1442                         fraglen = datalen + fragheaderlen;
1443
1444                         /*
1445                          * We just reserve space for fragment header.
1446                          * Note: this may be overallocation if the message
1447                          * (without MSG_MORE) fits into the MTU.
1448                          */
1449                         alloclen += sizeof(struct frag_hdr);
1450
1451                         if (transhdrlen) {
1452                                 skb = sock_alloc_send_skb(sk,
1453                                                 alloclen + hh_len,
1454                                                 (flags & MSG_DONTWAIT), &err);
1455                         } else {
1456                                 skb = NULL;
1457                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1458                                     2 * sk->sk_sndbuf)
1459                                         skb = sock_wmalloc(sk,
1460                                                            alloclen + hh_len, 1,
1461                                                            sk->sk_allocation);
1462                                 if (unlikely(!skb))
1463                                         err = -ENOBUFS;
1464                         }
1465                         if (!skb)
1466                                 goto error;
1467                         /*
1468                          *      Fill in the control structures
1469                          */
1470                         skb->protocol = htons(ETH_P_IPV6);
1471                         skb->ip_summed = csummode;
1472                         skb->csum = 0;
1473                         /* reserve for fragmentation and ipsec header */
1474                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1475                                     dst_exthdrlen);
1476
1477                         /* Only the initial fragment is time stamped */
1478                         skb_shinfo(skb)->tx_flags = tx_flags;
1479                         tx_flags = 0;
1480                         skb_shinfo(skb)->tskey = tskey;
1481                         tskey = 0;
1482
1483                         /*
1484                          *      Find where to start putting bytes
1485                          */
1486                         data = skb_put(skb, fraglen);
1487                         skb_set_network_header(skb, exthdrlen);
1488                         data += fragheaderlen;
1489                         skb->transport_header = (skb->network_header +
1490                                                  fragheaderlen);
1491                         if (fraggap) {
1492                                 skb->csum = skb_copy_and_csum_bits(
1493                                         skb_prev, maxfraglen,
1494                                         data + transhdrlen, fraggap, 0);
1495                                 skb_prev->csum = csum_sub(skb_prev->csum,
1496                                                           skb->csum);
1497                                 data += fraggap;
1498                                 pskb_trim_unique(skb_prev, maxfraglen);
1499                         }
1500                         copy = datalen - transhdrlen - fraggap;
1501
1502                         if (copy < 0) {
1503                                 err = -EINVAL;
1504                                 kfree_skb(skb);
1505                                 goto error;
1506                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1507                                 err = -EFAULT;
1508                                 kfree_skb(skb);
1509                                 goto error;
1510                         }
1511
1512                         offset += copy;
1513                         length -= datalen - fraggap;
1514                         transhdrlen = 0;
1515                         exthdrlen = 0;
1516                         dst_exthdrlen = 0;
1517
1518                         /*
1519                          * Put the packet on the pending queue
1520                          */
1521                         __skb_queue_tail(queue, skb);
1522                         continue;
1523                 }
1524
1525                 if (copy > length)
1526                         copy = length;
1527
1528                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1529                         unsigned int off;
1530
1531                         off = skb->len;
1532                         if (getfrag(from, skb_put(skb, copy),
1533                                                 offset, copy, off, skb) < 0) {
1534                                 __skb_trim(skb, off);
1535                                 err = -EFAULT;
1536                                 goto error;
1537                         }
1538                 } else {
1539                         int i = skb_shinfo(skb)->nr_frags;
1540
1541                         err = -ENOMEM;
1542                         if (!sk_page_frag_refill(sk, pfrag))
1543                                 goto error;
1544
1545                         if (!skb_can_coalesce(skb, i, pfrag->page,
1546                                               pfrag->offset)) {
1547                                 err = -EMSGSIZE;
1548                                 if (i == MAX_SKB_FRAGS)
1549                                         goto error;
1550
1551                                 __skb_fill_page_desc(skb, i, pfrag->page,
1552                                                      pfrag->offset, 0);
1553                                 skb_shinfo(skb)->nr_frags = ++i;
1554                                 get_page(pfrag->page);
1555                         }
1556                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1557                         if (getfrag(from,
1558                                     page_address(pfrag->page) + pfrag->offset,
1559                                     offset, copy, skb->len, skb) < 0)
1560                                 goto error_efault;
1561
1562                         pfrag->offset += copy;
1563                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1564                         skb->len += copy;
1565                         skb->data_len += copy;
1566                         skb->truesize += copy;
1567                         atomic_add(copy, &sk->sk_wmem_alloc);
1568                 }
1569                 offset += copy;
1570                 length -= copy;
1571         }
1572
1573         return 0;
1574
1575 error_efault:
1576         err = -EFAULT;
1577 error:
1578         cork->length -= length;
1579         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1580         return err;
1581 }
1582
1583 int ip6_append_data(struct sock *sk,
1584                     int getfrag(void *from, char *to, int offset, int len,
1585                                 int odd, struct sk_buff *skb),
1586                     void *from, int length, int transhdrlen,
1587                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1588                     struct rt6_info *rt, unsigned int flags,
1589                     const struct sockcm_cookie *sockc)
1590 {
1591         struct inet_sock *inet = inet_sk(sk);
1592         struct ipv6_pinfo *np = inet6_sk(sk);
1593         int exthdrlen;
1594         int err;
1595
1596         if (flags&MSG_PROBE)
1597                 return 0;
1598         if (skb_queue_empty(&sk->sk_write_queue)) {
1599                 /*
1600                  * setup for corking
1601                  */
1602                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1603                                      ipc6, rt, fl6);
1604                 if (err)
1605                         return err;
1606
1607                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1608                 length += exthdrlen;
1609                 transhdrlen += exthdrlen;
1610         } else {
1611                 fl6 = &inet->cork.fl.u.ip6;
1612                 transhdrlen = 0;
1613         }
1614
1615         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1616                                  &np->cork, sk_page_frag(sk), getfrag,
1617                                  from, length, transhdrlen, flags, ipc6, sockc);
1618 }
1619 EXPORT_SYMBOL_GPL(ip6_append_data);
1620
1621 static void ip6_cork_release(struct inet_cork_full *cork,
1622                              struct inet6_cork *v6_cork)
1623 {
1624         if (v6_cork->opt) {
1625                 kfree(v6_cork->opt->dst0opt);
1626                 kfree(v6_cork->opt->dst1opt);
1627                 kfree(v6_cork->opt->hopopt);
1628                 kfree(v6_cork->opt->srcrt);
1629                 kfree(v6_cork->opt);
1630                 v6_cork->opt = NULL;
1631         }
1632
1633         if (cork->base.dst) {
1634                 dst_release(cork->base.dst);
1635                 cork->base.dst = NULL;
1636                 cork->base.flags &= ~IPCORK_ALLFRAG;
1637         }
1638         memset(&cork->fl, 0, sizeof(cork->fl));
1639 }
1640
1641 struct sk_buff *__ip6_make_skb(struct sock *sk,
1642                                struct sk_buff_head *queue,
1643                                struct inet_cork_full *cork,
1644                                struct inet6_cork *v6_cork)
1645 {
1646         struct sk_buff *skb, *tmp_skb;
1647         struct sk_buff **tail_skb;
1648         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1649         struct ipv6_pinfo *np = inet6_sk(sk);
1650         struct net *net = sock_net(sk);
1651         struct ipv6hdr *hdr;
1652         struct ipv6_txoptions *opt = v6_cork->opt;
1653         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1654         struct flowi6 *fl6 = &cork->fl.u.ip6;
1655         unsigned char proto = fl6->flowi6_proto;
1656
1657         skb = __skb_dequeue(queue);
1658         if (!skb)
1659                 goto out;
1660         tail_skb = &(skb_shinfo(skb)->frag_list);
1661
1662         /* move skb->data to ip header from ext header */
1663         if (skb->data < skb_network_header(skb))
1664                 __skb_pull(skb, skb_network_offset(skb));
1665         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1666                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1667                 *tail_skb = tmp_skb;
1668                 tail_skb = &(tmp_skb->next);
1669                 skb->len += tmp_skb->len;
1670                 skb->data_len += tmp_skb->len;
1671                 skb->truesize += tmp_skb->truesize;
1672                 tmp_skb->destructor = NULL;
1673                 tmp_skb->sk = NULL;
1674         }
1675
1676         /* Allow local fragmentation. */
1677         skb->ignore_df = ip6_sk_ignore_df(sk);
1678
1679         *final_dst = fl6->daddr;
1680         __skb_pull(skb, skb_network_header_len(skb));
1681         if (opt && opt->opt_flen)
1682                 ipv6_push_frag_opts(skb, opt, &proto);
1683         if (opt && opt->opt_nflen)
1684                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1685
1686         skb_push(skb, sizeof(struct ipv6hdr));
1687         skb_reset_network_header(skb);
1688         hdr = ipv6_hdr(skb);
1689
1690         ip6_flow_hdr(hdr, v6_cork->tclass,
1691                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1692                                         np->autoflowlabel, fl6));
1693         hdr->hop_limit = v6_cork->hop_limit;
1694         hdr->nexthdr = proto;
1695         hdr->saddr = fl6->saddr;
1696         hdr->daddr = *final_dst;
1697
1698         skb->priority = sk->sk_priority;
1699         skb->mark = sk->sk_mark;
1700
1701         skb_dst_set(skb, dst_clone(&rt->dst));
1702         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1703         if (proto == IPPROTO_ICMPV6) {
1704                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1705
1706                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1707                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1708         }
1709
1710         ip6_cork_release(cork, v6_cork);
1711 out:
1712         return skb;
1713 }
1714
1715 int ip6_send_skb(struct sk_buff *skb)
1716 {
1717         struct net *net = sock_net(skb->sk);
1718         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1719         int err;
1720
1721         err = ip6_local_out(net, skb->sk, skb);
1722         if (err) {
1723                 if (err > 0)
1724                         err = net_xmit_errno(err);
1725                 if (err)
1726                         IP6_INC_STATS(net, rt->rt6i_idev,
1727                                       IPSTATS_MIB_OUTDISCARDS);
1728         }
1729
1730         return err;
1731 }
1732
1733 int ip6_push_pending_frames(struct sock *sk)
1734 {
1735         struct sk_buff *skb;
1736
1737         skb = ip6_finish_skb(sk);
1738         if (!skb)
1739                 return 0;
1740
1741         return ip6_send_skb(skb);
1742 }
1743 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1744
1745 static void __ip6_flush_pending_frames(struct sock *sk,
1746                                        struct sk_buff_head *queue,
1747                                        struct inet_cork_full *cork,
1748                                        struct inet6_cork *v6_cork)
1749 {
1750         struct sk_buff *skb;
1751
1752         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1753                 if (skb_dst(skb))
1754                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1755                                       IPSTATS_MIB_OUTDISCARDS);
1756                 kfree_skb(skb);
1757         }
1758
1759         ip6_cork_release(cork, v6_cork);
1760 }
1761
1762 void ip6_flush_pending_frames(struct sock *sk)
1763 {
1764         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1765                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1766 }
1767 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1768
1769 struct sk_buff *ip6_make_skb(struct sock *sk,
1770                              int getfrag(void *from, char *to, int offset,
1771                                          int len, int odd, struct sk_buff *skb),
1772                              void *from, int length, int transhdrlen,
1773                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1774                              struct rt6_info *rt, unsigned int flags,
1775                              const struct sockcm_cookie *sockc)
1776 {
1777         struct inet_cork_full cork;
1778         struct inet6_cork v6_cork;
1779         struct sk_buff_head queue;
1780         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1781         int err;
1782
1783         if (flags & MSG_PROBE)
1784                 return NULL;
1785
1786         __skb_queue_head_init(&queue);
1787
1788         cork.base.flags = 0;
1789         cork.base.addr = 0;
1790         cork.base.opt = NULL;
1791         v6_cork.opt = NULL;
1792         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1793         if (err)
1794                 return ERR_PTR(err);
1795
1796         if (ipc6->dontfrag < 0)
1797                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1798
1799         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1800                                 &current->task_frag, getfrag, from,
1801                                 length + exthdrlen, transhdrlen + exthdrlen,
1802                                 flags, ipc6, sockc);
1803         if (err) {
1804                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1805                 return ERR_PTR(err);
1806         }
1807
1808         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1809 }