datapath-windows: Refactor sofware offloads and mss
[cascardo/ovs.git] / datapath-windows / ovsext / Vxlan.c
1 /*
2  * Copyright (c) 2014, 2016 VMware, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at:
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "precomp.h"
18
19 #include "Atomic.h"
20 #include "Debug.h"
21 #include "Flow.h"
22 #include "Flow.h"
23 #include "IpHelper.h"
24 #include "NetProto.h"
25 #include "Offload.h"
26 #include "PacketIO.h"
27 #include "PacketParser.h"
28 #include "Switch.h"
29 #include "User.h"
30 #include "Vport.h"
31 #include "Vxlan.h"
32
33 #pragma warning( push )
34 #pragma warning( disable:4127 )
35
36
37 #ifdef OVS_DBG_MOD
38 #undef OVS_DBG_MOD
39 #endif
40 #define OVS_DBG_MOD OVS_DBG_VXLAN
41
42 /* Helper macro to check if a VXLAN ID is valid. */
43 #define VXLAN_ID_IS_VALID(vxlanID) (0 < (vxlanID) && (vxlanID) <= 0xffffff)
44 #define VXLAN_TUNNELID_TO_VNI(_tID)   (UINT32)(((UINT64)(_tID)) >> 40)
45 #define VXLAN_VNI_TO_TUNNELID(_vni) (((UINT64)(_vni)) << 40)
46 #define IP_DF_NBO 0x0040
47 #define VXLAN_DEFAULT_TTL 64
48 #define VXLAN_MULTICAST_TTL 64
49 #define VXLAN_DEFAULT_INSTANCE_ID 1
50
51 /* Move to a header file */
52 extern POVS_SWITCH_CONTEXT gOvsSwitchContext;
53
54 /*
55  *----------------------------------------------------------------------------
56  * This function verifies if the VXLAN tunnel already exists, in order to
57  * avoid sending a duplicate request to the WFP base filtering engine.
58  *----------------------------------------------------------------------------
59  */
60 static BOOLEAN
61 OvsIsTunnelFilterCreated(POVS_SWITCH_CONTEXT switchContext,
62                          UINT16 udpPortDest)
63 {
64     for (UINT hash = 0; hash < OVS_MAX_VPORT_ARRAY_SIZE; hash++) {
65         PLIST_ENTRY head, link, next;
66
67         head = &(switchContext->portNoHashArray[hash & OVS_VPORT_MASK]);
68         LIST_FORALL_SAFE(head, link, next) {
69             POVS_VPORT_ENTRY vport = NULL;
70             POVS_VXLAN_VPORT vxlanPort = NULL;
71             vport = CONTAINING_RECORD(link, OVS_VPORT_ENTRY, portNoLink);
72             vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
73             if (vxlanPort) {
74                 if ((udpPortDest == vxlanPort->dstPort)) {
75                     /* The VXLAN tunnel was already created. */
76                     return TRUE;
77                 }
78             }
79         }
80     }
81
82     return FALSE;
83 }
84
85 /*
86  *----------------------------------------------------------------------------
87  * This function allocates and initializes the OVS_VXLAN_VPORT. The function
88  * also creates a WFP tunnel filter for the necessary destination port. The
89  * tunnel filter create request is passed to the tunnel filter threads that
90  * will complete the request at a later time when IRQL is lowered to
91  * PASSIVE_LEVEL.
92  *
93  * udpDestPort: the vxlan is set as payload to a udp frame. If the destination
94  * port of an udp frame is udpDestPort, we understand it to be vxlan.
95  *----------------------------------------------------------------------------
96  */
97 NTSTATUS
98 OvsInitVxlanTunnel(PIRP irp,
99                    POVS_VPORT_ENTRY vport,
100                    UINT16 udpDestPort,
101                    PFNTunnelVportPendingOp callback,
102                    PVOID tunnelContext)
103 {
104     NTSTATUS status = STATUS_SUCCESS;
105     POVS_VXLAN_VPORT vxlanPort = NULL;
106
107     vxlanPort = OvsAllocateMemoryWithTag(sizeof (*vxlanPort),
108                                          OVS_VXLAN_POOL_TAG);
109     if (vxlanPort == NULL) {
110         return STATUS_INSUFFICIENT_RESOURCES;
111     }
112
113     RtlZeroMemory(vxlanPort, sizeof(*vxlanPort));
114     vxlanPort->dstPort = udpDestPort;
115     vport->priv = (PVOID)vxlanPort;
116
117     if (!OvsIsTunnelFilterCreated(gOvsSwitchContext, udpDestPort)) {
118         status = OvsTunnelFilterCreate(irp,
119                                        udpDestPort,
120                                        &vxlanPort->filterID,
121                                        callback,
122                                        tunnelContext);
123     } else {
124         status = STATUS_OBJECT_NAME_EXISTS;
125     }
126
127     return status;
128 }
129
130 /*
131  *----------------------------------------------------------------------------
132  * This function releases the OVS_VXLAN_VPORT. The function also deletes the
133  * WFP tunnel filter previously created. The tunnel filter delete request is
134  * passed to the tunnel filter threads that will complete the request at a
135  * later time when IRQL is lowered to PASSIVE_LEVEL.
136  *----------------------------------------------------------------------------
137  */
138 NTSTATUS
139 OvsCleanupVxlanTunnel(PIRP irp,
140                       POVS_VPORT_ENTRY vport,
141                       PFNTunnelVportPendingOp callback,
142                       PVOID tunnelContext)
143 {
144     NTSTATUS status = STATUS_SUCCESS;
145     POVS_VXLAN_VPORT vxlanPort = NULL;
146
147     if (vport->ovsType != OVS_VPORT_TYPE_VXLAN ||
148         vport->priv == NULL) {
149         return STATUS_SUCCESS;
150     }
151
152     vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
153
154     if (vxlanPort->filterID != 0) {
155         status = OvsTunnelFilterDelete(irp,
156                                        vxlanPort->filterID,
157                                        callback,
158                                        tunnelContext);
159     } else {
160         OvsFreeMemoryWithTag(vport->priv, OVS_VXLAN_POOL_TAG);
161         vport->priv = NULL;
162     }
163
164     return status;
165 }
166
167
168 /*
169  *----------------------------------------------------------------------------
170  * OvsDoEncapVxlan
171  *     Encapsulates the packet.
172  *----------------------------------------------------------------------------
173  */
174 static __inline NDIS_STATUS
175 OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
176                 PNET_BUFFER_LIST curNbl,
177                 OvsIPv4TunnelKey *tunKey,
178                 POVS_FWD_INFO fwdInfo,
179                 POVS_PACKET_HDR_INFO layers,
180                 POVS_SWITCH_CONTEXT switchContext,
181                 PNET_BUFFER_LIST *newNbl)
182 {
183     NDIS_STATUS status;
184     PNET_BUFFER curNb;
185     PMDL curMdl;
186     PUINT8 bufferStart;
187     EthHdr *ethHdr;
188     IPHdr *ipHdr;
189     UDPHdr *udpHdr;
190     VXLANHdr *vxlanHdr;
191     POVS_VXLAN_VPORT vportVxlan;
192     UINT32 headRoom = OvsGetVxlanTunHdrSize();
193     UINT32 packetLength;
194     ULONG mss = 0;
195
196     /*
197      * XXX: the assumption currently is that the NBL is owned by OVS, and
198      * headroom has already been allocated as part of allocating the NBL and
199      * MDL.
200      */
201     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
202     packetLength = NET_BUFFER_DATA_LENGTH(curNb);
203
204     if (layers->isTcp) {
205         mss = OVSGetTcpMSS(curNbl);
206
207         OVS_LOG_TRACE("MSS %u packet len %u", mss,
208                       packetLength);
209         if (mss) {
210             OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
211             *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
212                                        mss, headRoom);
213             if (*newNbl == NULL) {
214                 OVS_LOG_ERROR("Unable to segment NBL");
215                 return NDIS_STATUS_FAILURE;
216             }
217             /* Clear out LSO flags after this point */
218             NET_BUFFER_LIST_INFO(*newNbl, TcpLargeSendNetBufferListInfo) = 0;
219         }
220     }
221
222     vportVxlan = (POVS_VXLAN_VPORT) GetOvsVportPriv(vport);
223     ASSERT(vportVxlan);
224
225     /* If we didn't split the packet above, make a copy now */
226     if (*newNbl == NULL) {
227         *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
228                                     FALSE /*NBL info*/);
229         if (*newNbl == NULL) {
230             OVS_LOG_ERROR("Unable to copy NBL");
231             return NDIS_STATUS_FAILURE;
232         }
233         NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
234         csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
235                                               TcpIpChecksumNetBufferListInfo);
236         status = OvsApplySWChecksumOnNB(layers, *newNbl, &csumInfo);
237
238         if (status != NDIS_STATUS_SUCCESS) {
239             goto ret_error;
240         }
241     }
242
243     curNbl = *newNbl;
244     for (curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curNb != NULL;
245             curNb = curNb->Next) {
246         status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
247         if (status != NDIS_STATUS_SUCCESS) {
248             goto ret_error;
249         }
250
251         curMdl = NET_BUFFER_CURRENT_MDL(curNb);
252         bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
253         if (!bufferStart) {
254             status = NDIS_STATUS_RESOURCES;
255             goto ret_error;
256         }
257
258         bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
259         if (NET_BUFFER_NEXT_NB(curNb)) {
260             OVS_LOG_TRACE("nb length %u next %u", NET_BUFFER_DATA_LENGTH(curNb),
261                           NET_BUFFER_DATA_LENGTH(curNb->Next));
262         }
263
264         /* L2 header */
265         ethHdr = (EthHdr *)bufferStart;
266         ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
267                (PCHAR)&fwdInfo->srcMacAddr);
268         NdisMoveMemory(ethHdr->Destination, fwdInfo->dstMacAddr,
269                        sizeof ethHdr->Destination + sizeof ethHdr->Source);
270         ethHdr->Type = htons(ETH_TYPE_IPV4);
271
272         /* IP header */
273         ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
274
275         ipHdr->ihl = sizeof *ipHdr / 4;
276         ipHdr->version = IPPROTO_IPV4;
277         ipHdr->tos = tunKey->tos;
278         ipHdr->tot_len = htons(NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ethHdr);
279         ipHdr->id = (uint16)atomic_add64(&vportVxlan->ipId,
280                                          NET_BUFFER_DATA_LENGTH(curNb));
281         ipHdr->frag_off = (tunKey->flags & OVS_TNL_F_DONT_FRAGMENT) ?
282                           IP_DF_NBO : 0;
283         ipHdr->ttl = tunKey->ttl ? tunKey->ttl : VXLAN_DEFAULT_TTL;
284         ipHdr->protocol = IPPROTO_UDP;
285         ASSERT(tunKey->dst == fwdInfo->dstIpAddr);
286         ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
287         ipHdr->saddr = fwdInfo->srcIpAddr;
288         ipHdr->daddr = fwdInfo->dstIpAddr;
289
290         ipHdr->check = 0;
291         ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
292
293         /* UDP header */
294         udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
295         udpHdr->source = htons(tunKey->flow_hash | MAXINT16);
296         udpHdr->dest = htons(vportVxlan->dstPort);
297         udpHdr->len = htons(NET_BUFFER_DATA_LENGTH(curNb) - headRoom +
298                             sizeof *udpHdr + sizeof *vxlanHdr);
299         udpHdr->check = 0;
300
301         /* VXLAN header */
302         vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
303         vxlanHdr->flags1 = 0;
304         vxlanHdr->locallyReplicate = 0;
305         vxlanHdr->flags2 = 0;
306         vxlanHdr->reserved1 = 0;
307         if (tunKey->flags | OVS_TNL_F_KEY) {
308             vxlanHdr->vxlanID = VXLAN_TUNNELID_TO_VNI(tunKey->tunnelId);
309             vxlanHdr->instanceID = 1;
310         }
311         vxlanHdr->reserved2 = 0;
312     }
313     return STATUS_SUCCESS;
314
315 ret_error:
316     OvsCompleteNBL(switchContext, *newNbl, TRUE);
317     *newNbl = NULL;
318     return status;
319 }
320
321
322 /*
323  *----------------------------------------------------------------------------
324  * OvsEncapVxlan --
325  *     Encapsulates the packet if L2/L3 for destination resolves. Otherwise,
326  *     enqueues a callback that does encapsulatation after resolution.
327  *----------------------------------------------------------------------------
328  */
329 NDIS_STATUS
330 OvsEncapVxlan(POVS_VPORT_ENTRY vport,
331               PNET_BUFFER_LIST curNbl,
332               OvsIPv4TunnelKey *tunKey,
333               POVS_SWITCH_CONTEXT switchContext,
334               POVS_PACKET_HDR_INFO layers,
335               PNET_BUFFER_LIST *newNbl)
336 {
337     NTSTATUS status;
338     OVS_FWD_INFO fwdInfo;
339
340     status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
341     if (status != STATUS_SUCCESS) {
342         OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
343         // return NDIS_STATUS_PENDING;
344         /*
345          * XXX: Don't know if the completionList will make any sense when
346          * accessed in the callback. Make sure the caveats are known.
347          *
348          * XXX: This code will work once we are able to grab locks in the
349          * callback.
350          */
351         return NDIS_STATUS_FAILURE;
352     }
353
354     return OvsDoEncapVxlan(vport, curNbl, tunKey, &fwdInfo, layers,
355                            switchContext, newNbl);
356 }
357
358 /*
359  *----------------------------------------------------------------------------
360  * OvsCalculateUDPChecksum
361  *     Calculate UDP checksum
362  *----------------------------------------------------------------------------
363  */
364 static __inline NDIS_STATUS
365 OvsCalculateUDPChecksum(PNET_BUFFER_LIST curNbl,
366                         PNET_BUFFER curNb,
367                         IPHdr *ipHdr,
368                         UDPHdr *udpHdr,
369                         UINT32 packetLength)
370 {
371     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
372     UINT16 checkSum;
373
374     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo);
375
376     /* Next check if UDP checksum has been calculated. */
377     if (!csumInfo.Receive.UdpChecksumSucceeded) {
378         UINT32 l4Payload;
379
380         checkSum = udpHdr->check;
381
382         l4Payload = packetLength - sizeof(EthHdr) - ipHdr->ihl * 4;
383         udpHdr->check = 0;
384         udpHdr->check =
385             IPPseudoChecksum((UINT32 *)&ipHdr->saddr,
386                              (UINT32 *)&ipHdr->daddr,
387                              IPPROTO_UDP, (UINT16)l4Payload);
388         udpHdr->check = CalculateChecksumNB(curNb, (UINT16)l4Payload,
389             sizeof(EthHdr) + ipHdr->ihl * 4);
390         if (checkSum != udpHdr->check) {
391             OVS_LOG_TRACE("UDP checksum incorrect.");
392             return NDIS_STATUS_INVALID_PACKET;
393         }
394     }
395
396     csumInfo.Receive.UdpChecksumSucceeded = 1;
397     NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
398     return NDIS_STATUS_SUCCESS;
399 }
400
401 /*
402  *----------------------------------------------------------------------------
403  * OvsDecapVxlan
404  *     Decapsulates to tunnel header in 'curNbl' and puts into 'tunKey'.
405  *----------------------------------------------------------------------------
406  */
407 NDIS_STATUS
408 OvsDecapVxlan(POVS_SWITCH_CONTEXT switchContext,
409               PNET_BUFFER_LIST curNbl,
410               OvsIPv4TunnelKey *tunKey,
411               PNET_BUFFER_LIST *newNbl)
412 {
413     PNET_BUFFER curNb;
414     PMDL curMdl;
415     EthHdr *ethHdr;
416     IPHdr *ipHdr;
417     UDPHdr *udpHdr;
418     VXLANHdr *vxlanHdr;
419     UINT32 tunnelSize = 0, packetLength = 0;
420     PUINT8 bufferStart;
421     NDIS_STATUS status;
422
423     /* Check the length of the UDP payload */
424     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
425     packetLength = NET_BUFFER_DATA_LENGTH(curNb);
426     tunnelSize = OvsGetVxlanTunHdrSize();
427     if (packetLength <= tunnelSize) {
428         return NDIS_STATUS_INVALID_LENGTH;
429     }
430
431     /*
432      * Create a copy of the NBL so that we have all the headers in one MDL.
433      */
434     *newNbl = OvsPartialCopyNBL(switchContext, curNbl,
435                                 tunnelSize + OVS_DEFAULT_COPY_SIZE, 0,
436                                 TRUE /*copy NBL info */);
437
438     if (*newNbl == NULL) {
439         return NDIS_STATUS_RESOURCES;
440     }
441
442     /* XXX: Handle VLAN header. */
443     curNbl = *newNbl;
444     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
445     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
446     bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority) +
447                   NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
448     if (!bufferStart) {
449         status = NDIS_STATUS_RESOURCES;
450         goto dropNbl;
451     }
452
453     ethHdr = (EthHdr *)bufferStart;
454     /* XXX: Handle IP options. */
455     ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
456     tunKey->src = ipHdr->saddr;
457     tunKey->dst = ipHdr->daddr;
458     tunKey->tos = ipHdr->tos;
459     tunKey->ttl = ipHdr->ttl;
460     tunKey->pad = 0;
461     udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
462
463     /* Validate if NIC has indicated checksum failure. */
464     status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0);
465     if (status != NDIS_STATUS_SUCCESS) {
466         goto dropNbl;
467     }
468
469     /* Calculate and verify UDP checksum if NIC didn't do it. */
470     if (udpHdr->check != 0) {
471         status = OvsCalculateUDPChecksum(curNbl, curNb, ipHdr, udpHdr, packetLength);
472         if (status != NDIS_STATUS_SUCCESS) {
473             goto dropNbl;
474         }
475     }
476
477     vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
478     if (vxlanHdr->instanceID) {
479         tunKey->flags = OVS_TNL_F_KEY;
480         tunKey->tunnelId = VXLAN_VNI_TO_TUNNELID(vxlanHdr->vxlanID);
481     } else {
482         tunKey->flags = 0;
483         tunKey->tunnelId = 0;
484     }
485
486     /* Clear out the receive flag for the inner packet. */
487     NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
488     NdisAdvanceNetBufferDataStart(curNb, tunnelSize, FALSE, NULL);
489     return NDIS_STATUS_SUCCESS;
490
491 dropNbl:
492     OvsCompleteNBL(switchContext, *newNbl, TRUE);
493     *newNbl = NULL;
494     return status;
495 }
496
497
498 NDIS_STATUS
499 OvsSlowPathDecapVxlan(const PNET_BUFFER_LIST packet,
500                    OvsIPv4TunnelKey *tunnelKey)
501 {
502     NDIS_STATUS status = NDIS_STATUS_FAILURE;
503     UDPHdr udpStorage;
504     const UDPHdr *udp;
505     VXLANHdr *VxlanHeader;
506     VXLANHdr  VxlanHeaderBuffer;
507     struct IPHdr ip_storage;
508     const struct IPHdr *nh;
509     OVS_PACKET_HDR_INFO layers;
510
511     layers.value = 0;
512
513     do {
514         nh = OvsGetIp(packet, layers.l3Offset, &ip_storage);
515         if (nh) {
516             layers.l4Offset = layers.l3Offset + nh->ihl * 4;
517         } else {
518             break;
519         }
520
521         /* make sure it's a VXLAN packet */
522         udp = OvsGetUdp(packet, layers.l4Offset, &udpStorage);
523         if (udp) {
524             layers.l7Offset = layers.l4Offset + sizeof *udp;
525         } else {
526             break;
527         }
528
529         VxlanHeader = (VXLANHdr *)OvsGetPacketBytes(packet,
530                                                     sizeof(*VxlanHeader),
531                                                     layers.l7Offset,
532                                                     &VxlanHeaderBuffer);
533
534         if (VxlanHeader) {
535             tunnelKey->src = nh->saddr;
536             tunnelKey->dst = nh->daddr;
537             tunnelKey->ttl = nh->ttl;
538             tunnelKey->tos = nh->tos;
539             if (VxlanHeader->instanceID) {
540                 tunnelKey->flags = OVS_TNL_F_KEY;
541                 tunnelKey->tunnelId = VXLAN_VNI_TO_TUNNELID(VxlanHeader->vxlanID);
542             } else {
543                 tunnelKey->flags = 0;
544                 tunnelKey->tunnelId = 0;
545             }
546         } else {
547             break;
548         }
549         status = NDIS_STATUS_SUCCESS;
550
551     } while(FALSE);
552
553     return status;
554 }
555
556 #pragma warning( pop )