datapath-windows: Removed memory barrier and master lock
[cascardo/ovs.git] / datapath-windows / ovsext / Vxlan.c
1 /*
2  * Copyright (c) 2014 VMware, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at:
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "precomp.h"
18 #include "NetProto.h"
19 #include "Switch.h"
20 #include "Vport.h"
21 #include "Flow.h"
22 #include "Vxlan.h"
23 #include "IpHelper.h"
24 #include "Checksum.h"
25 #include "User.h"
26 #include "PacketIO.h"
27 #include "Flow.h"
28 #include "PacketParser.h"
29
30 #pragma warning( push )
31 #pragma warning( disable:4127 )
32
33
34 #ifdef OVS_DBG_MOD
35 #undef OVS_DBG_MOD
36 #endif
37 #define OVS_DBG_MOD OVS_DBG_VXLAN
38 #include "Debug.h"
39
40 /* Helper macro to check if a VXLAN ID is valid. */
41 #define VXLAN_ID_IS_VALID(vxlanID) (0 < (vxlanID) && (vxlanID) <= 0xffffff)
42 #define VXLAN_TUNNELID_TO_VNI(_tID)   (UINT32)(((UINT64)(_tID)) >> 40)
43 #define VXLAN_VNI_TO_TUNNELID(_vni) (((UINT64)(_vni)) << 40)
44 #define IP_DF_NBO 0x0040
45 #define VXLAN_DEFAULT_TTL 64
46 #define VXLAN_MULTICAST_TTL 64
47 #define VXLAN_DEFAULT_INSTANCE_ID 1
48
49 /* Move to a header file */
50 extern POVS_SWITCH_CONTEXT gOvsSwitchContext;
51
52 /*
53  *----------------------------------------------------------------------------
54  * This function verifies if the VXLAN tunnel already exists, in order to
55  * avoid sending a duplicate request to the WFP base filtering engine.
56  *----------------------------------------------------------------------------
57  */
58 static BOOLEAN
59 OvsIsTunnelFilterCreated(POVS_SWITCH_CONTEXT switchContext,
60                          UINT16 udpPortDest)
61 {
62     for (UINT hash = 0; hash < OVS_MAX_VPORT_ARRAY_SIZE; hash++) {
63         PLIST_ENTRY head, link, next;
64
65         head = &(switchContext->portNoHashArray[hash & OVS_VPORT_MASK]);
66         LIST_FORALL_SAFE(head, link, next) {
67             POVS_VPORT_ENTRY vport = NULL;
68             POVS_VXLAN_VPORT vxlanPort = NULL;
69             vport = CONTAINING_RECORD(link, OVS_VPORT_ENTRY, portNoLink);
70             vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
71             if (vxlanPort) {
72                 if ((udpPortDest == vxlanPort->dstPort)) {
73                     /* The VXLAN tunnel was already created. */
74                     return TRUE;
75                 }
76             }
77         }
78     }
79
80     return FALSE;
81 }
82
83 /*
84  *----------------------------------------------------------------------------
85  * This function allocates and initializes the OVS_VXLAN_VPORT. The function
86  * also creates a WFP tunnel filter for the necessary destination port. The
87  * tunnel filter create request is passed to the tunnel filter threads that
88  * will complete the request at a later time when IRQL is lowered to
89  * PASSIVE_LEVEL.
90  *
91  * udpDestPort: the vxlan is set as payload to a udp frame. If the destination
92  * port of an udp frame is udpDestPort, we understand it to be vxlan.
93  *----------------------------------------------------------------------------
94  */
95 NTSTATUS
96 OvsInitVxlanTunnel(PIRP irp,
97                    POVS_VPORT_ENTRY vport,
98                    UINT16 udpDestPort,
99                    PFNTunnelVportPendingOp callback,
100                    PVOID tunnelContext)
101 {
102     NTSTATUS status = STATUS_SUCCESS;
103     POVS_VXLAN_VPORT vxlanPort = NULL;
104
105     vxlanPort = OvsAllocateMemoryWithTag(sizeof (*vxlanPort),
106                                          OVS_VXLAN_POOL_TAG);
107     if (vxlanPort == NULL) {
108         return STATUS_INSUFFICIENT_RESOURCES;
109     }
110
111     RtlZeroMemory(vxlanPort, sizeof(*vxlanPort));
112     vxlanPort->dstPort = udpDestPort;
113     vport->priv = (PVOID)vxlanPort;
114
115     if (!OvsIsTunnelFilterCreated(gOvsSwitchContext, udpDestPort)) {
116         status = OvsTunelFilterCreate(irp,
117                                       udpDestPort,
118                                       &vxlanPort->filterID,
119                                       callback,
120                                       tunnelContext);
121     } else {
122         status = STATUS_OBJECT_NAME_EXISTS;
123     }
124
125     return status;
126 }
127
128 /*
129  *----------------------------------------------------------------------------
130  * This function releases the OVS_VXLAN_VPORT. The function also deletes the
131  * WFP tunnel filter previously created. The tunnel filter delete request is
132  * passed to the tunnel filter threads that will complete the request at a
133  * later time when IRQL is lowered to PASSIVE_LEVEL.
134  *----------------------------------------------------------------------------
135  */
136 NTSTATUS
137 OvsCleanupVxlanTunnel(PIRP irp,
138                       POVS_VPORT_ENTRY vport,
139                       PFNTunnelVportPendingOp callback,
140                       PVOID tunnelContext)
141 {
142     NTSTATUS status = STATUS_SUCCESS;
143     POVS_VXLAN_VPORT vxlanPort = NULL;
144
145     if (vport->ovsType != OVS_VPORT_TYPE_VXLAN ||
146         vport->priv == NULL) {
147         return STATUS_SUCCESS;
148     }
149
150     vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
151
152     if (vxlanPort->filterID != 0) {
153         status = OvsTunelFilterDelete(irp,
154                                       vxlanPort->filterID,
155                                       callback,
156                                       tunnelContext);
157     }
158
159     OvsFreeMemoryWithTag(vport->priv, OVS_VXLAN_POOL_TAG);
160     vport->priv = NULL;
161
162     return status;
163 }
164
165
166 /*
167  *----------------------------------------------------------------------------
168  * OvsDoEncapVxlan
169  *     Encapsulates the packet.
170  *----------------------------------------------------------------------------
171  */
172 static __inline NDIS_STATUS
173 OvsDoEncapVxlan(PNET_BUFFER_LIST curNbl,
174                 OvsIPv4TunnelKey *tunKey,
175                 POVS_FWD_INFO fwdInfo,
176                 POVS_PACKET_HDR_INFO layers,
177                 POVS_SWITCH_CONTEXT switchContext,
178                 PNET_BUFFER_LIST *newNbl)
179 {
180     NDIS_STATUS status;
181     PNET_BUFFER curNb;
182     PMDL curMdl;
183     PUINT8 bufferStart;
184     EthHdr *ethHdr;
185     IPHdr *ipHdr;
186     UDPHdr *udpHdr;
187     VXLANHdr *vxlanHdr;
188     UINT32 headRoom = OvsGetVxlanTunHdrSize();
189     UINT32 packetLength;
190
191     /*
192      * XXX: the assumption currently is that the NBL is owned by OVS, and
193      * headroom has already been allocated as part of allocating the NBL and
194      * MDL.
195      */
196     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
197     packetLength = NET_BUFFER_DATA_LENGTH(curNb);
198     if (layers->isTcp) {
199         NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO tsoInfo;
200
201         tsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
202                 TcpLargeSendNetBufferListInfo);
203         OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo.LsoV1Transmit.MSS, packetLength);
204         if (tsoInfo.LsoV1Transmit.MSS) {
205             OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
206             *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
207                         tsoInfo.LsoV1Transmit.MSS, headRoom);
208             if (*newNbl == NULL) {
209                 OVS_LOG_ERROR("Unable to segment NBL");
210                 return NDIS_STATUS_FAILURE;
211             }
212         }
213     }
214     /* If we didn't split the packet above, make a copy now */
215     if (*newNbl == NULL) {
216         *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
217                                     FALSE /*NBL info*/);
218         if (*newNbl == NULL) {
219             OVS_LOG_ERROR("Unable to copy NBL");
220             return NDIS_STATUS_FAILURE;
221         }
222     }
223
224     curNbl = *newNbl;
225     for (curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curNb != NULL;
226             curNb = curNb->Next) {
227         status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
228         if (status != NDIS_STATUS_SUCCESS) {
229             goto ret_error;
230         }
231
232         curMdl = NET_BUFFER_CURRENT_MDL(curNb);
233         bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
234         if (!bufferStart) {
235             status = NDIS_STATUS_RESOURCES;
236             goto ret_error;
237         }
238
239         bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
240         if (NET_BUFFER_NEXT_NB(curNb)) {
241             OVS_LOG_TRACE("nb length %u next %u", NET_BUFFER_DATA_LENGTH(curNb),
242                           NET_BUFFER_DATA_LENGTH(curNb->Next));
243         }
244
245         /* L2 header */
246         ethHdr = (EthHdr *)bufferStart;
247         NdisMoveMemory(ethHdr->Destination, fwdInfo->dstMacAddr,
248                        sizeof ethHdr->Destination + sizeof ethHdr->Source);
249         ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
250                (PCHAR)&fwdInfo->srcMacAddr);
251         ethHdr->Type = htons(ETH_TYPE_IPV4);
252
253         // XXX: question: there are fields in the OvsIPv4TunnelKey for ttl and such,
254         // should we use those values instead? or will they end up being
255         // uninitialized;
256         /* IP header */
257         ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
258
259         ipHdr->ihl = sizeof *ipHdr / 4;
260         ipHdr->version = IPV4;
261         ipHdr->tos = 0;
262         ipHdr->tot_len = htons(NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ethHdr);
263         ipHdr->id = 0;
264         ipHdr->frag_off = IP_DF_NBO;
265         ipHdr->ttl = tunKey->ttl ? tunKey->ttl : VXLAN_DEFAULT_TTL;
266         ipHdr->protocol = IPPROTO_UDP;
267         ASSERT(tunKey->dst == fwdInfo->dstIpAddr);
268         ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
269         ipHdr->saddr = fwdInfo->srcIpAddr;
270         ipHdr->daddr = fwdInfo->dstIpAddr;
271         ipHdr->check = 0;
272         ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
273
274         /* UDP header */
275         udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
276         udpHdr->source = htons(tunKey->flow_hash | 32768);
277         udpHdr->dest = htons(tunKey->dst_port);
278         udpHdr->len = htons(NET_BUFFER_DATA_LENGTH(curNb) - headRoom +
279                             sizeof *udpHdr + sizeof *vxlanHdr);
280         udpHdr->check = 0;
281
282         /* VXLAN header */
283         vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
284         vxlanHdr->flags1 = 0;
285         vxlanHdr->locallyReplicate = 0;
286         vxlanHdr->flags2 = 0;
287         vxlanHdr->reserved1 = 0;
288         if (tunKey->flags | OVS_TNL_F_KEY) {
289             vxlanHdr->vxlanID = VXLAN_TUNNELID_TO_VNI(tunKey->tunnelId);
290             vxlanHdr->instanceID = 1;
291         }
292         vxlanHdr->reserved2 = 0;
293     }
294     return STATUS_SUCCESS;
295
296 ret_error:
297     OvsCompleteNBL(switchContext, *newNbl, TRUE);
298     *newNbl = NULL;
299     return status;
300 }
301
302
303 /*
304  *----------------------------------------------------------------------------
305  * OvsEncapVxlan --
306  *     Encapsulates the packet if L2/L3 for destination resolves. Otherwise,
307  *     enqueues a callback that does encapsulatation after resolution.
308  *----------------------------------------------------------------------------
309  */
310 NDIS_STATUS
311 OvsEncapVxlan(PNET_BUFFER_LIST curNbl,
312               OvsIPv4TunnelKey *tunKey,
313               POVS_SWITCH_CONTEXT switchContext,
314               VOID *completionList,
315               POVS_PACKET_HDR_INFO layers,
316               PNET_BUFFER_LIST *newNbl)
317 {
318     NTSTATUS status;
319     OVS_FWD_INFO fwdInfo;
320     UNREFERENCED_PARAMETER(completionList);
321
322     status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
323     if (status != STATUS_SUCCESS) {
324         OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
325         // return NDIS_STATUS_PENDING;
326         /*
327          * XXX: Don't know if the completionList will make any sense when
328          * accessed in the callback. Make sure the caveats are known.
329          *
330          * XXX: This code will work once we are able to grab locks in the
331          * callback.
332          */
333         return NDIS_STATUS_FAILURE;
334     }
335
336     return OvsDoEncapVxlan(curNbl, tunKey, &fwdInfo, layers,
337                            switchContext, newNbl);
338 }
339
340
341 /*
342  *----------------------------------------------------------------------------
343  * OvsIpHlprCbVxlan --
344  *     Callback function for IP helper.
345  *     XXX: not used currently
346  *----------------------------------------------------------------------------
347  */
348 static VOID
349 OvsIpHlprCbVxlan(PNET_BUFFER_LIST curNbl,
350                  UINT32 inPort,
351                  OvsIPv4TunnelKey *tunKey,
352                  PVOID cbData1,
353                  PVOID cbData2,
354                  NTSTATUS result,
355                  POVS_FWD_INFO fwdInfo)
356 {
357     OVS_PACKET_HDR_INFO layers;
358     OvsFlowKey key;
359     NDIS_STATUS status;
360     UNREFERENCED_PARAMETER(inPort);
361
362     status = OvsExtractFlow(curNbl, inPort, &key, &layers, NULL);
363     if (result == STATUS_SUCCESS) {
364         status = OvsDoEncapVxlan(curNbl, tunKey, fwdInfo, &layers,
365                 (POVS_SWITCH_CONTEXT)cbData1, NULL);
366     } else {
367         status = NDIS_STATUS_FAILURE;
368     }
369
370     if (status != NDIS_STATUS_SUCCESS) {
371         // XXX: Free up the NBL;
372         return;
373     }
374
375     OvsLookupFlowOutput((POVS_SWITCH_CONTEXT)cbData1, cbData2, curNbl);
376 }
377
378 /*
379  *----------------------------------------------------------------------------
380  * OvsCalculateUDPChecksum
381  *     Calculate UDP checksum
382  *----------------------------------------------------------------------------
383  */
384 static __inline NDIS_STATUS
385 OvsCalculateUDPChecksum(PNET_BUFFER_LIST curNbl,
386                         PNET_BUFFER curNb,
387                         IPHdr *ipHdr,
388                         UDPHdr *udpHdr,
389                         UINT32 packetLength)
390 {
391     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
392     UINT16 checkSum;
393
394     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo);
395
396     /* Next check if UDP checksum has been calculated. */
397     if (!csumInfo.Receive.UdpChecksumSucceeded) {
398         UINT32 l4Payload;
399
400         checkSum = udpHdr->check;
401
402         l4Payload = packetLength - sizeof(EthHdr) - ipHdr->ihl * 4;
403         udpHdr->check = 0;
404         udpHdr->check =
405             IPPseudoChecksum((UINT32 *)&ipHdr->saddr,
406                              (UINT32 *)&ipHdr->daddr,
407                              IPPROTO_UDP, (UINT16)l4Payload);
408         udpHdr->check = CalculateChecksumNB(curNb, (UINT16)l4Payload,
409             sizeof(EthHdr) + ipHdr->ihl * 4);
410         if (checkSum != udpHdr->check) {
411             OVS_LOG_TRACE("UDP checksum incorrect.");
412             return NDIS_STATUS_INVALID_PACKET;
413         }
414     }
415
416     csumInfo.Receive.UdpChecksumSucceeded = 1;
417     NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
418     return NDIS_STATUS_SUCCESS;
419 }
420
421 /*
422  *----------------------------------------------------------------------------
423  * OvsDoDecapVxlan
424  *     Decapsulates to tunnel header in 'curNbl' and puts into 'tunKey'.
425  *----------------------------------------------------------------------------
426  */
427 NDIS_STATUS
428 OvsDoDecapVxlan(POVS_SWITCH_CONTEXT switchContext,
429                 PNET_BUFFER_LIST curNbl,
430                 OvsIPv4TunnelKey *tunKey,
431                 PNET_BUFFER_LIST *newNbl)
432 {
433     PNET_BUFFER curNb;
434     PMDL curMdl;
435     EthHdr *ethHdr;
436     IPHdr *ipHdr;
437     UDPHdr *udpHdr;
438     VXLANHdr *vxlanHdr;
439     UINT32 tunnelSize = 0, packetLength = 0;
440     PUINT8 bufferStart;
441     NDIS_STATUS status;
442
443     /* Check the the length of the UDP payload */
444     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
445     packetLength = NET_BUFFER_DATA_LENGTH(curNb);
446     tunnelSize = OvsGetVxlanTunHdrSize();
447     if (packetLength <= tunnelSize) {
448         return NDIS_STATUS_INVALID_LENGTH;
449     }
450
451     /*
452      * Create a copy of the NBL so that we have all the headers in one MDL.
453      */
454     *newNbl = OvsPartialCopyNBL(switchContext, curNbl,
455                                 tunnelSize + OVS_DEFAULT_COPY_SIZE, 0,
456                                 TRUE /*copy NBL info */);
457
458     if (*newNbl == NULL) {
459         return NDIS_STATUS_RESOURCES;
460     }
461
462     /* XXX: Handle VLAN header. */
463     curNbl = *newNbl;
464     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
465     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
466     bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority) +
467                   NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
468     if (!bufferStart) {
469         status = NDIS_STATUS_RESOURCES;
470         goto dropNbl;
471     }
472
473     ethHdr = (EthHdr *)bufferStart;
474     /* XXX: Handle IP options. */
475     ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
476     tunKey->src = ipHdr->saddr;
477     tunKey->dst = ipHdr->daddr;
478     tunKey->tos = ipHdr->tos;
479     tunKey->ttl = ipHdr->ttl;
480     tunKey->pad = 0;
481     udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
482
483     /* Validate if NIC has indicated checksum failure. */
484     status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0);
485     if (status != NDIS_STATUS_SUCCESS) {
486         goto dropNbl;
487     }
488
489     /* Calculate and verify UDP checksum if NIC didn't do it. */
490     if (udpHdr->check != 0) {
491         status = OvsCalculateUDPChecksum(curNbl, curNb, ipHdr, udpHdr, packetLength);
492         if (status != NDIS_STATUS_SUCCESS) {
493             goto dropNbl;
494         }
495     }
496
497     vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
498     if (vxlanHdr->instanceID) {
499         tunKey->flags = OVS_TNL_F_KEY;
500         tunKey->tunnelId = VXLAN_VNI_TO_TUNNELID(vxlanHdr->vxlanID);
501     } else {
502         tunKey->flags = 0;
503         tunKey->tunnelId = 0;
504     }
505
506     /* Clear out the receive flag for the inner packet. */
507     NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
508     NdisAdvanceNetBufferDataStart(curNb, tunnelSize, FALSE, NULL);
509     return NDIS_STATUS_SUCCESS;
510
511 dropNbl:
512     OvsCompleteNBL(switchContext, *newNbl, TRUE);
513     *newNbl = NULL;
514     return status;
515 }
516
517
518 NDIS_STATUS
519 OvsSlowPathDecapVxlan(const PNET_BUFFER_LIST packet,
520                    OvsIPv4TunnelKey *tunnelKey)
521 {
522     NDIS_STATUS status = NDIS_STATUS_FAILURE;
523     UDPHdr udpStorage;
524     const UDPHdr *udp;
525     VXLANHdr *VxlanHeader;
526     VXLANHdr  VxlanHeaderBuffer;
527     struct IPHdr ip_storage;
528     const struct IPHdr *nh;
529     OVS_PACKET_HDR_INFO layers;
530
531     layers.value = 0;
532
533     do {
534         nh = OvsGetIp(packet, layers.l3Offset, &ip_storage);
535         if (nh) {
536             layers.l4Offset = layers.l3Offset + nh->ihl * 4;
537         } else {
538             break;
539         }
540
541         /* make sure it's a VXLAN packet */
542         udp = OvsGetUdp(packet, layers.l4Offset, &udpStorage);
543         if (udp) {
544             layers.l7Offset = layers.l4Offset + sizeof *udp;
545         } else {
546             break;
547         }
548
549         VxlanHeader = (VXLANHdr *)OvsGetPacketBytes(packet,
550                                                     sizeof(*VxlanHeader),
551                                                     layers.l7Offset,
552                                                     &VxlanHeaderBuffer);
553
554         if (VxlanHeader) {
555             tunnelKey->src = nh->saddr;
556             tunnelKey->dst = nh->daddr;
557             tunnelKey->ttl = nh->ttl;
558             tunnelKey->tos = nh->tos;
559             if (VxlanHeader->instanceID) {
560                 tunnelKey->flags = OVS_TNL_F_KEY;
561                 tunnelKey->tunnelId = VXLAN_VNI_TO_TUNNELID(VxlanHeader->vxlanID);
562             } else {
563                 tunnelKey->flags = 0;
564                 tunnelKey->tunnelId = 0;
565             }
566         } else {
567             break;
568         }
569         status = NDIS_STATUS_SUCCESS;
570
571     } while(FALSE);
572
573     return status;
574 }
575
576 #pragma warning( pop )