datapath-windows: Added specific pool tag for vxlan code
[cascardo/ovs.git] / datapath-windows / ovsext / Vxlan.c
1 /*
2  * Copyright (c) 2014 VMware, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at:
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "precomp.h"
18 #include "NetProto.h"
19 #include "Switch.h"
20 #include "Vport.h"
21 #include "Flow.h"
22 #include "Vxlan.h"
23 #include "IpHelper.h"
24 #include "Checksum.h"
25 #include "User.h"
26 #include "PacketIO.h"
27 #include "Flow.h"
28 #include "PacketParser.h"
29
30 #pragma warning( push )
31 #pragma warning( disable:4127 )
32
33
34 #ifdef OVS_DBG_MOD
35 #undef OVS_DBG_MOD
36 #endif
37 #define OVS_DBG_MOD OVS_DBG_VXLAN
38 #include "Debug.h"
39
40 /* Helper macro to check if a VXLAN ID is valid. */
41 #define VXLAN_ID_IS_VALID(vxlanID) (0 < (vxlanID) && (vxlanID) <= 0xffffff)
42 #define VXLAN_TUNNELID_TO_VNI(_tID)   (UINT32)(((UINT64)(_tID)) >> 40)
43 #define VXLAN_VNI_TO_TUNNELID(_vni) (((UINT64)(_vni)) << 40)
44 #define IP_DF_NBO 0x0040
45 #define VXLAN_DEFAULT_TTL 64
46 #define VXLAN_MULTICAST_TTL 64
47 #define VXLAN_DEFAULT_INSTANCE_ID 1
48
49 /* Move to a header file */
50 extern POVS_SWITCH_CONTEXT gOvsSwitchContext;
51
52 /*
53  * udpDestPort: the vxlan is set as payload to a udp frame. If the destination
54  * port of an udp frame is udpDestPort, we understand it to be vxlan.
55  */
56 NTSTATUS
57 OvsInitVxlanTunnel(POVS_VPORT_ENTRY vport,
58                    UINT16 udpDestPort)
59 {
60     POVS_VXLAN_VPORT vxlanPort;
61
62     vxlanPort = OvsAllocateMemoryWithTag(sizeof (*vxlanPort),
63                                          OVS_VXLAN_POOL_TAG);
64     if (vxlanPort == NULL) {
65         return STATUS_INSUFFICIENT_RESOURCES;
66     }
67
68     RtlZeroMemory(vxlanPort, sizeof(*vxlanPort));
69     vxlanPort->dstPort = udpDestPort;
70     /*
71      * since we are installing the WFP filter before the port is created
72      * We need to check if it is the same number
73      * XXX should be removed later
74      */
75     ASSERT(vxlanPort->dstPort == VXLAN_UDP_PORT);
76     vport->priv = (PVOID)vxlanPort;
77
78     return STATUS_SUCCESS;
79 }
80
81
82 VOID
83 OvsCleanupVxlanTunnel(POVS_VPORT_ENTRY vport)
84 {
85     if (vport->ovsType != OVS_VPORT_TYPE_VXLAN ||
86         vport->priv == NULL) {
87         return;
88     }
89
90     OvsFreeMemoryWithTag(vport->priv, OVS_VXLAN_POOL_TAG);
91     vport->priv = NULL;
92 }
93
94
95 /*
96  *----------------------------------------------------------------------------
97  * OvsDoEncapVxlan
98  *     Encapsulates the packet.
99  *----------------------------------------------------------------------------
100  */
101 static __inline NDIS_STATUS
102 OvsDoEncapVxlan(PNET_BUFFER_LIST curNbl,
103                 OvsIPv4TunnelKey *tunKey,
104                 POVS_FWD_INFO fwdInfo,
105                 POVS_PACKET_HDR_INFO layers,
106                 POVS_SWITCH_CONTEXT switchContext,
107                 PNET_BUFFER_LIST *newNbl)
108 {
109     NDIS_STATUS status;
110     PNET_BUFFER curNb;
111     PMDL curMdl;
112     PUINT8 bufferStart;
113     EthHdr *ethHdr;
114     IPHdr *ipHdr;
115     UDPHdr *udpHdr;
116     VXLANHdr *vxlanHdr;
117     UINT32 headRoom = OvsGetVxlanTunHdrSize();
118     UINT32 packetLength;
119
120     /*
121      * XXX: the assumption currently is that the NBL is owned by OVS, and
122      * headroom has already been allocated as part of allocating the NBL and
123      * MDL.
124      */
125     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
126     packetLength = NET_BUFFER_DATA_LENGTH(curNb);
127     if (layers->isTcp) {
128         NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO tsoInfo;
129
130         tsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
131                 TcpLargeSendNetBufferListInfo);
132         OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo.LsoV1Transmit.MSS, packetLength);
133         if (tsoInfo.LsoV1Transmit.MSS) {
134             OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
135             *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
136                         tsoInfo.LsoV1Transmit.MSS, headRoom);
137             if (*newNbl == NULL) {
138                 OVS_LOG_ERROR("Unable to segment NBL");
139                 return NDIS_STATUS_FAILURE;
140             }
141         }
142     }
143     /* If we didn't split the packet above, make a copy now */
144     if (*newNbl == NULL) {
145         *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
146                                     FALSE /*NBL info*/);
147         if (*newNbl == NULL) {
148             OVS_LOG_ERROR("Unable to copy NBL");
149             return NDIS_STATUS_FAILURE;
150         }
151     }
152
153     curNbl = *newNbl;
154     for (curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curNb != NULL;
155             curNb = curNb->Next) {
156         status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
157         if (status != NDIS_STATUS_SUCCESS) {
158             goto ret_error;
159         }
160
161         curMdl = NET_BUFFER_CURRENT_MDL(curNb);
162         bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
163         if (!bufferStart) {
164             status = NDIS_STATUS_RESOURCES;
165             goto ret_error;
166         }
167
168         bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
169         if (NET_BUFFER_NEXT_NB(curNb)) {
170             OVS_LOG_TRACE("nb length %u next %u", NET_BUFFER_DATA_LENGTH(curNb),
171                           NET_BUFFER_DATA_LENGTH(curNb->Next));
172         }
173
174         /* L2 header */
175         ethHdr = (EthHdr *)bufferStart;
176         NdisMoveMemory(ethHdr->Destination, fwdInfo->dstMacAddr,
177                        sizeof ethHdr->Destination + sizeof ethHdr->Source);
178         ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
179                (PCHAR)&fwdInfo->srcMacAddr);
180         ethHdr->Type = htons(ETH_TYPE_IPV4);
181
182         // XXX: question: there are fields in the OvsIPv4TunnelKey for ttl and such,
183         // should we use those values instead? or will they end up being
184         // uninitialized;
185         /* IP header */
186         ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
187
188         ipHdr->ihl = sizeof *ipHdr / 4;
189         ipHdr->version = IPV4;
190         ipHdr->tos = 0;
191         ipHdr->tot_len = htons(NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ethHdr);
192         ipHdr->id = 0;
193         ipHdr->frag_off = IP_DF_NBO;
194         ipHdr->ttl = tunKey->ttl ? tunKey->ttl : VXLAN_DEFAULT_TTL;
195         ipHdr->protocol = IPPROTO_UDP;
196         ASSERT(tunKey->dst == fwdInfo->dstIpAddr);
197         ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
198         ipHdr->saddr = fwdInfo->srcIpAddr;
199         ipHdr->daddr = fwdInfo->dstIpAddr;
200         ipHdr->check = 0;
201         ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
202
203         /* UDP header */
204         udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
205         udpHdr->source = htons(tunKey->flow_hash | 32768);
206         udpHdr->dest = VXLAN_UDP_PORT_NBO;
207         udpHdr->len = htons(NET_BUFFER_DATA_LENGTH(curNb) - headRoom +
208                             sizeof *udpHdr + sizeof *vxlanHdr);
209         udpHdr->check = 0;
210
211         /* VXLAN header */
212         vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
213         vxlanHdr->flags1 = 0;
214         vxlanHdr->locallyReplicate = 0;
215         vxlanHdr->flags2 = 0;
216         vxlanHdr->reserved1 = 0;
217         if (tunKey->flags | OVS_TNL_F_KEY) {
218             vxlanHdr->vxlanID = VXLAN_TUNNELID_TO_VNI(tunKey->tunnelId);
219             vxlanHdr->instanceID = 1;
220         }
221         vxlanHdr->reserved2 = 0;
222     }
223     return STATUS_SUCCESS;
224
225 ret_error:
226     OvsCompleteNBL(switchContext, *newNbl, TRUE);
227     *newNbl = NULL;
228     return status;
229 }
230
231
232 /*
233  *----------------------------------------------------------------------------
234  * OvsEncapVxlan --
235  *     Encapsulates the packet if L2/L3 for destination resolves. Otherwise,
236  *     enqueues a callback that does encapsulatation after resolution.
237  *----------------------------------------------------------------------------
238  */
239 NDIS_STATUS
240 OvsEncapVxlan(PNET_BUFFER_LIST curNbl,
241               OvsIPv4TunnelKey *tunKey,
242               POVS_SWITCH_CONTEXT switchContext,
243               VOID *completionList,
244               POVS_PACKET_HDR_INFO layers,
245               PNET_BUFFER_LIST *newNbl)
246 {
247     NTSTATUS status;
248     OVS_FWD_INFO fwdInfo;
249     UNREFERENCED_PARAMETER(completionList);
250
251     status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
252     if (status != STATUS_SUCCESS) {
253         OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
254         // return NDIS_STATUS_PENDING;
255         /*
256          * XXX: Don't know if the completionList will make any sense when
257          * accessed in the callback. Make sure the caveats are known.
258          *
259          * XXX: This code will work once we are able to grab locks in the
260          * callback.
261          */
262         return NDIS_STATUS_FAILURE;
263     }
264
265     return OvsDoEncapVxlan(curNbl, tunKey, &fwdInfo, layers,
266                            switchContext, newNbl);
267 }
268
269
270 /*
271  *----------------------------------------------------------------------------
272  * OvsIpHlprCbVxlan --
273  *     Callback function for IP helper.
274  *     XXX: not used currently
275  *----------------------------------------------------------------------------
276  */
277 static VOID
278 OvsIpHlprCbVxlan(PNET_BUFFER_LIST curNbl,
279                  UINT32 inPort,
280                  OvsIPv4TunnelKey *tunKey,
281                  PVOID cbData1,
282                  PVOID cbData2,
283                  NTSTATUS result,
284                  POVS_FWD_INFO fwdInfo)
285 {
286     OVS_PACKET_HDR_INFO layers;
287     OvsFlowKey key;
288     NDIS_STATUS status;
289     UNREFERENCED_PARAMETER(inPort);
290
291     status = OvsExtractFlow(curNbl, inPort, &key, &layers, NULL);
292     if (result == STATUS_SUCCESS) {
293         status = OvsDoEncapVxlan(curNbl, tunKey, fwdInfo, &layers,
294                 (POVS_SWITCH_CONTEXT)cbData1, NULL);
295     } else {
296         status = NDIS_STATUS_FAILURE;
297     }
298
299     if (status != NDIS_STATUS_SUCCESS) {
300         // XXX: Free up the NBL;
301         return;
302     }
303
304     OvsLookupFlowOutput((POVS_SWITCH_CONTEXT)cbData1, cbData2, curNbl);
305 }
306
307 /*
308  *----------------------------------------------------------------------------
309  * OvsCalculateUDPChecksum
310  *     Calculate UDP checksum
311  *----------------------------------------------------------------------------
312  */
313 static __inline NDIS_STATUS
314 OvsCalculateUDPChecksum(PNET_BUFFER_LIST curNbl,
315                         PNET_BUFFER curNb,
316                         IPHdr *ipHdr,
317                         UDPHdr *udpHdr,
318                         UINT32 packetLength)
319 {
320     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
321     UINT16 checkSum;
322
323     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo);
324
325     /* Next check if UDP checksum has been calculated. */
326     if (!csumInfo.Receive.UdpChecksumSucceeded) {
327         UINT32 l4Payload;
328
329         checkSum = udpHdr->check;
330
331         l4Payload = packetLength - sizeof(EthHdr) - ipHdr->ihl * 4;
332         udpHdr->check = 0;
333         udpHdr->check =
334             IPPseudoChecksum((UINT32 *)&ipHdr->saddr,
335                              (UINT32 *)&ipHdr->daddr,
336                              IPPROTO_UDP, (UINT16)l4Payload);
337         udpHdr->check = CalculateChecksumNB(curNb, (UINT16)l4Payload,
338             sizeof(EthHdr) + ipHdr->ihl * 4);
339         if (checkSum != udpHdr->check) {
340             OVS_LOG_TRACE("UDP checksum incorrect.");
341             return NDIS_STATUS_INVALID_PACKET;
342         }
343     }
344
345     csumInfo.Receive.UdpChecksumSucceeded = 1;
346     NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
347     return NDIS_STATUS_SUCCESS;
348 }
349
350 /*
351  *----------------------------------------------------------------------------
352  * OvsDoDecapVxlan
353  *     Decapsulates to tunnel header in 'curNbl' and puts into 'tunKey'.
354  *----------------------------------------------------------------------------
355  */
356 NDIS_STATUS
357 OvsDoDecapVxlan(POVS_SWITCH_CONTEXT switchContext,
358                 PNET_BUFFER_LIST curNbl,
359                 OvsIPv4TunnelKey *tunKey,
360                 PNET_BUFFER_LIST *newNbl)
361 {
362     PNET_BUFFER curNb;
363     PMDL curMdl;
364     EthHdr *ethHdr;
365     IPHdr *ipHdr;
366     UDPHdr *udpHdr;
367     VXLANHdr *vxlanHdr;
368     UINT32 tunnelSize = 0, packetLength = 0;
369     PUINT8 bufferStart;
370     NDIS_STATUS status;
371
372     /* Check the the length of the UDP payload */
373     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
374     packetLength = NET_BUFFER_DATA_LENGTH(curNb);
375     tunnelSize = OvsGetVxlanTunHdrSize();
376     if (packetLength <= tunnelSize) {
377         return NDIS_STATUS_INVALID_LENGTH;
378     }
379
380     /*
381      * Create a copy of the NBL so that we have all the headers in one MDL.
382      */
383     *newNbl = OvsPartialCopyNBL(switchContext, curNbl,
384                                 tunnelSize + OVS_DEFAULT_COPY_SIZE, 0,
385                                 TRUE /*copy NBL info */);
386
387     if (*newNbl == NULL) {
388         return NDIS_STATUS_RESOURCES;
389     }
390
391     /* XXX: Handle VLAN header. */
392     curNbl = *newNbl;
393     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
394     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
395     bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority) +
396                   NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
397     if (!bufferStart) {
398         status = NDIS_STATUS_RESOURCES;
399         goto dropNbl;
400     }
401
402     ethHdr = (EthHdr *)bufferStart;
403     /* XXX: Handle IP options. */
404     ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
405     tunKey->src = ipHdr->saddr;
406     tunKey->dst = ipHdr->daddr;
407     tunKey->tos = ipHdr->tos;
408     tunKey->ttl = ipHdr->ttl;
409     tunKey->pad = 0;
410     udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
411
412     /* Validate if NIC has indicated checksum failure. */
413     status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0);
414     if (status != NDIS_STATUS_SUCCESS) {
415         goto dropNbl;
416     }
417
418     /* Calculate and verify UDP checksum if NIC didn't do it. */
419     if (udpHdr->check != 0) {
420         status = OvsCalculateUDPChecksum(curNbl, curNb, ipHdr, udpHdr, packetLength);
421         if (status != NDIS_STATUS_SUCCESS) {
422             goto dropNbl;
423         }
424     }
425
426     vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
427     if (vxlanHdr->instanceID) {
428         tunKey->flags = OVS_TNL_F_KEY;
429         tunKey->tunnelId = VXLAN_VNI_TO_TUNNELID(vxlanHdr->vxlanID);
430     } else {
431         tunKey->flags = 0;
432         tunKey->tunnelId = 0;
433     }
434
435     /* Clear out the receive flag for the inner packet. */
436     NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
437     NdisAdvanceNetBufferDataStart(curNb, tunnelSize, FALSE, NULL);
438     return NDIS_STATUS_SUCCESS;
439
440 dropNbl:
441     OvsCompleteNBL(switchContext, *newNbl, TRUE);
442     *newNbl = NULL;
443     return status;
444 }
445
446
447 NDIS_STATUS
448 OvsSlowPathDecapVxlan(const PNET_BUFFER_LIST packet,
449                    OvsIPv4TunnelKey *tunnelKey)
450 {
451     NDIS_STATUS status = NDIS_STATUS_FAILURE;
452     UDPHdr udpStorage;
453     const UDPHdr *udp;
454     VXLANHdr *VxlanHeader;
455     VXLANHdr  VxlanHeaderBuffer;
456     struct IPHdr ip_storage;
457     const struct IPHdr *nh;
458     OVS_PACKET_HDR_INFO layers;
459
460     layers.value = 0;
461
462     do {
463         nh = OvsGetIp(packet, layers.l3Offset, &ip_storage);
464         if (nh) {
465             layers.l4Offset = layers.l3Offset + nh->ihl * 4;
466         } else {
467             break;
468         }
469
470         /* make sure it's a VXLAN packet */
471         udp = OvsGetUdp(packet, layers.l4Offset, &udpStorage);
472         if (udp) {
473             layers.l7Offset = layers.l4Offset + sizeof *udp;
474         } else {
475             break;
476         }
477
478         /* XXX Should be tested against the dynamic port # in the VXLAN vport */
479         ASSERT(udp->dest == RtlUshortByteSwap(VXLAN_UDP_PORT));
480
481         VxlanHeader = (VXLANHdr *)OvsGetPacketBytes(packet,
482                                                     sizeof(*VxlanHeader),
483                                                     layers.l7Offset,
484                                                     &VxlanHeaderBuffer);
485
486         if (VxlanHeader) {
487             tunnelKey->src = nh->saddr;
488             tunnelKey->dst = nh->daddr;
489             tunnelKey->ttl = nh->ttl;
490             tunnelKey->tos = nh->tos;
491             if (VxlanHeader->instanceID) {
492                 tunnelKey->flags = OVS_TNL_F_KEY;
493                 tunnelKey->tunnelId = VXLAN_VNI_TO_TUNNELID(VxlanHeader->vxlanID);
494             } else {
495                 tunnelKey->flags = 0;
496                 tunnelKey->tunnelId = 0;
497             }
498         } else {
499             break;
500         }
501         status = NDIS_STATUS_SUCCESS;
502
503     } while(FALSE);
504
505     return status;
506 }
507
508 #pragma warning( pop )