db1909e1575c6784e7a59baacfdf896c40af4f31
[cascardo/ovs.git] / datapath-windows / ovsext / OvsVxlan.c
1 /*
2  * Copyright (c) 2014 VMware, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at:
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "precomp.h"
18 #include "OvsNetProto.h"
19 #include "OvsSwitch.h"
20 #include "OvsVport.h"
21 #include "OvsFlow.h"
22 #include "OvsVxlan.h"
23 #include "OvsIpHelper.h"
24 #include "OvsChecksum.h"
25 #include "OvsUser.h"
26 #include "OvsPacketIO.h"
27 #include "OvsFlow.h"
28 #include "OvsPacketParser.h"
29 #include "OvsChecksum.h"
30
31 #pragma warning( push )
32 #pragma warning( disable:4127 )
33
34
35 #ifdef OVS_DBG_MOD
36 #undef OVS_DBG_MOD
37 #endif
38 #define OVS_DBG_MOD OVS_DBG_VXLAN
39 #include "OvsDebug.h"
40
41 /* Helper macro to check if a VXLAN ID is valid. */
42 #define VXLAN_ID_IS_VALID(vxlanID) (0 < (vxlanID) && (vxlanID) <= 0xffffff)
43 #define VXLAN_TUNNELID_TO_VNI(_tID)   (UINT32)(((UINT64)(_tID)) >> 40)
44 #define VXLAN_VNI_TO_TUNNELID(_vni) (((UINT64)(_vni)) << 40)
45 #define IP_DF_NBO 0x0040
46 #define VXLAN_DEFAULT_TTL 64
47 #define VXLAN_MULTICAST_TTL 64
48 #define VXLAN_DEFAULT_INSTANCE_ID 1
49
50 /* Move to a header file */
51 extern POVS_SWITCH_CONTEXT gOvsSwitchContext;
52
53 NTSTATUS
54 OvsInitVxlanTunnel(POVS_VPORT_ENTRY vport,
55                    POVS_VPORT_ADD_REQUEST addReq)
56 {
57     POVS_VXLAN_VPORT vxlanPort;
58     NTSTATUS status = STATUS_SUCCESS;
59
60     ASSERT(addReq->type == OVSWIN_VPORT_TYPE_VXLAN);
61
62     vxlanPort = OvsAllocateMemory(sizeof (*vxlanPort));
63     if (vxlanPort == NULL) {
64         status =  STATUS_INSUFFICIENT_RESOURCES;
65     } else {
66         RtlZeroMemory(vxlanPort, sizeof (*vxlanPort));
67         vxlanPort->dstPort = addReq->dstPort;
68         /*
69          * since we are installing the WFP filter before the port is created
70          * We need to check if it is the same number
71          * XXX should be removed later
72          */
73         ASSERT(vxlanPort->dstPort == VXLAN_UDP_PORT);
74         vport->priv = (PVOID)vxlanPort;
75     }
76     return status;
77 }
78
79
80 VOID
81 OvsCleanupVxlanTunnel(POVS_VPORT_ENTRY vport)
82 {
83     if (vport->ovsType != OVSWIN_VPORT_TYPE_VXLAN ||
84         vport->priv == NULL) {
85         return;
86     }
87
88     OvsFreeMemory(vport->priv);
89     vport->priv = NULL;
90 }
91
92
93 /*
94  *----------------------------------------------------------------------------
95  * OvsDoEncapVxlan
96  *     Encapsulates the packet.
97  *----------------------------------------------------------------------------
98  */
99 static __inline NDIS_STATUS
100 OvsDoEncapVxlan(PNET_BUFFER_LIST curNbl,
101                 OvsIPv4TunnelKey *tunKey,
102                 POVS_FWD_INFO fwdInfo,
103                 POVS_PACKET_HDR_INFO layers,
104                 POVS_SWITCH_CONTEXT switchContext,
105                 PNET_BUFFER_LIST *newNbl)
106 {
107     NDIS_STATUS status;
108     PNET_BUFFER curNb;
109     PMDL curMdl;
110     PUINT8 bufferStart;
111     EthHdr *ethHdr;
112     IPHdr *ipHdr;
113     UDPHdr *udpHdr;
114     VXLANHdr *vxlanHdr;
115     UINT32 headRoom = OvsGetVxlanTunHdrSize();
116     UINT32 packetLength;
117
118     /*
119      * XXX: the assumption currently is that the NBL is owned by OVS, and
120      * headroom has already been allocated as part of allocating the NBL and
121      * MDL.
122      */
123     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
124     packetLength = NET_BUFFER_DATA_LENGTH(curNb);
125     if (layers->isTcp) {
126         NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO tsoInfo;
127
128         tsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
129                 TcpLargeSendNetBufferListInfo);
130         OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo.LsoV1Transmit.MSS, packetLength);
131         if (tsoInfo.LsoV1Transmit.MSS) {
132             OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
133             *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
134                         tsoInfo.LsoV1Transmit.MSS, headRoom);
135             if (*newNbl == NULL) {
136                 OVS_LOG_ERROR("Unable to segment NBL");
137                 return NDIS_STATUS_FAILURE;
138             }
139         }
140     }
141     /* If we didn't split the packet above, make a copy now */
142     if (*newNbl == NULL) {
143         *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
144                                     FALSE /*NBL info*/);
145         if (*newNbl == NULL) {
146             OVS_LOG_ERROR("Unable to copy NBL");
147             return NDIS_STATUS_FAILURE;
148         }
149     }
150
151     curNbl = *newNbl;
152     for (curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curNb != NULL;
153             curNb = curNb->Next) {
154         status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
155         if (status != NDIS_STATUS_SUCCESS) {
156             goto ret_error;
157         }
158
159         curMdl = NET_BUFFER_CURRENT_MDL(curNb);
160         bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
161         if (!bufferStart) {
162             status = NDIS_STATUS_RESOURCES;
163             goto ret_error;
164         }
165
166         bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
167         if (NET_BUFFER_NEXT_NB(curNb)) {
168             OVS_LOG_TRACE("nb length %u next %u", NET_BUFFER_DATA_LENGTH(curNb),
169                           NET_BUFFER_DATA_LENGTH(curNb->Next));
170         }
171
172         /* L2 header */
173         ethHdr = (EthHdr *)bufferStart;
174         NdisMoveMemory(ethHdr->Destination, fwdInfo->dstMacAddr,
175                        sizeof ethHdr->Destination + sizeof ethHdr->Source);
176         ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
177                (PCHAR)&fwdInfo->srcMacAddr);
178         ethHdr->Type = htons(ETH_TYPE_IPV4);
179
180         // XXX: question: there are fields in the OvsIPv4TunnelKey for ttl and such,
181         // should we use those values instead? or will they end up being
182         // uninitialized;
183         /* IP header */
184         ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
185
186         ipHdr->ihl = sizeof *ipHdr / 4;
187         ipHdr->version = IPV4;
188         ipHdr->tos = 0;
189         ipHdr->tot_len = htons(NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ethHdr);
190         ipHdr->id = 0;
191         ipHdr->frag_off = IP_DF_NBO;
192         ipHdr->ttl = tunKey->ttl ? tunKey->ttl : VXLAN_DEFAULT_TTL;
193         ipHdr->protocol = IPPROTO_UDP;
194         ASSERT(tunKey->dst == fwdInfo->dstIpAddr);
195         ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
196         ipHdr->saddr = fwdInfo->srcIpAddr;
197         ipHdr->daddr = fwdInfo->dstIpAddr;
198         ipHdr->check = 0;
199         ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
200
201         /* UDP header */
202         udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
203         udpHdr->source = htons(tunKey->flow_hash | 32768);
204         udpHdr->dest = VXLAN_UDP_PORT_NBO;
205         udpHdr->len = htons(NET_BUFFER_DATA_LENGTH(curNb) - headRoom +
206                             sizeof *udpHdr + sizeof *vxlanHdr);
207         udpHdr->check = 0;
208
209         /* VXLAN header */
210         vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
211         vxlanHdr->flags1 = 0;
212         vxlanHdr->locallyReplicate = 0;
213         vxlanHdr->flags2 = 0;
214         vxlanHdr->reserved1 = 0;
215         if (tunKey->flags | OVS_TNL_F_KEY) {
216             vxlanHdr->vxlanID = VXLAN_TUNNELID_TO_VNI(tunKey->tunnelId);
217             vxlanHdr->instanceID = 1;
218         }
219         vxlanHdr->reserved2 = 0;
220     }
221     return STATUS_SUCCESS;
222
223 ret_error:
224     OvsCompleteNBL(switchContext, *newNbl, TRUE);
225     *newNbl = NULL;
226     return status;
227 }
228
229
230 /*
231  *----------------------------------------------------------------------------
232  * OvsEncapVxlan --
233  *     Encapsulates the packet if L2/L3 for destination resolves. Otherwise,
234  *     enqueues a callback that does encapsulatation after resolution.
235  *----------------------------------------------------------------------------
236  */
237 NDIS_STATUS
238 OvsEncapVxlan(PNET_BUFFER_LIST curNbl,
239               OvsIPv4TunnelKey *tunKey,
240               POVS_SWITCH_CONTEXT switchContext,
241               VOID *completionList,
242               POVS_PACKET_HDR_INFO layers,
243               PNET_BUFFER_LIST *newNbl)
244 {
245     NTSTATUS status;
246     OVS_FWD_INFO fwdInfo;
247     UNREFERENCED_PARAMETER(completionList);
248
249     status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
250     if (status != STATUS_SUCCESS) {
251         OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
252         // return NDIS_STATUS_PENDING;
253         /*
254          * XXX: Don't know if the completionList will make any sense when
255          * accessed in the callback. Make sure the caveats are known.
256          *
257          * XXX: This code will work once we are able to grab locks in the
258          * callback.
259          */
260         return NDIS_STATUS_FAILURE;
261     }
262
263     return OvsDoEncapVxlan(curNbl, tunKey, &fwdInfo, layers,
264                            switchContext, newNbl);
265 }
266
267
268 /*
269  *----------------------------------------------------------------------------
270  * OvsIpHlprCbVxlan --
271  *     Callback function for IP helper.
272  *     XXX: not used currently
273  *----------------------------------------------------------------------------
274  */
275 static VOID
276 OvsIpHlprCbVxlan(PNET_BUFFER_LIST curNbl,
277                  UINT32 inPort,
278                  OvsIPv4TunnelKey *tunKey,
279                  PVOID cbData1,
280                  PVOID cbData2,
281                  NTSTATUS result,
282                  POVS_FWD_INFO fwdInfo)
283 {
284     OVS_PACKET_HDR_INFO layers;
285     OvsFlowKey key;
286     NDIS_STATUS status;
287     UNREFERENCED_PARAMETER(inPort);
288
289     status = OvsExtractFlow(curNbl, inPort, &key, &layers, NULL);
290     if (result == STATUS_SUCCESS) {
291         status = OvsDoEncapVxlan(curNbl, tunKey, fwdInfo, &layers,
292                 (POVS_SWITCH_CONTEXT)cbData1, NULL);
293     } else {
294         status = NDIS_STATUS_FAILURE;
295     }
296
297     if (status != NDIS_STATUS_SUCCESS) {
298         // XXX: Free up the NBL;
299         return;
300     }
301
302     OvsLookupFlowOutput((POVS_SWITCH_CONTEXT)cbData1, cbData2, curNbl);
303 }
304
305 /*
306  *----------------------------------------------------------------------------
307  * OvsCalculateUDPChecksum
308  *     Calculate UDP checksum
309  *----------------------------------------------------------------------------
310  */
311 static __inline NDIS_STATUS
312 OvsCalculateUDPChecksum(PNET_BUFFER_LIST curNbl,
313                         PNET_BUFFER curNb,
314                         IPHdr *ipHdr,
315                         UDPHdr *udpHdr,
316                         UINT32 packetLength)
317 {
318     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
319     UINT16 checkSum;
320
321     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo);
322
323     /* Next check if UDP checksum has been calculated. */
324     if (!csumInfo.Receive.UdpChecksumSucceeded) {
325         UINT32 l4Payload;
326
327         checkSum = udpHdr->check;
328
329         l4Payload = packetLength - sizeof(EthHdr) - ipHdr->ihl * 4;
330         udpHdr->check = 0;
331         udpHdr->check =
332             IPPseudoChecksum((UINT32 *)&ipHdr->saddr,
333                              (UINT32 *)&ipHdr->daddr,
334                              IPPROTO_UDP, (UINT16)l4Payload);
335         udpHdr->check = CalculateChecksumNB(curNb, (UINT16)l4Payload,
336             sizeof(EthHdr) + ipHdr->ihl * 4);
337         if (checkSum != udpHdr->check) {
338             OVS_LOG_TRACE("UDP checksum incorrect.");
339             return NDIS_STATUS_INVALID_PACKET;
340         }
341     }
342
343     csumInfo.Receive.UdpChecksumSucceeded = 1;
344     NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
345     return NDIS_STATUS_SUCCESS;
346 }
347
348 /*
349  *----------------------------------------------------------------------------
350  * OvsDoDecapVxlan
351  *     Decapsulates to tunnel header in 'curNbl' and puts into 'tunKey'.
352  *----------------------------------------------------------------------------
353  */
354 NDIS_STATUS
355 OvsDoDecapVxlan(POVS_SWITCH_CONTEXT switchContext,
356                 PNET_BUFFER_LIST curNbl,
357                 OvsIPv4TunnelKey *tunKey,
358                 PNET_BUFFER_LIST *newNbl)
359 {
360     PNET_BUFFER curNb;
361     PMDL curMdl;
362     EthHdr *ethHdr;
363     IPHdr *ipHdr;
364     UDPHdr *udpHdr;
365     VXLANHdr *vxlanHdr;
366     UINT32 tunnelSize = 0, packetLength = 0;
367     PUINT8 bufferStart;
368     NDIS_STATUS status;
369
370     /* Check the the length of the UDP payload */
371     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
372     packetLength = NET_BUFFER_DATA_LENGTH(curNb);
373     tunnelSize = OvsGetVxlanTunHdrSize();
374     if (packetLength <= tunnelSize) {
375         return NDIS_STATUS_INVALID_LENGTH;
376     }
377
378     /*
379      * Create a copy of the NBL so that we have all the headers in one MDL.
380      */
381     *newNbl = OvsPartialCopyNBL(switchContext, curNbl,
382                                 tunnelSize + OVS_DEFAULT_COPY_SIZE, 0,
383                                 TRUE /*copy NBL info */);
384
385     if (*newNbl == NULL) {
386         return NDIS_STATUS_RESOURCES;
387     }
388
389     /* XXX: Handle VLAN header. */
390     curNbl = *newNbl;
391     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
392     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
393     bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority) +
394                   NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
395     if (!bufferStart) {
396         status = NDIS_STATUS_RESOURCES;
397         goto dropNbl;
398     }
399
400     ethHdr = (EthHdr *)bufferStart;
401     /* XXX: Handle IP options. */
402     ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
403     tunKey->src = ipHdr->saddr;
404     tunKey->dst = ipHdr->daddr;
405     tunKey->tos = ipHdr->tos;
406     tunKey->ttl = ipHdr->ttl;
407     tunKey->pad = 0;
408     udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
409
410     /* Validate if NIC has indicated checksum failure. */
411     status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0);
412     if (status != NDIS_STATUS_SUCCESS) {
413         goto dropNbl;
414     }
415
416     /* Calculate and verify UDP checksum if NIC didn't do it. */
417     if (udpHdr->check != 0) {
418         status = OvsCalculateUDPChecksum(curNbl, curNb, ipHdr, udpHdr, packetLength);
419         if (status != NDIS_STATUS_SUCCESS) {
420             goto dropNbl;
421         }
422     }
423
424     vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
425     if (vxlanHdr->instanceID) {
426         tunKey->flags = OVS_TNL_F_KEY;
427         tunKey->tunnelId = VXLAN_VNI_TO_TUNNELID(vxlanHdr->vxlanID);
428     } else {
429         tunKey->flags = 0;
430         tunKey->tunnelId = 0;
431     }
432
433     /* Clear out the receive flag for the inner packet. */
434     NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
435     NdisAdvanceNetBufferDataStart(curNb, tunnelSize, FALSE, NULL);
436     return NDIS_STATUS_SUCCESS;
437
438 dropNbl:
439     OvsCompleteNBL(switchContext, *newNbl, TRUE);
440     *newNbl = NULL;
441     return status;
442 }
443
444
445 NDIS_STATUS
446 OvsSlowPathDecapVxlan(const PNET_BUFFER_LIST packet,
447                    OvsIPv4TunnelKey *tunnelKey)
448 {
449     NDIS_STATUS status = NDIS_STATUS_FAILURE;
450     UDPHdr udpStorage;
451     const UDPHdr *udp;
452     VXLANHdr *VxlanHeader;
453     VXLANHdr  VxlanHeaderBuffer;
454     struct IPHdr ip_storage;
455     const struct IPHdr *nh;
456     OVS_PACKET_HDR_INFO layers;
457
458     layers.value = 0;
459
460     do {
461         nh = OvsGetIp(packet, layers.l3Offset, &ip_storage);
462         if (nh) {
463             layers.l4Offset = layers.l3Offset + nh->ihl * 4;
464         } else {
465             break;
466         }
467
468         /* make sure it's a VXLAN packet */
469         udp = OvsGetUdp(packet, layers.l4Offset, &udpStorage);
470         if (udp) {
471             layers.l7Offset = layers.l4Offset + sizeof *udp;
472         } else {
473             break;
474         }
475
476         /* XXX Should be tested against the dynamic port # in the VXLAN vport */
477         ASSERT(udp->dest == RtlUshortByteSwap(VXLAN_UDP_PORT));
478
479         VxlanHeader = (VXLANHdr *)OvsGetPacketBytes(packet,
480                                                     sizeof(*VxlanHeader),
481                                                     layers.l7Offset,
482                                                     &VxlanHeaderBuffer);
483
484         if (VxlanHeader) {
485             tunnelKey->src = nh->saddr;
486             tunnelKey->dst = nh->daddr;
487             tunnelKey->ttl = nh->ttl;
488             tunnelKey->tos = nh->tos;
489             if (VxlanHeader->instanceID) {
490                 tunnelKey->flags = OVS_TNL_F_KEY;
491                 tunnelKey->tunnelId = VXLAN_VNI_TO_TUNNELID(VxlanHeader->vxlanID);
492             } else {
493                 tunnelKey->flags = 0;
494                 tunnelKey->tunnelId = 0;
495             }
496         } else {
497             break;
498         }
499         status = NDIS_STATUS_SUCCESS;
500
501     } while(FALSE);
502
503     return status;
504 }
505
506 #pragma warning( pop )