2 * Copyright (c) 2014, 2016 VMware, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
27 #include "PacketParser.h"
33 #pragma warning( push )
34 #pragma warning( disable:4127 )
40 #define OVS_DBG_MOD OVS_DBG_VXLAN
42 /* Helper macro to check if a VXLAN ID is valid. */
43 #define VXLAN_ID_IS_VALID(vxlanID) (0 < (vxlanID) && (vxlanID) <= 0xffffff)
44 #define VXLAN_TUNNELID_TO_VNI(_tID) (UINT32)(((UINT64)(_tID)) >> 40)
45 #define VXLAN_VNI_TO_TUNNELID(_vni) (((UINT64)(_vni)) << 40)
46 #define IP_DF_NBO 0x0040
47 #define VXLAN_DEFAULT_TTL 64
48 #define VXLAN_MULTICAST_TTL 64
49 #define VXLAN_DEFAULT_INSTANCE_ID 1
51 /* Move to a header file */
52 extern POVS_SWITCH_CONTEXT gOvsSwitchContext;
55 *----------------------------------------------------------------------------
56 * This function verifies if the VXLAN tunnel already exists, in order to
57 * avoid sending a duplicate request to the WFP base filtering engine.
58 *----------------------------------------------------------------------------
61 OvsIsTunnelFilterCreated(POVS_SWITCH_CONTEXT switchContext,
64 for (UINT hash = 0; hash < OVS_MAX_VPORT_ARRAY_SIZE; hash++) {
65 PLIST_ENTRY head, link, next;
67 head = &(switchContext->portNoHashArray[hash & OVS_VPORT_MASK]);
68 LIST_FORALL_SAFE(head, link, next) {
69 POVS_VPORT_ENTRY vport = NULL;
70 POVS_VXLAN_VPORT vxlanPort = NULL;
71 vport = CONTAINING_RECORD(link, OVS_VPORT_ENTRY, portNoLink);
72 vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
74 if ((udpPortDest == vxlanPort->dstPort)) {
75 /* The VXLAN tunnel was already created. */
86 *----------------------------------------------------------------------------
87 * This function allocates and initializes the OVS_VXLAN_VPORT. The function
88 * also creates a WFP tunnel filter for the necessary destination port. The
89 * tunnel filter create request is passed to the tunnel filter threads that
90 * will complete the request at a later time when IRQL is lowered to
93 * udpDestPort: the vxlan is set as payload to a udp frame. If the destination
94 * port of an udp frame is udpDestPort, we understand it to be vxlan.
95 *----------------------------------------------------------------------------
98 OvsInitVxlanTunnel(PIRP irp,
99 POVS_VPORT_ENTRY vport,
101 PFNTunnelVportPendingOp callback,
104 NTSTATUS status = STATUS_SUCCESS;
105 POVS_VXLAN_VPORT vxlanPort = NULL;
107 vxlanPort = OvsAllocateMemoryWithTag(sizeof (*vxlanPort),
109 if (vxlanPort == NULL) {
110 return STATUS_INSUFFICIENT_RESOURCES;
113 RtlZeroMemory(vxlanPort, sizeof(*vxlanPort));
114 vxlanPort->dstPort = udpDestPort;
115 vport->priv = (PVOID)vxlanPort;
117 if (!OvsIsTunnelFilterCreated(gOvsSwitchContext, udpDestPort)) {
118 status = OvsTunnelFilterCreate(irp,
120 &vxlanPort->filterID,
124 status = STATUS_OBJECT_NAME_EXISTS;
131 *----------------------------------------------------------------------------
132 * This function releases the OVS_VXLAN_VPORT. The function also deletes the
133 * WFP tunnel filter previously created. The tunnel filter delete request is
134 * passed to the tunnel filter threads that will complete the request at a
135 * later time when IRQL is lowered to PASSIVE_LEVEL.
136 *----------------------------------------------------------------------------
139 OvsCleanupVxlanTunnel(PIRP irp,
140 POVS_VPORT_ENTRY vport,
141 PFNTunnelVportPendingOp callback,
144 NTSTATUS status = STATUS_SUCCESS;
145 POVS_VXLAN_VPORT vxlanPort = NULL;
147 if (vport->ovsType != OVS_VPORT_TYPE_VXLAN ||
148 vport->priv == NULL) {
149 return STATUS_SUCCESS;
152 vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
154 if (vxlanPort->filterID != 0) {
155 status = OvsTunnelFilterDelete(irp,
160 OvsFreeMemoryWithTag(vport->priv, OVS_VXLAN_POOL_TAG);
169 *----------------------------------------------------------------------------
171 * Encapsulates the packet.
172 *----------------------------------------------------------------------------
174 static __inline NDIS_STATUS
175 OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
176 PNET_BUFFER_LIST curNbl,
177 OvsIPv4TunnelKey *tunKey,
178 POVS_FWD_INFO fwdInfo,
179 POVS_PACKET_HDR_INFO layers,
180 POVS_SWITCH_CONTEXT switchContext,
181 PNET_BUFFER_LIST *newNbl)
191 POVS_VXLAN_VPORT vportVxlan;
192 UINT32 headRoom = OvsGetVxlanTunHdrSize();
197 * XXX: the assumption currently is that the NBL is owned by OVS, and
198 * headroom has already been allocated as part of allocating the NBL and
201 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
202 packetLength = NET_BUFFER_DATA_LENGTH(curNb);
205 mss = OVSGetTcpMSS(curNbl);
207 OVS_LOG_TRACE("MSS %u packet len %u", mss,
210 OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
211 *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
213 if (*newNbl == NULL) {
214 OVS_LOG_ERROR("Unable to segment NBL");
215 return NDIS_STATUS_FAILURE;
217 /* Clear out LSO flags after this point */
218 NET_BUFFER_LIST_INFO(*newNbl, TcpLargeSendNetBufferListInfo) = 0;
222 vportVxlan = (POVS_VXLAN_VPORT) GetOvsVportPriv(vport);
225 /* If we didn't split the packet above, make a copy now */
226 if (*newNbl == NULL) {
227 *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
229 if (*newNbl == NULL) {
230 OVS_LOG_ERROR("Unable to copy NBL");
231 return NDIS_STATUS_FAILURE;
233 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
234 csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
235 TcpIpChecksumNetBufferListInfo);
236 status = OvsApplySWChecksumOnNB(layers, *newNbl, &csumInfo);
238 if (status != NDIS_STATUS_SUCCESS) {
244 for (curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curNb != NULL;
245 curNb = curNb->Next) {
246 status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
247 if (status != NDIS_STATUS_SUCCESS) {
251 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
252 bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
254 status = NDIS_STATUS_RESOURCES;
258 bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
259 if (NET_BUFFER_NEXT_NB(curNb)) {
260 OVS_LOG_TRACE("nb length %u next %u", NET_BUFFER_DATA_LENGTH(curNb),
261 NET_BUFFER_DATA_LENGTH(curNb->Next));
265 ethHdr = (EthHdr *)bufferStart;
266 ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
267 (PCHAR)&fwdInfo->srcMacAddr);
268 NdisMoveMemory(ethHdr->Destination, fwdInfo->dstMacAddr,
269 sizeof ethHdr->Destination + sizeof ethHdr->Source);
270 ethHdr->Type = htons(ETH_TYPE_IPV4);
273 ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
275 ipHdr->ihl = sizeof *ipHdr / 4;
276 ipHdr->version = IPPROTO_IPV4;
277 ipHdr->tos = tunKey->tos;
278 ipHdr->tot_len = htons(NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ethHdr);
279 ipHdr->id = (uint16)atomic_add64(&vportVxlan->ipId,
280 NET_BUFFER_DATA_LENGTH(curNb));
281 ipHdr->frag_off = (tunKey->flags & OVS_TNL_F_DONT_FRAGMENT) ?
283 ipHdr->ttl = tunKey->ttl ? tunKey->ttl : VXLAN_DEFAULT_TTL;
284 ipHdr->protocol = IPPROTO_UDP;
285 ASSERT(tunKey->dst == fwdInfo->dstIpAddr);
286 ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
287 ipHdr->saddr = fwdInfo->srcIpAddr;
288 ipHdr->daddr = fwdInfo->dstIpAddr;
291 ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
294 udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
295 udpHdr->source = htons(tunKey->flow_hash | MAXINT16);
296 udpHdr->dest = htons(vportVxlan->dstPort);
297 udpHdr->len = htons(NET_BUFFER_DATA_LENGTH(curNb) - headRoom +
298 sizeof *udpHdr + sizeof *vxlanHdr);
302 vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
303 vxlanHdr->flags1 = 0;
304 vxlanHdr->locallyReplicate = 0;
305 vxlanHdr->flags2 = 0;
306 vxlanHdr->reserved1 = 0;
307 if (tunKey->flags | OVS_TNL_F_KEY) {
308 vxlanHdr->vxlanID = VXLAN_TUNNELID_TO_VNI(tunKey->tunnelId);
309 vxlanHdr->instanceID = 1;
311 vxlanHdr->reserved2 = 0;
313 return STATUS_SUCCESS;
316 OvsCompleteNBL(switchContext, *newNbl, TRUE);
323 *----------------------------------------------------------------------------
325 * Encapsulates the packet if L2/L3 for destination resolves. Otherwise,
326 * enqueues a callback that does encapsulatation after resolution.
327 *----------------------------------------------------------------------------
330 OvsEncapVxlan(POVS_VPORT_ENTRY vport,
331 PNET_BUFFER_LIST curNbl,
332 OvsIPv4TunnelKey *tunKey,
333 POVS_SWITCH_CONTEXT switchContext,
334 POVS_PACKET_HDR_INFO layers,
335 PNET_BUFFER_LIST *newNbl)
338 OVS_FWD_INFO fwdInfo;
340 status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
341 if (status != STATUS_SUCCESS) {
342 OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
343 // return NDIS_STATUS_PENDING;
345 * XXX: Don't know if the completionList will make any sense when
346 * accessed in the callback. Make sure the caveats are known.
348 * XXX: This code will work once we are able to grab locks in the
351 return NDIS_STATUS_FAILURE;
354 return OvsDoEncapVxlan(vport, curNbl, tunKey, &fwdInfo, layers,
355 switchContext, newNbl);
359 *----------------------------------------------------------------------------
360 * OvsCalculateUDPChecksum
361 * Calculate UDP checksum
362 *----------------------------------------------------------------------------
364 static __inline NDIS_STATUS
365 OvsCalculateUDPChecksum(PNET_BUFFER_LIST curNbl,
371 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
374 csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo);
376 /* Next check if UDP checksum has been calculated. */
377 if (!csumInfo.Receive.UdpChecksumSucceeded) {
380 checkSum = udpHdr->check;
382 l4Payload = packetLength - sizeof(EthHdr) - ipHdr->ihl * 4;
385 IPPseudoChecksum((UINT32 *)&ipHdr->saddr,
386 (UINT32 *)&ipHdr->daddr,
387 IPPROTO_UDP, (UINT16)l4Payload);
388 udpHdr->check = CalculateChecksumNB(curNb, (UINT16)l4Payload,
389 sizeof(EthHdr) + ipHdr->ihl * 4);
390 if (checkSum != udpHdr->check) {
391 OVS_LOG_TRACE("UDP checksum incorrect.");
392 return NDIS_STATUS_INVALID_PACKET;
396 csumInfo.Receive.UdpChecksumSucceeded = 1;
397 NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
398 return NDIS_STATUS_SUCCESS;
402 *----------------------------------------------------------------------------
404 * Decapsulates to tunnel header in 'curNbl' and puts into 'tunKey'.
405 *----------------------------------------------------------------------------
408 OvsDecapVxlan(POVS_SWITCH_CONTEXT switchContext,
409 PNET_BUFFER_LIST curNbl,
410 OvsIPv4TunnelKey *tunKey,
411 PNET_BUFFER_LIST *newNbl)
419 UINT32 tunnelSize = 0, packetLength = 0;
423 /* Check the length of the UDP payload */
424 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
425 packetLength = NET_BUFFER_DATA_LENGTH(curNb);
426 tunnelSize = OvsGetVxlanTunHdrSize();
427 if (packetLength <= tunnelSize) {
428 return NDIS_STATUS_INVALID_LENGTH;
432 * Create a copy of the NBL so that we have all the headers in one MDL.
434 *newNbl = OvsPartialCopyNBL(switchContext, curNbl,
435 tunnelSize + OVS_DEFAULT_COPY_SIZE, 0,
436 TRUE /*copy NBL info */);
438 if (*newNbl == NULL) {
439 return NDIS_STATUS_RESOURCES;
442 /* XXX: Handle VLAN header. */
444 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
445 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
446 bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority) +
447 NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
449 status = NDIS_STATUS_RESOURCES;
453 ethHdr = (EthHdr *)bufferStart;
454 /* XXX: Handle IP options. */
455 ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
456 tunKey->src = ipHdr->saddr;
457 tunKey->dst = ipHdr->daddr;
458 tunKey->tos = ipHdr->tos;
459 tunKey->ttl = ipHdr->ttl;
461 udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
463 /* Validate if NIC has indicated checksum failure. */
464 status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0);
465 if (status != NDIS_STATUS_SUCCESS) {
469 /* Calculate and verify UDP checksum if NIC didn't do it. */
470 if (udpHdr->check != 0) {
471 status = OvsCalculateUDPChecksum(curNbl, curNb, ipHdr, udpHdr, packetLength);
472 if (status != NDIS_STATUS_SUCCESS) {
477 vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
478 if (vxlanHdr->instanceID) {
479 tunKey->flags = OVS_TNL_F_KEY;
480 tunKey->tunnelId = VXLAN_VNI_TO_TUNNELID(vxlanHdr->vxlanID);
483 tunKey->tunnelId = 0;
486 /* Clear out the receive flag for the inner packet. */
487 NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
488 NdisAdvanceNetBufferDataStart(curNb, tunnelSize, FALSE, NULL);
489 return NDIS_STATUS_SUCCESS;
492 OvsCompleteNBL(switchContext, *newNbl, TRUE);
499 OvsSlowPathDecapVxlan(const PNET_BUFFER_LIST packet,
500 OvsIPv4TunnelKey *tunnelKey)
502 NDIS_STATUS status = NDIS_STATUS_FAILURE;
505 VXLANHdr *VxlanHeader;
506 VXLANHdr VxlanHeaderBuffer;
507 struct IPHdr ip_storage;
508 const struct IPHdr *nh;
509 OVS_PACKET_HDR_INFO layers;
514 nh = OvsGetIp(packet, layers.l3Offset, &ip_storage);
516 layers.l4Offset = layers.l3Offset + nh->ihl * 4;
521 /* make sure it's a VXLAN packet */
522 udp = OvsGetUdp(packet, layers.l4Offset, &udpStorage);
524 layers.l7Offset = layers.l4Offset + sizeof *udp;
529 VxlanHeader = (VXLANHdr *)OvsGetPacketBytes(packet,
530 sizeof(*VxlanHeader),
535 tunnelKey->src = nh->saddr;
536 tunnelKey->dst = nh->daddr;
537 tunnelKey->ttl = nh->ttl;
538 tunnelKey->tos = nh->tos;
539 if (VxlanHeader->instanceID) {
540 tunnelKey->flags = OVS_TNL_F_KEY;
541 tunnelKey->tunnelId = VXLAN_VNI_TO_TUNNELID(VxlanHeader->vxlanID);
543 tunnelKey->flags = 0;
544 tunnelKey->tunnelId = 0;
549 status = NDIS_STATUS_SUCCESS;
556 #pragma warning( pop )