2 * Copyright (c) 2014 VMware, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
28 #include "PacketParser.h"
30 #pragma warning( push )
31 #pragma warning( disable:4127 )
37 #define OVS_DBG_MOD OVS_DBG_VXLAN
40 /* Helper macro to check if a VXLAN ID is valid. */
41 #define VXLAN_ID_IS_VALID(vxlanID) (0 < (vxlanID) && (vxlanID) <= 0xffffff)
42 #define VXLAN_TUNNELID_TO_VNI(_tID) (UINT32)(((UINT64)(_tID)) >> 40)
43 #define VXLAN_VNI_TO_TUNNELID(_vni) (((UINT64)(_vni)) << 40)
44 #define IP_DF_NBO 0x0040
45 #define VXLAN_DEFAULT_TTL 64
46 #define VXLAN_MULTICAST_TTL 64
47 #define VXLAN_DEFAULT_INSTANCE_ID 1
49 /* Move to a header file */
50 extern POVS_SWITCH_CONTEXT gOvsSwitchContext;
53 *----------------------------------------------------------------------------
54 * This function verifies if the VXLAN tunnel already exists, in order to
55 * avoid sending a duplicate request to the WFP base filtering engine.
56 *----------------------------------------------------------------------------
59 OvsIsTunnelFilterCreated(POVS_SWITCH_CONTEXT switchContext,
62 for (UINT hash = 0; hash < OVS_MAX_VPORT_ARRAY_SIZE; hash++) {
63 PLIST_ENTRY head, link, next;
65 head = &(switchContext->portNoHashArray[hash & OVS_VPORT_MASK]);
66 LIST_FORALL_SAFE(head, link, next) {
67 POVS_VPORT_ENTRY vport = NULL;
68 POVS_VXLAN_VPORT vxlanPort = NULL;
69 vport = CONTAINING_RECORD(link, OVS_VPORT_ENTRY, portNoLink);
70 vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
72 if ((udpPortDest == vxlanPort->dstPort)) {
73 /* The VXLAN tunnel was already created. */
84 *----------------------------------------------------------------------------
85 * This function allocates and initializes the OVS_VXLAN_VPORT. The function
86 * also creates a WFP tunnel filter for the necessary destination port. The
87 * tunnel filter create request is passed to the tunnel filter threads that
88 * will complete the request at a later time when IRQL is lowered to
91 * udpDestPort: the vxlan is set as payload to a udp frame. If the destination
92 * port of an udp frame is udpDestPort, we understand it to be vxlan.
93 *----------------------------------------------------------------------------
96 OvsInitVxlanTunnel(PIRP irp,
97 POVS_VPORT_ENTRY vport,
99 PFNTunnelVportPendingOp callback,
102 NTSTATUS status = STATUS_SUCCESS;
103 POVS_VXLAN_VPORT vxlanPort = NULL;
105 vxlanPort = OvsAllocateMemoryWithTag(sizeof (*vxlanPort),
107 if (vxlanPort == NULL) {
108 return STATUS_INSUFFICIENT_RESOURCES;
111 RtlZeroMemory(vxlanPort, sizeof(*vxlanPort));
112 vxlanPort->dstPort = udpDestPort;
113 vport->priv = (PVOID)vxlanPort;
115 if (!OvsIsTunnelFilterCreated(gOvsSwitchContext, udpDestPort)) {
116 status = OvsTunelFilterCreate(irp,
118 &vxlanPort->filterID,
122 status = STATUS_OBJECT_NAME_EXISTS;
129 *----------------------------------------------------------------------------
130 * This function releases the OVS_VXLAN_VPORT. The function also deletes the
131 * WFP tunnel filter previously created. The tunnel filter delete request is
132 * passed to the tunnel filter threads that will complete the request at a
133 * later time when IRQL is lowered to PASSIVE_LEVEL.
134 *----------------------------------------------------------------------------
137 OvsCleanupVxlanTunnel(PIRP irp,
138 POVS_VPORT_ENTRY vport,
139 PFNTunnelVportPendingOp callback,
142 NTSTATUS status = STATUS_SUCCESS;
143 POVS_VXLAN_VPORT vxlanPort = NULL;
145 if (vport->ovsType != OVS_VPORT_TYPE_VXLAN ||
146 vport->priv == NULL) {
147 return STATUS_SUCCESS;
150 vxlanPort = (POVS_VXLAN_VPORT)vport->priv;
152 if (vxlanPort->filterID != 0) {
153 status = OvsTunelFilterDelete(irp,
159 OvsFreeMemoryWithTag(vport->priv, OVS_VXLAN_POOL_TAG);
167 *----------------------------------------------------------------------------
169 * Encapsulates the packet.
170 *----------------------------------------------------------------------------
172 static __inline NDIS_STATUS
173 OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
174 PNET_BUFFER_LIST curNbl,
175 OvsIPv4TunnelKey *tunKey,
176 POVS_FWD_INFO fwdInfo,
177 POVS_PACKET_HDR_INFO layers,
178 POVS_SWITCH_CONTEXT switchContext,
179 PNET_BUFFER_LIST *newNbl)
189 POVS_VXLAN_VPORT vportVxlan;
190 UINT32 headRoom = OvsGetVxlanTunHdrSize();
194 * XXX: the assumption currently is that the NBL is owned by OVS, and
195 * headroom has already been allocated as part of allocating the NBL and
198 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
199 packetLength = NET_BUFFER_DATA_LENGTH(curNb);
201 NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO tsoInfo;
203 tsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
204 TcpLargeSendNetBufferListInfo);
205 OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo.LsoV1Transmit.MSS, packetLength);
206 if (tsoInfo.LsoV1Transmit.MSS) {
207 OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
208 *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
209 tsoInfo.LsoV1Transmit.MSS, headRoom);
210 if (*newNbl == NULL) {
211 OVS_LOG_ERROR("Unable to segment NBL");
212 return NDIS_STATUS_FAILURE;
217 vportVxlan = (POVS_VXLAN_VPORT) GetOvsVportPriv(vport);
220 /* If we didn't split the packet above, make a copy now */
221 if (*newNbl == NULL) {
222 *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
224 if (*newNbl == NULL) {
225 OVS_LOG_ERROR("Unable to copy NBL");
226 return NDIS_STATUS_FAILURE;
231 for (curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curNb != NULL;
232 curNb = curNb->Next) {
233 status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
234 if (status != NDIS_STATUS_SUCCESS) {
238 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
239 bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
241 status = NDIS_STATUS_RESOURCES;
245 bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
246 if (NET_BUFFER_NEXT_NB(curNb)) {
247 OVS_LOG_TRACE("nb length %u next %u", NET_BUFFER_DATA_LENGTH(curNb),
248 NET_BUFFER_DATA_LENGTH(curNb->Next));
252 ethHdr = (EthHdr *)bufferStart;
253 ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
254 (PCHAR)&fwdInfo->srcMacAddr);
255 NdisMoveMemory(ethHdr->Destination, fwdInfo->dstMacAddr,
256 sizeof ethHdr->Destination + sizeof ethHdr->Source);
257 ethHdr->Type = htons(ETH_TYPE_IPV4);
259 // XXX: question: there are fields in the OvsIPv4TunnelKey for ttl and such,
260 // should we use those values instead? or will they end up being
263 ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
265 ipHdr->ihl = sizeof *ipHdr / 4;
266 ipHdr->version = IPV4;
268 ipHdr->tot_len = htons(NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ethHdr);
270 ipHdr->frag_off = IP_DF_NBO;
271 ipHdr->ttl = tunKey->ttl ? tunKey->ttl : VXLAN_DEFAULT_TTL;
272 ipHdr->protocol = IPPROTO_UDP;
273 ASSERT(tunKey->dst == fwdInfo->dstIpAddr);
274 ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
275 ipHdr->saddr = fwdInfo->srcIpAddr;
276 ipHdr->daddr = fwdInfo->dstIpAddr;
278 ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
281 udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
282 udpHdr->source = htons(tunKey->flow_hash | 32768);
283 udpHdr->dest = htons(vportVxlan->dstPort);
284 udpHdr->len = htons(NET_BUFFER_DATA_LENGTH(curNb) - headRoom +
285 sizeof *udpHdr + sizeof *vxlanHdr);
289 vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
290 vxlanHdr->flags1 = 0;
291 vxlanHdr->locallyReplicate = 0;
292 vxlanHdr->flags2 = 0;
293 vxlanHdr->reserved1 = 0;
294 if (tunKey->flags | OVS_TNL_F_KEY) {
295 vxlanHdr->vxlanID = VXLAN_TUNNELID_TO_VNI(tunKey->tunnelId);
296 vxlanHdr->instanceID = 1;
298 vxlanHdr->reserved2 = 0;
300 return STATUS_SUCCESS;
303 OvsCompleteNBL(switchContext, *newNbl, TRUE);
310 *----------------------------------------------------------------------------
312 * Encapsulates the packet if L2/L3 for destination resolves. Otherwise,
313 * enqueues a callback that does encapsulatation after resolution.
314 *----------------------------------------------------------------------------
317 OvsEncapVxlan(POVS_VPORT_ENTRY vport,
318 PNET_BUFFER_LIST curNbl,
319 OvsIPv4TunnelKey *tunKey,
320 POVS_SWITCH_CONTEXT switchContext,
321 POVS_PACKET_HDR_INFO layers,
322 PNET_BUFFER_LIST *newNbl)
325 OVS_FWD_INFO fwdInfo;
327 status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
328 if (status != STATUS_SUCCESS) {
329 OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
330 // return NDIS_STATUS_PENDING;
332 * XXX: Don't know if the completionList will make any sense when
333 * accessed in the callback. Make sure the caveats are known.
335 * XXX: This code will work once we are able to grab locks in the
338 return NDIS_STATUS_FAILURE;
341 return OvsDoEncapVxlan(vport, curNbl, tunKey, &fwdInfo, layers,
342 switchContext, newNbl);
346 *----------------------------------------------------------------------------
347 * OvsCalculateUDPChecksum
348 * Calculate UDP checksum
349 *----------------------------------------------------------------------------
351 static __inline NDIS_STATUS
352 OvsCalculateUDPChecksum(PNET_BUFFER_LIST curNbl,
358 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
361 csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo);
363 /* Next check if UDP checksum has been calculated. */
364 if (!csumInfo.Receive.UdpChecksumSucceeded) {
367 checkSum = udpHdr->check;
369 l4Payload = packetLength - sizeof(EthHdr) - ipHdr->ihl * 4;
372 IPPseudoChecksum((UINT32 *)&ipHdr->saddr,
373 (UINT32 *)&ipHdr->daddr,
374 IPPROTO_UDP, (UINT16)l4Payload);
375 udpHdr->check = CalculateChecksumNB(curNb, (UINT16)l4Payload,
376 sizeof(EthHdr) + ipHdr->ihl * 4);
377 if (checkSum != udpHdr->check) {
378 OVS_LOG_TRACE("UDP checksum incorrect.");
379 return NDIS_STATUS_INVALID_PACKET;
383 csumInfo.Receive.UdpChecksumSucceeded = 1;
384 NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
385 return NDIS_STATUS_SUCCESS;
389 *----------------------------------------------------------------------------
391 * Decapsulates to tunnel header in 'curNbl' and puts into 'tunKey'.
392 *----------------------------------------------------------------------------
395 OvsDecapVxlan(POVS_SWITCH_CONTEXT switchContext,
396 PNET_BUFFER_LIST curNbl,
397 OvsIPv4TunnelKey *tunKey,
398 PNET_BUFFER_LIST *newNbl)
406 UINT32 tunnelSize = 0, packetLength = 0;
410 /* Check the the length of the UDP payload */
411 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
412 packetLength = NET_BUFFER_DATA_LENGTH(curNb);
413 tunnelSize = OvsGetVxlanTunHdrSize();
414 if (packetLength <= tunnelSize) {
415 return NDIS_STATUS_INVALID_LENGTH;
419 * Create a copy of the NBL so that we have all the headers in one MDL.
421 *newNbl = OvsPartialCopyNBL(switchContext, curNbl,
422 tunnelSize + OVS_DEFAULT_COPY_SIZE, 0,
423 TRUE /*copy NBL info */);
425 if (*newNbl == NULL) {
426 return NDIS_STATUS_RESOURCES;
429 /* XXX: Handle VLAN header. */
431 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
432 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
433 bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority) +
434 NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
436 status = NDIS_STATUS_RESOURCES;
440 ethHdr = (EthHdr *)bufferStart;
441 /* XXX: Handle IP options. */
442 ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
443 tunKey->src = ipHdr->saddr;
444 tunKey->dst = ipHdr->daddr;
445 tunKey->tos = ipHdr->tos;
446 tunKey->ttl = ipHdr->ttl;
448 udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr);
450 /* Validate if NIC has indicated checksum failure. */
451 status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0);
452 if (status != NDIS_STATUS_SUCCESS) {
456 /* Calculate and verify UDP checksum if NIC didn't do it. */
457 if (udpHdr->check != 0) {
458 status = OvsCalculateUDPChecksum(curNbl, curNb, ipHdr, udpHdr, packetLength);
459 if (status != NDIS_STATUS_SUCCESS) {
464 vxlanHdr = (VXLANHdr *)((PCHAR)udpHdr + sizeof *udpHdr);
465 if (vxlanHdr->instanceID) {
466 tunKey->flags = OVS_TNL_F_KEY;
467 tunKey->tunnelId = VXLAN_VNI_TO_TUNNELID(vxlanHdr->vxlanID);
470 tunKey->tunnelId = 0;
473 /* Clear out the receive flag for the inner packet. */
474 NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
475 NdisAdvanceNetBufferDataStart(curNb, tunnelSize, FALSE, NULL);
476 return NDIS_STATUS_SUCCESS;
479 OvsCompleteNBL(switchContext, *newNbl, TRUE);
486 OvsSlowPathDecapVxlan(const PNET_BUFFER_LIST packet,
487 OvsIPv4TunnelKey *tunnelKey)
489 NDIS_STATUS status = NDIS_STATUS_FAILURE;
492 VXLANHdr *VxlanHeader;
493 VXLANHdr VxlanHeaderBuffer;
494 struct IPHdr ip_storage;
495 const struct IPHdr *nh;
496 OVS_PACKET_HDR_INFO layers;
501 nh = OvsGetIp(packet, layers.l3Offset, &ip_storage);
503 layers.l4Offset = layers.l3Offset + nh->ihl * 4;
508 /* make sure it's a VXLAN packet */
509 udp = OvsGetUdp(packet, layers.l4Offset, &udpStorage);
511 layers.l7Offset = layers.l4Offset + sizeof *udp;
516 VxlanHeader = (VXLANHdr *)OvsGetPacketBytes(packet,
517 sizeof(*VxlanHeader),
522 tunnelKey->src = nh->saddr;
523 tunnelKey->dst = nh->daddr;
524 tunnelKey->ttl = nh->ttl;
525 tunnelKey->tos = nh->tos;
526 if (VxlanHeader->instanceID) {
527 tunnelKey->flags = OVS_TNL_F_KEY;
528 tunnelKey->tunnelId = VXLAN_VNI_TO_TUNNELID(VxlanHeader->vxlanID);
530 tunnelKey->flags = 0;
531 tunnelKey->tunnelId = 0;
536 status = NDIS_STATUS_SUCCESS;
543 #pragma warning( pop )