2 * Copyright (c) 2015 VMware, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
28 #include "PacketParser.h"
35 #define OVS_DBG_MOD OVS_DBG_STT
39 OvsDoEncapStt(POVS_VPORT_ENTRY vport, PNET_BUFFER_LIST curNbl,
40 const OvsIPv4TunnelKey *tunKey,
41 const POVS_FWD_INFO fwdInfo,
42 POVS_PACKET_HDR_INFO layers,
43 POVS_SWITCH_CONTEXT switchContext,
44 PNET_BUFFER_LIST *newNbl);
47 * --------------------------------------------------------------------------
49 * Initialize STT tunnel module.
50 * --------------------------------------------------------------------------
53 OvsInitSttTunnel(POVS_VPORT_ENTRY vport,
56 POVS_STT_VPORT sttPort;
58 sttPort = (POVS_STT_VPORT) OvsAllocateMemoryWithTag(sizeof(*sttPort),
61 OVS_LOG_ERROR("Insufficient memory, can't allocate STT_VPORT");
62 return STATUS_INSUFFICIENT_RESOURCES;
65 RtlZeroMemory(sttPort, sizeof(*sttPort));
66 sttPort->dstPort = tcpDestPort;
67 vport->priv = (PVOID) sttPort;
68 return STATUS_SUCCESS;
72 * --------------------------------------------------------------------------
73 * OvsCleanupSttTunnel --
74 * Cleanup STT Tunnel module.
75 * --------------------------------------------------------------------------
78 OvsCleanupSttTunnel(POVS_VPORT_ENTRY vport)
80 if (vport->ovsType != OVS_VPORT_TYPE_STT ||
81 vport->priv == NULL) {
85 OvsFreeMemoryWithTag(vport->priv, OVS_STT_POOL_TAG);
90 * --------------------------------------------------------------------------
92 * Encapsulates a packet with an STT header.
93 * --------------------------------------------------------------------------
96 OvsEncapStt(POVS_VPORT_ENTRY vport,
97 PNET_BUFFER_LIST curNbl,
98 OvsIPv4TunnelKey *tunKey,
99 POVS_SWITCH_CONTEXT switchContext,
100 POVS_PACKET_HDR_INFO layers,
101 PNET_BUFFER_LIST *newNbl)
103 OVS_FWD_INFO fwdInfo;
106 UNREFERENCED_PARAMETER(switchContext);
107 status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
108 if (status != STATUS_SUCCESS) {
109 OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
111 * XXX This case where the ARP table is not populated is
112 * currently not handled
114 return NDIS_STATUS_FAILURE;
117 status = OvsDoEncapStt(vport, curNbl, tunKey, &fwdInfo, layers,
118 switchContext, newNbl);
123 * --------------------------------------------------------------------------
125 * Internal utility function which actually does the STT encap.
126 * --------------------------------------------------------------------------
129 OvsDoEncapStt(POVS_VPORT_ENTRY vport,
130 PNET_BUFFER_LIST curNbl,
131 const OvsIPv4TunnelKey *tunKey,
132 const POVS_FWD_INFO fwdInfo,
133 POVS_PACKET_HDR_INFO layers,
134 POVS_SWITCH_CONTEXT switchContext,
135 PNET_BUFFER_LIST *newNbl)
137 NDIS_STATUS status = NDIS_STATUS_SUCCESS;
145 UINT32 innerFrameLen, ipTotalLen;
146 POVS_STT_VPORT vportStt;
147 UINT32 headRoom = OvsGetSttTunHdrSize();
151 UNREFERENCED_PARAMETER(layers);
153 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
155 /* Verify if inner checksum is verified */
156 BOOLEAN innerChecksumVerified = FALSE;
157 BOOLEAN innerPartialChecksum = FALSE;
160 NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
162 lsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
163 TcpLargeSendNetBufferListInfo);
164 if (lsoInfo.LsoV1Transmit.MSS) {
165 /* XXX We don't handle LSO yet */
166 OVS_LOG_ERROR("LSO on STT is not supported");
167 return NDIS_STATUS_FAILURE;
171 vportStt = (POVS_STT_VPORT) GetOvsVportPriv(vport);
174 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
175 csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
176 TcpIpChecksumNetBufferListInfo);
177 *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
178 FALSE /*copy NblInfo*/);
179 if (*newNbl == NULL) {
180 OVS_LOG_ERROR("Unable to copy NBL");
181 return NDIS_STATUS_FAILURE;
184 curNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
185 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
186 bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl,
188 bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
190 if (layers->isIPv4 && csumInfo.Transmit.IpHeaderChecksum) {
191 IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset);
192 ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0);
195 if(!csumInfo.Transmit.TcpChecksum) {
196 innerChecksumVerified = TRUE;
198 innerPartialChecksum = TRUE;
200 } else if (layers->isUdp) {
201 if(!csumInfo.Transmit.UdpChecksum) {
202 innerChecksumVerified = TRUE;
204 innerPartialChecksum = TRUE;
209 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
210 /* NB Chain should be split before */
211 ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
213 innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb);
215 * External port can't be removed as we hold the dispatch lock
216 * We also check if the external port was removed beforecalling
217 * port encapsulation functions
219 if (innerFrameLen > OvsGetExternalMtu(switchContext) - headRoom) {
220 OVS_LOG_ERROR("Packet too large (size %d, mtu %d). Can't encapsulate",
221 innerFrameLen, OvsGetExternalMtu(switchContext));
222 status = NDIS_STATUS_FAILURE;
226 status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
227 if (status != NDIS_STATUS_SUCCESS) {
228 ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)");
229 OVS_LOG_ERROR("Unable to NdisRetreatNetBufferDataStart(headroom)");
234 * Make sure that the headroom for the tunnel header is continguous in
237 curMdl = NET_BUFFER_CURRENT_MDL(curNb);
238 ASSERT((int) (MmGetMdlByteCount(curMdl) - NET_BUFFER_CURRENT_MDL_OFFSET(curNb))
241 buf = (PUINT8) MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
243 ASSERT(!"MmGetSystemAddressForMdlSafe failed");
244 OVS_LOG_ERROR("MmGetSystemAddressForMdlSafe failed");
245 status = NDIS_STATUS_RESOURCES;
249 buf += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
250 outerEthHdr = (EthHdr *)buf;
251 outerIpHdr = (IPHdr *) (outerEthHdr + 1);
252 outerTcpHdr = (TCPHdr *) (outerIpHdr + 1);
253 sttHdr = (SttHdr *) (outerTcpHdr + 1);
256 ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
257 (PCHAR)&fwdInfo->srcMacAddr);
258 NdisMoveMemory(outerEthHdr->Destination, fwdInfo->dstMacAddr,
259 sizeof outerEthHdr->Destination + sizeof outerEthHdr->Source);
260 outerEthHdr->Type = htons(ETH_TYPE_IPV4);
263 outerIpHdr->ihl = sizeof(IPHdr) >> 2;
264 outerIpHdr->version = IPPROTO_IPV4;
265 outerIpHdr->tos = tunKey->tos;
267 ipTotalLen = sizeof(IPHdr) + sizeof(TCPHdr) + STT_HDR_LEN + innerFrameLen;
268 outerIpHdr->tot_len = htons(ipTotalLen);
269 ASSERT(ipTotalLen < 65536);
271 outerIpHdr->id = (uint16) atomic_add64(&vportStt->ipId, innerFrameLen);
272 outerIpHdr->frag_off = (tunKey->flags & OVS_TNL_F_DONT_FRAGMENT) ?
274 outerIpHdr->ttl = tunKey->ttl? tunKey->ttl : 64;
275 outerIpHdr->protocol = IPPROTO_TCP;
276 outerIpHdr->check = 0;
277 outerIpHdr->saddr = fwdInfo->srcIpAddr;
278 outerIpHdr->daddr = tunKey->dst;
281 RtlZeroMemory(outerTcpHdr, sizeof *outerTcpHdr);
282 outerTcpHdr->source = htons(tunKey->flow_hash | 32768);
283 outerTcpHdr->dest = htons(vportStt->dstPort);
284 outerTcpHdr->seq = htonl((STT_HDR_LEN + innerFrameLen) <<
286 outerTcpHdr->ack_seq = htonl(atomic_inc64(&vportStt->ackNo));
287 outerTcpHdr->doff = sizeof(TCPHdr) >> 2;
288 outerTcpHdr->psh = 1;
289 outerTcpHdr->ack = 1;
290 outerTcpHdr->window = (uint16) ~0;
292 /* Calculate pseudo header chksum */
293 tcpChksumLen = sizeof(TCPHdr) + STT_HDR_LEN + innerFrameLen;
294 ASSERT(tcpChksumLen < 65535);
295 outerTcpHdr->check = IPPseudoChecksum(&fwdInfo->srcIpAddr,(uint32 *) &tunKey->dst,
296 IPPROTO_TCP, (uint16) tcpChksumLen);
299 /* XXX need to peek into the inner packet, hard code for now */
300 sttHdr->flags = STT_PROTO_IPV4;
301 if (innerChecksumVerified) {
302 sttHdr->flags |= STT_CSUM_VERIFIED;
303 } else if (innerPartialChecksum) {
304 sttHdr->flags |= STT_CSUM_PARTIAL;
306 sttHdr->l4Offset = 0;
308 sttHdr->reserved = 0;
309 /* XXX Used for large TCP packets.Not sure how it is used, clarify */
312 sttHdr->key = tunKey->tunnelId;
313 /* Zero out stt padding */
314 *(uint16 *)(sttHdr + 1) = 0;
316 /* Offload IP and TCP checksum */
318 csumInfo.Transmit.IpHeaderChecksum = 1;
319 csumInfo.Transmit.TcpChecksum = 1;
320 csumInfo.Transmit.IsIPv4 = 1;
321 csumInfo.Transmit.TcpHeaderOffset = sizeof *outerEthHdr +
323 NET_BUFFER_LIST_INFO(curNbl,
324 TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
326 return STATUS_SUCCESS;
329 OvsCompleteNBL(switchContext, *newNbl, TRUE);
335 *----------------------------------------------------------------------------
336 * OvsCalculateTCPChecksum
337 * Calculate TCP checksum
338 *----------------------------------------------------------------------------
340 static __inline NDIS_STATUS
341 OvsCalculateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb)
343 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
344 csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
345 TcpIpChecksumNetBufferListInfo);
348 /* Check if TCP Checksum has been calculated by NIC */
349 if (csumInfo.Receive.TcpChecksumSucceeded) {
350 return NDIS_STATUS_SUCCESS;
353 EthHdr *eth = (EthHdr *)NdisGetDataBuffer(curNb, sizeof(EthHdr),
356 if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV4)) {
357 IPHdr *ip = (IPHdr *)((PCHAR)eth + sizeof *eth);
358 UINT32 l4Payload = ntohs(ip->tot_len) - ip->ihl * 4;
359 TCPHdr *tcp = (TCPHdr *)((PCHAR)ip + ip->ihl * 4);
360 checkSum = tcp->check;
363 tcp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
364 IPPROTO_TCP, (UINT16)l4Payload);
365 tcp->check = CalculateChecksumNB(curNb, (UINT16)(l4Payload),
366 sizeof(EthHdr) + ip->ihl * 4);
367 if (checkSum != tcp->check) {
368 return NDIS_STATUS_INVALID_PACKET;
371 OVS_LOG_ERROR("IPv6 on STT is not supported");
372 return NDIS_STATUS_INVALID_PACKET;
375 csumInfo.Receive.TcpChecksumSucceeded = 1;
376 NET_BUFFER_LIST_INFO(curNbl,
377 TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
378 return NDIS_STATUS_SUCCESS;
382 * --------------------------------------------------------------------------
384 * Decapsulates an STT packet.
385 * --------------------------------------------------------------------------
388 OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
389 PNET_BUFFER_LIST curNbl,
390 OvsIPv4TunnelKey *tunKey,
391 PNET_BUFFER_LIST *newNbl)
393 NDIS_STATUS status = NDIS_STATUS_FAILURE;
396 char *ipBuf[sizeof(IPHdr)];
398 char *sttBuf[STT_HDR_LEN];
399 UINT32 advanceCnt, hdrLen;
400 NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
402 curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
403 ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
405 if (NET_BUFFER_DATA_LENGTH(curNb) < OvsGetSttTunHdrSize()) {
406 OVS_LOG_ERROR("Packet length received is less than the tunnel header:"
407 " %d<%d\n", NET_BUFFER_DATA_LENGTH(curNb), OvsGetSttTunHdrSize());
408 return NDIS_STATUS_INVALID_LENGTH;
411 /* Verify outer TCP Checksum */
412 csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
413 TcpIpChecksumNetBufferListInfo);
415 /* Check if NIC has indicated TCP checksum failure */
416 if (csumInfo.Receive.TcpChecksumFailed) {
417 return NDIS_STATUS_INVALID_PACKET;
420 /* Calculate the TCP Checksum */
421 status = OvsCalculateTCPChecksum(curNbl, curNb);
422 if (status != NDIS_STATUS_SUCCESS) {
426 /* Skip Eth header */
427 hdrLen = sizeof(EthHdr);
428 NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
431 ipHdr = NdisGetDataBuffer(curNb, sizeof *ipHdr, (PVOID) &ipBuf,
435 /* Skip IP & TCP headers */
436 hdrLen = sizeof(IPHdr) + sizeof(TCPHdr),
437 NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
438 advanceCnt += hdrLen;
441 sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr, (PVOID) &sttBuf,
445 /* Initialize the tunnel key */
446 tunKey->dst = ipHdr->daddr;
447 tunKey->src = ipHdr->saddr;
448 tunKey->tunnelId = sttHdr->key;
449 tunKey->flags = (OVS_TNL_F_CSUM | OVS_TNL_F_KEY);
450 tunKey->tos = ipHdr->tos;
451 tunKey->ttl = ipHdr->ttl;
454 /* Skip stt header, DataOffset points to inner pkt now. */
455 hdrLen = STT_HDR_LEN;
456 NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
457 advanceCnt += hdrLen;
459 /* Verify checksum for inner packet if it's required */
460 if (!(sttHdr->flags & STT_CSUM_VERIFIED)) {
461 BOOLEAN innerChecksumPartial = sttHdr->flags & STT_CSUM_PARTIAL;
462 EthHdr *eth = (EthHdr *)NdisGetDataBuffer(curNb, sizeof(EthHdr),
465 /* XXX Figure out a way to offload checksum receives */
466 if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV4)) {
467 IPHdr *ip = (IPHdr *)((PCHAR)eth + sizeof *eth);
468 UINT16 l4Payload = (UINT16)ntohs(ip->tot_len) - ip->ihl * 4;
469 UINT32 offset = sizeof(EthHdr) + ip->ihl * 4;
471 if (ip->protocol == IPPROTO_TCP) {
472 TCPHdr *tcp = (TCPHdr *)((PCHAR)ip + ip->ihl * 4);
473 if (!innerChecksumPartial){
474 tcp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
478 tcp->check = CalculateChecksumNB(curNb, l4Payload, offset);
479 } else if (ip->protocol == IPPROTO_UDP) {
480 UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
481 if (!innerChecksumPartial){
482 udp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
483 IPPROTO_UDP, l4Payload);
485 udp->check = CalculateChecksumNB(curNb, l4Payload, offset);
487 } else if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV6)) {
488 IPv6Hdr *ip = (IPv6Hdr *)((PCHAR)eth + sizeof *eth);
489 UINT32 offset = (UINT32)(sizeof *eth + sizeof *ip);
490 UINT16 totalLength = (UINT16)ntohs(ip->payload_len);
491 if (ip->nexthdr == IPPROTO_TCP) {
492 TCPHdr *tcp = (TCPHdr *)((PCHAR)ip + sizeof *ip);
493 if (!innerChecksumPartial){
494 tcp->check = IPv6PseudoChecksum((UINT32 *)&ip->saddr,
495 (UINT32 *)&ip->daddr,
496 IPPROTO_TCP, totalLength);
498 tcp->check = CalculateChecksumNB(curNb, totalLength, offset);
500 else if (ip->nexthdr == IPPROTO_UDP) {
501 UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
502 if (!innerChecksumPartial) {
503 udp->check = IPv6PseudoChecksum((UINT32 *)&ip->saddr,
504 (UINT32 *)&ip->daddr,
505 IPPROTO_UDP, totalLength);
507 udp->check = CalculateChecksumNB(curNb, totalLength, offset);
511 NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
514 *newNbl = OvsPartialCopyNBL(switchContext, curNbl, OVS_DEFAULT_COPY_SIZE,
515 0, FALSE /*copy NBL info*/);
517 ASSERT(advanceCnt == OvsGetSttTunHdrSize());
518 status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL);
520 if (*newNbl == NULL) {
521 OVS_LOG_ERROR("OvsDecapStt: Unable to allocate a new cloned NBL");
522 status = NDIS_STATUS_RESOURCES;