datapath-windows: Cleanup Stt.c
[cascardo/ovs.git] / datapath-windows / ovsext / Stt.c
1 /*
2  * Copyright (c) 2015 VMware, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at:
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "precomp.h"
18
19 #include "Atomic.h"
20 #include "Checksum.h"
21 #include "Flow.h"
22 #include "IpHelper.h"
23 #include "NetProto.h"
24 #include "PacketIO.h"
25 #include "PacketParser.h"
26 #include "Stt.h"
27 #include "Switch.h"
28 #include "User.h"
29 #include "Util.h"
30 #include "Vport.h"
31
32 #ifdef OVS_DBG_MOD
33 #undef OVS_DBG_MOD
34 #endif
35 #define OVS_DBG_MOD OVS_DBG_STT
36 #include "Debug.h"
37 #include "Jhash.h"
38
39 KSTART_ROUTINE OvsSttDefragCleaner;
40 static PLIST_ENTRY OvsSttPktFragHash;
41 static NDIS_SPIN_LOCK OvsSttSpinLock;
42 static OVS_STT_THREAD_CTX sttDefragThreadCtx;
43
44 static NDIS_STATUS
45 OvsDoEncapStt(POVS_VPORT_ENTRY vport, PNET_BUFFER_LIST curNbl,
46               const OvsIPv4TunnelKey *tunKey,
47               const POVS_FWD_INFO fwdInfo,
48               POVS_PACKET_HDR_INFO layers,
49               POVS_SWITCH_CONTEXT switchContext,
50               PNET_BUFFER_LIST *newNbl);
51
52 /*
53  * --------------------------------------------------------------------------
54  * OvsInitSttTunnel --
55  *    Initialize STT tunnel module.
56  * --------------------------------------------------------------------------
57  */
58 NTSTATUS
59 OvsInitSttTunnel(POVS_VPORT_ENTRY vport,
60                  UINT16 tcpDestPort)
61 {
62     POVS_STT_VPORT sttPort;
63
64     sttPort = (POVS_STT_VPORT) OvsAllocateMemoryWithTag(sizeof(*sttPort),
65                                                         OVS_STT_POOL_TAG);
66     if (!sttPort) {
67         OVS_LOG_ERROR("Insufficient memory, can't allocate STT_VPORT");
68         return STATUS_INSUFFICIENT_RESOURCES;
69     }
70
71     RtlZeroMemory(sttPort, sizeof(*sttPort));
72     sttPort->dstPort = tcpDestPort;
73     vport->priv = (PVOID) sttPort;
74     return STATUS_SUCCESS;
75 }
76
77 /*
78  * --------------------------------------------------------------------------
79  * OvsCleanupSttTunnel --
80  *    Cleanup STT Tunnel module.
81  * --------------------------------------------------------------------------
82  */
83 void
84 OvsCleanupSttTunnel(POVS_VPORT_ENTRY vport)
85 {
86     if (vport->ovsType != OVS_VPORT_TYPE_STT ||
87         vport->priv == NULL) {
88         return;
89     }
90
91     OvsFreeMemoryWithTag(vport->priv, OVS_STT_POOL_TAG);
92     vport->priv = NULL;
93 }
94
95 /*
96  * --------------------------------------------------------------------------
97  * OvsEncapStt --
98  *     Encapsulates a packet with an STT header.
99  * --------------------------------------------------------------------------
100  */
101 NDIS_STATUS
102 OvsEncapStt(POVS_VPORT_ENTRY vport,
103             PNET_BUFFER_LIST curNbl,
104             OvsIPv4TunnelKey *tunKey,
105             POVS_SWITCH_CONTEXT switchContext,
106             POVS_PACKET_HDR_INFO layers,
107             PNET_BUFFER_LIST *newNbl)
108 {
109     OVS_FWD_INFO fwdInfo;
110     NDIS_STATUS status;
111
112     UNREFERENCED_PARAMETER(switchContext);
113     status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
114     if (status != STATUS_SUCCESS) {
115         OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
116         /*
117          * XXX This case where the ARP table is not populated is
118          * currently not handled
119          */
120         return NDIS_STATUS_FAILURE;
121     }
122
123     status = OvsDoEncapStt(vport, curNbl, tunKey, &fwdInfo, layers,
124                            switchContext, newNbl);
125     return status;
126 }
127
128 /*
129  * --------------------------------------------------------------------------
130  * OvsDoEncapStt --
131  *    Internal utility function which actually does the STT encap.
132  * --------------------------------------------------------------------------
133  */
134 NDIS_STATUS
135 OvsDoEncapStt(POVS_VPORT_ENTRY vport,
136               PNET_BUFFER_LIST curNbl,
137               const OvsIPv4TunnelKey *tunKey,
138               const POVS_FWD_INFO fwdInfo,
139               POVS_PACKET_HDR_INFO layers,
140               POVS_SWITCH_CONTEXT switchContext,
141               PNET_BUFFER_LIST *newNbl)
142 {
143     NDIS_STATUS status = NDIS_STATUS_SUCCESS;
144     PMDL curMdl = NULL;
145     PNET_BUFFER curNb;
146     PUINT8 buf = NULL;
147     EthHdr *outerEthHdr;
148     IPHdr *outerIpHdr;
149     TCPHdr *outerTcpHdr;
150     SttHdr *sttHdr;
151     UINT32 innerFrameLen, ipTotalLen;
152     POVS_STT_VPORT vportStt;
153     UINT32 headRoom = OvsGetSttTunHdrSize();
154     UINT32 tcpChksumLen;
155     PUINT8 bufferStart;
156     ULONG mss = 0;
157     NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
158
159     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
160
161     /* Verify if inner checksum is verified */
162     BOOLEAN innerChecksumVerified = FALSE;
163     BOOLEAN innerPartialChecksum = FALSE;
164
165     if (layers->isTcp) {
166         lsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
167                 TcpLargeSendNetBufferListInfo);
168
169         switch (lsoInfo.Transmit.Type) {
170             case NDIS_TCP_LARGE_SEND_OFFLOAD_V1_TYPE:
171                 mss = lsoInfo.LsoV1Transmit.MSS;
172                 break;
173             case NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE:
174                 mss = lsoInfo.LsoV2Transmit.MSS;
175                 break;
176             default:
177                 OVS_LOG_ERROR("Unknown LSO transmit type:%d",
178                               lsoInfo.Transmit.Type);
179         }
180     }
181
182     vportStt = (POVS_STT_VPORT) GetOvsVportPriv(vport);
183     ASSERT(vportStt);
184
185     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
186     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
187                                           TcpIpChecksumNetBufferListInfo);
188     *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
189                                 FALSE /*copy NblInfo*/);
190     if (*newNbl == NULL) {
191         OVS_LOG_ERROR("Unable to copy NBL");
192         return NDIS_STATUS_FAILURE;
193     }
194
195     curNbl = *newNbl;
196     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
197     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
198     /* NB Chain should be split before */
199     ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
200     innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb);
201
202     bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl,
203                                                        LowPagePriority);
204     bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
205
206     if (layers->isIPv4) {
207         IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset);
208         if (!ip->tot_len) {
209             ip->tot_len = htons(innerFrameLen - sizeof(EthHdr));
210         }
211         if (!ip->check) {
212             ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0);
213         }
214     }
215
216     if (layers->isTcp) {
217         if (mss) {
218             innerPartialChecksum = TRUE;
219         } else {
220             if (!csumInfo.Transmit.TcpChecksum) {
221                 innerChecksumVerified = TRUE;
222             } else {
223                 innerPartialChecksum = TRUE;
224             }
225         }
226     } else if (layers->isUdp) {
227         if(!csumInfo.Transmit.UdpChecksum) {
228             innerChecksumVerified = TRUE;
229         } else {
230             innerPartialChecksum = TRUE;
231         }
232     }
233
234     status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
235     if (status != NDIS_STATUS_SUCCESS) {
236         ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)");
237         OVS_LOG_ERROR("Unable to NdisRetreatNetBufferDataStart(headroom)");
238         goto ret_error;
239     }
240
241     /*
242      * Make sure that the headroom for the tunnel header is continguous in
243      * memory.
244      */
245     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
246     ASSERT((int) (MmGetMdlByteCount(curMdl) - NET_BUFFER_CURRENT_MDL_OFFSET(curNb))
247                 >= (int) headRoom);
248
249     buf = (PUINT8) MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
250     if (!buf) {
251         ASSERT(!"MmGetSystemAddressForMdlSafe failed");
252         OVS_LOG_ERROR("MmGetSystemAddressForMdlSafe failed");
253         status = NDIS_STATUS_RESOURCES;
254         goto ret_error;
255     }
256
257     buf += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
258     outerEthHdr = (EthHdr *)buf;
259     outerIpHdr = (IPHdr *) (outerEthHdr + 1);
260     outerTcpHdr = (TCPHdr *) (outerIpHdr + 1);
261     sttHdr = (SttHdr *) (outerTcpHdr + 1);
262
263     /* L2 header */
264     ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
265             (PCHAR)&fwdInfo->srcMacAddr);
266     NdisMoveMemory(outerEthHdr->Destination, fwdInfo->dstMacAddr,
267                     sizeof outerEthHdr->Destination + sizeof outerEthHdr->Source);
268     outerEthHdr->Type = htons(ETH_TYPE_IPV4);
269
270     /* L3 header */
271     outerIpHdr->ihl = sizeof(IPHdr) >> 2;
272     outerIpHdr->version = IPPROTO_IPV4;
273     outerIpHdr->tos = tunKey->tos;
274
275     ipTotalLen = sizeof(IPHdr) + sizeof(TCPHdr) + STT_HDR_LEN + innerFrameLen;
276     outerIpHdr->tot_len = htons(ipTotalLen);
277     ASSERT(ipTotalLen < 65536);
278
279     outerIpHdr->id = (uint16) atomic_add64(&vportStt->ipId, innerFrameLen);
280     outerIpHdr->frag_off = (tunKey->flags & OVS_TNL_F_DONT_FRAGMENT) ?
281                            IP_DF_NBO : 0;
282     outerIpHdr->ttl = tunKey->ttl? tunKey->ttl : 64;
283     outerIpHdr->protocol = IPPROTO_TCP;
284     outerIpHdr->check = 0;
285     outerIpHdr->saddr = fwdInfo->srcIpAddr;
286     outerIpHdr->daddr = tunKey->dst;
287
288     /* L4 header */
289     RtlZeroMemory(outerTcpHdr, sizeof *outerTcpHdr);
290     outerTcpHdr->source = htons(tunKey->flow_hash | 32768);
291     outerTcpHdr->dest = htons(vportStt->dstPort);
292     outerTcpHdr->seq = htonl((STT_HDR_LEN + innerFrameLen) <<
293                              STT_SEQ_LEN_SHIFT);
294     outerTcpHdr->ack_seq = htonl(atomic_inc64(&vportStt->ackNo));
295     outerTcpHdr->doff = sizeof(TCPHdr) >> 2;
296     outerTcpHdr->psh = 1;
297     outerTcpHdr->ack = 1;
298     outerTcpHdr->window = (uint16) ~0;
299
300     /* Calculate pseudo header chksum */
301     tcpChksumLen = sizeof(TCPHdr) + STT_HDR_LEN + innerFrameLen;
302     ASSERT(tcpChksumLen < 65535);
303     outerTcpHdr->check = IPPseudoChecksum(&fwdInfo->srcIpAddr,(uint32 *) &tunKey->dst,
304                                           IPPROTO_TCP, (uint16) tcpChksumLen);
305     sttHdr->version = 0;
306
307     /* Set STT Header */
308     sttHdr->flags = 0;
309     if (innerPartialChecksum) {
310         sttHdr->flags |= STT_CSUM_PARTIAL;
311         if (layers->isIPv4) {
312             sttHdr->flags |= STT_PROTO_IPV4;
313         }
314         if (layers->isTcp) {
315             sttHdr->flags |= STT_PROTO_TCP;
316         }
317         sttHdr->l4Offset = (UINT8) layers->l4Offset;
318         sttHdr->mss = (UINT16) htons(mss);
319     } else if (innerChecksumVerified) {
320         sttHdr->flags = STT_CSUM_VERIFIED;
321         sttHdr->l4Offset = 0;
322         sttHdr->mss = 0;
323     }
324
325     sttHdr->reserved = 0;
326     sttHdr->vlanTCI = 0;
327     sttHdr->key = tunKey->tunnelId;
328     /* Zero out stt padding */
329     *(uint16 *)(sttHdr + 1) = 0;
330
331     /* Offload IP and TCP checksum */
332     ULONG tcpHeaderOffset = sizeof *outerEthHdr +
333                         outerIpHdr->ihl * 4;
334     csumInfo.Value = 0;
335     csumInfo.Transmit.IpHeaderChecksum = 1;
336     csumInfo.Transmit.TcpChecksum = 1;
337     csumInfo.Transmit.IsIPv4 = 1;
338     csumInfo.Transmit.TcpHeaderOffset = tcpHeaderOffset;
339     NET_BUFFER_LIST_INFO(curNbl,
340                          TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
341
342     UINT32 encapMss = OvsGetExternalMtu(switchContext) - sizeof(IPHdr) - sizeof(TCPHdr);
343     if (ipTotalLen > encapMss) {
344         lsoInfo.Value = 0;
345         lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset;
346         lsoInfo.LsoV2Transmit.MSS = encapMss;
347         lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
348         lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
349         NET_BUFFER_LIST_INFO(curNbl,
350                              TcpLargeSendNetBufferListInfo) = lsoInfo.Value;
351     }
352
353     return STATUS_SUCCESS;
354
355 ret_error:
356     OvsCompleteNBL(switchContext, *newNbl, TRUE);
357     *newNbl = NULL;
358     return status;
359 }
360
361 /*
362  *----------------------------------------------------------------------------
363  * OvsValidateTCPChecksum
364  *     Validate TCP checksum
365  *----------------------------------------------------------------------------
366  */
367 static __inline NDIS_STATUS
368 OvsValidateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb)
369 {
370     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
371     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
372                                           TcpIpChecksumNetBufferListInfo);
373
374     /* Check if NIC has indicated TCP checksum failure */
375     if (csumInfo.Receive.TcpChecksumFailed) {
376         return NDIS_STATUS_INVALID_PACKET;
377     }
378
379     UINT16 checkSum;
380
381     /* Check if TCP Checksum has been calculated by NIC */
382     if (csumInfo.Receive.TcpChecksumSucceeded) {
383         return NDIS_STATUS_SUCCESS;
384     }
385
386     EthHdr *eth = (EthHdr *)NdisGetDataBuffer(curNb, sizeof(EthHdr),
387                                               NULL, 1, 0);
388
389     if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV4)) {
390         IPHdr *ip = (IPHdr *)((PCHAR)eth + sizeof *eth);
391         UINT32 l4Payload = ntohs(ip->tot_len) - ip->ihl * 4;
392         TCPHdr *tcp = (TCPHdr *)((PCHAR)ip + ip->ihl * 4);
393         checkSum = tcp->check;
394
395         tcp->check = 0;
396         tcp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
397                                       IPPROTO_TCP, (UINT16)l4Payload);
398         tcp->check = CalculateChecksumNB(curNb, (UINT16)(l4Payload),
399                                          sizeof(EthHdr) + ip->ihl * 4);
400         if (checkSum != tcp->check) {
401             return NDIS_STATUS_INVALID_PACKET;
402         }
403     } else {
404         OVS_LOG_ERROR("IPv6 on STT is not supported");
405         return NDIS_STATUS_INVALID_PACKET;
406     }
407
408     csumInfo.Receive.TcpChecksumSucceeded = 1;
409     NET_BUFFER_LIST_INFO(curNbl,
410                          TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
411     return NDIS_STATUS_SUCCESS;
412 }
413
414 /*
415  *----------------------------------------------------------------------------
416  * OvsInitSttDefragmentation
417  *     Initialize the components used by the stt lso defragmentation
418  *----------------------------------------------------------------------------
419  */
420 NTSTATUS
421 OvsInitSttDefragmentation()
422 {
423     NTSTATUS status;
424     HANDLE threadHandle = NULL;
425
426     /* Init the sync-lock */
427     NdisAllocateSpinLock(&OvsSttSpinLock);
428
429     /* Init the Hash Buffer */
430     OvsSttPktFragHash = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY)
431                                                  * STT_HASH_TABLE_SIZE,
432                                                  OVS_STT_POOL_TAG);
433     if (OvsSttPktFragHash == NULL) {
434         NdisFreeSpinLock(&OvsSttSpinLock);
435         return STATUS_INSUFFICIENT_RESOURCES;
436     }
437
438     for (int i = 0; i < STT_HASH_TABLE_SIZE; i++) {
439         InitializeListHead(&OvsSttPktFragHash[i]);
440     }
441
442     /* Init Defrag Cleanup Thread */
443     KeInitializeEvent(&sttDefragThreadCtx.event, NotificationEvent, FALSE);
444     status = PsCreateSystemThread(&threadHandle, SYNCHRONIZE, NULL, NULL,
445                                   NULL, OvsSttDefragCleaner,
446                                   &sttDefragThreadCtx);
447
448     if (status != STATUS_SUCCESS) {
449         OvsCleanupSttDefragmentation();
450         return status;
451     }
452
453     ObReferenceObjectByHandle(threadHandle, SYNCHRONIZE, NULL, KernelMode,
454                               &sttDefragThreadCtx.threadObject, NULL);
455     ZwClose(threadHandle);
456     threadHandle = NULL;
457     return STATUS_SUCCESS;
458 }
459
460 /*
461  *----------------------------------------------------------------------------
462  * OvsCleanupSttDefragmentation
463  *     Cleanup memory and thread that were spawned for STT LSO defragmentation
464  *----------------------------------------------------------------------------
465  */
466 VOID
467 OvsCleanupSttDefragmentation(VOID)
468 {
469     NdisAcquireSpinLock(&OvsSttSpinLock);
470     sttDefragThreadCtx.exit = 1;
471     KeSetEvent(&sttDefragThreadCtx.event, 0, FALSE);
472     NdisReleaseSpinLock(&OvsSttSpinLock);
473
474     KeWaitForSingleObject(sttDefragThreadCtx.threadObject, Executive,
475                           KernelMode, FALSE, NULL);
476     ObDereferenceObject(sttDefragThreadCtx.threadObject);
477
478     if (OvsSttPktFragHash) {
479         OvsFreeMemoryWithTag(OvsSttPktFragHash, OVS_STT_POOL_TAG);
480         OvsSttPktFragHash = NULL;
481     }
482
483     NdisFreeSpinLock(&OvsSttSpinLock);
484 }
485
486 /*
487  *----------------------------------------------------------------------------
488  * OvsSttDefragCleaner
489  *     Runs periodically and cleans up the buffer to remove expired segments
490  *----------------------------------------------------------------------------
491  */
492 VOID
493 OvsSttDefragCleaner(PVOID data)
494 {
495     POVS_STT_THREAD_CTX context = (POVS_STT_THREAD_CTX)data;
496     PLIST_ENTRY link, next;
497     POVS_STT_PKT_ENTRY entry;
498     BOOLEAN success = TRUE;
499
500     while (success) {
501         NdisAcquireSpinLock(&OvsSttSpinLock);
502         if (context->exit) {
503             NdisReleaseSpinLock(&OvsSttSpinLock);
504             break;
505         }
506
507         /* Set the timeout for the thread and cleanup */
508         UINT64 currentTime, threadSleepTimeout;
509         NdisGetCurrentSystemTime((LARGE_INTEGER *)&currentTime);
510         threadSleepTimeout = currentTime + STT_CLEANUP_INTERVAL;
511
512         for (int i = 0; i < STT_HASH_TABLE_SIZE; i++) {
513             LIST_FORALL_SAFE(&OvsSttPktFragHash[i], link, next) {
514                 entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
515                 if (entry->timeout < currentTime) {
516                     RemoveEntryList(&entry->link);
517                     OvsFreeMemoryWithTag(entry->packetBuf, OVS_STT_POOL_TAG);
518                     OvsFreeMemoryWithTag(entry, OVS_STT_POOL_TAG);
519                 }
520             }
521         }
522
523         NdisReleaseSpinLock(&OvsSttSpinLock);
524         KeWaitForSingleObject(&context->event, Executive, KernelMode,
525                               FALSE, (LARGE_INTEGER *)&threadSleepTimeout);
526     }
527
528     PsTerminateSystemThread(STATUS_SUCCESS);
529 }
530
531 static OVS_STT_PKT_KEY
532 OvsGeneratePacketKey(IPHdr *ipHdr, TCPHdr *tcpHdr)
533 {
534     OVS_STT_PKT_KEY key;
535     key.sAddr = ipHdr->saddr;
536     key.dAddr = ipHdr->daddr;
537     key.ackSeq = ntohl(tcpHdr->ack_seq);
538     return key;
539 }
540
541 static UINT32
542 OvsSttGetPktHash(OVS_STT_PKT_KEY *pktKey)
543 {
544     UINT32 arr[3];
545     arr[0] = pktKey->ackSeq;
546     arr[1] = pktKey->dAddr;
547     arr[2] = pktKey->sAddr;
548     return OvsJhashWords(arr, 3, OVS_HASH_BASIS);
549 }
550
551 static VOID *
552 OvsLookupPktFrag(OVS_STT_PKT_KEY *pktKey, UINT32 hash)
553 {
554     PLIST_ENTRY link;
555     POVS_STT_PKT_ENTRY entry;
556
557     LIST_FORALL(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], link) {
558         entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
559         if (entry->ovsPktKey.ackSeq == pktKey->ackSeq &&
560             entry->ovsPktKey.dAddr == pktKey->dAddr &&
561             entry->ovsPktKey.sAddr == pktKey->sAddr) {
562             return entry;
563         }
564     }
565     return NULL;
566 }
567
568 /*
569 *
570 --------------------------------------------------------------------------
571 * OvsSttReassemble --
572 *     Reassemble an LSO packet from multiple STT-Fragments.
573 *
574 --------------------------------------------------------------------------
575 */
576 PNET_BUFFER_LIST
577 OvsSttReassemble(POVS_SWITCH_CONTEXT switchContext,
578                  PNET_BUFFER_LIST curNbl,
579                  IPHdr *ipHdr,
580                  TCPHdr *tcp,
581                  SttHdr *newSttHdr,
582                  UINT16 payloadLen)
583 {
584     UINT32 seq = ntohl(tcp->seq);
585     UINT32 innerPacketLen = (seq >> STT_SEQ_LEN_SHIFT) - STT_HDR_LEN;
586     UINT32 segOffset = STT_SEGMENT_OFF(seq);
587     UINT32 offset = segOffset == 0 ? 0 : segOffset - STT_HDR_LEN;
588     UINT32 startOffset = 0;
589     OVS_STT_PKT_ENTRY *pktFragEntry;
590     PNET_BUFFER_LIST targetPNbl = NULL;
591     BOOLEAN lastPacket = FALSE;
592     PNET_BUFFER sourceNb;
593     UINT32 fragmentLength = payloadLen;
594     SttHdr stt;
595     SttHdr *sttHdr = NULL;
596     sourceNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
597
598     /* XXX optimize this lock */
599     NdisAcquireSpinLock(&OvsSttSpinLock);
600
601     /* If this is the first fragment, copy the STT header */
602     if (segOffset == 0) {
603         sttHdr = NdisGetDataBuffer(sourceNb, sizeof(SttHdr), &stt, 1, 0);
604         if (sttHdr == NULL) {
605             OVS_LOG_ERROR("Unable to retrieve STT header");
606             return NULL;
607         }
608         fragmentLength = fragmentLength - STT_HDR_LEN;
609         startOffset = startOffset + STT_HDR_LEN;
610     }
611
612     /* Lookup fragment */
613     OVS_STT_PKT_KEY pktKey = OvsGeneratePacketKey(ipHdr, tcp);
614     UINT32 hash = OvsSttGetPktHash(&pktKey);
615     pktFragEntry = OvsLookupPktFrag(&pktKey, hash);
616
617     if (pktFragEntry == NULL) {
618         /* Create a new Packet Entry */
619         POVS_STT_PKT_ENTRY entry;
620         entry = OvsAllocateMemoryWithTag(sizeof(OVS_STT_PKT_ENTRY),
621                                          OVS_STT_POOL_TAG);
622         RtlZeroMemory(entry, sizeof (OVS_STT_PKT_ENTRY));
623
624         /* Update Key, timestamp and recvdLen */
625         NdisMoveMemory(&entry->ovsPktKey, &pktKey, sizeof (OVS_STT_PKT_KEY));
626
627         entry->recvdLen = fragmentLength;
628
629         UINT64 currentTime;
630         NdisGetCurrentSystemTime((LARGE_INTEGER *) &currentTime);
631         entry->timeout = currentTime + STT_ENTRY_TIMEOUT;
632
633         if (segOffset == 0) {
634             entry->sttHdr = *sttHdr;
635         }
636
637         /* Copy the data from Source to new buffer */
638         entry->packetBuf = OvsAllocateMemoryWithTag(innerPacketLen,
639                                                     OVS_STT_POOL_TAG);
640         if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
641                               entry->packetBuf + offset) == NULL) {
642             OVS_LOG_ERROR("Error when obtaining bytes from Packet");
643             goto handle_error;
644         }
645
646         /* Insert the entry in the Static Buffer */
647         InsertHeadList(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK],
648                        &entry->link);
649     } else {
650         /* Add to recieved length to identify if this is the last fragment */
651         pktFragEntry->recvdLen += fragmentLength;
652         lastPacket = (pktFragEntry->recvdLen == innerPacketLen);
653
654         if (segOffset == 0) {
655             pktFragEntry->sttHdr = *sttHdr;
656         }
657
658         /* Copy the fragment data from Source to existing buffer */
659         if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
660                               pktFragEntry->packetBuf + offset) == NULL) {
661             OVS_LOG_ERROR("Error when obtaining bytes from Packet");
662             goto handle_error;
663         }
664     }
665
666 handle_error:
667     if (lastPacket) {
668         /* Retrieve the original STT header */
669         NdisMoveMemory(newSttHdr, &pktFragEntry->sttHdr, sizeof (SttHdr));
670         targetPNbl = OvsAllocateNBLFromBuffer(switchContext, pktFragEntry->packetBuf,
671                                               innerPacketLen);
672
673         /* Delete this entry and free up the memory/ */
674         RemoveEntryList(&pktFragEntry->link);
675         OvsFreeMemoryWithTag(pktFragEntry->packetBuf, OVS_STT_POOL_TAG);
676         OvsFreeMemoryWithTag(pktFragEntry, OVS_STT_POOL_TAG);
677     }
678
679     NdisReleaseSpinLock(&OvsSttSpinLock);
680     return lastPacket ? targetPNbl : NULL;
681 }
682
683 VOID
684 OvsDecapSetOffloads(PNET_BUFFER_LIST curNbl, SttHdr *sttHdr)
685 {
686     if ((sttHdr->flags & STT_CSUM_VERIFIED)
687         || !(sttHdr->flags & STT_CSUM_PARTIAL)) {
688         return;
689     }
690
691     UINT8 protoType;
692     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
693     csumInfo.Value = 0;
694     csumInfo.Transmit.IpHeaderChecksum = 0;
695     csumInfo.Transmit.TcpHeaderOffset = sttHdr->l4Offset;
696     protoType = sttHdr->flags & STT_PROTO_TYPES;
697     switch (protoType) {
698         case (STT_PROTO_IPV4 | STT_PROTO_TCP):
699             /* TCP/IPv4 */
700             csumInfo.Transmit.IsIPv4 = 1;
701             csumInfo.Transmit.TcpChecksum = 1;
702             break;
703         case STT_PROTO_TCP:
704             /* TCP/IPv6 */
705             csumInfo.Transmit.IsIPv6 = 1;
706             csumInfo.Transmit.TcpChecksum = 1;
707             break;
708         case STT_PROTO_IPV4:
709             /* UDP/IPv4 */
710             csumInfo.Transmit.IsIPv4 = 1;
711             csumInfo.Transmit.UdpChecksum = 1;
712             break;
713         default:
714             /* UDP/IPv6 */
715             csumInfo.Transmit.IsIPv6 = 1;
716             csumInfo.Transmit.UdpChecksum = 1;
717     }
718     NET_BUFFER_LIST_INFO(curNbl,
719                          TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
720
721     if (sttHdr->mss) {
722         NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
723         lsoInfo.Value = 0;
724         lsoInfo.LsoV2Transmit.TcpHeaderOffset = sttHdr->l4Offset;
725         lsoInfo.LsoV2Transmit.MSS = ETH_DEFAULT_MTU
726                                     - sizeof(IPHdr)
727                                     - sizeof(TCPHdr);
728         lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
729         if (sttHdr->flags & STT_PROTO_IPV4) {
730             lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
731         } else {
732             lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv6;
733         }
734         NET_BUFFER_LIST_INFO(curNbl,
735                              TcpLargeSendNetBufferListInfo) = lsoInfo.Value;
736     }
737 }
738
739 /*
740  * --------------------------------------------------------------------------
741  * OvsDecapStt --
742  *     Decapsulates an STT packet.
743  * --------------------------------------------------------------------------
744  */
745 NDIS_STATUS
746 OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
747             PNET_BUFFER_LIST curNbl,
748             OvsIPv4TunnelKey *tunKey,
749             PNET_BUFFER_LIST *newNbl)
750 {
751     NDIS_STATUS status = NDIS_STATUS_FAILURE;
752     PNET_BUFFER curNb, newNb;
753     IPHdr *ipHdr;
754     char *ipBuf[sizeof(IPHdr)];
755     SttHdr stt;
756     SttHdr *sttHdr;
757     char *sttBuf[STT_HDR_LEN];
758     UINT32 advanceCnt, hdrLen;
759     BOOLEAN isLsoPacket = FALSE;
760
761     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
762     ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
763
764     /* Validate the TCP Checksum */
765     status = OvsValidateTCPChecksum(curNbl, curNb);
766     if (status != NDIS_STATUS_SUCCESS) {
767         return status;
768     }
769
770     /* Skip Eth header */
771     hdrLen = sizeof(EthHdr);
772     NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
773     advanceCnt = hdrLen;
774
775     ipHdr = NdisGetDataBuffer(curNb, sizeof *ipHdr, (PVOID) &ipBuf,
776                                                     1 /*no align*/, 0);
777     ASSERT(ipHdr);
778
779     TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4);
780
781     /* Skip IP & TCP headers */
782     hdrLen = sizeof(IPHdr) + sizeof(TCPHdr),
783     NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
784     advanceCnt += hdrLen;
785
786     UINT32 seq = ntohl(tcp->seq);
787     UINT32 totalLen = (seq >> STT_SEQ_LEN_SHIFT);
788     UINT16 payloadLen = (UINT16)ntohs(ipHdr->tot_len)
789                         - (ipHdr->ihl * 4)
790                         - (sizeof * tcp);
791
792     /* Check if incoming packet requires reassembly */
793     if (totalLen != payloadLen) {
794         sttHdr = &stt;
795         PNET_BUFFER_LIST pNbl = OvsSttReassemble(switchContext, curNbl,
796                                                  ipHdr, tcp, sttHdr,
797                                                  payloadLen);
798         if (pNbl == NULL) {
799             return NDIS_STATUS_SUCCESS;
800         }
801
802         *newNbl = pNbl;
803         isLsoPacket = TRUE;
804     } else {
805         /* STT Header */
806         sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr,
807                                    (PVOID) &sttBuf, 1 /*no align*/, 0);
808         /* Skip stt header, DataOffset points to inner pkt now. */
809         hdrLen = STT_HDR_LEN;
810         NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
811         advanceCnt += hdrLen;
812
813         *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0,
814                                     0, FALSE /*copy NBL info*/);
815     }
816
817     if (*newNbl == NULL) {
818         OVS_LOG_ERROR("Unable to allocate a new cloned NBL");
819         return NDIS_STATUS_RESOURCES;
820     }
821
822     status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL);
823     if (status != NDIS_STATUS_SUCCESS) {
824         OvsCompleteNBL(switchContext, *newNbl, TRUE);
825         return NDIS_STATUS_FAILURE;
826     }
827     newNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
828
829     ASSERT(sttHdr);
830
831     /* Initialize the tunnel key */
832     tunKey->dst = ipHdr->daddr;
833     tunKey->src = ipHdr->saddr;
834     tunKey->tunnelId = sttHdr->key;
835     tunKey->flags = OVS_TNL_F_KEY;
836     tunKey->tos = ipHdr->tos;
837     tunKey->ttl = ipHdr->ttl;
838     tunKey->pad = 0;
839
840     /* Set Checksum and LSO offload flags */
841     OvsDecapSetOffloads(*newNbl, sttHdr);
842
843     return NDIS_STATUS_SUCCESS;
844 }