datapath-windows: Refactor sofware offloads and mss
[cascardo/ovs.git] / datapath-windows / ovsext / Stt.c
1 /*
2  * Copyright (c) 2015, 2016 VMware, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at:
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "precomp.h"
18
19 #include "Atomic.h"
20 #include "Debug.h"
21 #include "Flow.h"
22 #include "IpHelper.h"
23 #include "Jhash.h"
24 #include "NetProto.h"
25 #include "Offload.h"
26 #include "PacketIO.h"
27 #include "PacketParser.h"
28 #include "Stt.h"
29 #include "Switch.h"
30 #include "User.h"
31 #include "Util.h"
32 #include "Vport.h"
33
34 #ifdef OVS_DBG_MOD
35 #undef OVS_DBG_MOD
36 #endif
37 #define OVS_DBG_MOD OVS_DBG_STT
38
39
40 KSTART_ROUTINE OvsSttDefragCleaner;
41 static PLIST_ENTRY OvsSttPktFragHash;
42 static NDIS_SPIN_LOCK OvsSttSpinLock;
43 static OVS_STT_THREAD_CTX sttDefragThreadCtx;
44
45 static NDIS_STATUS
46 OvsDoEncapStt(POVS_VPORT_ENTRY vport, PNET_BUFFER_LIST curNbl,
47               const OvsIPv4TunnelKey *tunKey,
48               const POVS_FWD_INFO fwdInfo,
49               POVS_PACKET_HDR_INFO layers,
50               POVS_SWITCH_CONTEXT switchContext,
51               PNET_BUFFER_LIST *newNbl);
52
53 /*
54  * --------------------------------------------------------------------------
55  * OvsInitSttTunnel --
56  *    Initialize STT tunnel module.
57  * --------------------------------------------------------------------------
58  */
59 NTSTATUS
60 OvsInitSttTunnel(POVS_VPORT_ENTRY vport,
61                  UINT16 tcpDestPort)
62 {
63     POVS_STT_VPORT sttPort;
64
65     sttPort = (POVS_STT_VPORT) OvsAllocateMemoryWithTag(sizeof(*sttPort),
66                                                         OVS_STT_POOL_TAG);
67     if (!sttPort) {
68         OVS_LOG_ERROR("Insufficient memory, can't allocate STT_VPORT");
69         return STATUS_INSUFFICIENT_RESOURCES;
70     }
71
72     RtlZeroMemory(sttPort, sizeof(*sttPort));
73     sttPort->dstPort = tcpDestPort;
74     vport->priv = (PVOID) sttPort;
75     return STATUS_SUCCESS;
76 }
77
78 /*
79  * --------------------------------------------------------------------------
80  * OvsCleanupSttTunnel --
81  *    Cleanup STT Tunnel module.
82  * --------------------------------------------------------------------------
83  */
84 void
85 OvsCleanupSttTunnel(POVS_VPORT_ENTRY vport)
86 {
87     if (vport->ovsType != OVS_VPORT_TYPE_STT ||
88         vport->priv == NULL) {
89         return;
90     }
91
92     OvsFreeMemoryWithTag(vport->priv, OVS_STT_POOL_TAG);
93     vport->priv = NULL;
94 }
95
96 /*
97  * --------------------------------------------------------------------------
98  * OvsEncapStt --
99  *     Encapsulates a packet with an STT header.
100  * --------------------------------------------------------------------------
101  */
102 NDIS_STATUS
103 OvsEncapStt(POVS_VPORT_ENTRY vport,
104             PNET_BUFFER_LIST curNbl,
105             OvsIPv4TunnelKey *tunKey,
106             POVS_SWITCH_CONTEXT switchContext,
107             POVS_PACKET_HDR_INFO layers,
108             PNET_BUFFER_LIST *newNbl)
109 {
110     OVS_FWD_INFO fwdInfo;
111     NDIS_STATUS status;
112
113     UNREFERENCED_PARAMETER(switchContext);
114     status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
115     if (status != STATUS_SUCCESS) {
116         OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
117         /*
118          * XXX This case where the ARP table is not populated is
119          * currently not handled
120          */
121         return NDIS_STATUS_FAILURE;
122     }
123
124     status = OvsDoEncapStt(vport, curNbl, tunKey, &fwdInfo, layers,
125                            switchContext, newNbl);
126     return status;
127 }
128
129 /*
130  * --------------------------------------------------------------------------
131  * OvsDoEncapStt --
132  *    Internal utility function which actually does the STT encap.
133  * --------------------------------------------------------------------------
134  */
135 NDIS_STATUS
136 OvsDoEncapStt(POVS_VPORT_ENTRY vport,
137               PNET_BUFFER_LIST curNbl,
138               const OvsIPv4TunnelKey *tunKey,
139               const POVS_FWD_INFO fwdInfo,
140               POVS_PACKET_HDR_INFO layers,
141               POVS_SWITCH_CONTEXT switchContext,
142               PNET_BUFFER_LIST *newNbl)
143 {
144     NDIS_STATUS status = NDIS_STATUS_SUCCESS;
145     PMDL curMdl = NULL;
146     PNET_BUFFER curNb;
147     PUINT8 buf = NULL;
148     EthHdr *outerEthHdr;
149     IPHdr *outerIpHdr;
150     TCPHdr *outerTcpHdr;
151     SttHdr *sttHdr;
152     UINT32 innerFrameLen, ipTotalLen;
153     POVS_STT_VPORT vportStt;
154     UINT32 headRoom = OvsGetSttTunHdrSize();
155     UINT32 tcpChksumLen;
156     PUINT8 bufferStart;
157     ULONG mss = 0;
158     NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
159
160     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
161
162     /* Verify if inner checksum is verified */
163     BOOLEAN innerChecksumVerified = FALSE;
164     BOOLEAN innerPartialChecksum = FALSE;
165
166     if (layers->isTcp) {
167         mss = OVSGetTcpMSS(curNbl);
168     }
169
170     vportStt = (POVS_STT_VPORT) GetOvsVportPriv(vport);
171     ASSERT(vportStt);
172
173     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
174     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
175                                           TcpIpChecksumNetBufferListInfo);
176     *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
177                                 FALSE /*copy NblInfo*/);
178     if (*newNbl == NULL) {
179         OVS_LOG_ERROR("Unable to copy NBL");
180         return NDIS_STATUS_FAILURE;
181     }
182
183     curNbl = *newNbl;
184     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
185     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
186     /* NB Chain should be split before */
187     ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
188     innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb);
189
190     bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl,
191                                                        LowPagePriority);
192     bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
193
194     if (layers->isIPv4) {
195         IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset);
196         if (!ip->tot_len) {
197             ip->tot_len = htons(innerFrameLen - sizeof(EthHdr));
198         }
199         if (!ip->check) {
200             ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0);
201         }
202     }
203
204     if (layers->isTcp) {
205         if (mss) {
206             innerPartialChecksum = TRUE;
207         } else {
208             if (!csumInfo.Transmit.TcpChecksum) {
209                 innerChecksumVerified = TRUE;
210             } else {
211                 innerPartialChecksum = TRUE;
212             }
213         }
214     } else if (layers->isUdp) {
215         if(!csumInfo.Transmit.UdpChecksum) {
216             innerChecksumVerified = TRUE;
217         } else {
218             innerPartialChecksum = TRUE;
219         }
220     }
221
222     status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
223     if (status != NDIS_STATUS_SUCCESS) {
224         ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)");
225         OVS_LOG_ERROR("Unable to NdisRetreatNetBufferDataStart(headroom)");
226         goto ret_error;
227     }
228
229     /*
230      * Make sure that the headroom for the tunnel header is continguous in
231      * memory.
232      */
233     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
234     ASSERT((int) (MmGetMdlByteCount(curMdl) - NET_BUFFER_CURRENT_MDL_OFFSET(curNb))
235                 >= (int) headRoom);
236
237     buf = (PUINT8) MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
238     if (!buf) {
239         ASSERT(!"MmGetSystemAddressForMdlSafe failed");
240         OVS_LOG_ERROR("MmGetSystemAddressForMdlSafe failed");
241         status = NDIS_STATUS_RESOURCES;
242         goto ret_error;
243     }
244
245     buf += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
246     outerEthHdr = (EthHdr *)buf;
247     outerIpHdr = (IPHdr *) (outerEthHdr + 1);
248     outerTcpHdr = (TCPHdr *) (outerIpHdr + 1);
249     sttHdr = (SttHdr *) (outerTcpHdr + 1);
250
251     /* L2 header */
252     ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
253             (PCHAR)&fwdInfo->srcMacAddr);
254     NdisMoveMemory(outerEthHdr->Destination, fwdInfo->dstMacAddr,
255                     sizeof outerEthHdr->Destination + sizeof outerEthHdr->Source);
256     outerEthHdr->Type = htons(ETH_TYPE_IPV4);
257
258     /* L3 header */
259     outerIpHdr->ihl = sizeof(IPHdr) >> 2;
260     outerIpHdr->version = IPPROTO_IPV4;
261     outerIpHdr->tos = tunKey->tos;
262
263     ipTotalLen = sizeof(IPHdr) + sizeof(TCPHdr) + STT_HDR_LEN + innerFrameLen;
264     outerIpHdr->tot_len = htons(ipTotalLen);
265     ASSERT(ipTotalLen < 65536);
266
267     outerIpHdr->id = (uint16) atomic_add64(&vportStt->ipId, innerFrameLen);
268     outerIpHdr->frag_off = (tunKey->flags & OVS_TNL_F_DONT_FRAGMENT) ?
269                            IP_DF_NBO : 0;
270     outerIpHdr->ttl = tunKey->ttl? tunKey->ttl : 64;
271     outerIpHdr->protocol = IPPROTO_TCP;
272     outerIpHdr->check = 0;
273     outerIpHdr->saddr = fwdInfo->srcIpAddr;
274     outerIpHdr->daddr = tunKey->dst;
275
276     /* L4 header */
277     RtlZeroMemory(outerTcpHdr, sizeof *outerTcpHdr);
278     outerTcpHdr->source = htons(tunKey->flow_hash | 32768);
279     outerTcpHdr->dest = htons(vportStt->dstPort);
280     outerTcpHdr->seq = htonl((STT_HDR_LEN + innerFrameLen) <<
281                              STT_SEQ_LEN_SHIFT);
282     outerTcpHdr->ack_seq = htonl(atomic_inc64(&vportStt->ackNo));
283     outerTcpHdr->doff = sizeof(TCPHdr) >> 2;
284     outerTcpHdr->psh = 1;
285     outerTcpHdr->ack = 1;
286     outerTcpHdr->window = (uint16) ~0;
287
288     /* Calculate pseudo header chksum */
289     tcpChksumLen = sizeof(TCPHdr) + STT_HDR_LEN + innerFrameLen;
290     ASSERT(tcpChksumLen < 65535);
291     outerTcpHdr->check = IPPseudoChecksum(&fwdInfo->srcIpAddr,(uint32 *) &tunKey->dst,
292                                           IPPROTO_TCP, (uint16) tcpChksumLen);
293     sttHdr->version = 0;
294
295     /* Set STT Header */
296     sttHdr->flags = 0;
297     if (innerPartialChecksum) {
298         sttHdr->flags |= STT_CSUM_PARTIAL;
299         if (layers->isIPv4) {
300             sttHdr->flags |= STT_PROTO_IPV4;
301         }
302         if (layers->isTcp) {
303             sttHdr->flags |= STT_PROTO_TCP;
304         }
305         sttHdr->l4Offset = (UINT8) layers->l4Offset;
306         sttHdr->mss = (UINT16) htons(mss);
307     } else if (innerChecksumVerified) {
308         sttHdr->flags = STT_CSUM_VERIFIED;
309         sttHdr->l4Offset = 0;
310         sttHdr->mss = 0;
311     }
312
313     sttHdr->reserved = 0;
314     sttHdr->vlanTCI = 0;
315     sttHdr->key = tunKey->tunnelId;
316     /* Zero out stt padding */
317     *(uint16 *)(sttHdr + 1) = 0;
318
319     /* Offload IP and TCP checksum */
320     ULONG tcpHeaderOffset = sizeof *outerEthHdr +
321                         outerIpHdr->ihl * 4;
322     csumInfo.Value = 0;
323     csumInfo.Transmit.IpHeaderChecksum = 1;
324     csumInfo.Transmit.TcpChecksum = 1;
325     csumInfo.Transmit.IsIPv4 = 1;
326     csumInfo.Transmit.TcpHeaderOffset = tcpHeaderOffset;
327     NET_BUFFER_LIST_INFO(curNbl,
328                          TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
329
330     UINT32 encapMss = OvsGetExternalMtu(switchContext) - sizeof(IPHdr) - sizeof(TCPHdr);
331     if (ipTotalLen > encapMss) {
332         lsoInfo.Value = 0;
333         lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset;
334         lsoInfo.LsoV2Transmit.MSS = encapMss;
335         lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
336         lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
337         NET_BUFFER_LIST_INFO(curNbl,
338                              TcpLargeSendNetBufferListInfo) = lsoInfo.Value;
339     }
340
341     return STATUS_SUCCESS;
342
343 ret_error:
344     OvsCompleteNBL(switchContext, *newNbl, TRUE);
345     *newNbl = NULL;
346     return status;
347 }
348
349 /*
350  *----------------------------------------------------------------------------
351  * OvsValidateTCPChecksum
352  *     Validate TCP checksum
353  *----------------------------------------------------------------------------
354  */
355 static __inline NDIS_STATUS
356 OvsValidateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb)
357 {
358     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
359     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
360                                           TcpIpChecksumNetBufferListInfo);
361
362     /* Check if NIC has indicated TCP checksum failure */
363     if (csumInfo.Receive.TcpChecksumFailed) {
364         return NDIS_STATUS_INVALID_PACKET;
365     }
366
367     UINT16 checkSum;
368
369     /* Check if TCP Checksum has been calculated by NIC */
370     if (csumInfo.Receive.TcpChecksumSucceeded) {
371         return NDIS_STATUS_SUCCESS;
372     }
373
374     EthHdr *eth = (EthHdr *)NdisGetDataBuffer(curNb, sizeof(EthHdr),
375                                               NULL, 1, 0);
376
377     if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV4)) {
378         IPHdr *ip = (IPHdr *)((PCHAR)eth + sizeof *eth);
379         UINT32 l4Payload = ntohs(ip->tot_len) - ip->ihl * 4;
380         TCPHdr *tcp = (TCPHdr *)((PCHAR)ip + ip->ihl * 4);
381         checkSum = tcp->check;
382
383         tcp->check = 0;
384         tcp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
385                                       IPPROTO_TCP, (UINT16)l4Payload);
386         tcp->check = CalculateChecksumNB(curNb, (UINT16)(l4Payload),
387                                          sizeof(EthHdr) + ip->ihl * 4);
388         if (checkSum != tcp->check) {
389             return NDIS_STATUS_INVALID_PACKET;
390         }
391     } else {
392         OVS_LOG_ERROR("IPv6 on STT is not supported");
393         return NDIS_STATUS_INVALID_PACKET;
394     }
395
396     csumInfo.Receive.TcpChecksumSucceeded = 1;
397     NET_BUFFER_LIST_INFO(curNbl,
398                          TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
399     return NDIS_STATUS_SUCCESS;
400 }
401
402 /*
403  *----------------------------------------------------------------------------
404  * OvsInitSttDefragmentation
405  *     Initialize the components used by the stt lso defragmentation
406  *----------------------------------------------------------------------------
407  */
408 NTSTATUS
409 OvsInitSttDefragmentation()
410 {
411     NTSTATUS status;
412     HANDLE threadHandle = NULL;
413
414     /* Init the sync-lock */
415     NdisAllocateSpinLock(&OvsSttSpinLock);
416
417     /* Init the Hash Buffer */
418     OvsSttPktFragHash = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY)
419                                                  * STT_HASH_TABLE_SIZE,
420                                                  OVS_STT_POOL_TAG);
421     if (OvsSttPktFragHash == NULL) {
422         NdisFreeSpinLock(&OvsSttSpinLock);
423         return STATUS_INSUFFICIENT_RESOURCES;
424     }
425
426     for (int i = 0; i < STT_HASH_TABLE_SIZE; i++) {
427         InitializeListHead(&OvsSttPktFragHash[i]);
428     }
429
430     /* Init Defrag Cleanup Thread */
431     KeInitializeEvent(&sttDefragThreadCtx.event, NotificationEvent, FALSE);
432     status = PsCreateSystemThread(&threadHandle, SYNCHRONIZE, NULL, NULL,
433                                   NULL, OvsSttDefragCleaner,
434                                   &sttDefragThreadCtx);
435
436     if (status != STATUS_SUCCESS) {
437         OvsCleanupSttDefragmentation();
438         return status;
439     }
440
441     ObReferenceObjectByHandle(threadHandle, SYNCHRONIZE, NULL, KernelMode,
442                               &sttDefragThreadCtx.threadObject, NULL);
443     ZwClose(threadHandle);
444     threadHandle = NULL;
445     return STATUS_SUCCESS;
446 }
447
448 /*
449  *----------------------------------------------------------------------------
450  * OvsCleanupSttDefragmentation
451  *     Cleanup memory and thread that were spawned for STT LSO defragmentation
452  *----------------------------------------------------------------------------
453  */
454 VOID
455 OvsCleanupSttDefragmentation(VOID)
456 {
457     NdisAcquireSpinLock(&OvsSttSpinLock);
458     sttDefragThreadCtx.exit = 1;
459     KeSetEvent(&sttDefragThreadCtx.event, 0, FALSE);
460     NdisReleaseSpinLock(&OvsSttSpinLock);
461
462     KeWaitForSingleObject(sttDefragThreadCtx.threadObject, Executive,
463                           KernelMode, FALSE, NULL);
464     ObDereferenceObject(sttDefragThreadCtx.threadObject);
465
466     if (OvsSttPktFragHash) {
467         OvsFreeMemoryWithTag(OvsSttPktFragHash, OVS_STT_POOL_TAG);
468         OvsSttPktFragHash = NULL;
469     }
470
471     NdisFreeSpinLock(&OvsSttSpinLock);
472 }
473
474 /*
475  *----------------------------------------------------------------------------
476  * OvsSttDefragCleaner
477  *     Runs periodically and cleans up the buffer to remove expired segments
478  *----------------------------------------------------------------------------
479  */
480 VOID
481 OvsSttDefragCleaner(PVOID data)
482 {
483     POVS_STT_THREAD_CTX context = (POVS_STT_THREAD_CTX)data;
484     PLIST_ENTRY link, next;
485     POVS_STT_PKT_ENTRY entry;
486     BOOLEAN success = TRUE;
487
488     while (success) {
489         NdisAcquireSpinLock(&OvsSttSpinLock);
490         if (context->exit) {
491             NdisReleaseSpinLock(&OvsSttSpinLock);
492             break;
493         }
494
495         /* Set the timeout for the thread and cleanup */
496         UINT64 currentTime, threadSleepTimeout;
497         NdisGetCurrentSystemTime((LARGE_INTEGER *)&currentTime);
498         threadSleepTimeout = currentTime + STT_CLEANUP_INTERVAL;
499
500         for (int i = 0; i < STT_HASH_TABLE_SIZE; i++) {
501             LIST_FORALL_SAFE(&OvsSttPktFragHash[i], link, next) {
502                 entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
503                 if (entry->timeout < currentTime) {
504                     RemoveEntryList(&entry->link);
505                     OvsFreeMemoryWithTag(entry->packetBuf, OVS_STT_POOL_TAG);
506                     OvsFreeMemoryWithTag(entry, OVS_STT_POOL_TAG);
507                 }
508             }
509         }
510
511         NdisReleaseSpinLock(&OvsSttSpinLock);
512         KeWaitForSingleObject(&context->event, Executive, KernelMode,
513                               FALSE, (LARGE_INTEGER *)&threadSleepTimeout);
514     }
515
516     PsTerminateSystemThread(STATUS_SUCCESS);
517 }
518
519 static OVS_STT_PKT_KEY
520 OvsGeneratePacketKey(IPHdr *ipHdr, TCPHdr *tcpHdr)
521 {
522     OVS_STT_PKT_KEY key;
523     key.sAddr = ipHdr->saddr;
524     key.dAddr = ipHdr->daddr;
525     key.ackSeq = ntohl(tcpHdr->ack_seq);
526     return key;
527 }
528
529 static UINT32
530 OvsSttGetPktHash(OVS_STT_PKT_KEY *pktKey)
531 {
532     UINT32 arr[3];
533     arr[0] = pktKey->ackSeq;
534     arr[1] = pktKey->dAddr;
535     arr[2] = pktKey->sAddr;
536     return OvsJhashWords(arr, 3, OVS_HASH_BASIS);
537 }
538
539 static VOID *
540 OvsLookupPktFrag(OVS_STT_PKT_KEY *pktKey, UINT32 hash)
541 {
542     PLIST_ENTRY link;
543     POVS_STT_PKT_ENTRY entry;
544
545     LIST_FORALL(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], link) {
546         entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
547         if (entry->ovsPktKey.ackSeq == pktKey->ackSeq &&
548             entry->ovsPktKey.dAddr == pktKey->dAddr &&
549             entry->ovsPktKey.sAddr == pktKey->sAddr) {
550             return entry;
551         }
552     }
553     return NULL;
554 }
555
556 /*
557 *
558 --------------------------------------------------------------------------
559 * OvsSttReassemble --
560 *     Reassemble an LSO packet from multiple STT-Fragments.
561 *
562 --------------------------------------------------------------------------
563 */
564 PNET_BUFFER_LIST
565 OvsSttReassemble(POVS_SWITCH_CONTEXT switchContext,
566                  PNET_BUFFER_LIST curNbl,
567                  IPHdr *ipHdr,
568                  TCPHdr *tcp,
569                  SttHdr *newSttHdr,
570                  UINT16 payloadLen)
571 {
572     UINT32 seq = ntohl(tcp->seq);
573     UINT32 innerPacketLen = (seq >> STT_SEQ_LEN_SHIFT) - STT_HDR_LEN;
574     UINT32 segOffset = STT_SEGMENT_OFF(seq);
575     UINT32 offset = segOffset == 0 ? 0 : segOffset - STT_HDR_LEN;
576     UINT32 startOffset = 0;
577     OVS_STT_PKT_ENTRY *pktFragEntry;
578     PNET_BUFFER_LIST targetPNbl = NULL;
579     BOOLEAN lastPacket = FALSE;
580     PNET_BUFFER sourceNb;
581     UINT32 fragmentLength = payloadLen;
582     SttHdr stt;
583     SttHdr *sttHdr = NULL;
584     sourceNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
585
586     /* XXX optimize this lock */
587     NdisAcquireSpinLock(&OvsSttSpinLock);
588
589     /* If this is the first fragment, copy the STT header */
590     if (segOffset == 0) {
591         sttHdr = NdisGetDataBuffer(sourceNb, sizeof(SttHdr), &stt, 1, 0);
592         if (sttHdr == NULL) {
593             OVS_LOG_ERROR("Unable to retrieve STT header");
594             return NULL;
595         }
596         fragmentLength = fragmentLength - STT_HDR_LEN;
597         startOffset = startOffset + STT_HDR_LEN;
598     }
599
600     /* Lookup fragment */
601     OVS_STT_PKT_KEY pktKey = OvsGeneratePacketKey(ipHdr, tcp);
602     UINT32 hash = OvsSttGetPktHash(&pktKey);
603     pktFragEntry = OvsLookupPktFrag(&pktKey, hash);
604
605     if (pktFragEntry == NULL) {
606         /* Create a new Packet Entry */
607         POVS_STT_PKT_ENTRY entry;
608         entry = OvsAllocateMemoryWithTag(sizeof(OVS_STT_PKT_ENTRY),
609                                          OVS_STT_POOL_TAG);
610         RtlZeroMemory(entry, sizeof (OVS_STT_PKT_ENTRY));
611
612         /* Update Key, timestamp and recvdLen */
613         NdisMoveMemory(&entry->ovsPktKey, &pktKey, sizeof (OVS_STT_PKT_KEY));
614
615         entry->recvdLen = fragmentLength;
616
617         UINT64 currentTime;
618         NdisGetCurrentSystemTime((LARGE_INTEGER *) &currentTime);
619         entry->timeout = currentTime + STT_ENTRY_TIMEOUT;
620
621         if (segOffset == 0) {
622             entry->sttHdr = *sttHdr;
623         }
624
625         /* Copy the data from Source to new buffer */
626         entry->packetBuf = OvsAllocateMemoryWithTag(innerPacketLen,
627                                                     OVS_STT_POOL_TAG);
628         if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
629                               entry->packetBuf + offset) == NULL) {
630             OVS_LOG_ERROR("Error when obtaining bytes from Packet");
631             goto handle_error;
632         }
633
634         /* Insert the entry in the Static Buffer */
635         InsertHeadList(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK],
636                        &entry->link);
637     } else {
638         /* Add to recieved length to identify if this is the last fragment */
639         pktFragEntry->recvdLen += fragmentLength;
640         lastPacket = (pktFragEntry->recvdLen == innerPacketLen);
641
642         if (segOffset == 0) {
643             pktFragEntry->sttHdr = *sttHdr;
644         }
645
646         /* Copy the fragment data from Source to existing buffer */
647         if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
648                               pktFragEntry->packetBuf + offset) == NULL) {
649             OVS_LOG_ERROR("Error when obtaining bytes from Packet");
650             goto handle_error;
651         }
652     }
653
654 handle_error:
655     if (lastPacket) {
656         /* Retrieve the original STT header */
657         NdisMoveMemory(newSttHdr, &pktFragEntry->sttHdr, sizeof (SttHdr));
658         targetPNbl = OvsAllocateNBLFromBuffer(switchContext, pktFragEntry->packetBuf,
659                                               innerPacketLen);
660
661         /* Delete this entry and free up the memory/ */
662         RemoveEntryList(&pktFragEntry->link);
663         OvsFreeMemoryWithTag(pktFragEntry->packetBuf, OVS_STT_POOL_TAG);
664         OvsFreeMemoryWithTag(pktFragEntry, OVS_STT_POOL_TAG);
665     }
666
667     NdisReleaseSpinLock(&OvsSttSpinLock);
668     return lastPacket ? targetPNbl : NULL;
669 }
670
671 VOID
672 OvsDecapSetOffloads(PNET_BUFFER_LIST curNbl, SttHdr *sttHdr)
673 {
674     if ((sttHdr->flags & STT_CSUM_VERIFIED)
675         || !(sttHdr->flags & STT_CSUM_PARTIAL)) {
676         return;
677     }
678
679     UINT8 protoType;
680     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
681     csumInfo.Value = 0;
682     csumInfo.Transmit.IpHeaderChecksum = 0;
683     csumInfo.Transmit.TcpHeaderOffset = sttHdr->l4Offset;
684     protoType = sttHdr->flags & STT_PROTO_TYPES;
685     switch (protoType) {
686         case (STT_PROTO_IPV4 | STT_PROTO_TCP):
687             /* TCP/IPv4 */
688             csumInfo.Transmit.IsIPv4 = 1;
689             csumInfo.Transmit.TcpChecksum = 1;
690             break;
691         case STT_PROTO_TCP:
692             /* TCP/IPv6 */
693             csumInfo.Transmit.IsIPv6 = 1;
694             csumInfo.Transmit.TcpChecksum = 1;
695             break;
696         case STT_PROTO_IPV4:
697             /* UDP/IPv4 */
698             csumInfo.Transmit.IsIPv4 = 1;
699             csumInfo.Transmit.UdpChecksum = 1;
700             break;
701         default:
702             /* UDP/IPv6 */
703             csumInfo.Transmit.IsIPv6 = 1;
704             csumInfo.Transmit.UdpChecksum = 1;
705     }
706     NET_BUFFER_LIST_INFO(curNbl,
707                          TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
708
709     if (sttHdr->mss) {
710         NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
711         lsoInfo.Value = 0;
712         lsoInfo.LsoV2Transmit.TcpHeaderOffset = sttHdr->l4Offset;
713         lsoInfo.LsoV2Transmit.MSS = ETH_DEFAULT_MTU
714                                     - sizeof(IPHdr)
715                                     - sizeof(TCPHdr);
716         lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
717         if (sttHdr->flags & STT_PROTO_IPV4) {
718             lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
719         } else {
720             lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv6;
721         }
722         NET_BUFFER_LIST_INFO(curNbl,
723                              TcpLargeSendNetBufferListInfo) = lsoInfo.Value;
724     }
725 }
726
727 /*
728  * --------------------------------------------------------------------------
729  * OvsDecapStt --
730  *     Decapsulates an STT packet.
731  * --------------------------------------------------------------------------
732  */
733 NDIS_STATUS
734 OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
735             PNET_BUFFER_LIST curNbl,
736             OvsIPv4TunnelKey *tunKey,
737             PNET_BUFFER_LIST *newNbl)
738 {
739     NDIS_STATUS status = NDIS_STATUS_FAILURE;
740     PNET_BUFFER curNb, newNb;
741     IPHdr *ipHdr;
742     char *ipBuf[sizeof(IPHdr)];
743     SttHdr stt;
744     SttHdr *sttHdr;
745     char *sttBuf[STT_HDR_LEN];
746     UINT32 advanceCnt, hdrLen;
747     BOOLEAN isLsoPacket = FALSE;
748
749     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
750     ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
751
752     /* Validate the TCP Checksum */
753     status = OvsValidateTCPChecksum(curNbl, curNb);
754     if (status != NDIS_STATUS_SUCCESS) {
755         return status;
756     }
757
758     /* Skip Eth header */
759     hdrLen = sizeof(EthHdr);
760     NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
761     advanceCnt = hdrLen;
762
763     ipHdr = NdisGetDataBuffer(curNb, sizeof *ipHdr, (PVOID) &ipBuf,
764                                                     1 /*no align*/, 0);
765     ASSERT(ipHdr);
766
767     TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4);
768
769     /* Skip IP & TCP headers */
770     hdrLen = sizeof(IPHdr) + sizeof(TCPHdr),
771     NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
772     advanceCnt += hdrLen;
773
774     UINT32 seq = ntohl(tcp->seq);
775     UINT32 totalLen = (seq >> STT_SEQ_LEN_SHIFT);
776     UINT16 payloadLen = (UINT16)ntohs(ipHdr->tot_len)
777                         - (ipHdr->ihl * 4)
778                         - (sizeof * tcp);
779
780     /* Check if incoming packet requires reassembly */
781     if (totalLen != payloadLen) {
782         sttHdr = &stt;
783         PNET_BUFFER_LIST pNbl = OvsSttReassemble(switchContext, curNbl,
784                                                  ipHdr, tcp, sttHdr,
785                                                  payloadLen);
786         if (pNbl == NULL) {
787             return NDIS_STATUS_SUCCESS;
788         }
789
790         *newNbl = pNbl;
791         isLsoPacket = TRUE;
792     } else {
793         /* STT Header */
794         sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr,
795                                    (PVOID) &sttBuf, 1 /*no align*/, 0);
796         /* Skip stt header, DataOffset points to inner pkt now. */
797         hdrLen = STT_HDR_LEN;
798         NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
799         advanceCnt += hdrLen;
800
801         *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0,
802                                     0, FALSE /*copy NBL info*/);
803     }
804
805     if (*newNbl == NULL) {
806         OVS_LOG_ERROR("Unable to allocate a new cloned NBL");
807         return NDIS_STATUS_RESOURCES;
808     }
809
810     status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL);
811     if (status != NDIS_STATUS_SUCCESS) {
812         OvsCompleteNBL(switchContext, *newNbl, TRUE);
813         return NDIS_STATUS_FAILURE;
814     }
815     newNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
816
817     ASSERT(sttHdr);
818
819     /* Initialize the tunnel key */
820     tunKey->dst = ipHdr->daddr;
821     tunKey->src = ipHdr->saddr;
822     tunKey->tunnelId = sttHdr->key;
823     tunKey->flags = OVS_TNL_F_KEY;
824     tunKey->tos = ipHdr->tos;
825     tunKey->ttl = ipHdr->ttl;
826     tunKey->pad = 0;
827
828     /* Set Checksum and LSO offload flags */
829     OvsDecapSetOffloads(*newNbl, sttHdr);
830
831     return NDIS_STATUS_SUCCESS;
832 }