dc2910f6ed4fc73059e3fdf969aba96870bfeddc
[cascardo/ovs.git] / datapath-windows / ovsext / Stt.c
1 /*
2  * Copyright (c) 2015 VMware, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at:
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "precomp.h"
18 #include "NetProto.h"
19 #include "Switch.h"
20 #include "Vport.h"
21 #include "Flow.h"
22 #include "Stt.h"
23 #include "IpHelper.h"
24 #include "Checksum.h"
25 #include "User.h"
26 #include "PacketIO.h"
27 #include "Flow.h"
28 #include "PacketParser.h"
29 #include "Atomic.h"
30 #include "Util.h"
31
32 #ifdef OVS_DBG_MOD
33 #undef OVS_DBG_MOD
34 #endif
35 #define OVS_DBG_MOD OVS_DBG_STT
36 #include "Debug.h"
37 #include "Jhash.h"
38
39 KSTART_ROUTINE OvsSttDefragCleaner;
40 static PLIST_ENTRY OvsSttPktFragHash;
41 static NDIS_SPIN_LOCK OvsSttSpinLock;
42 static OVS_STT_THREAD_CTX sttDefragThreadCtx;
43
44 static NDIS_STATUS
45 OvsDoEncapStt(POVS_VPORT_ENTRY vport, PNET_BUFFER_LIST curNbl,
46               const OvsIPv4TunnelKey *tunKey,
47               const POVS_FWD_INFO fwdInfo,
48               POVS_PACKET_HDR_INFO layers,
49               POVS_SWITCH_CONTEXT switchContext,
50               PNET_BUFFER_LIST *newNbl);
51
52 /*
53  * --------------------------------------------------------------------------
54  * OvsInitSttTunnel --
55  *    Initialize STT tunnel module.
56  * --------------------------------------------------------------------------
57  */
58 NTSTATUS
59 OvsInitSttTunnel(POVS_VPORT_ENTRY vport,
60                  UINT16 tcpDestPort)
61 {
62     POVS_STT_VPORT sttPort;
63
64     sttPort = (POVS_STT_VPORT) OvsAllocateMemoryWithTag(sizeof(*sttPort),
65                                                         OVS_STT_POOL_TAG);
66     if (!sttPort) {
67         OVS_LOG_ERROR("Insufficient memory, can't allocate STT_VPORT");
68         return STATUS_INSUFFICIENT_RESOURCES;
69     }
70
71     RtlZeroMemory(sttPort, sizeof(*sttPort));
72     sttPort->dstPort = tcpDestPort;
73     vport->priv = (PVOID) sttPort;
74     return STATUS_SUCCESS;
75 }
76
77 /*
78  * --------------------------------------------------------------------------
79  * OvsCleanupSttTunnel --
80  *    Cleanup STT Tunnel module.
81  * --------------------------------------------------------------------------
82  */
83 void
84 OvsCleanupSttTunnel(POVS_VPORT_ENTRY vport)
85 {
86     if (vport->ovsType != OVS_VPORT_TYPE_STT ||
87         vport->priv == NULL) {
88         return;
89     }
90
91     OvsFreeMemoryWithTag(vport->priv, OVS_STT_POOL_TAG);
92     vport->priv = NULL;
93 }
94
95 /*
96  * --------------------------------------------------------------------------
97  * OvsEncapStt --
98  *     Encapsulates a packet with an STT header.
99  * --------------------------------------------------------------------------
100  */
101 NDIS_STATUS
102 OvsEncapStt(POVS_VPORT_ENTRY vport,
103             PNET_BUFFER_LIST curNbl,
104             OvsIPv4TunnelKey *tunKey,
105             POVS_SWITCH_CONTEXT switchContext,
106             POVS_PACKET_HDR_INFO layers,
107             PNET_BUFFER_LIST *newNbl)
108 {
109     OVS_FWD_INFO fwdInfo;
110     NDIS_STATUS status;
111
112     UNREFERENCED_PARAMETER(switchContext);
113     status = OvsLookupIPFwdInfo(tunKey->dst, &fwdInfo);
114     if (status != STATUS_SUCCESS) {
115         OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL);
116         /*
117          * XXX This case where the ARP table is not populated is
118          * currently not handled
119          */
120         return NDIS_STATUS_FAILURE;
121     }
122
123     status = OvsDoEncapStt(vport, curNbl, tunKey, &fwdInfo, layers,
124                            switchContext, newNbl);
125     return status;
126 }
127
128 /*
129  * --------------------------------------------------------------------------
130  * OvsDoEncapStt --
131  *    Internal utility function which actually does the STT encap.
132  * --------------------------------------------------------------------------
133  */
134 NDIS_STATUS
135 OvsDoEncapStt(POVS_VPORT_ENTRY vport,
136               PNET_BUFFER_LIST curNbl,
137               const OvsIPv4TunnelKey *tunKey,
138               const POVS_FWD_INFO fwdInfo,
139               POVS_PACKET_HDR_INFO layers,
140               POVS_SWITCH_CONTEXT switchContext,
141               PNET_BUFFER_LIST *newNbl)
142 {
143     NDIS_STATUS status = NDIS_STATUS_SUCCESS;
144     PMDL curMdl = NULL;
145     PNET_BUFFER curNb;
146     PUINT8 buf = NULL;
147     EthHdr *outerEthHdr;
148     IPHdr *outerIpHdr;
149     TCPHdr *outerTcpHdr;
150     SttHdr *sttHdr;
151     UINT32 innerFrameLen, ipTotalLen;
152     POVS_STT_VPORT vportStt;
153     UINT32 headRoom = OvsGetSttTunHdrSize();
154     UINT32 tcpChksumLen;
155     PUINT8 bufferStart;
156     ULONG mss = 0;
157     NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
158
159     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
160
161     /* Verify if inner checksum is verified */
162     BOOLEAN innerChecksumVerified = FALSE;
163     BOOLEAN innerPartialChecksum = FALSE;
164
165     if (layers->isTcp) {
166         lsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
167                 TcpLargeSendNetBufferListInfo);
168
169         switch (lsoInfo.Transmit.Type) {
170             case NDIS_TCP_LARGE_SEND_OFFLOAD_V1_TYPE:
171                 mss = lsoInfo.LsoV1Transmit.MSS;
172                 break;
173             case NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE:
174                 mss = lsoInfo.LsoV2Transmit.MSS;
175                 break;
176             default:
177                 OVS_LOG_ERROR("Unknown LSO transmit type:%d",
178                               lsoInfo.Transmit.Type);
179                 return NDIS_STATUS_FAILURE;
180         }
181     }
182
183     vportStt = (POVS_STT_VPORT) GetOvsVportPriv(vport);
184     ASSERT(vportStt);
185
186     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
187     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
188                                           TcpIpChecksumNetBufferListInfo);
189     *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom,
190                                 FALSE /*copy NblInfo*/);
191     if (*newNbl == NULL) {
192         OVS_LOG_ERROR("Unable to copy NBL");
193         return NDIS_STATUS_FAILURE;
194     }
195
196     curNbl = *newNbl;
197     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
198     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
199     /* NB Chain should be split before */
200     ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
201     innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb);
202
203     bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl,
204                                                        LowPagePriority);
205     bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
206
207     if (layers->isIPv4) {
208         IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset);
209         if (!ip->tot_len) {
210             ip->tot_len = htons(innerFrameLen - sizeof(EthHdr));
211         }
212         if (!ip->check) {
213             ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0);
214         }
215     }
216
217     if (layers->isTcp) {
218         if (mss) {
219             innerPartialChecksum = TRUE;
220         } else {
221             if (!csumInfo.Transmit.TcpChecksum) {
222                 innerChecksumVerified = TRUE;
223             } else {
224                 innerPartialChecksum = TRUE;
225             }
226         }
227     } else if (layers->isUdp) {
228         if(!csumInfo.Transmit.UdpChecksum) {
229             innerChecksumVerified = TRUE;
230         } else {
231             innerPartialChecksum = TRUE;
232         }
233     }
234
235     status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
236     if (status != NDIS_STATUS_SUCCESS) {
237         ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)");
238         OVS_LOG_ERROR("Unable to NdisRetreatNetBufferDataStart(headroom)");
239         goto ret_error;
240     }
241
242     /*
243      * Make sure that the headroom for the tunnel header is continguous in
244      * memory.
245      */
246     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
247     ASSERT((int) (MmGetMdlByteCount(curMdl) - NET_BUFFER_CURRENT_MDL_OFFSET(curNb))
248                 >= (int) headRoom);
249
250     buf = (PUINT8) MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
251     if (!buf) {
252         ASSERT(!"MmGetSystemAddressForMdlSafe failed");
253         OVS_LOG_ERROR("MmGetSystemAddressForMdlSafe failed");
254         status = NDIS_STATUS_RESOURCES;
255         goto ret_error;
256     }
257
258     buf += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
259     outerEthHdr = (EthHdr *)buf;
260     outerIpHdr = (IPHdr *) (outerEthHdr + 1);
261     outerTcpHdr = (TCPHdr *) (outerIpHdr + 1);
262     sttHdr = (SttHdr *) (outerTcpHdr + 1);
263
264     /* L2 header */
265     ASSERT(((PCHAR)&fwdInfo->dstMacAddr + sizeof fwdInfo->dstMacAddr) ==
266             (PCHAR)&fwdInfo->srcMacAddr);
267     NdisMoveMemory(outerEthHdr->Destination, fwdInfo->dstMacAddr,
268                     sizeof outerEthHdr->Destination + sizeof outerEthHdr->Source);
269     outerEthHdr->Type = htons(ETH_TYPE_IPV4);
270
271     /* L3 header */
272     outerIpHdr->ihl = sizeof(IPHdr) >> 2;
273     outerIpHdr->version = IPPROTO_IPV4;
274     outerIpHdr->tos = tunKey->tos;
275
276     ipTotalLen = sizeof(IPHdr) + sizeof(TCPHdr) + STT_HDR_LEN + innerFrameLen;
277     outerIpHdr->tot_len = htons(ipTotalLen);
278     ASSERT(ipTotalLen < 65536);
279
280     outerIpHdr->id = (uint16) atomic_add64(&vportStt->ipId, innerFrameLen);
281     outerIpHdr->frag_off = (tunKey->flags & OVS_TNL_F_DONT_FRAGMENT) ?
282                            IP_DF_NBO : 0;
283     outerIpHdr->ttl = tunKey->ttl? tunKey->ttl : 64;
284     outerIpHdr->protocol = IPPROTO_TCP;
285     outerIpHdr->check = 0;
286     outerIpHdr->saddr = fwdInfo->srcIpAddr;
287     outerIpHdr->daddr = tunKey->dst;
288
289     /* L4 header */
290     RtlZeroMemory(outerTcpHdr, sizeof *outerTcpHdr);
291     outerTcpHdr->source = htons(tunKey->flow_hash | 32768);
292     outerTcpHdr->dest = htons(vportStt->dstPort);
293     outerTcpHdr->seq = htonl((STT_HDR_LEN + innerFrameLen) <<
294                              STT_SEQ_LEN_SHIFT);
295     outerTcpHdr->ack_seq = htonl(atomic_inc64(&vportStt->ackNo));
296     outerTcpHdr->doff = sizeof(TCPHdr) >> 2;
297     outerTcpHdr->psh = 1;
298     outerTcpHdr->ack = 1;
299     outerTcpHdr->window = (uint16) ~0;
300
301     /* Calculate pseudo header chksum */
302     tcpChksumLen = sizeof(TCPHdr) + STT_HDR_LEN + innerFrameLen;
303     ASSERT(tcpChksumLen < 65535);
304     outerTcpHdr->check = IPPseudoChecksum(&fwdInfo->srcIpAddr,(uint32 *) &tunKey->dst,
305                                           IPPROTO_TCP, (uint16) tcpChksumLen);
306     sttHdr->version = 0;
307
308     /* Set STT Header */
309     sttHdr->flags = 0;
310     if (innerPartialChecksum) {
311         sttHdr->flags |= STT_CSUM_PARTIAL;
312         if (layers->isIPv4) {
313             sttHdr->flags |= STT_PROTO_IPV4;
314         }
315         if (layers->isTcp) {
316             sttHdr->flags |= STT_PROTO_TCP;
317         }
318         sttHdr->l4Offset = (UINT8) layers->l4Offset;
319         sttHdr->mss = (UINT16) htons(mss);
320     } else if (innerChecksumVerified) {
321         sttHdr->flags = STT_CSUM_VERIFIED;
322         sttHdr->l4Offset = 0;
323         sttHdr->mss = 0;
324     }
325
326     sttHdr->reserved = 0;
327     sttHdr->vlanTCI = 0;
328     sttHdr->key = tunKey->tunnelId;
329     /* Zero out stt padding */
330     *(uint16 *)(sttHdr + 1) = 0;
331
332     /* Offload IP and TCP checksum */
333     ULONG tcpHeaderOffset = sizeof *outerEthHdr +
334                         outerIpHdr->ihl * 4;
335     csumInfo.Value = 0;
336     csumInfo.Transmit.IpHeaderChecksum = 1;
337     csumInfo.Transmit.TcpChecksum = 1;
338     csumInfo.Transmit.IsIPv4 = 1;
339     csumInfo.Transmit.TcpHeaderOffset = tcpHeaderOffset;
340     NET_BUFFER_LIST_INFO(curNbl,
341                          TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
342
343     UINT32 encapMss = OvsGetExternalMtu(switchContext) - sizeof(IPHdr) - sizeof(TCPHdr);
344     if (ipTotalLen > encapMss) {
345         lsoInfo.Value = 0;
346         lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset;
347         lsoInfo.LsoV2Transmit.MSS = encapMss;
348         lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
349         lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
350         NET_BUFFER_LIST_INFO(curNbl,
351                              TcpLargeSendNetBufferListInfo) = lsoInfo.Value;
352     }
353
354     return STATUS_SUCCESS;
355
356 ret_error:
357     OvsCompleteNBL(switchContext, *newNbl, TRUE);
358     *newNbl = NULL;
359     return status;
360 }
361
362 /*
363  *----------------------------------------------------------------------------
364  * OvsValidateTCPChecksum
365  *     Validate TCP checksum
366  *----------------------------------------------------------------------------
367  */
368 static __inline NDIS_STATUS
369 OvsValidateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb)
370 {
371     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
372     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
373                                           TcpIpChecksumNetBufferListInfo);
374
375     /* Check if NIC has indicated TCP checksum failure */
376     if (csumInfo.Receive.TcpChecksumFailed) {
377         return NDIS_STATUS_INVALID_PACKET;
378     }
379
380     UINT16 checkSum;
381
382     /* Check if TCP Checksum has been calculated by NIC */
383     if (csumInfo.Receive.TcpChecksumSucceeded) {
384         return NDIS_STATUS_SUCCESS;
385     }
386
387     EthHdr *eth = (EthHdr *)NdisGetDataBuffer(curNb, sizeof(EthHdr),
388                                               NULL, 1, 0);
389
390     if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV4)) {
391         IPHdr *ip = (IPHdr *)((PCHAR)eth + sizeof *eth);
392         UINT32 l4Payload = ntohs(ip->tot_len) - ip->ihl * 4;
393         TCPHdr *tcp = (TCPHdr *)((PCHAR)ip + ip->ihl * 4);
394         checkSum = tcp->check;
395
396         tcp->check = 0;
397         tcp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
398                                       IPPROTO_TCP, (UINT16)l4Payload);
399         tcp->check = CalculateChecksumNB(curNb, (UINT16)(l4Payload),
400                                          sizeof(EthHdr) + ip->ihl * 4);
401         if (checkSum != tcp->check) {
402             return NDIS_STATUS_INVALID_PACKET;
403         }
404     } else {
405         OVS_LOG_ERROR("IPv6 on STT is not supported");
406         return NDIS_STATUS_INVALID_PACKET;
407     }
408
409     csumInfo.Receive.TcpChecksumSucceeded = 1;
410     NET_BUFFER_LIST_INFO(curNbl,
411                          TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
412     return NDIS_STATUS_SUCCESS;
413 }
414
415 /*
416  *----------------------------------------------------------------------------
417  * OvsInitSttDefragmentation
418  *     Initialize the components used by the stt lso defragmentation
419  *----------------------------------------------------------------------------
420  */
421 NTSTATUS
422 OvsInitSttDefragmentation()
423 {
424     NTSTATUS status;
425     HANDLE threadHandle = NULL;
426
427     /* Init the sync-lock */
428     NdisAllocateSpinLock(&OvsSttSpinLock);
429
430     /* Init the Hash Buffer */
431     OvsSttPktFragHash = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY)
432                                                  * STT_HASH_TABLE_SIZE,
433                                                  OVS_STT_POOL_TAG);
434     if (OvsSttPktFragHash == NULL) {
435         NdisFreeSpinLock(&OvsSttSpinLock);
436         return STATUS_INSUFFICIENT_RESOURCES;
437     }
438
439     for (int i = 0; i < STT_HASH_TABLE_SIZE; i++) {
440         InitializeListHead(&OvsSttPktFragHash[i]);
441     }
442
443     /* Init Defrag Cleanup Thread */
444     KeInitializeEvent(&sttDefragThreadCtx.event, NotificationEvent, FALSE);
445     status = PsCreateSystemThread(&threadHandle, SYNCHRONIZE, NULL, NULL,
446                                   NULL, OvsSttDefragCleaner,
447                                   &sttDefragThreadCtx);
448
449     if (status != STATUS_SUCCESS) {
450         OvsCleanupSttDefragmentation();
451         return status;
452     }
453
454     ObReferenceObjectByHandle(threadHandle, SYNCHRONIZE, NULL, KernelMode,
455                               &sttDefragThreadCtx.threadObject, NULL);
456     ZwClose(threadHandle);
457     threadHandle = NULL;
458     return STATUS_SUCCESS;
459 }
460
461 /*
462  *----------------------------------------------------------------------------
463  * OvsCleanupSttDefragmentation
464  *     Cleanup memory and thread that were spawned for STT LSO defragmentation
465  *----------------------------------------------------------------------------
466  */
467 VOID
468 OvsCleanupSttDefragmentation(VOID)
469 {
470     NdisAcquireSpinLock(&OvsSttSpinLock);
471     sttDefragThreadCtx.exit = 1;
472     KeSetEvent(&sttDefragThreadCtx.event, 0, FALSE);
473     NdisReleaseSpinLock(&OvsSttSpinLock);
474
475     KeWaitForSingleObject(sttDefragThreadCtx.threadObject, Executive,
476                           KernelMode, FALSE, NULL);
477     ObDereferenceObject(sttDefragThreadCtx.threadObject);
478
479     if (OvsSttPktFragHash) {
480         OvsFreeMemoryWithTag(OvsSttPktFragHash, OVS_STT_POOL_TAG);
481         OvsSttPktFragHash = NULL;
482     }
483
484     NdisFreeSpinLock(&OvsSttSpinLock);
485 }
486
487 /*
488  *----------------------------------------------------------------------------
489  * OvsSttDefragCleaner
490  *     Runs periodically and cleans up the buffer to remove expired segments
491  *----------------------------------------------------------------------------
492  */
493 VOID
494 OvsSttDefragCleaner(PVOID data)
495 {
496     POVS_STT_THREAD_CTX context = (POVS_STT_THREAD_CTX)data;
497     PLIST_ENTRY link, next;
498     POVS_STT_PKT_ENTRY entry;
499     BOOLEAN success = TRUE;
500
501     while (success) {
502         NdisAcquireSpinLock(&OvsSttSpinLock);
503         if (context->exit) {
504             NdisReleaseSpinLock(&OvsSttSpinLock);
505             break;
506         }
507
508         /* Set the timeout for the thread and cleanup */
509         UINT64 currentTime, threadSleepTimeout;
510         NdisGetCurrentSystemTime((LARGE_INTEGER *)&currentTime);
511         threadSleepTimeout = currentTime + STT_CLEANUP_INTERVAL;
512
513         for (int i = 0; i < STT_HASH_TABLE_SIZE; i++) {
514             LIST_FORALL_SAFE(&OvsSttPktFragHash[i], link, next) {
515                 entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
516                 if (entry->timeout < currentTime) {
517                     RemoveEntryList(&entry->link);
518                     OvsFreeMemoryWithTag(entry->packetBuf, OVS_STT_POOL_TAG);
519                     OvsFreeMemoryWithTag(entry, OVS_STT_POOL_TAG);
520                 }
521             }
522         }
523
524         NdisReleaseSpinLock(&OvsSttSpinLock);
525         KeWaitForSingleObject(&context->event, Executive, KernelMode,
526                               FALSE, (LARGE_INTEGER *)&threadSleepTimeout);
527     }
528
529     PsTerminateSystemThread(STATUS_SUCCESS);
530 }
531
532 static OVS_STT_PKT_KEY
533 OvsGeneratePacketKey(IPHdr *ipHdr, TCPHdr *tcpHdr)
534 {
535     OVS_STT_PKT_KEY key;
536     key.sAddr = ipHdr->saddr;
537     key.dAddr = ipHdr->daddr;
538     key.ackSeq = ntohl(tcpHdr->ack_seq);
539     return key;
540 }
541
542 static UINT32
543 OvsSttGetPktHash(OVS_STT_PKT_KEY *pktKey)
544 {
545     UINT32 arr[3];
546     arr[0] = pktKey->ackSeq;
547     arr[1] = pktKey->dAddr;
548     arr[2] = pktKey->sAddr;
549     return OvsJhashWords(arr, 3, OVS_HASH_BASIS);
550 }
551
552 static VOID *
553 OvsLookupPktFrag(OVS_STT_PKT_KEY *pktKey, UINT32 hash)
554 {
555     PLIST_ENTRY link;
556     POVS_STT_PKT_ENTRY entry;
557
558     LIST_FORALL(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], link) {
559         entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
560         if (entry->ovsPktKey.ackSeq == pktKey->ackSeq &&
561             entry->ovsPktKey.dAddr == pktKey->dAddr &&
562             entry->ovsPktKey.sAddr == pktKey->sAddr) {
563             return entry;
564         }
565     }
566     return NULL;
567 }
568
569 /*
570 *
571 --------------------------------------------------------------------------
572 * OvsSttReassemble --
573 *     Reassemble an LSO packet from multiple STT-Fragments.
574 *
575 --------------------------------------------------------------------------
576 */
577 PNET_BUFFER_LIST
578 OvsSttReassemble(POVS_SWITCH_CONTEXT switchContext,
579                  PNET_BUFFER_LIST curNbl,
580                  IPHdr *ipHdr,
581                  TCPHdr *tcp,
582                  SttHdr *newSttHdr,
583                  UINT16 payloadLen)
584 {
585     UINT32 seq = ntohl(tcp->seq);
586     UINT32 innerPacketLen = (seq >> STT_SEQ_LEN_SHIFT) - STT_HDR_LEN;
587     UINT32 segOffset = STT_SEGMENT_OFF(seq);
588     UINT32 offset = segOffset == 0 ? 0 : segOffset - STT_HDR_LEN;
589     UINT32 startOffset = 0;
590     OVS_STT_PKT_ENTRY *pktFragEntry;
591     PNET_BUFFER_LIST targetPNbl = NULL;
592     BOOLEAN lastPacket = FALSE;
593     PNET_BUFFER sourceNb;
594     UINT32 fragmentLength = payloadLen;
595     SttHdr stt;
596     SttHdr *sttHdr = NULL;
597     sourceNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
598
599     /* XXX optimize this lock */
600     NdisAcquireSpinLock(&OvsSttSpinLock);
601
602     /* If this is the first fragment, copy the STT header */
603     if (segOffset == 0) {
604         sttHdr = NdisGetDataBuffer(sourceNb, sizeof(SttHdr), &stt, 1, 0);
605         if (sttHdr == NULL) {
606             OVS_LOG_ERROR("Unable to retrieve STT header");
607             return NULL;
608         }
609         fragmentLength = fragmentLength - STT_HDR_LEN;
610         startOffset = startOffset + STT_HDR_LEN;
611     }
612
613     /* Lookup fragment */
614     OVS_STT_PKT_KEY pktKey = OvsGeneratePacketKey(ipHdr, tcp);
615     UINT32 hash = OvsSttGetPktHash(&pktKey);
616     pktFragEntry = OvsLookupPktFrag(&pktKey, hash);
617
618     if (pktFragEntry == NULL) {
619         /* Create a new Packet Entry */
620         POVS_STT_PKT_ENTRY entry;
621         entry = OvsAllocateMemoryWithTag(sizeof(OVS_STT_PKT_ENTRY),
622                                          OVS_STT_POOL_TAG);
623         RtlZeroMemory(entry, sizeof (OVS_STT_PKT_ENTRY));
624
625         /* Update Key, timestamp and recvdLen */
626         NdisMoveMemory(&entry->ovsPktKey, &pktKey, sizeof (OVS_STT_PKT_KEY));
627
628         entry->recvdLen = fragmentLength;
629
630         UINT64 currentTime;
631         NdisGetCurrentSystemTime((LARGE_INTEGER *) &currentTime);
632         entry->timeout = currentTime + STT_ENTRY_TIMEOUT;
633
634         if (segOffset == 0) {
635             entry->sttHdr = *sttHdr;
636         }
637
638         /* Copy the data from Source to new buffer */
639         entry->packetBuf = OvsAllocateMemoryWithTag(innerPacketLen,
640                                                     OVS_STT_POOL_TAG);
641         if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
642                               entry->packetBuf + offset) == NULL) {
643             OVS_LOG_ERROR("Error when obtaining bytes from Packet");
644             goto handle_error;
645         }
646
647         /* Insert the entry in the Static Buffer */
648         InsertHeadList(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK],
649                        &entry->link);
650     } else {
651         /* Add to recieved length to identify if this is the last fragment */
652         pktFragEntry->recvdLen += fragmentLength;
653         lastPacket = (pktFragEntry->recvdLen == innerPacketLen);
654
655         if (segOffset == 0) {
656             pktFragEntry->sttHdr = *sttHdr;
657         }
658
659         /* Copy the fragment data from Source to existing buffer */
660         if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
661                               pktFragEntry->packetBuf + offset) == NULL) {
662             OVS_LOG_ERROR("Error when obtaining bytes from Packet");
663             goto handle_error;
664         }
665     }
666
667 handle_error:
668     if (lastPacket) {
669         /* Retrieve the original STT header */
670         NdisMoveMemory(newSttHdr, &pktFragEntry->sttHdr, sizeof (SttHdr));
671         targetPNbl = OvsAllocateNBLFromBuffer(switchContext, pktFragEntry->packetBuf,
672                                               innerPacketLen);
673
674         /* Delete this entry and free up the memory/ */
675         RemoveEntryList(&pktFragEntry->link);
676         OvsFreeMemoryWithTag(pktFragEntry->packetBuf, OVS_STT_POOL_TAG);
677         OvsFreeMemoryWithTag(pktFragEntry, OVS_STT_POOL_TAG);
678     }
679
680     NdisReleaseSpinLock(&OvsSttSpinLock);
681     return lastPacket ? targetPNbl : NULL;
682 }
683
684 VOID
685 OvsDecapSetOffloads(PNET_BUFFER_LIST curNbl, SttHdr *sttHdr)
686 {
687     if ((sttHdr->flags & STT_CSUM_VERIFIED)
688         || !(sttHdr->flags & STT_CSUM_PARTIAL)) {
689         return;
690     }
691
692     UINT8 protoType;
693     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
694     csumInfo.Value = 0;
695     csumInfo.Transmit.IpHeaderChecksum = 0;
696     csumInfo.Transmit.TcpHeaderOffset = sttHdr->l4Offset;
697     protoType = sttHdr->flags & STT_PROTO_TYPES;
698     switch (protoType) {
699         case (STT_PROTO_IPV4 | STT_PROTO_TCP):
700             /* TCP/IPv4 */
701                 csumInfo.Transmit.IsIPv4 = 1;
702                 csumInfo.Transmit.TcpChecksum = 1;
703                 break;
704         case STT_PROTO_TCP:
705                 /* TCP/IPv6 */
706                 csumInfo.Transmit.IsIPv6 = 1;
707                 csumInfo.Transmit.TcpChecksum = 1;
708                 break;
709         case STT_PROTO_IPV4:
710                 /* UDP/IPv4 */
711                 csumInfo.Transmit.IsIPv4 = 1;
712                 csumInfo.Transmit.UdpChecksum = 1;
713                 break;
714         default:
715                 /* UDP/IPv6 */
716                 csumInfo.Transmit.IsIPv6 = 1;
717                 csumInfo.Transmit.UdpChecksum = 1;
718     }
719     NET_BUFFER_LIST_INFO(curNbl,
720                          TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
721
722     if (sttHdr->mss) {
723         NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
724         lsoInfo.Value = 0;
725         lsoInfo.LsoV2Transmit.TcpHeaderOffset = sttHdr->l4Offset;
726         lsoInfo.LsoV2Transmit.MSS = ETH_DEFAULT_MTU
727                                     - sizeof(IPHdr)
728                                     - sizeof(TCPHdr);
729         lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
730         if (sttHdr->flags & STT_PROTO_IPV4) {
731             lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
732         } else {
733             lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv6;
734         }
735         NET_BUFFER_LIST_INFO(curNbl,
736                              TcpLargeSendNetBufferListInfo) = lsoInfo.Value;
737     }
738 }
739
740 /*
741  * --------------------------------------------------------------------------
742  * OvsDecapStt --
743  *     Decapsulates an STT packet.
744  * --------------------------------------------------------------------------
745  */
746 NDIS_STATUS
747 OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
748             PNET_BUFFER_LIST curNbl,
749             OvsIPv4TunnelKey *tunKey,
750             PNET_BUFFER_LIST *newNbl)
751 {
752     NDIS_STATUS status = NDIS_STATUS_FAILURE;
753     PNET_BUFFER curNb, newNb;
754     IPHdr *ipHdr;
755     char *ipBuf[sizeof(IPHdr)];
756     SttHdr stt;
757     SttHdr *sttHdr;
758     char *sttBuf[STT_HDR_LEN];
759     UINT32 advanceCnt, hdrLen;
760     BOOLEAN isLsoPacket = FALSE;
761
762     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
763     ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
764
765     /* Validate the TCP Checksum */
766     status = OvsValidateTCPChecksum(curNbl, curNb);
767     if (status != NDIS_STATUS_SUCCESS) {
768         return status;
769     }
770
771     /* Skip Eth header */
772     hdrLen = sizeof(EthHdr);
773     NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
774     advanceCnt = hdrLen;
775
776     ipHdr = NdisGetDataBuffer(curNb, sizeof *ipHdr, (PVOID) &ipBuf,
777                                                     1 /*no align*/, 0);
778     ASSERT(ipHdr);
779
780     TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4);
781
782     /* Skip IP & TCP headers */
783     hdrLen = sizeof(IPHdr) + sizeof(TCPHdr),
784     NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
785     advanceCnt += hdrLen;
786
787     UINT32 seq = ntohl(tcp->seq);
788     UINT32 totalLen = (seq >> STT_SEQ_LEN_SHIFT);
789     UINT16 payloadLen = (UINT16)ntohs(ipHdr->tot_len)
790                         - (ipHdr->ihl * 4)
791                         - (sizeof * tcp);
792
793     /* Check if incoming packet requires reassembly */
794     if (totalLen != payloadLen) {
795         sttHdr = &stt;
796         PNET_BUFFER_LIST pNbl = OvsSttReassemble(switchContext, curNbl,
797                                                  ipHdr, tcp, sttHdr,
798                                                  payloadLen);
799         if (pNbl == NULL) {
800             return NDIS_STATUS_SUCCESS;
801         }
802
803         *newNbl = pNbl;
804         isLsoPacket = TRUE;
805     } else {
806         /* STT Header */
807         sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr,
808                                    (PVOID) &sttBuf, 1 /*no align*/, 0);
809         /* Skip stt header, DataOffset points to inner pkt now. */
810         hdrLen = STT_HDR_LEN;
811         NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
812         advanceCnt += hdrLen;
813
814         *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0,
815                                     0, FALSE /*copy NBL info*/);
816     }
817
818     if (*newNbl == NULL) {
819         OVS_LOG_ERROR("Unable to allocate a new cloned NBL");
820         return NDIS_STATUS_RESOURCES;
821     }
822
823     status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL);
824     if (status != NDIS_STATUS_SUCCESS) {
825         OvsCompleteNBL(switchContext, *newNbl, TRUE);
826         return NDIS_STATUS_FAILURE;
827     }
828     newNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
829
830     ASSERT(sttHdr);
831
832     /* Initialize the tunnel key */
833     tunKey->dst = ipHdr->daddr;
834     tunKey->src = ipHdr->saddr;
835     tunKey->tunnelId = sttHdr->key;
836     tunKey->flags = OVS_TNL_F_KEY;
837     tunKey->tos = ipHdr->tos;
838     tunKey->ttl = ipHdr->ttl;
839     tunKey->pad = 0;
840
841     /* Set Checksum and LSO offload flags */
842     OvsDecapSetOffloads(*newNbl, sttHdr);
843
844     return NDIS_STATUS_SUCCESS;
845 }