datapath-windows: Refactor sofware offloads and mss
[cascardo/ovs.git] / datapath-windows / ovsext / User.c
index e27bd76..e97f2b2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 VMware, Inc.
+ * Copyright (c) 2014, 2016 VMware, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 #include "precomp.h"
 
 #include "Datapath.h"
-#include "Switch.h"
-#include "Vport.h"
+#include "Debug.h"
 #include "Event.h"
-#include "User.h"
-#include "PacketIO.h"
-#include "Checksum.h"
-#include "NetProto.h"
 #include "Flow.h"
+#include "Jhash.h"
+#include "NetProto.h"
+#include "Offload.h"
+#include "PacketIO.h"
+#include "Switch.h"
 #include "TunnelIntf.h"
+#include "User.h"
+#include "Vport.h"
 
 #ifdef OVS_DBG_MOD
 #undef OVS_DBG_MOD
 #endif
 #define OVS_DBG_MOD OVS_DBG_USER
-#include "Debug.h"
-
-OVS_USER_PACKET_QUEUE ovsPacketQueues[OVS_MAX_NUM_PACKET_QUEUES];
 
 POVS_PACKET_QUEUE_ELEM OvsGetNextPacket(POVS_OPEN_INSTANCE instance);
 extern PNDIS_SPIN_LOCK gOvsCtrlLock;
 extern POVS_SWITCH_CONTEXT gOvsSwitchContext;
 OVS_USER_STATS ovsUserStats;
 
+static VOID _MapNlAttrToOvsPktExec(PNL_ATTR *nlAttrs, PNL_ATTR *keyAttrs,
+                                   OvsPacketExecute  *execute);
+extern NL_POLICY nlFlowKeyPolicy[];
+extern UINT32 nlFlowKeyPolicyLen;
 
-NTSTATUS
-OvsUserInit()
+static __inline VOID
+OvsAcquirePidHashLock()
 {
-    UINT32 i;
-    POVS_USER_PACKET_QUEUE queue;
-    for (i = 0; i < OVS_MAX_NUM_PACKET_QUEUES; i++) {
-        queue = &ovsPacketQueues[i];
-        RtlZeroMemory(queue, sizeof (*queue));
-        InitializeListHead(&queue->packetList);
-        NdisAllocateSpinLock(&queue->queueLock);
-    }
-    return STATUS_SUCCESS;
+    NdisAcquireSpinLock(&(gOvsSwitchContext->pidHashLock));
 }
 
-VOID
-OvsUserCleanup()
+static __inline VOID
+OvsReleasePidHashLock()
 {
-    UINT32 i;
-    POVS_USER_PACKET_QUEUE queue;
-    for (i = 0; i < OVS_MAX_NUM_PACKET_QUEUES; i++) {
-        queue = &ovsPacketQueues[i];
-        ASSERT(IsListEmpty(&queue->packetList));
-        ASSERT(queue->instance == NULL);
-        ASSERT(queue->pendingIrp == NULL);
-        NdisFreeSpinLock(&queue->queueLock);
-    }
+    NdisReleaseSpinLock(&(gOvsSwitchContext->pidHashLock));
 }
 
+
 static VOID
 OvsPurgePacketQueue(POVS_USER_PACKET_QUEUE queue,
                     POVS_OPEN_INSTANCE instance)
@@ -98,11 +86,10 @@ OvsPurgePacketQueue(POVS_USER_PACKET_QUEUE queue,
     LIST_FORALL_SAFE(&tmp, link, next) {
         RemoveEntryList(link);
         elem = CONTAINING_RECORD(link, OVS_PACKET_QUEUE_ELEM, link);
-        OvsFreeMemory(elem);
+        OvsFreeMemoryWithTag(elem, OVS_USER_POOL_TAG);
     }
 }
 
-
 VOID
 OvsCleanupPacketQueue(POVS_OPEN_INSTANCE instance)
 {
@@ -112,13 +99,17 @@ OvsCleanupPacketQueue(POVS_OPEN_INSTANCE instance)
     LIST_ENTRY tmp;
     PIRP irp = NULL;
 
+    ASSERT(instance);
     InitializeListHead(&tmp);
     queue = (POVS_USER_PACKET_QUEUE)instance->packetQueue;
     if (queue) {
         PDRIVER_CANCEL cancelRoutine;
         NdisAcquireSpinLock(&queue->queueLock);
+        ASSERT(queue->instance == instance);
+        /* XXX Should not happen */
         if (queue->instance != instance) {
             NdisReleaseSpinLock(&queue->queueLock);
+            NdisFreeSpinLock(&queue->queueLock);
             return;
         }
 
@@ -127,7 +118,6 @@ OvsCleanupPacketQueue(POVS_OPEN_INSTANCE instance)
             queue->numPackets = 0;
         }
         queue->instance = NULL;
-        queue->queueId = OVS_MAX_NUM_PACKET_QUEUES;
         instance->packetQueue = NULL;
         irp = queue->pendingIrp;
         queue->pendingIrp = NULL;
@@ -138,55 +128,67 @@ OvsCleanupPacketQueue(POVS_OPEN_INSTANCE instance)
             }
         }
         NdisReleaseSpinLock(&queue->queueLock);
+        NdisFreeSpinLock(&queue->queueLock);
     }
     LIST_FORALL_SAFE(&tmp, link, next) {
         RemoveEntryList(link);
         elem = CONTAINING_RECORD(link, OVS_PACKET_QUEUE_ELEM, link);
-        OvsFreeMemory(elem);
+        OvsFreeMemoryWithTag(elem, OVS_USER_POOL_TAG);
     }
     if (irp) {
         OvsCompleteIrpRequest(irp, 0, STATUS_SUCCESS);
     }
+    if (queue) {
+        OvsFreeMemoryWithTag(queue, OVS_USER_POOL_TAG);
+    }
+
+    /* Verify if gOvsSwitchContext exists. */
+    if (gOvsSwitchContext) {
+        /* Remove the instance from pidHashArray */
+        OvsAcquirePidHashLock();
+        OvsDelPidInstance(gOvsSwitchContext, instance->pid);
+        OvsReleasePidHashLock();
+    }
 }
 
 NTSTATUS
-OvsSubscribeDpIoctl(PFILE_OBJECT fileObject,
-                    PVOID inputBuffer,
-                    UINT32 inputLength)
+OvsSubscribeDpIoctl(PVOID instanceP,
+                    UINT32 pid,
+                    UINT8 join)
 {
-    POVS_OPEN_INSTANCE instance = (POVS_OPEN_INSTANCE)fileObject->FsContext;
-    UINT32 queueId;
     POVS_USER_PACKET_QUEUE queue;
-    if (inputLength < sizeof (UINT32)) {
-        return STATUS_INVALID_PARAMETER;
-    }
-    queueId = *(UINT32 *)inputBuffer;
-    if (instance->packetQueue && queueId >= OVS_MAX_NUM_PACKET_QUEUES) {
-        /*
-         * unsubscribe
-         */
+    POVS_OPEN_INSTANCE instance = (POVS_OPEN_INSTANCE)instanceP;
+
+    if (instance->packetQueue && !join) {
+        /* unsubscribe */
         OvsCleanupPacketQueue(instance);
-    } else if (instance->packetQueue == NULL &&
-               queueId < OVS_MAX_NUM_PACKET_QUEUES) {
-        queue = &ovsPacketQueues[queueId];
-        NdisAcquireSpinLock(&queue->queueLock);
-        if (ovsPacketQueues[queueId].instance) {
-             if (ovsPacketQueues[queueId].instance != instance) {
-                 NdisReleaseSpinLock(&queue->queueLock);
-                 return STATUS_INSUFFICIENT_RESOURCES;
-             } else {
-                 NdisReleaseSpinLock(&queue->queueLock);
-                 return STATUS_SUCCESS;
-             }
+    } else if (instance->packetQueue == NULL && join) {
+        queue = (POVS_USER_PACKET_QUEUE) OvsAllocateMemoryWithTag(
+            sizeof *queue, OVS_USER_POOL_TAG);
+        if (queue == NULL) {
+            return STATUS_NO_MEMORY;
         }
-        queue->queueId = queueId;
+        InitializeListHead(&(instance->pidLink));
+        instance->packetQueue = queue;
+        RtlZeroMemory(queue, sizeof (*queue));
+        NdisAllocateSpinLock(&queue->queueLock);
+        NdisAcquireSpinLock(&queue->queueLock);
+        InitializeListHead(&queue->packetList);
+        queue->pid = pid;
         queue->instance = instance;
         instance->packetQueue = queue;
-        ASSERT(IsListEmpty(&queue->packetList));
         NdisReleaseSpinLock(&queue->queueLock);
+
+        OvsAcquirePidHashLock();
+        /* Insert the instance to pidHashArray */
+        OvsAddPidInstance(gOvsSwitchContext, pid, instance);
+        OvsReleasePidHashLock();
+
     } else {
+        /* user mode should call only once for subscribe */
         return STATUS_INVALID_PARAMETER;
     }
+
     return STATUS_SUCCESS;
 }
 
@@ -223,13 +225,12 @@ OvsReadDpIoctl(PFILE_OBJECT fileObject,
         if ((elem->hdrInfo.tcpCsumNeeded || elem->hdrInfo.udpCsumNeeded) &&
             len == elem->packet.totalLen) {
             UINT16 sum, *ptr;
-            UINT16 size = (UINT16)(elem->packet.userDataLen +
-                                   elem->hdrInfo.l4Offset +
-                                   (UINT16)sizeof (OVS_PACKET_INFO));
-            RtlCopyMemory(outputBuffer, &elem->packet, size);
-            ASSERT(len - size >=  elem->hdrInfo.l4PayLoad);
+            UINT16 size = (UINT16)(elem->packet.payload - elem->packet.data +
+                                  elem->hdrInfo.l4Offset);
+            RtlCopyMemory(outputBuffer, &elem->packet.data, size);
+            ASSERT(len - size >= elem->hdrInfo.l4PayLoad);
             sum = CopyAndCalculateChecksum((UINT8 *)outputBuffer + size,
-                                           (UINT8 *)&elem->packet + size,
+                                           (UINT8 *)&elem->packet.data + size,
                                            elem->hdrInfo.l4PayLoad, 0);
             ptr =(UINT16 *)((UINT8 *)outputBuffer + size +
                             (elem->hdrInfo.tcpCsumNeeded ?
@@ -237,11 +238,11 @@ OvsReadDpIoctl(PFILE_OBJECT fileObject,
             *ptr = sum;
             ovsUserStats.l4Csum++;
         } else {
-            RtlCopyMemory(outputBuffer, &elem->packet, len);
+            RtlCopyMemory(outputBuffer, &elem->packet.data, len);
         }
 
         *replyLen = len;
-        OvsFreeMemory(elem);
+        OvsFreeMemoryWithTag(elem, OVS_USER_POOL_TAG);
     }
     return STATUS_SUCCESS;
 }
@@ -257,57 +258,137 @@ OvsAllocateForwardingContextForNBL(POVS_SWITCH_CONTEXT switchContext,
 }
 
 /*
- * --------------------------------------------------------------------------
- * This function allocates all the stuff necessary for creating an NBL from the
- * input buffer of specified length, namely, a nonpaged data buffer of size
- * length, an MDL from it, and a NB and NBL from it. It does not allocate an NBL
- * context yet. It also copies data from the specified buffer to the NBL.
- * --------------------------------------------------------------------------
+ *----------------------------------------------------------------------------
+ *  OvsNlExecuteCmdHandler --
+ *    Handler for OVS_PACKET_CMD_EXECUTE command.
+ *----------------------------------------------------------------------------
  */
-PNET_BUFFER_LIST
-OvsAllocateNBLForUserBuffer(POVS_SWITCH_CONTEXT switchContext,
-                            PVOID userBuffer,
-                            ULONG length)
+NTSTATUS
+OvsNlExecuteCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx,
+                       UINT32 *replyLen)
 {
-    UINT8 *data = NULL;
-    PNET_BUFFER_LIST nbl = NULL;
-    PNET_BUFFER nb;
-    PMDL mdl;
-
-    if (length > OVS_DEFAULT_DATA_SIZE) {
-        nbl = OvsAllocateVariableSizeNBL(switchContext, length,
-                                         OVS_DEFAULT_HEADROOM_SIZE);
+    NTSTATUS status = STATUS_SUCCESS;
+    POVS_MESSAGE msgIn = (POVS_MESSAGE)usrParamsCtx->inputBuffer;
+    POVS_MESSAGE msgOut = (POVS_MESSAGE)usrParamsCtx->outputBuffer;
+    PNL_MSG_HDR nlMsgHdr = &(msgIn->nlMsg);
+    PGENL_MSG_HDR genlMsgHdr = &(msgIn->genlMsg);
+    POVS_HDR ovsHdr = &(msgIn->ovsHdr);
+
+    PNL_ATTR nlAttrs[__OVS_PACKET_ATTR_MAX];
+    PNL_ATTR keyAttrs[__OVS_KEY_ATTR_MAX] = {NULL};
+
+    UINT32 attrOffset = NLMSG_HDRLEN + GENL_HDRLEN + OVS_HDRLEN;
+    UINT32 keyAttrOffset = 0;
+    OvsPacketExecute execute;
+    NL_ERROR nlError = NL_ERROR_SUCCESS;
+    NL_BUFFER nlBuf;
 
-    } else {
-        nbl = OvsAllocateFixSizeNBL(switchContext, length,
-                                    OVS_DEFAULT_HEADROOM_SIZE);
+    static const NL_POLICY nlPktExecPolicy[] = {
+        [OVS_PACKET_ATTR_PACKET] = {.type = NL_A_UNSPEC, .optional = FALSE},
+        [OVS_PACKET_ATTR_KEY] = {.type = NL_A_UNSPEC, .optional = FALSE},
+        [OVS_PACKET_ATTR_ACTIONS] = {.type = NL_A_UNSPEC, .optional = FALSE},
+        [OVS_PACKET_ATTR_USERDATA] = {.type = NL_A_UNSPEC, .optional = TRUE},
+        [OVS_PACKET_ATTR_EGRESS_TUN_KEY] = {.type = NL_A_UNSPEC,
+                                            .optional = TRUE}
+    };
+
+    RtlZeroMemory(&execute, sizeof(OvsPacketExecute));
+
+    /* Get all the top level Flow attributes */
+    if ((NlAttrParse(nlMsgHdr, attrOffset, NlMsgAttrsLen(nlMsgHdr),
+                     nlPktExecPolicy, ARRAY_SIZE(nlPktExecPolicy),
+                     nlAttrs, ARRAY_SIZE(nlAttrs)))
+                     != TRUE) {
+        OVS_LOG_ERROR("Attr Parsing failed for msg: %p",
+                       nlMsgHdr);
+        status = STATUS_UNSUCCESSFUL;
+        goto done;
     }
-    if (nbl == NULL) {
-        return NULL;
+
+    keyAttrOffset = (UINT32)((PCHAR)nlAttrs[OVS_PACKET_ATTR_KEY] -
+                    (PCHAR)nlMsgHdr);
+
+    /* Get flow keys attributes */
+    if ((NlAttrParseNested(nlMsgHdr, keyAttrOffset,
+                           NlAttrLen(nlAttrs[OVS_PACKET_ATTR_KEY]),
+                           nlFlowKeyPolicy, nlFlowKeyPolicyLen,
+                           keyAttrs, ARRAY_SIZE(keyAttrs))) != TRUE) {
+        OVS_LOG_ERROR("Key Attr Parsing failed for msg: %p", nlMsgHdr);
+        status = STATUS_UNSUCCESSFUL;
+        goto done;
     }
 
-    nb = NET_BUFFER_LIST_FIRST_NB(nbl);
-    mdl = NET_BUFFER_CURRENT_MDL(nb);
-    data = (PUINT8)MmGetSystemAddressForMdlSafe(mdl, LowPagePriority) +
-                    NET_BUFFER_CURRENT_MDL_OFFSET(nb);
-    if (!data) {
-        OvsCompleteNBL(switchContext, nbl, TRUE);
-        return NULL;
+    execute.dpNo = ovsHdr->dp_ifindex;
+
+    _MapNlAttrToOvsPktExec(nlAttrs, keyAttrs, &execute);
+
+    status = OvsExecuteDpIoctl(&execute);
+
+    /* Default reply that we want to send */
+    if (status == STATUS_SUCCESS) {
+        BOOLEAN ok;
+
+        NlBufInit(&nlBuf, usrParamsCtx->outputBuffer,
+                  usrParamsCtx->outputLength);
+
+        /* Prepare nl Msg headers */
+        ok = NlFillOvsMsg(&nlBuf, nlMsgHdr->nlmsgType, 0,
+                 nlMsgHdr->nlmsgSeq, nlMsgHdr->nlmsgPid,
+                 genlMsgHdr->cmd, OVS_PACKET_VERSION,
+                 ovsHdr->dp_ifindex);
+
+        if (ok) {
+            *replyLen = msgOut->nlMsg.nlmsgLen;
+        } else {
+            status = STATUS_INVALID_BUFFER_SIZE;
+        }
+    } else {
+        /* Map NTSTATUS to NL_ERROR */
+        nlError = NlMapStatusToNlErr(status);
+
+        /* As of now there are no transactional errors in the implementation.
+         * Once we have them then we need to map status to correct
+         * nlError value, so that below mentioned code gets hit. */
+        if ((nlError != NL_ERROR_SUCCESS) &&
+            (usrParamsCtx->outputBuffer)) {
+
+            POVS_MESSAGE_ERROR msgError = (POVS_MESSAGE_ERROR)
+                                           usrParamsCtx->outputBuffer;
+            NlBuildErrorMsg(msgIn, msgError, nlError);
+            *replyLen = msgError->nlMsg.nlmsgLen;
+            status = STATUS_SUCCESS;
+            goto done;
+        }
     }
 
-    NdisMoveMemory(data, userBuffer, length);
+done:
+    return status;
+}
+
+/*
+ *----------------------------------------------------------------------------
+ *  _MapNlAttrToOvsPktExec --
+ *    Maps input Netlink attributes to OvsPacketExecute.
+ *----------------------------------------------------------------------------
+ */
+static VOID
+_MapNlAttrToOvsPktExec(PNL_ATTR *nlAttrs, PNL_ATTR *keyAttrs,
+                       OvsPacketExecute *execute)
+{
+    execute->packetBuf = NlAttrGet(nlAttrs[OVS_PACKET_ATTR_PACKET]);
+    execute->packetLen = NlAttrGetSize(nlAttrs[OVS_PACKET_ATTR_PACKET]);
+
+    execute->actions = NlAttrGet(nlAttrs[OVS_PACKET_ATTR_ACTIONS]);
+    execute->actionsLen = NlAttrGetSize(nlAttrs[OVS_PACKET_ATTR_ACTIONS]);
 
-    return nbl;
+    execute->inPort = NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_IN_PORT]);
 }
 
 NTSTATUS
-OvsExecuteDpIoctl(PVOID inputBuffer,
-                  UINT32 inputLength,
-                  UINT32 outputLength)
+OvsExecuteDpIoctl(OvsPacketExecute *execute)
 {
     NTSTATUS                    status = STATUS_SUCCESS;
     NTSTATUS                    ndisStatus;
-    OvsPacketExecute            *execute;
     LOCK_STATE_EX               lockState;
     PNET_BUFFER_LIST pNbl;
     PNL_ATTR actions;
@@ -316,39 +397,24 @@ OvsExecuteDpIoctl(PVOID inputBuffer,
     OVS_PACKET_HDR_INFO layers;
     POVS_VPORT_ENTRY vport;
 
-    if (inputLength < sizeof(*execute) || outputLength != 0) {
-        return STATUS_INFO_LENGTH_MISMATCH;
-    }
-
-    NdisAcquireSpinLock(gOvsCtrlLock);
-    if (gOvsSwitchContext == NULL) {
-        status = STATUS_INVALID_PARAMETER;
-        goto unlock;
-    }
-
-    execute = (struct OvsPacketExecute *) inputBuffer;
-
     if (execute->packetLen == 0) {
         status = STATUS_INVALID_PARAMETER;
-        goto unlock;
+        goto exit;
     }
 
-    if (inputLength != sizeof (*execute) +
-                       execute->actionsLen + execute->packetLen) {
-        status = STATUS_INFO_LENGTH_MISMATCH;
-        goto unlock;
-    }
-    actions = (PNL_ATTR)((PCHAR)&execute->actions + execute->packetLen);
+    actions = execute->actions;
+
+    ASSERT(actions);
 
     /*
      * Allocate the NBL, copy the data from the userspace buffer. Allocate
      * also, the forwarding context for the packet.
      */
-    pNbl = OvsAllocateNBLForUserBuffer(gOvsSwitchContext, &execute->packetBuf,
-                                       execute->packetLen);
+    pNbl = OvsAllocateNBLFromBuffer(gOvsSwitchContext, execute->packetBuf,
+                                    execute->packetLen);
     if (pNbl == NULL) {
         status = STATUS_NO_MEMORY;
-        goto unlock;
+        goto exit;
     }
 
     fwdDetail = NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(pNbl);
@@ -363,14 +429,12 @@ OvsExecuteDpIoctl(PVOID inputBuffer,
     // XXX: Figure out if any of the other members of fwdDetail need to be set.
 
     ndisStatus = OvsExtractFlow(pNbl, fwdDetail->SourcePortId, &key, &layers,
-                              NULL);
+                                NULL);
     if (ndisStatus == NDIS_STATUS_SUCCESS) {
-        ASSERT(KeGetCurrentIrql() == DISPATCH_LEVEL);
-        NdisAcquireRWLockRead(gOvsSwitchContext->dispatchLock, &lockState,
-                              NDIS_RWL_AT_DISPATCH_LEVEL);
+        NdisAcquireRWLockRead(gOvsSwitchContext->dispatchLock, &lockState, 0);
         ndisStatus = OvsActionsExecute(gOvsSwitchContext, NULL, pNbl,
                                        vport ? vport->portNo :
-                                               OVS_DEFAULT_PORT_NO,
+                                               OVS_DPPORT_NUMBER_INVALID,
                                        NDIS_SEND_FLAGS_SWITCH_DESTINATION_GROUP,
                                        &key, NULL, &layers, actions,
                                        execute->actionsLen);
@@ -378,14 +442,17 @@ OvsExecuteDpIoctl(PVOID inputBuffer,
         NdisReleaseRWLock(gOvsSwitchContext->dispatchLock, &lockState);
     }
     if (ndisStatus != NDIS_STATUS_SUCCESS) {
-        status = STATUS_UNSUCCESSFUL;
+        if (ndisStatus == NDIS_STATUS_NOT_SUPPORTED) {
+            status = STATUS_NOT_SUPPORTED;
+        } else {
+            status = STATUS_UNSUCCESSFUL;
+        }
     }
 
     if (pNbl) {
         OvsCompleteNBL(gOvsSwitchContext, pNbl, TRUE);
     }
-unlock:
-    NdisReleaseSpinLock(gOvsCtrlLock);
+exit:
     return status;
 }
 
@@ -505,68 +572,141 @@ OvsGetNextPacket(POVS_OPEN_INSTANCE instance)
     return CONTAINING_RECORD(link, OVS_PACKET_QUEUE_ELEM, link);
 }
 
-
+/*
+ * ---------------------------------------------------------------------------
+ * Given a pid, returns the corresponding USER_PACKET_QUEUE.
+ * ---------------------------------------------------------------------------
+ */
 POVS_USER_PACKET_QUEUE
-OvsGetQueue(UINT32 queueId)
+OvsGetQueue(UINT32 pid)
 {
-    POVS_USER_PACKET_QUEUE queue;
-    if (queueId >= OVS_MAX_NUM_PACKET_QUEUES) {
-        return NULL;
+    POVS_OPEN_INSTANCE instance;
+    POVS_USER_PACKET_QUEUE ret = NULL;
+
+    instance = OvsGetPidInstance(gOvsSwitchContext, pid);
+
+    if (instance) {
+        ret = instance->packetQueue;
     }
-    queue = &ovsPacketQueues[queueId];
-    return queue->instance != NULL ? queue : NULL;
+
+    return ret;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * Given a pid, returns the corresponding instance.
+ * pidHashLock must be acquired before calling this API.
+ * ---------------------------------------------------------------------------
+ */
+POVS_OPEN_INSTANCE
+OvsGetPidInstance(POVS_SWITCH_CONTEXT switchContext, UINT32 pid)
+{
+    POVS_OPEN_INSTANCE instance;
+    PLIST_ENTRY head, link;
+    UINT32 hash = OvsJhashBytes((const VOID *)&pid, sizeof(pid),
+                                OVS_HASH_BASIS);
+    head = &(switchContext->pidHashArray[hash & OVS_PID_MASK]);
+    LIST_FORALL(head, link) {
+        instance = CONTAINING_RECORD(link, OVS_OPEN_INSTANCE, pidLink);
+        if (instance->pid == pid) {
+            return instance;
+        }
+    }
+    return NULL;
 }
 
+/*
+ * ---------------------------------------------------------------------------
+ * Given a pid and an instance. This API adds instance to pidHashArray.
+ * pidHashLock must be acquired before calling this API.
+ * ---------------------------------------------------------------------------
+ */
 VOID
-OvsQueuePackets(UINT32 queueId,
-                PLIST_ENTRY packetList,
+OvsAddPidInstance(POVS_SWITCH_CONTEXT switchContext, UINT32 pid,
+                  POVS_OPEN_INSTANCE instance)
+{
+    PLIST_ENTRY head;
+    UINT32 hash = OvsJhashBytes((const VOID *)&pid, sizeof(pid),
+                                OVS_HASH_BASIS);
+    head = &(switchContext->pidHashArray[hash & OVS_PID_MASK]);
+    InsertHeadList(head, &(instance->pidLink));
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * Given a pid and an instance. This API removes instance from pidHashArray.
+ * pidHashLock must be acquired before calling this API.
+ * ---------------------------------------------------------------------------
+ */
+VOID
+OvsDelPidInstance(POVS_SWITCH_CONTEXT switchContext, UINT32 pid)
+{
+    POVS_OPEN_INSTANCE instance = OvsGetPidInstance(switchContext, pid);
+
+    if (instance) {
+        RemoveEntryList(&(instance->pidLink));
+    }
+}
+
+VOID
+OvsQueuePackets(PLIST_ENTRY packetList,
                 UINT32 numElems)
 {
-    POVS_USER_PACKET_QUEUE queue = OvsGetQueue(queueId);
+    POVS_USER_PACKET_QUEUE upcallQueue = NULL;
     POVS_PACKET_QUEUE_ELEM elem;
-    PIRP irp = NULL;
     PLIST_ENTRY  link;
     UINT32 num = 0;
+    LIST_ENTRY dropPackets;
 
-    OVS_LOG_LOUD("Enter: queueId %u, numELems: %u",
-                  queueId, numElems);
-    if (queue == NULL) {
-        goto cleanup;
-    }
+    OVS_LOG_LOUD("Enter: numELems: %u", numElems);
 
-    NdisAcquireSpinLock(&queue->queueLock);
-    if (queue->instance == NULL) {
-        NdisReleaseSpinLock(&queue->queueLock);
-        goto cleanup;
-    } else {
-        OvsAppendList(&queue->packetList, packetList);
-        queue->numPackets += numElems;
-    }
-    if (queue->pendingIrp) {
-        PDRIVER_CANCEL cancelRoutine;
-        irp = queue->pendingIrp;
-        queue->pendingIrp = NULL;
-        cancelRoutine = IoSetCancelRoutine(irp, NULL);
-        if (cancelRoutine == NULL) {
-            irp = NULL;
-        }
-    }
-    NdisReleaseSpinLock(&queue->queueLock);
-    if (irp) {
-        OvsCompleteIrpRequest(irp, 0, STATUS_SUCCESS);
-    }
+    InitializeListHead(&dropPackets);
 
-cleanup:
     while (!IsListEmpty(packetList)) {
         link = RemoveHeadList(packetList);
         elem = CONTAINING_RECORD(link, OVS_PACKET_QUEUE_ELEM, link);
-        OvsFreeMemory(elem);
+
+        ASSERT(elem);
+
+        OvsAcquirePidHashLock();
+
+        upcallQueue = OvsGetQueue(elem->upcallPid);
+        if (!upcallQueue) {
+            /* No upcall queue found, drop this packet. */
+            InsertTailList(&dropPackets, &elem->link);
+        } else {
+            NdisAcquireSpinLock(&upcallQueue->queueLock);
+
+            if (upcallQueue->instance == NULL) {
+                InsertTailList(&dropPackets, &elem->link);
+            } else {
+                InsertTailList(&upcallQueue->packetList, &elem->link);
+                upcallQueue->numPackets++;
+                if (upcallQueue->pendingIrp) {
+                    PIRP irp = upcallQueue->pendingIrp;
+                    PDRIVER_CANCEL cancelRoutine;
+                    upcallQueue->pendingIrp = NULL;
+                    cancelRoutine = IoSetCancelRoutine(irp, NULL);
+                    if (cancelRoutine != NULL) {
+                        OvsCompleteIrpRequest(irp, 0, STATUS_SUCCESS);
+                    }
+                }
+            }
+            NdisReleaseSpinLock(&upcallQueue->queueLock);
+        }
+        OvsReleasePidHashLock();
+    }
+
+    while (!IsListEmpty(&dropPackets)) {
+        link = RemoveHeadList(&dropPackets);
+        elem = CONTAINING_RECORD(link, OVS_PACKET_QUEUE_ELEM, link);
+        OvsFreeMemoryWithTag(elem, OVS_USER_POOL_TAG);
         num++;
     }
+
     OVS_LOG_LOUD("Exit: drop %u packets", num);
 }
 
-
 /*
  *----------------------------------------------------------------------------
  * OvsCreateAndAddPackets --
@@ -581,7 +721,7 @@ NTSTATUS
 OvsCreateAndAddPackets(PVOID userData,
                        UINT32 userDataLen,
                        UINT32 cmd,
-                       UINT32 inPort,
+                       POVS_VPORT_ENTRY vport,
                        OvsFlowKey *key,
                        PNET_BUFFER_LIST nbl,
                        BOOLEAN isRecv,
@@ -618,7 +758,7 @@ OvsCreateAndAddPackets(PVOID userData,
     nb = NET_BUFFER_LIST_FIRST_NB(nbl);
     while (nb) {
         elem = OvsCreateQueueNlPacket(userData, userDataLen,
-                                    cmd, inPort, key, nbl, nb,
+                                    cmd, vport, key, nbl, nb,
                                     isRecv, hdrInfo);
         if (elem) {
             InsertTailList(list, &elem->link);
@@ -647,7 +787,7 @@ OvsGetUpcallMsgSize(PVOID userData,
         size += NlAttrTotalSize(userDataLen);
     }
     /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */
-    /* Is it included in the the flwo key attr XXX */
+    /* Is it included in the flow key attr XXX */
     if (tunnelKey) {
         size += NlAttrTotalSize(OvsTunKeyAttrSize());
     }
@@ -763,6 +903,8 @@ OvsGetPid(POVS_VPORT_ENTRY vport, PNET_BUFFER nb, UINT32 *pid)
 {
     UNREFERENCED_PARAMETER(nb);
 
+    ASSERT(vport);
+
     /* XXX select a pid from an array of pids using a flow based hash */
     *pid = vport->upcallPid;
     return STATUS_SUCCESS;
@@ -795,7 +937,7 @@ POVS_PACKET_QUEUE_ELEM
 OvsCreateQueueNlPacket(PVOID userData,
                        UINT32 userDataLen,
                        UINT32 cmd,
-                       UINT32 inPort,
+                       POVS_VPORT_ENTRY vport,
                        OvsFlowKey *key,
                        PNET_BUFFER_LIST nbl,
                        PNET_BUFFER nb,
@@ -812,18 +954,16 @@ OvsCreateQueueNlPacket(PVOID userData,
     UINT32 pid;
     UINT32 nlMsgSize;
     NL_BUFFER nlBuf;
-
-    /* XXX pass vport in the stack rather than portNo */
-    POVS_VPORT_ENTRY vport =
-        OvsFindVportByPortNo(gOvsSwitchContext, inPort);
+    PNL_MSG_HDR nlMsg;
 
     if (vport == NULL){
-        /* Should never happen as dispatch lock is held */
-        ASSERT(vport);
+        /* No vport is not fatal. */
         return NULL;
     }
 
-    if (!OvsGetPid(vport, nb, &pid)) {
+    OvsGetPid(vport, nb, &pid);
+
+    if (!pid) {
         /*
          * There is no userspace queue created yet, so there is no point for
          * creating a new packet to be queued.
@@ -854,18 +994,20 @@ OvsCreateQueueNlPacket(PVOID userData,
                                     dataLen + extraLen);
 
     allocLen = sizeof (OVS_PACKET_QUEUE_ELEM) + nlMsgSize;
-    elem = (POVS_PACKET_QUEUE_ELEM)OvsAllocateMemory(allocLen);
+    elem = (POVS_PACKET_QUEUE_ELEM)OvsAllocateMemoryWithTag(allocLen,
+                                                            OVS_USER_POOL_TAG);
     if (elem == NULL) {
         ovsUserStats.dropDuetoResource++;
         return NULL;
     }
     elem->hdrInfo.value = hdrInfo->value;
+    elem->upcallPid = pid;
     elem->packet.totalLen = nlMsgSize;
     /* XXX remove queueid */
     elem->packet.queue = 0;
     /* XXX  no need as the length is already in the NL attrib */
     elem->packet.userDataLen = userDataLen;
-    elem->packet.inPort = inPort;
+    elem->packet.inPort = vport->portNo;
     elem->packet.cmd = cmd;
     if (cmd == (UINT32)OVS_PACKET_CMD_MISS) {
         ovsUserStats.miss++;
@@ -885,9 +1027,9 @@ OvsCreateQueueNlPacket(PVOID userData,
      * Since we are pre allocating memory for the NL buffer
      * the attribute settings should not fail
      */
-    if (NlFillOvsMsg(&nlBuf, OVS_WIN_NL_PACKET_FAMILY_ID, 0,
+    if (!NlFillOvsMsg(&nlBuf, OVS_WIN_NL_PACKET_FAMILY_ID, 0,
                       0, pid, (UINT8)cmd, OVS_PACKET_VERSION,
-                      gOvsSwitchContext->dpNo) != STATUS_SUCCESS) {
+                      gOvsSwitchContext->dpNo)) {
         goto fail;
     }
 
@@ -948,8 +1090,122 @@ OvsCreateQueueNlPacket(PVOID userData,
         elem->hdrInfo.l4Offset += VLAN_TAG_SIZE;
         ovsUserStats.vlanInsert++;
     }
+
+    nlMsg = (PNL_MSG_HDR)NlBufAt(&nlBuf, 0, 0);
+    nlMsg->nlmsgLen = NlBufSize(&nlBuf);
+    /* 'totalLen' should be size of valid data. */
+    elem->packet.totalLen = nlMsg->nlmsgLen;
+
     return elem;
 fail:
-    OvsFreeMemory(elem);
+    OvsFreeMemoryWithTag(elem, OVS_USER_POOL_TAG);
     return NULL;
 }
+
+/*
+ * --------------------------------------------------------------------------
+ *  Handler for the subscription for a packet queue
+ * --------------------------------------------------------------------------
+ */
+NTSTATUS
+OvsSubscribePacketCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx,
+                             UINT32 *replyLen)
+{
+    NDIS_STATUS status;
+    BOOLEAN rc;
+    UINT8 join;
+    UINT32 pid;
+    const NL_POLICY policy[] =  {
+        [OVS_NL_ATTR_PACKET_PID] = {.type = NL_A_U32 },
+        [OVS_NL_ATTR_PACKET_SUBSCRIBE] = {.type = NL_A_U8 }
+        };
+    PNL_ATTR attrs[ARRAY_SIZE(policy)];
+
+    UNREFERENCED_PARAMETER(replyLen);
+
+    POVS_OPEN_INSTANCE instance =
+        (POVS_OPEN_INSTANCE)usrParamsCtx->ovsInstance;
+    POVS_MESSAGE msgIn = (POVS_MESSAGE)usrParamsCtx->inputBuffer;
+
+    rc = NlAttrParse(&msgIn->nlMsg, sizeof (*msgIn),
+         NlMsgAttrsLen((PNL_MSG_HDR)msgIn), policy, ARRAY_SIZE(policy),
+                       attrs, ARRAY_SIZE(attrs));
+    if (!rc) {
+        status = STATUS_INVALID_PARAMETER;
+        goto done;
+    }
+
+    join = NlAttrGetU8(attrs[OVS_NL_ATTR_PACKET_SUBSCRIBE]);
+    pid = NlAttrGetU32(attrs[OVS_NL_ATTR_PACKET_PID]);
+
+    /* The socket subscribed with must be the same socket we perform receive*/
+    ASSERT(pid == instance->pid);
+
+    status = OvsSubscribeDpIoctl(instance, pid, join);
+
+    /*
+     * XXX Need to add this instance to a global data structure
+     * which hold all packet based instances. The data structure (hash)
+     * should be searched through the pid field of the instance for
+     * placing the missed packet into the correct queue
+     */
+done:
+    return status;
+}
+
+/*
+ * --------------------------------------------------------------------------
+ * Handler for queueing an IRP used for missed packet notification. The IRP is
+ * completed when a packet received and mismatched. STATUS_PENDING is returned
+ * on success. User mode keep a pending IRP at all times.
+ * --------------------------------------------------------------------------
+ */
+NTSTATUS
+OvsPendPacketCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx,
+                       UINT32 *replyLen)
+{
+    UNREFERENCED_PARAMETER(replyLen);
+
+    POVS_OPEN_INSTANCE instance =
+        (POVS_OPEN_INSTANCE)usrParamsCtx->ovsInstance;
+
+    /*
+     * XXX access to packet queue must be through acquiring a lock as user mode
+     * could unsubscribe and the instnace will be freed.
+     */
+    return OvsWaitDpIoctl(usrParamsCtx->irp, instance->fileObject);
+}
+
+/*
+ * --------------------------------------------------------------------------
+ * Handler for reading missed pacckets from the driver event queue. This
+ * handler is executed when user modes issues a socket receive on a socket
+ * --------------------------------------------------------------------------
+ */
+NTSTATUS
+OvsReadPacketCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx,
+                       UINT32 *replyLen)
+{
+#ifdef DBG
+    POVS_MESSAGE msgOut = (POVS_MESSAGE)usrParamsCtx->outputBuffer;
+#endif
+    POVS_OPEN_INSTANCE instance =
+        (POVS_OPEN_INSTANCE)usrParamsCtx->ovsInstance;
+    NTSTATUS status;
+
+    ASSERT(usrParamsCtx->devOp == OVS_READ_DEV_OP);
+
+    /* Should never read events with a dump socket */
+    ASSERT(instance->dumpState.ovsMsg == NULL);
+
+    /* Must have an packet queue */
+    ASSERT(instance->packetQueue != NULL);
+
+    /* Output buffer has been validated while validating read dev op. */
+    ASSERT(msgOut != NULL && usrParamsCtx->outputLength >= sizeof *msgOut);
+
+    /* Read a packet from the instance queue */
+    status = OvsReadDpIoctl(instance->fileObject, usrParamsCtx->outputBuffer,
+                            usrParamsCtx->outputLength, replyLen);
+    return status;
+}