瀏覽代碼

xen_disk: add persistent grant support to xen_disk backend

This protocol extension reuses the same set of grant pages for all
transactions between the front/back drivers, avoiding expensive tlb
flushes, grant table lock contention and switches between userspace
and kernel space. The full description of the protocol can be found in
the public blkif.h header.

http://xenbits.xen.org/gitweb/?p=xen.git;a=blob_plain;f=xen/include/public/io/blkif.h

Speed improvement with 15 guests performing I/O is ~450%.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Roger Pau Monne 12 年之前
父節點
當前提交
9e496d7458
共有 1 個文件被更改,包括 154 次插入17 次删除
  1. 154 17
      hw/xen_disk.c

+ 154 - 17
hw/xen_disk.c

@@ -51,6 +51,13 @@ static int max_requests = 32;
 #define BLOCK_SIZE  512
 #define BLOCK_SIZE  512
 #define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
 #define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
 
 
+struct PersistentGrant {
+    void *page;
+    struct XenBlkDev *blkdev;
+};
+
+typedef struct PersistentGrant PersistentGrant;
+
 struct ioreq {
 struct ioreq {
     blkif_request_t     req;
     blkif_request_t     req;
     int16_t             status;
     int16_t             status;
@@ -68,6 +75,7 @@ struct ioreq {
     int                 prot;
     int                 prot;
     void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
     void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
     void                *pages;
     void                *pages;
+    int                 num_unmap;
 
 
     /* aio status */
     /* aio status */
     int                 aio_inflight;
     int                 aio_inflight;
@@ -104,6 +112,12 @@ struct XenBlkDev {
     int                 requests_inflight;
     int                 requests_inflight;
     int                 requests_finished;
     int                 requests_finished;
 
 
+    /* Persistent grants extension */
+    gboolean            feature_persistent;
+    GTree               *persistent_gnts;
+    unsigned int        persistent_gnt_count;
+    unsigned int        max_grants;
+
     /* qemu block driver */
     /* qemu block driver */
     DriveInfo           *dinfo;
     DriveInfo           *dinfo;
     BlockDriverState    *bs;
     BlockDriverState    *bs;
@@ -137,6 +151,29 @@ static void ioreq_reset(struct ioreq *ioreq)
     qemu_iovec_reset(&ioreq->v);
     qemu_iovec_reset(&ioreq->v);
 }
 }
 
 
+static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
+{
+    uint ua = GPOINTER_TO_UINT(a);
+    uint ub = GPOINTER_TO_UINT(b);
+    return (ua > ub) - (ua < ub);
+}
+
+static void destroy_grant(gpointer pgnt)
+{
+    PersistentGrant *grant = pgnt;
+    XenGnttab gnt = grant->blkdev->xendev.gnttabdev;
+
+    if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) {
+        xen_be_printf(&grant->blkdev->xendev, 0,
+                      "xc_gnttab_munmap failed: %s\n",
+                      strerror(errno));
+    }
+    grant->blkdev->persistent_gnt_count--;
+    xen_be_printf(&grant->blkdev->xendev, 3,
+                  "unmapped grant %p\n", grant->page);
+    g_free(grant);
+}
+
 static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
 static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
 {
 {
     struct ioreq *ioreq = NULL;
     struct ioreq *ioreq = NULL;
@@ -265,21 +302,21 @@ static void ioreq_unmap(struct ioreq *ioreq)
     XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
     XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
     int i;
     int i;
 
 
-    if (ioreq->v.niov == 0 || ioreq->mapped == 0) {
+    if (ioreq->num_unmap == 0 || ioreq->mapped == 0) {
         return;
         return;
     }
     }
     if (batch_maps) {
     if (batch_maps) {
         if (!ioreq->pages) {
         if (!ioreq->pages) {
             return;
             return;
         }
         }
-        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) {
+        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) {
             xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
             xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
                           strerror(errno));
                           strerror(errno));
         }
         }
-        ioreq->blkdev->cnt_map -= ioreq->v.niov;
+        ioreq->blkdev->cnt_map -= ioreq->num_unmap;
         ioreq->pages = NULL;
         ioreq->pages = NULL;
     } else {
     } else {
-        for (i = 0; i < ioreq->v.niov; i++) {
+        for (i = 0; i < ioreq->num_unmap; i++) {
             if (!ioreq->page[i]) {
             if (!ioreq->page[i]) {
                 continue;
                 continue;
             }
             }
@@ -297,41 +334,120 @@ static void ioreq_unmap(struct ioreq *ioreq)
 static int ioreq_map(struct ioreq *ioreq)
 static int ioreq_map(struct ioreq *ioreq)
 {
 {
     XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
     XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
-    int i;
+    uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    int i, j, new_maps = 0;
+    PersistentGrant *grant;
+    /* domids and refs variables will contain the information necessary
+     * to map the grants that are needed to fulfill this request.
+     *
+     * After mapping the needed grants, the page array will contain the
+     * memory address of each granted page in the order specified in ioreq
+     * (disregarding if it's a persistent grant or not).
+     */
 
 
     if (ioreq->v.niov == 0 || ioreq->mapped == 1) {
     if (ioreq->v.niov == 0 || ioreq->mapped == 1) {
         return 0;
         return 0;
     }
     }
-    if (batch_maps) {
+    if (ioreq->blkdev->feature_persistent) {
+        for (i = 0; i < ioreq->v.niov; i++) {
+            grant = g_tree_lookup(ioreq->blkdev->persistent_gnts,
+                                    GUINT_TO_POINTER(ioreq->refs[i]));
+
+            if (grant != NULL) {
+                page[i] = grant->page;
+                xen_be_printf(&ioreq->blkdev->xendev, 3,
+                              "using persistent-grant %" PRIu32 "\n",
+                              ioreq->refs[i]);
+            } else {
+                    /* Add the grant to the list of grants that
+                     * should be mapped
+                     */
+                    domids[new_maps] = ioreq->domids[i];
+                    refs[new_maps] = ioreq->refs[i];
+                    page[i] = NULL;
+                    new_maps++;
+            }
+        }
+        /* Set the protection to RW, since grants may be reused later
+         * with a different protection than the one needed for this request
+         */
+        ioreq->prot = PROT_WRITE | PROT_READ;
+    } else {
+        /* All grants in the request should be mapped */
+        memcpy(refs, ioreq->refs, sizeof(refs));
+        memcpy(domids, ioreq->domids, sizeof(domids));
+        memset(page, 0, sizeof(page));
+        new_maps = ioreq->v.niov;
+    }
+
+    if (batch_maps && new_maps) {
         ioreq->pages = xc_gnttab_map_grant_refs
         ioreq->pages = xc_gnttab_map_grant_refs
-            (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot);
+            (gnt, new_maps, domids, refs, ioreq->prot);
         if (ioreq->pages == NULL) {
         if (ioreq->pages == NULL) {
             xen_be_printf(&ioreq->blkdev->xendev, 0,
             xen_be_printf(&ioreq->blkdev->xendev, 0,
                           "can't map %d grant refs (%s, %d maps)\n",
                           "can't map %d grant refs (%s, %d maps)\n",
-                          ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map);
+                          new_maps, strerror(errno), ioreq->blkdev->cnt_map);
             return -1;
             return -1;
         }
         }
-        for (i = 0; i < ioreq->v.niov; i++) {
-            ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE +
-                (uintptr_t)ioreq->v.iov[i].iov_base;
+        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
+            if (page[i] == NULL) {
+                page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE;
+            }
         }
         }
-        ioreq->blkdev->cnt_map += ioreq->v.niov;
-    } else  {
-        for (i = 0; i < ioreq->v.niov; i++) {
+        ioreq->blkdev->cnt_map += new_maps;
+    } else if (new_maps)  {
+        for (i = 0; i < new_maps; i++) {
             ioreq->page[i] = xc_gnttab_map_grant_ref
             ioreq->page[i] = xc_gnttab_map_grant_ref
-                (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot);
+                (gnt, domids[i], refs[i], ioreq->prot);
             if (ioreq->page[i] == NULL) {
             if (ioreq->page[i] == NULL) {
                 xen_be_printf(&ioreq->blkdev->xendev, 0,
                 xen_be_printf(&ioreq->blkdev->xendev, 0,
                               "can't map grant ref %d (%s, %d maps)\n",
                               "can't map grant ref %d (%s, %d maps)\n",
-                              ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map);
+                              refs[i], strerror(errno), ioreq->blkdev->cnt_map);
                 ioreq_unmap(ioreq);
                 ioreq_unmap(ioreq);
                 return -1;
                 return -1;
             }
             }
-            ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base;
             ioreq->blkdev->cnt_map++;
             ioreq->blkdev->cnt_map++;
         }
         }
+        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
+            if (page[i] == NULL) {
+                page[i] = ioreq->page[j++];
+            }
+        }
+    }
+    if (ioreq->blkdev->feature_persistent) {
+        while ((ioreq->blkdev->persistent_gnt_count < ioreq->blkdev->max_grants)
+              && new_maps) {
+            /* Go through the list of newly mapped grants and add as many
+             * as possible to the list of persistently mapped grants.
+             *
+             * Since we start at the end of ioreq->page(s), we only need
+             * to decrease new_maps to prevent this granted pages from
+             * being unmapped in ioreq_unmap.
+             */
+            grant = g_malloc0(sizeof(*grant));
+            new_maps--;
+            if (batch_maps) {
+                grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
+            } else {
+                grant->page = ioreq->page[new_maps];
+            }
+            grant->blkdev = ioreq->blkdev;
+            xen_be_printf(&ioreq->blkdev->xendev, 3,
+                          "adding grant %" PRIu32 " page: %p\n",
+                          refs[new_maps], grant->page);
+            g_tree_insert(ioreq->blkdev->persistent_gnts,
+                          GUINT_TO_POINTER(refs[new_maps]),
+                          grant);
+            ioreq->blkdev->persistent_gnt_count++;
+        }
+    }
+    for (i = 0; i < ioreq->v.niov; i++) {
+        ioreq->v.iov[i].iov_base += (uintptr_t)page[i];
     }
     }
     ioreq->mapped = 1;
     ioreq->mapped = 1;
+    ioreq->num_unmap = new_maps;
     return 0;
     return 0;
 }
 }
 
 
@@ -679,6 +795,7 @@ static int blk_init(struct XenDevice *xendev)
 
 
     /* fill info */
     /* fill info */
     xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1);
     xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1);
+    xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1);
     xenstore_write_be_int(&blkdev->xendev, "info",            info);
     xenstore_write_be_int(&blkdev->xendev, "info",            info);
     xenstore_write_be_int(&blkdev->xendev, "sector-size",     blkdev->file_blk);
     xenstore_write_be_int(&blkdev->xendev, "sector-size",     blkdev->file_blk);
     xenstore_write_be_int(&blkdev->xendev, "sectors",
     xenstore_write_be_int(&blkdev->xendev, "sectors",
@@ -702,6 +819,7 @@ out_error:
 static int blk_connect(struct XenDevice *xendev)
 static int blk_connect(struct XenDevice *xendev)
 {
 {
     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
+    int pers;
 
 
     if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) {
     if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) {
         return -1;
         return -1;
@@ -710,6 +828,11 @@ static int blk_connect(struct XenDevice *xendev)
                              &blkdev->xendev.remote_port) == -1) {
                              &blkdev->xendev.remote_port) == -1) {
         return -1;
         return -1;
     }
     }
+    if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) {
+        blkdev->feature_persistent = FALSE;
+    } else {
+        blkdev->feature_persistent = !!pers;
+    }
 
 
     blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
     blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
     if (blkdev->xendev.protocol) {
     if (blkdev->xendev.protocol) {
@@ -753,6 +876,15 @@ static int blk_connect(struct XenDevice *xendev)
     }
     }
     }
     }
 
 
+    if (blkdev->feature_persistent) {
+        /* Init persistent grants */
+        blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+        blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp,
+                                             NULL, NULL,
+                                             (GDestroyNotify)destroy_grant);
+        blkdev->persistent_gnt_count = 0;
+    }
+
     xen_be_bind_evtchn(&blkdev->xendev);
     xen_be_bind_evtchn(&blkdev->xendev);
 
 
     xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
     xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
@@ -793,6 +925,11 @@ static int blk_free(struct XenDevice *xendev)
         blk_disconnect(xendev);
         blk_disconnect(xendev);
     }
     }
 
 
+    /* Free persistent grants */
+    if (blkdev->feature_persistent) {
+        g_tree_destroy(blkdev->persistent_gnts);
+    }
+
     while (!QLIST_EMPTY(&blkdev->freelist)) {
     while (!QLIST_EMPTY(&blkdev->freelist)) {
         ioreq = QLIST_FIRST(&blkdev->freelist);
         ioreq = QLIST_FIRST(&blkdev->freelist);
         QLIST_REMOVE(ioreq, list);
         QLIST_REMOVE(ioreq, list);