misc: mic: Enable VOP host side functionality
authorSudeep Dutt <sudeep.dutt@intel.com>
Mon, 8 Feb 2016 23:48:15 +0000 (15:48 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 10 Feb 2016 01:32:37 +0000 (17:32 -0800)
This patch moves virtio functionality from the MIC host driver into a
separate hardware independent Virtio Over PCIe (VOP) driver. This
functionality was introduced in commit f69bcbf3b4c4 ("Intel MIC Host
Driver Changes for Virtio Devices.") in
drivers/misc/mic/host/mic_virtio.c. Apart from being moved into a
separate driver the functionality is essentially unchanged. See the
above mentioned commit for a description of this functionality.

Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Sudeep Dutt <sudeep.dutt@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/misc/mic/vop/vop_vringh.c [new file with mode: 0644]

diff --git a/drivers/misc/mic/vop/vop_vringh.c b/drivers/misc/mic/vop/vop_vringh.c
new file mode 100644 (file)
index 0000000..c1dd000
--- /dev/null
@@ -0,0 +1,1164 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Intel Virtio Over PCIe (VOP) driver.
+ *
+ */
+#include <linux/sched.h>
+#include <linux/poll.h>
+#include <linux/dma-mapping.h>
+
+#include <linux/mic_common.h>
+#include "../common/mic_dev.h"
+
+#include <linux/mic_ioctl.h>
+#include "vop_main.h"
+
+/* Helper API to obtain the VOP PCIe device */
+static inline struct device *vop_dev(struct vop_vdev *vdev)
+{
+       return vdev->vpdev->dev.parent;
+}
+
+/* Helper API to check if a virtio device is initialized */
+static inline int vop_vdev_inited(struct vop_vdev *vdev)
+{
+       if (!vdev)
+               return -EINVAL;
+       /* Device has not been created yet */
+       if (!vdev->dd || !vdev->dd->type) {
+               dev_err(vop_dev(vdev), "%s %d err %d\n",
+                       __func__, __LINE__, -EINVAL);
+               return -EINVAL;
+       }
+       /* Device has been removed/deleted */
+       if (vdev->dd->type == -1) {
+               dev_dbg(vop_dev(vdev), "%s %d err %d\n",
+                       __func__, __LINE__, -ENODEV);
+               return -ENODEV;
+       }
+       return 0;
+}
+
+static void _vop_notify(struct vringh *vrh)
+{
+       struct vop_vringh *vvrh = container_of(vrh, struct vop_vringh, vrh);
+       struct vop_vdev *vdev = vvrh->vdev;
+       struct vop_device *vpdev = vdev->vpdev;
+       s8 db = vdev->dc->h2c_vdev_db;
+
+       if (db != -1)
+               vpdev->hw_ops->send_intr(vpdev, db);
+}
+
+static void vop_virtio_init_post(struct vop_vdev *vdev)
+{
+       struct mic_vqconfig *vqconfig = mic_vq_config(vdev->dd);
+       struct vop_device *vpdev = vdev->vpdev;
+       int i, used_size;
+
+       for (i = 0; i < vdev->dd->num_vq; i++) {
+               used_size = PAGE_ALIGN(sizeof(u16) * 3 +
+                               sizeof(struct vring_used_elem) *
+                               le16_to_cpu(vqconfig->num));
+               if (!le64_to_cpu(vqconfig[i].used_address)) {
+                       dev_warn(vop_dev(vdev), "used_address zero??\n");
+                       continue;
+               }
+               vdev->vvr[i].vrh.vring.used =
+                       (void __force *)vpdev->hw_ops->ioremap(
+                       vpdev,
+                       le64_to_cpu(vqconfig[i].used_address),
+                       used_size);
+       }
+
+       vdev->dc->used_address_updated = 0;
+
+       dev_info(vop_dev(vdev), "%s: device type %d LINKUP\n",
+                __func__, vdev->virtio_id);
+}
+
+static inline void vop_virtio_device_reset(struct vop_vdev *vdev)
+{
+       int i;
+
+       dev_dbg(vop_dev(vdev), "%s: status %d device type %d RESET\n",
+               __func__, vdev->dd->status, vdev->virtio_id);
+
+       for (i = 0; i < vdev->dd->num_vq; i++)
+               /*
+                * Avoid lockdep false positive. The + 1 is for the vop
+                * mutex which is held in the reset devices code path.
+                */
+               mutex_lock_nested(&vdev->vvr[i].vr_mutex, i + 1);
+
+       /* 0 status means "reset" */
+       vdev->dd->status = 0;
+       vdev->dc->vdev_reset = 0;
+       vdev->dc->host_ack = 1;
+
+       for (i = 0; i < vdev->dd->num_vq; i++) {
+               struct vringh *vrh = &vdev->vvr[i].vrh;
+
+               vdev->vvr[i].vring.info->avail_idx = 0;
+               vrh->completed = 0;
+               vrh->last_avail_idx = 0;
+               vrh->last_used_idx = 0;
+       }
+
+       for (i = 0; i < vdev->dd->num_vq; i++)
+               mutex_unlock(&vdev->vvr[i].vr_mutex);
+}
+
+static void vop_virtio_reset_devices(struct vop_info *vi)
+{
+       struct list_head *pos, *tmp;
+       struct vop_vdev *vdev;
+
+       list_for_each_safe(pos, tmp, &vi->vdev_list) {
+               vdev = list_entry(pos, struct vop_vdev, list);
+               vop_virtio_device_reset(vdev);
+               vdev->poll_wake = 1;
+               wake_up(&vdev->waitq);
+       }
+}
+
+static void vop_bh_handler(struct work_struct *work)
+{
+       struct vop_vdev *vdev = container_of(work, struct vop_vdev,
+                       virtio_bh_work);
+
+       if (vdev->dc->used_address_updated)
+               vop_virtio_init_post(vdev);
+
+       if (vdev->dc->vdev_reset)
+               vop_virtio_device_reset(vdev);
+
+       vdev->poll_wake = 1;
+       wake_up(&vdev->waitq);
+}
+
+static irqreturn_t _vop_virtio_intr_handler(int irq, void *data)
+{
+       struct vop_vdev *vdev = data;
+       struct vop_device *vpdev = vdev->vpdev;
+
+       vpdev->hw_ops->ack_interrupt(vpdev, vdev->virtio_db);
+       schedule_work(&vdev->virtio_bh_work);
+       return IRQ_HANDLED;
+}
+
+static int vop_virtio_config_change(struct vop_vdev *vdev, void *argp)
+{
+       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wake);
+       int ret = 0, retry, i;
+       struct vop_device *vpdev = vdev->vpdev;
+       struct vop_info *vi = dev_get_drvdata(&vpdev->dev);
+       struct mic_bootparam *bootparam = vpdev->hw_ops->get_dp(vpdev);
+       s8 db = bootparam->h2c_config_db;
+
+       mutex_lock(&vi->vop_mutex);
+       for (i = 0; i < vdev->dd->num_vq; i++)
+               mutex_lock_nested(&vdev->vvr[i].vr_mutex, i + 1);
+
+       if (db == -1 || vdev->dd->type == -1) {
+               ret = -EIO;
+               goto exit;
+       }
+
+       memcpy(mic_vq_configspace(vdev->dd), argp, vdev->dd->config_len);
+       vdev->dc->config_change = MIC_VIRTIO_PARAM_CONFIG_CHANGED;
+       vpdev->hw_ops->send_intr(vpdev, db);
+
+       for (retry = 100; retry--;) {
+               ret = wait_event_timeout(wake, vdev->dc->guest_ack,
+                                        msecs_to_jiffies(100));
+               if (ret)
+                       break;
+       }
+
+       dev_dbg(vop_dev(vdev),
+               "%s %d retry: %d\n", __func__, __LINE__, retry);
+       vdev->dc->config_change = 0;
+       vdev->dc->guest_ack = 0;
+exit:
+       for (i = 0; i < vdev->dd->num_vq; i++)
+               mutex_unlock(&vdev->vvr[i].vr_mutex);
+       mutex_unlock(&vi->vop_mutex);
+       return ret;
+}
+
+static int vop_copy_dp_entry(struct vop_vdev *vdev,
+                            struct mic_device_desc *argp, __u8 *type,
+                            struct mic_device_desc **devpage)
+{
+       struct vop_device *vpdev = vdev->vpdev;
+       struct mic_device_desc *devp;
+       struct mic_vqconfig *vqconfig;
+       int ret = 0, i;
+       bool slot_found = false;
+
+       vqconfig = mic_vq_config(argp);
+       for (i = 0; i < argp->num_vq; i++) {
+               if (le16_to_cpu(vqconfig[i].num) > MIC_MAX_VRING_ENTRIES) {
+                       ret =  -EINVAL;
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, ret);
+                       goto exit;
+               }
+       }
+
+       /* Find the first free device page entry */
+       for (i = sizeof(struct mic_bootparam);
+               i < MIC_DP_SIZE - mic_total_desc_size(argp);
+               i += mic_total_desc_size(devp)) {
+               devp = vpdev->hw_ops->get_dp(vpdev) + i;
+               if (devp->type == 0 || devp->type == -1) {
+                       slot_found = true;
+                       break;
+               }
+       }
+       if (!slot_found) {
+               ret =  -EINVAL;
+               dev_err(vop_dev(vdev), "%s %d err %d\n",
+                       __func__, __LINE__, ret);
+               goto exit;
+       }
+       /*
+        * Save off the type before doing the memcpy. Type will be set in the
+        * end after completing all initialization for the new device.
+        */
+       *type = argp->type;
+       argp->type = 0;
+       memcpy(devp, argp, mic_desc_size(argp));
+
+       *devpage = devp;
+exit:
+       return ret;
+}
+
+static void vop_init_device_ctrl(struct vop_vdev *vdev,
+                                struct mic_device_desc *devpage)
+{
+       struct mic_device_ctrl *dc;
+
+       dc = (void *)devpage + mic_aligned_desc_size(devpage);
+
+       dc->config_change = 0;
+       dc->guest_ack = 0;
+       dc->vdev_reset = 0;
+       dc->host_ack = 0;
+       dc->used_address_updated = 0;
+       dc->c2h_vdev_db = -1;
+       dc->h2c_vdev_db = -1;
+       vdev->dc = dc;
+}
+
+static int vop_virtio_add_device(struct vop_vdev *vdev,
+                                struct mic_device_desc *argp)
+{
+       struct vop_info *vi = vdev->vi;
+       struct vop_device *vpdev = vi->vpdev;
+       struct mic_device_desc *dd = NULL;
+       struct mic_vqconfig *vqconfig;
+       int vr_size, i, j, ret;
+       u8 type = 0;
+       s8 db = -1;
+       char irqname[16];
+       struct mic_bootparam *bootparam;
+       u16 num;
+       dma_addr_t vr_addr;
+
+       bootparam = vpdev->hw_ops->get_dp(vpdev);
+       init_waitqueue_head(&vdev->waitq);
+       INIT_LIST_HEAD(&vdev->list);
+       vdev->vpdev = vpdev;
+
+       ret = vop_copy_dp_entry(vdev, argp, &type, &dd);
+       if (ret) {
+               kfree(vdev);
+               dev_err(vop_dev(vdev), "%s %d err %d\n",
+                       __func__, __LINE__, ret);
+               return ret;
+       }
+
+       vop_init_device_ctrl(vdev, dd);
+
+       vdev->dd = dd;
+       vdev->virtio_id = type;
+       vqconfig = mic_vq_config(dd);
+       INIT_WORK(&vdev->virtio_bh_work, vop_bh_handler);
+
+       for (i = 0; i < dd->num_vq; i++) {
+               struct vop_vringh *vvr = &vdev->vvr[i];
+               struct mic_vring *vr = &vdev->vvr[i].vring;
+
+               num = le16_to_cpu(vqconfig[i].num);
+               mutex_init(&vvr->vr_mutex);
+               vr_size = PAGE_ALIGN(vring_size(num, MIC_VIRTIO_RING_ALIGN) +
+                       sizeof(struct _mic_vring_info));
+               vr->va = (void *)
+                       __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                        get_order(vr_size));
+               if (!vr->va) {
+                       ret = -ENOMEM;
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, ret);
+                       goto err;
+               }
+               vr->len = vr_size;
+               vr->info = vr->va + vring_size(num, MIC_VIRTIO_RING_ALIGN);
+               vr->info->magic = cpu_to_le32(MIC_MAGIC + vdev->virtio_id + i);
+               vr_addr = dma_map_single(&vpdev->dev, vr->va, vr_size,
+                                        DMA_BIDIRECTIONAL);
+               if (dma_mapping_error(&vpdev->dev, vr_addr)) {
+                       free_pages((unsigned long)vr->va, get_order(vr_size));
+                       ret = -ENOMEM;
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, ret);
+                       goto err;
+               }
+               vqconfig[i].address = cpu_to_le64(vr_addr);
+
+               vring_init(&vr->vr, num, vr->va, MIC_VIRTIO_RING_ALIGN);
+               ret = vringh_init_kern(&vvr->vrh,
+                                      *(u32 *)mic_vq_features(vdev->dd),
+                                      num, false, vr->vr.desc, vr->vr.avail,
+                                      vr->vr.used);
+               if (ret) {
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, ret);
+                       goto err;
+               }
+               vringh_kiov_init(&vvr->riov, NULL, 0);
+               vringh_kiov_init(&vvr->wiov, NULL, 0);
+               vvr->head = USHRT_MAX;
+               vvr->vdev = vdev;
+               vvr->vrh.notify = _vop_notify;
+               dev_dbg(&vpdev->dev,
+                       "%s %d index %d va %p info %p vr_size 0x%x\n",
+                       __func__, __LINE__, i, vr->va, vr->info, vr_size);
+               vvr->buf = (void *)__get_free_pages(GFP_KERNEL,
+                                       get_order(VOP_INT_DMA_BUF_SIZE));
+               vvr->buf_da = dma_map_single(&vpdev->dev,
+                                         vvr->buf, VOP_INT_DMA_BUF_SIZE,
+                                         DMA_BIDIRECTIONAL);
+       }
+
+       snprintf(irqname, sizeof(irqname), "vop%dvirtio%d", vpdev->index,
+                vdev->virtio_id);
+       vdev->virtio_db = vpdev->hw_ops->next_db(vpdev);
+       vdev->virtio_cookie = vpdev->hw_ops->request_irq(vpdev,
+                       _vop_virtio_intr_handler, irqname, vdev,
+                       vdev->virtio_db);
+       if (IS_ERR(vdev->virtio_cookie)) {
+               ret = PTR_ERR(vdev->virtio_cookie);
+               dev_dbg(&vpdev->dev, "request irq failed\n");
+               goto err;
+       }
+
+       vdev->dc->c2h_vdev_db = vdev->virtio_db;
+
+       /*
+        * Order the type update with previous stores. This write barrier
+        * is paired with the corresponding read barrier before the uncached
+        * system memory read of the type, on the card while scanning the
+        * device page.
+        */
+       smp_wmb();
+       dd->type = type;
+       argp->type = type;
+
+       if (bootparam) {
+               db = bootparam->h2c_config_db;
+               if (db != -1)
+                       vpdev->hw_ops->send_intr(vpdev, db);
+       }
+       dev_dbg(&vpdev->dev, "Added virtio id %d db %d\n", dd->type, db);
+       return 0;
+err:
+       vqconfig = mic_vq_config(dd);
+       for (j = 0; j < i; j++) {
+               struct vop_vringh *vvr = &vdev->vvr[j];
+
+               dma_unmap_single(&vpdev->dev, le64_to_cpu(vqconfig[j].address),
+                                vvr->vring.len, DMA_BIDIRECTIONAL);
+               free_pages((unsigned long)vvr->vring.va,
+                          get_order(vvr->vring.len));
+       }
+       return ret;
+}
+
+static void vop_dev_remove(struct vop_info *pvi, struct mic_device_ctrl *devp,
+                          struct vop_device *vpdev)
+{
+       struct mic_bootparam *bootparam = vpdev->hw_ops->get_dp(vpdev);
+       s8 db;
+       int ret, retry;
+       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wake);
+
+       devp->config_change = MIC_VIRTIO_PARAM_DEV_REMOVE;
+       db = bootparam->h2c_config_db;
+       if (db != -1)
+               vpdev->hw_ops->send_intr(vpdev, db);
+       else
+               goto done;
+       for (retry = 15; retry--;) {
+               ret = wait_event_timeout(wake, devp->guest_ack,
+                                        msecs_to_jiffies(1000));
+               if (ret)
+                       break;
+       }
+done:
+       devp->config_change = 0;
+       devp->guest_ack = 0;
+}
+
+static void vop_virtio_del_device(struct vop_vdev *vdev)
+{
+       struct vop_info *vi = vdev->vi;
+       struct vop_device *vpdev = vdev->vpdev;
+       int i;
+       struct mic_vqconfig *vqconfig;
+       struct mic_bootparam *bootparam = vpdev->hw_ops->get_dp(vpdev);
+
+       if (!bootparam)
+               goto skip_hot_remove;
+       vop_dev_remove(vi, vdev->dc, vpdev);
+skip_hot_remove:
+       vpdev->hw_ops->free_irq(vpdev, vdev->virtio_cookie, vdev);
+       flush_work(&vdev->virtio_bh_work);
+       vqconfig = mic_vq_config(vdev->dd);
+       for (i = 0; i < vdev->dd->num_vq; i++) {
+               struct vop_vringh *vvr = &vdev->vvr[i];
+
+               dma_unmap_single(&vpdev->dev,
+                                vvr->buf_da, VOP_INT_DMA_BUF_SIZE,
+                                DMA_BIDIRECTIONAL);
+               free_pages((unsigned long)vvr->buf,
+                          get_order(VOP_INT_DMA_BUF_SIZE));
+               vringh_kiov_cleanup(&vvr->riov);
+               vringh_kiov_cleanup(&vvr->wiov);
+               dma_unmap_single(&vpdev->dev, le64_to_cpu(vqconfig[i].address),
+                                vvr->vring.len, DMA_BIDIRECTIONAL);
+               free_pages((unsigned long)vvr->vring.va,
+                          get_order(vvr->vring.len));
+       }
+       /*
+        * Order the type update with previous stores. This write barrier
+        * is paired with the corresponding read barrier before the uncached
+        * system memory read of the type, on the card while scanning the
+        * device page.
+        */
+       smp_wmb();
+       vdev->dd->type = -1;
+}
+
+/*
+ * vop_sync_dma - Wrapper for synchronous DMAs.
+ *
+ * @dev - The address of the pointer to the device instance used
+ * for DMA registration.
+ * @dst - destination DMA address.
+ * @src - source DMA address.
+ * @len - size of the transfer.
+ *
+ * Return DMA_SUCCESS on success
+ */
+static int vop_sync_dma(struct vop_vdev *vdev, dma_addr_t dst, dma_addr_t src,
+                       size_t len)
+{
+       int err = 0;
+       struct dma_device *ddev;
+       struct dma_async_tx_descriptor *tx;
+       struct vop_info *vi = dev_get_drvdata(&vdev->vpdev->dev);
+       struct dma_chan *vop_ch = vi->dma_ch;
+
+       if (!vop_ch) {
+               err = -EBUSY;
+               goto error;
+       }
+       ddev = vop_ch->device;
+       tx = ddev->device_prep_dma_memcpy(vop_ch, dst, src, len,
+               DMA_PREP_FENCE);
+       if (!tx) {
+               err = -ENOMEM;
+               goto error;
+       } else {
+               dma_cookie_t cookie;
+
+               cookie = tx->tx_submit(tx);
+               if (dma_submit_error(cookie)) {
+                       err = -ENOMEM;
+                       goto error;
+               }
+               dma_async_issue_pending(vop_ch);
+               err = dma_sync_wait(vop_ch, cookie);
+       }
+error:
+       if (err)
+               dev_err(&vi->vpdev->dev, "%s %d err %d\n",
+                       __func__, __LINE__, err);
+       return err;
+}
+
+#define VOP_USE_DMA true
+
+/*
+ * Initiates the copies across the PCIe bus from card memory to a user
+ * space buffer. When transfers are done using DMA, source/destination
+ * addresses and transfer length must follow the alignment requirements of
+ * the MIC DMA engine.
+ */
+static int vop_virtio_copy_to_user(struct vop_vdev *vdev, void __user *ubuf,
+                                  size_t len, u64 daddr, size_t dlen,
+                                  int vr_idx)
+{
+       struct vop_device *vpdev = vdev->vpdev;
+       void __iomem *dbuf = vpdev->hw_ops->ioremap(vpdev, daddr, len);
+       struct vop_vringh *vvr = &vdev->vvr[vr_idx];
+       struct vop_info *vi = dev_get_drvdata(&vpdev->dev);
+       size_t dma_alignment = 1 << vi->dma_ch->device->copy_align;
+       bool x200 = is_dma_copy_aligned(vi->dma_ch->device, 1, 1, 1);
+       size_t dma_offset, partlen;
+       int err;
+
+       if (!VOP_USE_DMA) {
+               if (copy_to_user(ubuf, (void __force *)dbuf, len)) {
+                       err = -EFAULT;
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, err);
+                       goto err;
+               }
+               vdev->in_bytes += len;
+               err = 0;
+               goto err;
+       }
+
+       dma_offset = daddr - round_down(daddr, dma_alignment);
+       daddr -= dma_offset;
+       len += dma_offset;
+       /*
+        * X100 uses DMA addresses as seen by the card so adding
+        * the aperture base is not required for DMA. However x200
+        * requires DMA addresses to be an offset into the bar so
+        * add the aperture base for x200.
+        */
+       if (x200)
+               daddr += vpdev->aper->pa;
+       while (len) {
+               partlen = min_t(size_t, len, VOP_INT_DMA_BUF_SIZE);
+               err = vop_sync_dma(vdev, vvr->buf_da, daddr,
+                                  ALIGN(partlen, dma_alignment));
+               if (err) {
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, err);
+                       goto err;
+               }
+               if (copy_to_user(ubuf, vvr->buf + dma_offset,
+                                partlen - dma_offset)) {
+                       err = -EFAULT;
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, err);
+                       goto err;
+               }
+               daddr += partlen;
+               ubuf += partlen;
+               dbuf += partlen;
+               vdev->in_bytes_dma += partlen;
+               vdev->in_bytes += partlen;
+               len -= partlen;
+               dma_offset = 0;
+       }
+       err = 0;
+err:
+       vpdev->hw_ops->iounmap(vpdev, dbuf);
+       dev_dbg(vop_dev(vdev),
+               "%s: ubuf %p dbuf %p len 0x%lx vr_idx 0x%x\n",
+               __func__, ubuf, dbuf, len, vr_idx);
+       return err;
+}
+
+/*
+ * Initiates copies across the PCIe bus from a user space buffer to card
+ * memory. When transfers are done using DMA, source/destination addresses
+ * and transfer length must follow the alignment requirements of the MIC
+ * DMA engine.
+ */
+static int vop_virtio_copy_from_user(struct vop_vdev *vdev, void __user *ubuf,
+                                    size_t len, u64 daddr, size_t dlen,
+                                    int vr_idx)
+{
+       struct vop_device *vpdev = vdev->vpdev;
+       void __iomem *dbuf = vpdev->hw_ops->ioremap(vpdev, daddr, len);
+       struct vop_vringh *vvr = &vdev->vvr[vr_idx];
+       struct vop_info *vi = dev_get_drvdata(&vdev->vpdev->dev);
+       size_t dma_alignment = 1 << vi->dma_ch->device->copy_align;
+       bool x200 = is_dma_copy_aligned(vi->dma_ch->device, 1, 1, 1);
+       size_t partlen;
+       bool dma = VOP_USE_DMA;
+       int err = 0;
+
+       if (daddr & (dma_alignment - 1)) {
+               vdev->tx_dst_unaligned += len;
+               dma = false;
+       } else if (ALIGN(len, dma_alignment) > dlen) {
+               vdev->tx_len_unaligned += len;
+               dma = false;
+       }
+
+       if (!dma)
+               goto memcpy;
+
+       /*
+        * X100 uses DMA addresses as seen by the card so adding
+        * the aperture base is not required for DMA. However x200
+        * requires DMA addresses to be an offset into the bar so
+        * add the aperture base for x200.
+        */
+       if (x200)
+               daddr += vpdev->aper->pa;
+       while (len) {
+               partlen = min_t(size_t, len, VOP_INT_DMA_BUF_SIZE);
+
+               if (copy_from_user(vvr->buf, ubuf, partlen)) {
+                       err = -EFAULT;
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, err);
+                       goto err;
+               }
+               err = vop_sync_dma(vdev, daddr, vvr->buf_da,
+                                  ALIGN(partlen, dma_alignment));
+               if (err) {
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, err);
+                       goto err;
+               }
+               daddr += partlen;
+               ubuf += partlen;
+               dbuf += partlen;
+               vdev->out_bytes_dma += partlen;
+               vdev->out_bytes += partlen;
+               len -= partlen;
+       }
+memcpy:
+       /*
+        * We are copying to IO below and should ideally use something
+        * like copy_from_user_toio(..) if it existed.
+        */
+       if (copy_from_user((void __force *)dbuf, ubuf, len)) {
+               err = -EFAULT;
+               dev_err(vop_dev(vdev), "%s %d err %d\n",
+                       __func__, __LINE__, err);
+               goto err;
+       }
+       vdev->out_bytes += len;
+       err = 0;
+err:
+       vpdev->hw_ops->iounmap(vpdev, dbuf);
+       dev_dbg(vop_dev(vdev),
+               "%s: ubuf %p dbuf %p len 0x%lx vr_idx 0x%x\n",
+               __func__, ubuf, dbuf, len, vr_idx);
+       return err;
+}
+
+#define MIC_VRINGH_READ true
+
+/* Determine the total number of bytes consumed in a VRINGH KIOV */
+static inline u32 vop_vringh_iov_consumed(struct vringh_kiov *iov)
+{
+       int i;
+       u32 total = iov->consumed;
+
+       for (i = 0; i < iov->i; i++)
+               total += iov->iov[i].iov_len;
+       return total;
+}
+
+/*
+ * Traverse the VRINGH KIOV and issue the APIs to trigger the copies.
+ * This API is heavily based on the vringh_iov_xfer(..) implementation
+ * in vringh.c. The reason we cannot reuse vringh_iov_pull_kern(..)
+ * and vringh_iov_push_kern(..) directly is because there is no
+ * way to override the VRINGH xfer(..) routines as of v3.10.
+ */
+static int vop_vringh_copy(struct vop_vdev *vdev, struct vringh_kiov *iov,
+                          void __user *ubuf, size_t len, bool read, int vr_idx,
+                          size_t *out_len)
+{
+       int ret = 0;
+       size_t partlen, tot_len = 0;
+
+       while (len && iov->i < iov->used) {
+               struct kvec *kiov = &iov->iov[iov->i];
+
+               partlen = min(kiov->iov_len, len);
+               if (read)
+                       ret = vop_virtio_copy_to_user(vdev, ubuf, partlen,
+                                                     (u64)kiov->iov_base,
+                                                     kiov->iov_len,
+                                                     vr_idx);
+               else
+                       ret = vop_virtio_copy_from_user(vdev, ubuf, partlen,
+                                                       (u64)kiov->iov_base,
+                                                       kiov->iov_len,
+                                                       vr_idx);
+               if (ret) {
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, ret);
+                       break;
+               }
+               len -= partlen;
+               ubuf += partlen;
+               tot_len += partlen;
+               iov->consumed += partlen;
+               kiov->iov_len -= partlen;
+               kiov->iov_base += partlen;
+               if (!kiov->iov_len) {
+                       /* Fix up old iov element then increment. */
+                       kiov->iov_len = iov->consumed;
+                       kiov->iov_base -= iov->consumed;
+
+                       iov->consumed = 0;
+                       iov->i++;
+               }
+       }
+       *out_len = tot_len;
+       return ret;
+}
+
+/*
+ * Use the standard VRINGH infrastructure in the kernel to fetch new
+ * descriptors, initiate the copies and update the used ring.
+ */
+static int _vop_virtio_copy(struct vop_vdev *vdev, struct mic_copy_desc *copy)
+{
+       int ret = 0;
+       u32 iovcnt = copy->iovcnt;
+       struct iovec iov;
+       struct iovec __user *u_iov = copy->iov;
+       void __user *ubuf = NULL;
+       struct vop_vringh *vvr = &vdev->vvr[copy->vr_idx];
+       struct vringh_kiov *riov = &vvr->riov;
+       struct vringh_kiov *wiov = &vvr->wiov;
+       struct vringh *vrh = &vvr->vrh;
+       u16 *head = &vvr->head;
+       struct mic_vring *vr = &vvr->vring;
+       size_t len = 0, out_len;
+
+       copy->out_len = 0;
+       /* Fetch a new IOVEC if all previous elements have been processed */
+       if (riov->i == riov->used && wiov->i == wiov->used) {
+               ret = vringh_getdesc_kern(vrh, riov, wiov,
+                                         head, GFP_KERNEL);
+               /* Check if there are available descriptors */
+               if (ret <= 0)
+                       return ret;
+       }
+       while (iovcnt) {
+               if (!len) {
+                       /* Copy over a new iovec from user space. */
+                       ret = copy_from_user(&iov, u_iov, sizeof(*u_iov));
+                       if (ret) {
+                               ret = -EINVAL;
+                               dev_err(vop_dev(vdev), "%s %d err %d\n",
+                                       __func__, __LINE__, ret);
+                               break;
+                       }
+                       len = iov.iov_len;
+                       ubuf = iov.iov_base;
+               }
+               /* Issue all the read descriptors first */
+               ret = vop_vringh_copy(vdev, riov, ubuf, len,
+                                     MIC_VRINGH_READ, copy->vr_idx, &out_len);
+               if (ret) {
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, ret);
+                       break;
+               }
+               len -= out_len;
+               ubuf += out_len;
+               copy->out_len += out_len;
+               /* Issue the write descriptors next */
+               ret = vop_vringh_copy(vdev, wiov, ubuf, len,
+                                     !MIC_VRINGH_READ, copy->vr_idx, &out_len);
+               if (ret) {
+                       dev_err(vop_dev(vdev), "%s %d err %d\n",
+                               __func__, __LINE__, ret);
+                       break;
+               }
+               len -= out_len;
+               ubuf += out_len;
+               copy->out_len += out_len;
+               if (!len) {
+                       /* One user space iovec is now completed */
+                       iovcnt--;
+                       u_iov++;
+               }
+               /* Exit loop if all elements in KIOVs have been processed. */
+               if (riov->i == riov->used && wiov->i == wiov->used)
+                       break;
+       }
+       /*
+        * Update the used ring if a descriptor was available and some data was
+        * copied in/out and the user asked for a used ring update.
+        */
+       if (*head != USHRT_MAX && copy->out_len && copy->update_used) {
+               u32 total = 0;
+
+               /* Determine the total data consumed */
+               total += vop_vringh_iov_consumed(riov);
+               total += vop_vringh_iov_consumed(wiov);
+               vringh_complete_kern(vrh, *head, total);
+               *head = USHRT_MAX;
+               if (vringh_need_notify_kern(vrh) > 0)
+                       vringh_notify(vrh);
+               vringh_kiov_cleanup(riov);
+               vringh_kiov_cleanup(wiov);
+               /* Update avail idx for user space */
+               vr->info->avail_idx = vrh->last_avail_idx;
+       }
+       return ret;
+}
+
+static inline int vop_verify_copy_args(struct vop_vdev *vdev,
+                                      struct mic_copy_desc *copy)
+{
+       if (!vdev || copy->vr_idx >= vdev->dd->num_vq)
+               return -EINVAL;
+       return 0;
+}
+
+/* Copy a specified number of virtio descriptors in a chain */
+static int vop_virtio_copy_desc(struct vop_vdev *vdev,
+                               struct mic_copy_desc *copy)
+{
+       int err;
+       struct vop_vringh *vvr = &vdev->vvr[copy->vr_idx];
+
+       err = vop_verify_copy_args(vdev, copy);
+       if (err)
+               return err;
+
+       mutex_lock(&vvr->vr_mutex);
+       if (!vop_vdevup(vdev)) {
+               err = -ENODEV;
+               dev_err(vop_dev(vdev), "%s %d err %d\n",
+                       __func__, __LINE__, err);
+               goto err;
+       }
+       err = _vop_virtio_copy(vdev, copy);
+       if (err) {
+               dev_err(vop_dev(vdev), "%s %d err %d\n",
+                       __func__, __LINE__, err);
+       }
+err:
+       mutex_unlock(&vvr->vr_mutex);
+       return err;
+}
+
+static int vop_open(struct inode *inode, struct file *f)
+{
+       struct vop_vdev *vdev;
+       struct vop_info *vi = container_of(f->private_data,
+               struct vop_info, miscdev);
+
+       vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+       if (!vdev)
+               return -ENOMEM;
+       vdev->vi = vi;
+       mutex_init(&vdev->vdev_mutex);
+       f->private_data = vdev;
+       init_completion(&vdev->destroy);
+       complete(&vdev->destroy);
+       return 0;
+}
+
+static int vop_release(struct inode *inode, struct file *f)
+{
+       struct vop_vdev *vdev = f->private_data, *vdev_tmp;
+       struct vop_info *vi = vdev->vi;
+       struct list_head *pos, *tmp;
+       bool found = false;
+
+       mutex_lock(&vdev->vdev_mutex);
+       if (vdev->deleted)
+               goto unlock;
+       mutex_lock(&vi->vop_mutex);
+       list_for_each_safe(pos, tmp, &vi->vdev_list) {
+               vdev_tmp = list_entry(pos, struct vop_vdev, list);
+               if (vdev == vdev_tmp) {
+                       vop_virtio_del_device(vdev);
+                       list_del(pos);
+                       found = true;
+                       break;
+               }
+       }
+       mutex_unlock(&vi->vop_mutex);
+unlock:
+       mutex_unlock(&vdev->vdev_mutex);
+       if (!found)
+               wait_for_completion(&vdev->destroy);
+       f->private_data = NULL;
+       kfree(vdev);
+       return 0;
+}
+
+static long vop_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+       struct vop_vdev *vdev = f->private_data;
+       struct vop_info *vi = vdev->vi;
+       void __user *argp = (void __user *)arg;
+       int ret;
+
+       switch (cmd) {
+       case MIC_VIRTIO_ADD_DEVICE:
+       {
+               struct mic_device_desc dd, *dd_config;
+
+               if (copy_from_user(&dd, argp, sizeof(dd)))
+                       return -EFAULT;
+
+               if (mic_aligned_desc_size(&dd) > MIC_MAX_DESC_BLK_SIZE ||
+                   dd.num_vq > MIC_MAX_VRINGS)
+                       return -EINVAL;
+
+               dd_config = kzalloc(mic_desc_size(&dd), GFP_KERNEL);
+               if (!dd_config)
+                       return -ENOMEM;
+               if (copy_from_user(dd_config, argp, mic_desc_size(&dd))) {
+                       ret = -EFAULT;
+                       goto free_ret;
+               }
+               mutex_lock(&vdev->vdev_mutex);
+               mutex_lock(&vi->vop_mutex);
+               ret = vop_virtio_add_device(vdev, dd_config);
+               if (ret)
+                       goto unlock_ret;
+               list_add_tail(&vdev->list, &vi->vdev_list);
+unlock_ret:
+               mutex_unlock(&vi->vop_mutex);
+               mutex_unlock(&vdev->vdev_mutex);
+free_ret:
+               kfree(dd_config);
+               return ret;
+       }
+       case MIC_VIRTIO_COPY_DESC:
+       {
+               struct mic_copy_desc copy;
+
+               mutex_lock(&vdev->vdev_mutex);
+               ret = vop_vdev_inited(vdev);
+               if (ret)
+                       goto _unlock_ret;
+
+               if (copy_from_user(&copy, argp, sizeof(copy))) {
+                       ret = -EFAULT;
+                       goto _unlock_ret;
+               }
+
+               ret = vop_virtio_copy_desc(vdev, &copy);
+               if (ret < 0)
+                       goto _unlock_ret;
+               if (copy_to_user(
+                       &((struct mic_copy_desc __user *)argp)->out_len,
+                       &copy.out_len, sizeof(copy.out_len)))
+                       ret = -EFAULT;
+_unlock_ret:
+               mutex_unlock(&vdev->vdev_mutex);
+               return ret;
+       }
+       case MIC_VIRTIO_CONFIG_CHANGE:
+       {
+               void *buf;
+
+               mutex_lock(&vdev->vdev_mutex);
+               ret = vop_vdev_inited(vdev);
+               if (ret)
+                       goto __unlock_ret;
+               buf = kzalloc(vdev->dd->config_len, GFP_KERNEL);
+               if (!buf) {
+                       ret = -ENOMEM;
+                       goto __unlock_ret;
+               }
+               if (copy_from_user(buf, argp, vdev->dd->config_len)) {
+                       ret = -EFAULT;
+                       goto done;
+               }
+               ret = vop_virtio_config_change(vdev, buf);
+done:
+               kfree(buf);
+__unlock_ret:
+               mutex_unlock(&vdev->vdev_mutex);
+               return ret;
+       }
+       default:
+               return -ENOIOCTLCMD;
+       };
+       return 0;
+}
+
+/*
+ * We return POLLIN | POLLOUT from poll when new buffers are enqueued, and
+ * not when previously enqueued buffers may be available. This means that
+ * in the card->host (TX) path, when userspace is unblocked by poll it
+ * must drain all available descriptors or it can stall.
+ */
+static unsigned int vop_poll(struct file *f, poll_table *wait)
+{
+       struct vop_vdev *vdev = f->private_data;
+       int mask = 0;
+
+       mutex_lock(&vdev->vdev_mutex);
+       if (vop_vdev_inited(vdev)) {
+               mask = POLLERR;
+               goto done;
+       }
+       poll_wait(f, &vdev->waitq, wait);
+       if (vop_vdev_inited(vdev)) {
+               mask = POLLERR;
+       } else if (vdev->poll_wake) {
+               vdev->poll_wake = 0;
+               mask = POLLIN | POLLOUT;
+       }
+done:
+       mutex_unlock(&vdev->vdev_mutex);
+       return mask;
+}
+
+static inline int
+vop_query_offset(struct vop_vdev *vdev, unsigned long offset,
+                unsigned long *size, unsigned long *pa)
+{
+       struct vop_device *vpdev = vdev->vpdev;
+       unsigned long start = MIC_DP_SIZE;
+       int i;
+
+       /*
+        * MMAP interface is as follows:
+        * offset                               region
+        * 0x0                                  virtio device_page
+        * 0x1000                               first vring
+        * 0x1000 + size of 1st vring           second vring
+        * ....
+        */
+       if (!offset) {
+               *pa = virt_to_phys(vpdev->hw_ops->get_dp(vpdev));
+               *size = MIC_DP_SIZE;
+               return 0;
+       }
+
+       for (i = 0; i < vdev->dd->num_vq; i++) {
+               struct vop_vringh *vvr = &vdev->vvr[i];
+
+               if (offset == start) {
+                       *pa = virt_to_phys(vvr->vring.va);
+                       *size = vvr->vring.len;
+                       return 0;
+               }
+               start += vvr->vring.len;
+       }
+       return -1;
+}
+
+/*
+ * Maps the device page and virtio rings to user space for readonly access.
+ */
+static int vop_mmap(struct file *f, struct vm_area_struct *vma)
+{
+       struct vop_vdev *vdev = f->private_data;
+       unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+       unsigned long pa, size = vma->vm_end - vma->vm_start, size_rem = size;
+       int i, err;
+
+       err = vop_vdev_inited(vdev);
+       if (err)
+               goto ret;
+       if (vma->vm_flags & VM_WRITE) {
+               err = -EACCES;
+               goto ret;
+       }
+       while (size_rem) {
+               i = vop_query_offset(vdev, offset, &size, &pa);
+               if (i < 0) {
+                       err = -EINVAL;
+                       goto ret;
+               }
+               err = remap_pfn_range(vma, vma->vm_start + offset,
+                                     pa >> PAGE_SHIFT, size,
+                                     vma->vm_page_prot);
+               if (err)
+                       goto ret;
+               size_rem -= size;
+               offset += size;
+       }
+ret:
+       return err;
+}
+
+static const struct file_operations vop_fops = {
+       .open = vop_open,
+       .release = vop_release,
+       .unlocked_ioctl = vop_ioctl,
+       .poll = vop_poll,
+       .mmap = vop_mmap,
+       .owner = THIS_MODULE,
+};
+
+int vop_host_init(struct vop_info *vi)
+{
+       int rc;
+       struct miscdevice *mdev;
+       struct vop_device *vpdev = vi->vpdev;
+
+       INIT_LIST_HEAD(&vi->vdev_list);
+       vi->dma_ch = vpdev->dma_ch;
+       mdev = &vi->miscdev;
+       mdev->minor = MISC_DYNAMIC_MINOR;
+       snprintf(vi->name, sizeof(vi->name), "vop_virtio%d", vpdev->index);
+       mdev->name = vi->name;
+       mdev->fops = &vop_fops;
+       mdev->parent = &vpdev->dev;
+
+       rc = misc_register(mdev);
+       if (rc)
+               dev_err(&vpdev->dev, "%s failed rc %d\n", __func__, rc);
+       return rc;
+}
+
+void vop_host_uninit(struct vop_info *vi)
+{
+       struct list_head *pos, *tmp;
+       struct vop_vdev *vdev;
+
+       mutex_lock(&vi->vop_mutex);
+       vop_virtio_reset_devices(vi);
+       list_for_each_safe(pos, tmp, &vi->vdev_list) {
+               vdev = list_entry(pos, struct vop_vdev, list);
+               list_del(pos);
+               reinit_completion(&vdev->destroy);
+               mutex_unlock(&vi->vop_mutex);
+               mutex_lock(&vdev->vdev_mutex);
+               vop_virtio_del_device(vdev);
+               vdev->deleted = true;
+               mutex_unlock(&vdev->vdev_mutex);
+               complete(&vdev->destroy);
+               mutex_lock(&vi->vop_mutex);
+       }
+       mutex_unlock(&vi->vop_mutex);
+       misc_deregister(&vi->miscdev);
+}