KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization
authorPaul Mackerras <paulus@ozlabs.org>
Mon, 8 Oct 2018 05:31:03 +0000 (16:31 +1100)
committerMichael Ellerman <mpe@ellerman.id.au>
Tue, 9 Oct 2018 05:04:27 +0000 (16:04 +1100)
This starts the process of adding the code to support nested HV-style
virtualization.  It defines a new H_SET_PARTITION_TABLE hypercall which
a nested hypervisor can use to set the base address and size of a
partition table in its memory (analogous to the PTCR register).
On the host (level 0 hypervisor) side, the H_SET_PARTITION_TABLE
hypercall from the guest is handled by code that saves the virtual
PTCR value for the guest.

This also adds code for creating and destroying nested guests and for
reading the partition table entry for a nested guest from L1 memory.
Each nested guest has its own shadow LPID value, different in general
from the LPID value used by the nested hypervisor to refer to it.  The
shadow LPID value is allocated at nested guest creation time.

Nested hypervisor functionality is only available for a radix guest,
which therefore means a radix host on a POWER9 (or later) processor.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
arch/powerpc/include/asm/hvcall.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_nested.c [new file with mode: 0644]

index a0b17f9f1ea4e5c40e1b2d7a2c6e1aa913759ea5..c95c6518bf27079a729cb1026a54a0b78b8b6bbb 100644 (file)
 #define H_GET_24X7_DATA                0xF07C
 #define H_GET_PERF_COUNTER_INFO        0xF080
 
+/* Platform-specific hcalls used for nested HV KVM */
+#define H_SET_PARTITION_TABLE  0xF800
+#define H_ENTER_NESTED         0xF804
+#define H_TLB_INVALIDATE       0xF808
+
 /* Values for 2nd argument to H_SET_MODE */
 #define H_SET_MODE_RESOURCE_SET_CIABR          1
 #define H_SET_MODE_RESOURCE_SET_DAWR           2
index 91c977948828155c09e598d61e5df35b28ac5a27..43f212e38b89e86f8b2fed6274d9bf83acb2d4a6 100644 (file)
@@ -274,6 +274,13 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
 static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
 #endif
 
+long kvmhv_nested_init(void);
+void kvmhv_nested_exit(void);
+void kvmhv_vm_nested_init(struct kvm *kvm);
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
+void kvmhv_release_all_nested(struct kvm *kvm);
+
 void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 
 extern int kvm_irq_bypass;
@@ -387,9 +394,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
 /* TO = 31 for unconditional trap */
 #define INS_TW                         0x7fe00008
 
-/* LPIDs we support with this build -- runtime limit may be lower */
-#define KVMPPC_NR_LPIDS                        (LPID_RSVD + 1)
-
 #define SPLIT_HACK_MASK                        0xff000000
 #define SPLIT_HACK_OFFS                        0xfb000000
 
index 5c0e2d9a7e15149dd1354fd75036a7d7e73daabf..6d67b6a9e78468b35f0d29f507ec6966b58dfdcb 100644 (file)
 #include <linux/string.h>
 #include <asm/bitops.h>
 #include <asm/book3s/64/mmu-hash.h>
+#include <asm/cpu_has_feature.h>
+
+#ifdef CONFIG_PPC_PSERIES
+static inline bool kvmhv_on_pseries(void)
+{
+       return !cpu_has_feature(CPU_FTR_HVMODE);
+}
+#else
+static inline bool kvmhv_on_pseries(void)
+{
+       return false;
+}
+#endif
+
+/*
+ * Structure for a nested guest, that is, for a guest that is managed by
+ * one of our guests.
+ */
+struct kvm_nested_guest {
+       struct kvm *l1_host;            /* L1 VM that owns this nested guest */
+       int l1_lpid;                    /* lpid L1 guest thinks this guest is */
+       int shadow_lpid;                /* real lpid of this nested guest */
+       pgd_t *shadow_pgtable;          /* our page table for this guest */
+       u64 l1_gr_to_hr;                /* L1's addr of part'n-scoped table */
+       u64 process_table;              /* process table entry for this guest */
+       long refcnt;                    /* number of pointers to this struct */
+       struct mutex tlb_lock;          /* serialize page faults and tlbies */
+       struct kvm_nested_guest *next;
+};
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
+                                         bool create);
+void kvmhv_put_nested(struct kvm_nested_guest *gp);
 
 /* Power architecture requires HPT is at least 256kiB, at most 64TiB */
 #define PPC_MIN_HPT_ORDER      18
index d978fdf698af2ad5e89a4243e7bf2efe3e4e15bb..eb3ba6390108215c2a0c628c43a27266b1444ff5 100644 (file)
@@ -25,6 +25,9 @@
 #define XICS_MFRR              0xc
 #define XICS_IPI               2       /* interrupt source # for IPIs */
 
+/* LPIDs we support with this build -- runtime limit may be lower */
+#define KVMPPC_NR_LPIDS                        (LPID_RSVD + 1)
+
 /* Maximum number of threads per physical core */
 #define MAX_SMT_THREADS                8
 
index c9cc42f73b3c57b92a74a8214582266d4917c270..c35d4f2c4d908adf1053addce1f4e5cdd9bc51ba 100644 (file)
@@ -46,6 +46,7 @@
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #include <asm/kvm_book3s_asm.h>                /* for MAX_SMT_THREADS */
 #define KVM_MAX_VCPU_ID                (MAX_SMT_THREADS * KVM_MAX_VCORES)
+#define KVM_MAX_NESTED_GUESTS  KVMPPC_NR_LPIDS
 
 #else
 #define KVM_MAX_VCPU_ID                KVM_MAX_VCPUS
@@ -287,6 +288,7 @@ struct kvm_arch {
        u8 radix;
        u8 fwnmi_enabled;
        bool threads_indep;
+       bool nested_enable;
        pgd_t *pgtable;
        u64 process_table;
        struct dentry *debugfs_dir;
@@ -312,6 +314,9 @@ struct kvm_arch {
 #endif
        struct kvmppc_ops *kvm_ops;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+       u64 l1_ptcr;
+       int max_nested_lpid;
+       struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
        /* This array can grow quite large, keep it at the end */
        struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 #endif
index f872c04bb5b1bb1185a7fcc9df1597540d3d3120..e814f40ab836e3bfc66f9e529299fb8202af2f6a 100644 (file)
@@ -75,7 +75,8 @@ kvm-hv-y += \
        book3s_hv.o \
        book3s_hv_interrupts.o \
        book3s_64_mmu_hv.o \
-       book3s_64_mmu_radix.o
+       book3s_64_mmu_radix.o \
+       book3s_hv_nested.o
 
 kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
        book3s_hv_tm.o
index 8c20a90a68519737677ccf97865850fc57f64064..d8fc49effab0b9ce0a6336eaa4b01b34254b5560 100644 (file)
@@ -934,6 +934,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                if (ret == H_TOO_HARD)
                        return RESUME_HOST;
                break;
+
+       case H_SET_PARTITION_TABLE:
+               ret = H_FUNCTION;
+               if (vcpu->kvm->arch.nested_enable)
+                       ret = kvmhv_set_partition_table(vcpu);
+               break;
+       case H_ENTER_NESTED:
+               ret = H_FUNCTION;
+               break;
+       case H_TLB_INVALIDATE:
+               ret = H_FUNCTION;
+               break;
+
        default:
                return RESUME_HOST;
        }
@@ -4157,8 +4170,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm)
                        __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
                dw1 = PATB_GR | kvm->arch.process_table;
        }
-
-       mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
+       kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
 }
 
 /*
@@ -4254,6 +4266,10 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
 int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
 {
+       if (kvm->arch.nested_enable) {
+               kvm->arch.nested_enable = false;
+               kvmhv_release_all_nested(kvm);
+       }
        kvmppc_free_radix(kvm);
        kvmppc_update_lpcr(kvm, LPCR_VPM1,
                           LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
@@ -4374,6 +4390,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
        kvmppc_alloc_host_rm_ops();
 
+       kvmhv_vm_nested_init(kvm);
+
        /*
         * Since we don't flush the TLB when tearing down a VM,
         * and this lpid might have previously been used,
@@ -4517,8 +4535,10 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 
        /* Perform global invalidation and return lpid to the pool */
        if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+               if (kvm->arch.nested_enable)
+                       kvmhv_release_all_nested(kvm);
                kvm->arch.process_table = 0;
-               kvmppc_setup_partition_table(kvm);
+               kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
        }
        kvmppc_free_lpid(kvm->arch.lpid);
 
@@ -4989,6 +5009,10 @@ static int kvmppc_book3s_init_hv(void)
        if (r < 0)
                return -ENODEV;
 
+       r = kvmhv_nested_init();
+       if (r)
+               return r;
+
        r = kvm_init_subcore_bitmap();
        if (r)
                return r;
@@ -5047,6 +5071,7 @@ static void kvmppc_book3s_exit_hv(void)
        if (kvmppc_radix_possible())
                kvmppc_radix_exit();
        kvmppc_hv_ops = NULL;
+       kvmhv_nested_exit();
 }
 
 module_init(kvmppc_book3s_init_hv);
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
new file mode 100644 (file)
index 0000000..3278262
--- /dev/null
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2018
+ * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
+ *        Paul Mackerras <paulus@ozlabs.org>
+ *
+ * Description: KVM functions specific to running nested KVM-HV guests
+ * on Book3S processors (specifically POWER9 and later).
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+static struct patb_entry *pseries_partition_tb;
+
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
+
+long kvmhv_nested_init(void)
+{
+       long int ptb_order;
+       unsigned long ptcr;
+       long rc;
+
+       if (!kvmhv_on_pseries())
+               return 0;
+       if (!radix_enabled())
+               return -ENODEV;
+
+       /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
+       ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
+       if (ptb_order < 8)
+               ptb_order = 8;
+       pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
+                                      GFP_KERNEL);
+       if (!pseries_partition_tb) {
+               pr_err("kvm-hv: failed to allocated nested partition table\n");
+               return -ENOMEM;
+       }
+
+       ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
+       rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
+       if (rc != H_SUCCESS) {
+               pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
+                      rc);
+               kfree(pseries_partition_tb);
+               pseries_partition_tb = NULL;
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+void kvmhv_nested_exit(void)
+{
+       /*
+        * N.B. the kvmhv_on_pseries() test is there because it enables
+        * the compiler to remove the call to plpar_hcall_norets()
+        * when CONFIG_PPC_PSERIES=n.
+        */
+       if (kvmhv_on_pseries() && pseries_partition_tb) {
+               plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
+               kfree(pseries_partition_tb);
+               pseries_partition_tb = NULL;
+       }
+}
+
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
+{
+       if (cpu_has_feature(CPU_FTR_HVMODE)) {
+               mmu_partition_table_set_entry(lpid, dw0, dw1);
+       } else {
+               pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+               pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+       }
+}
+
+static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
+{
+       unsigned long dw0;
+
+       dw0 = PATB_HR | radix__get_tree_size() |
+               __pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
+       kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
+}
+
+void kvmhv_vm_nested_init(struct kvm *kvm)
+{
+       kvm->arch.max_nested_lpid = -1;
+}
+
+/*
+ * Handle the H_SET_PARTITION_TABLE hcall.
+ * r4 = guest real address of partition table + log_2(size) - 12
+ * (formatted as for the PTCR).
+ */
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
+       int srcu_idx;
+       long ret = H_SUCCESS;
+
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+       /*
+        * Limit the partition table to 4096 entries (because that's what
+        * hardware supports), and check the base address.
+        */
+       if ((ptcr & PRTS_MASK) > 12 - 8 ||
+           !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
+               ret = H_PARAMETER;
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+       if (ret == H_SUCCESS)
+               kvm->arch.l1_ptcr = ptcr;
+       return ret;
+}
+
+/*
+ * Reload the partition table entry for a guest.
+ * Caller must hold gp->tlb_lock.
+ */
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
+{
+       int ret;
+       struct patb_entry ptbl_entry;
+       unsigned long ptbl_addr;
+       struct kvm *kvm = gp->l1_host;
+
+       ret = -EFAULT;
+       ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
+       if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8)))
+               ret = kvm_read_guest(kvm, ptbl_addr,
+                                    &ptbl_entry, sizeof(ptbl_entry));
+       if (ret) {
+               gp->l1_gr_to_hr = 0;
+               gp->process_table = 0;
+       } else {
+               gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
+               gp->process_table = be64_to_cpu(ptbl_entry.patb1);
+       }
+       kvmhv_set_nested_ptbl(gp);
+}
+
+struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
+{
+       struct kvm_nested_guest *gp;
+       long shadow_lpid;
+
+       gp = kzalloc(sizeof(*gp), GFP_KERNEL);
+       if (!gp)
+               return NULL;
+       gp->l1_host = kvm;
+       gp->l1_lpid = lpid;
+       mutex_init(&gp->tlb_lock);
+       gp->shadow_pgtable = pgd_alloc(kvm->mm);
+       if (!gp->shadow_pgtable)
+               goto out_free;
+       shadow_lpid = kvmppc_alloc_lpid();
+       if (shadow_lpid < 0)
+               goto out_free2;
+       gp->shadow_lpid = shadow_lpid;
+
+       return gp;
+
+ out_free2:
+       pgd_free(kvm->mm, gp->shadow_pgtable);
+ out_free:
+       kfree(gp);
+       return NULL;
+}
+
+/*
+ * Free up any resources allocated for a nested guest.
+ */
+static void kvmhv_release_nested(struct kvm_nested_guest *gp)
+{
+       kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
+       kvmppc_free_lpid(gp->shadow_lpid);
+       if (gp->shadow_pgtable)
+               pgd_free(gp->l1_host->mm, gp->shadow_pgtable);
+       kfree(gp);
+}
+
+static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = gp->l1_host;
+       int lpid = gp->l1_lpid;
+       long ref;
+
+       spin_lock(&kvm->mmu_lock);
+       if (gp == kvm->arch.nested_guests[lpid]) {
+               kvm->arch.nested_guests[lpid] = NULL;
+               if (lpid == kvm->arch.max_nested_lpid) {
+                       while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
+                               ;
+                       kvm->arch.max_nested_lpid = lpid;
+               }
+               --gp->refcnt;
+       }
+       ref = gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+       if (ref == 0)
+               kvmhv_release_nested(gp);
+}
+
+/*
+ * Free up all nested resources allocated for this guest.
+ * This is called with no vcpus of the guest running, when
+ * switching the guest to HPT mode or when destroying the
+ * guest.
+ */
+void kvmhv_release_all_nested(struct kvm *kvm)
+{
+       int i;
+       struct kvm_nested_guest *gp;
+       struct kvm_nested_guest *freelist = NULL;
+
+       spin_lock(&kvm->mmu_lock);
+       for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
+               gp = kvm->arch.nested_guests[i];
+               if (!gp)
+                       continue;
+               kvm->arch.nested_guests[i] = NULL;
+               if (--gp->refcnt == 0) {
+                       gp->next = freelist;
+                       freelist = gp;
+               }
+       }
+       kvm->arch.max_nested_lpid = -1;
+       spin_unlock(&kvm->mmu_lock);
+       while ((gp = freelist) != NULL) {
+               freelist = gp->next;
+               kvmhv_release_nested(gp);
+       }
+}
+
+/* caller must hold gp->tlb_lock */
+void kvmhv_flush_nested(struct kvm_nested_guest *gp)
+{
+       kvmhv_update_ptbl_cache(gp);
+       if (gp->l1_gr_to_hr == 0)
+               kvmhv_remove_nested(gp);
+}
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
+                                         bool create)
+{
+       struct kvm_nested_guest *gp, *newgp;
+
+       if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
+           l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
+               return NULL;
+
+       spin_lock(&kvm->mmu_lock);
+       gp = kvm->arch.nested_guests[l1_lpid];
+       if (gp)
+               ++gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+
+       if (gp || !create)
+               return gp;
+
+       newgp = kvmhv_alloc_nested(kvm, l1_lpid);
+       if (!newgp)
+               return NULL;
+       spin_lock(&kvm->mmu_lock);
+       if (kvm->arch.nested_guests[l1_lpid]) {
+               /* someone else beat us to it */
+               gp = kvm->arch.nested_guests[l1_lpid];
+       } else {
+               kvm->arch.nested_guests[l1_lpid] = newgp;
+               ++newgp->refcnt;
+               gp = newgp;
+               newgp = NULL;
+               if (l1_lpid > kvm->arch.max_nested_lpid)
+                       kvm->arch.max_nested_lpid = l1_lpid;
+       }
+       ++gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+
+       if (newgp)
+               kvmhv_release_nested(newgp);
+
+       return gp;
+}
+
+void kvmhv_put_nested(struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = gp->l1_host;
+       long ref;
+
+       spin_lock(&kvm->mmu_lock);
+       ref = --gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+       if (ref == 0)
+               kvmhv_release_nested(gp);
+}