KVM: PPC: Book3S HV: Run HPT guests on POWER9 radix hosts
authorPaul Mackerras <paulus@ozlabs.org>
Thu, 19 Oct 2017 03:11:23 +0000 (14:11 +1100)
committerPaul Mackerras <paulus@ozlabs.org>
Wed, 1 Nov 2017 04:36:41 +0000 (15:36 +1100)
This patch removes the restriction that a radix host can only run
radix guests, allowing us to run HPT (hashed page table) guests as
well.  This is useful because it provides a way to run old guest
kernels that know about POWER8 but not POWER9.

Unfortunately, POWER9 currently has a restriction that all threads
in a given code must either all be in HPT mode, or all in radix mode.
This means that when entering a HPT guest, we have to obtain control
of all 4 threads in the core and get them to switch their LPIDR and
LPCR registers, even if they are not going to run a guest.  On guest
exit we also have to get all threads to switch LPIDR and LPCR back
to host values.

To make this feasible, we require that KVM not be in the "independent
threads" mode, and that the CPU cores be in single-threaded mode from
the host kernel's perspective (only thread 0 online; threads 1, 2 and
3 offline).  That allows us to use the same code as on POWER8 for
obtaining control of the secondary threads.

To manage the LPCR/LPIDR changes required, we extend the kvm_split_info
struct to contain the information needed by the secondary threads.
All threads perform a barrier synchronization (where all threads wait
for every other thread to reach the synchronization point) on guest
entry, both before and after loading LPCR and LPIDR.  On guest exit,
they all once again perform a barrier synchronization both before
and after loading host values into LPCR and LPIDR.

Finally, it is also currently necessary to flush the entire TLB every
time we enter a HPT guest on a radix host.  We do this on thread 0
with a loop of tlbiel instructions.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S

index 7cea76f11c26c66fc52e08aae5b37e6534ac79a8..ab386af2904fdb4bc08713c641027866a320a055 100644 (file)
@@ -82,6 +82,16 @@ struct kvm_split_mode {
        u8              do_nap;
        u8              napped[MAX_SMT_THREADS];
        struct kvmppc_vcore *vc[MAX_SUBCORES];
+       /* Bits for changing lpcr on P9 */
+       unsigned long   lpcr_req;
+       unsigned long   lpidr_req;
+       unsigned long   host_lpcr;
+       u32             do_set;
+       u32             do_restore;
+       union {
+               u32     allphases;
+               u8      phase[4];
+       } lpcr_sync;
 };
 
 /*
@@ -107,7 +117,8 @@ struct kvmppc_host_state {
        u8 hwthread_req;
        u8 hwthread_state;
        u8 host_ipi;
-       u8 ptid;
+       u8 ptid;                /* thread number within subcore when split */
+       u8 tid;                 /* thread number within whole core */
        struct kvm_vcpu *kvm_vcpu;
        struct kvmppc_vcore *kvm_vcore;
        void __iomem *xics_phys;
index 8cfb20e38cfe9740c168c6389223493a034c6c5a..519fad556113196273fd38d13f4d18c7918d5ebd 100644 (file)
@@ -642,6 +642,7 @@ int main(void)
        HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
        HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
        HSTATE_FIELD(HSTATE_PTID, ptid);
+       HSTATE_FIELD(HSTATE_TID, tid);
        HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
        HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
        HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]);
@@ -667,6 +668,8 @@ int main(void)
        OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
        OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
        OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
+       OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
+       OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #ifdef CONFIG_PPC_BOOK3S_64
index b5fbf76179522c4905192c60bb86e0fce7ae4c19..fff62fdf1464f72ac443d128ac51f0f7d76693de 100644 (file)
@@ -2414,6 +2414,11 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
        if (!cpu_has_feature(CPU_FTR_ARCH_207S))
                return false;
 
+       /* POWER9 currently requires all threads to be in the same MMU mode */
+       if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+           kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
+               return false;
+
        if (n_threads < cip->max_subcore_threads)
                n_threads = cip->max_subcore_threads;
        if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
@@ -2452,9 +2457,6 @@ static void prepare_threads(struct kvmppc_vcore *vc)
        for_each_runnable_thread(i, vcpu, vc) {
                if (signal_pending(vcpu->arch.run_task))
                        vcpu->arch.ret = -EINTR;
-               else if (kvm_is_radix(vc->kvm) != radix_enabled())
-                       /* can't actually run HPT guest on radix host yet... */
-                       vcpu->arch.ret = -EINVAL;
                else if (vcpu->arch.vpa.update_pending ||
                         vcpu->arch.slb_shadow.update_pending ||
                         vcpu->arch.dtl.update_pending)
@@ -2643,6 +2645,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        int controlled_threads;
        int trap;
        bool is_power8;
+       bool hpt_on_radix;
 
        /*
         * Remove from the list any threads that have a signal pending
@@ -2671,9 +2674,13 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
         * Make sure we are running on primary threads, and that secondary
         * threads are offline.  Also check if the number of threads in this
         * guest are greater than the current system threads per guest.
+        * On POWER9, we need to be not in independent-threads mode if
+        * this is a HPT guest on a radix host.
         */
-       if ((controlled_threads > 1) &&
-           ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
+       hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm);
+       if (((controlled_threads > 1) &&
+            ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
+           (hpt_on_radix && vc->kvm->arch.threads_indep)) {
                for_each_runnable_thread(i, vcpu, vc) {
                        vcpu->arch.ret = -EBUSY;
                        kvmppc_remove_runnable(vc, vcpu);
@@ -2739,7 +2746,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
                && !cpu_has_feature(CPU_FTR_ARCH_300);
 
-       if (split > 1) {
+       if (split > 1 || hpt_on_radix) {
                sip = &split_info;
                memset(&split_info, 0, sizeof(split_info));
                for (sub = 0; sub < core_info.n_subcores; ++sub)
@@ -2761,13 +2768,24 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                        split_info.subcore_size = subcore_size;
                } else {
                        split_info.subcore_size = 1;
+                       if (hpt_on_radix) {
+                               /* Use the split_info for LPCR/LPIDR changes */
+                               split_info.lpcr_req = vc->lpcr;
+                               split_info.lpidr_req = vc->kvm->arch.lpid;
+                               split_info.host_lpcr = vc->kvm->arch.host_lpcr;
+                               split_info.do_set = 1;
+                       }
                }
 
                /* order writes to split_info before kvm_split_mode pointer */
                smp_wmb();
        }
-       for (thr = 0; thr < controlled_threads; ++thr)
+
+       for (thr = 0; thr < controlled_threads; ++thr) {
+               paca[pcpu + thr].kvm_hstate.tid = thr;
+               paca[pcpu + thr].kvm_hstate.napping = 0;
                paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+       }
 
        /* Initiate micro-threading (split-core) on POWER8 if required */
        if (cmd_bit) {
@@ -2820,8 +2838,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
         * When doing micro-threading, poke the inactive threads as well.
         * This gets them to the nap instruction after kvm_do_nap,
         * which reduces the time taken to unsplit later.
+        * For POWER9 HPT guest on radix host, we need all the secondary
+        * threads woken up so they can do the LPCR/LPIDR change.
         */
-       if (cmd_bit) {
+       if (cmd_bit || hpt_on_radix) {
                split_info.do_nap = 1;  /* ask secondaries to nap when done */
                for (thr = 1; thr < threads_per_subcore; ++thr)
                        if (!(active & (1 << thr)))
@@ -2879,8 +2899,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                        cpu_relax();
                        ++loops;
                }
-               split_info.do_nap = 0;
+       } else if (hpt_on_radix) {
+               /* Wait for all threads to have seen final sync */
+               for (thr = 1; thr < controlled_threads; ++thr) {
+                       while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) {
+                               HMT_low();
+                               barrier();
+                       }
+                       HMT_medium();
+               }
        }
+       split_info.do_nap = 0;
 
        kvmppc_set_host_core(pcpu);
 
index e38cc2df6d2a8a48a479b00b25c05b90c9394a7c..49a2c7825e045662bf1080d0888a00a0f076da6a 100644 (file)
@@ -278,7 +278,8 @@ void kvmhv_commence_exit(int trap)
        struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
        int ptid = local_paca->kvm_hstate.ptid;
        struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
-       int me, ee, i;
+       int me, ee, i, t;
+       int cpu0;
 
        /* Set our bit in the threads-exiting-guest map in the 0xff00
           bits of vcore->entry_exit_map */
@@ -320,6 +321,22 @@ void kvmhv_commence_exit(int trap)
                if ((ee >> 8) == 0)
                        kvmhv_interrupt_vcore(vc, ee);
        }
+
+       /*
+        * On POWER9 when running a HPT guest on a radix host (sip != NULL),
+        * we have to interrupt inactive CPU threads to get them to
+        * restore the host LPCR value.
+        */
+       if (sip->lpcr_req) {
+               if (cmpxchg(&sip->do_restore, 0, 1) == 0) {
+                       vc = local_paca->kvm_hstate.kvm_vcore;
+                       cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid;
+                       for (t = 1; t < threads_per_core; ++t) {
+                               if (sip->napped[t])
+                                       kvmhv_rm_send_ipi(cpu0 + t);
+                       }
+               }
+       }
 }
 
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
@@ -619,3 +636,83 @@ void kvmppc_bad_interrupt(struct pt_regs *regs)
        die("Bad interrupt in KVM entry/exit code", regs, SIGABRT);
        panic("Bad KVM trap");
 }
+
+/*
+ * Functions used to switch LPCR HR and UPRT bits on all threads
+ * when entering and exiting HPT guests on a radix host.
+ */
+
+#define PHASE_REALMODE         1       /* in real mode */
+#define PHASE_SET_LPCR         2       /* have set LPCR */
+#define PHASE_OUT_OF_GUEST     4       /* have finished executing in guest */
+#define PHASE_RESET_LPCR       8       /* have reset LPCR to host value */
+
+#define ALL(p)         (((p) << 24) | ((p) << 16) | ((p) << 8) | (p))
+
+static void wait_for_sync(struct kvm_split_mode *sip, int phase)
+{
+       int thr = local_paca->kvm_hstate.tid;
+
+       sip->lpcr_sync.phase[thr] |= phase;
+       phase = ALL(phase);
+       while ((sip->lpcr_sync.allphases & phase) != phase) {
+               HMT_low();
+               barrier();
+       }
+       HMT_medium();
+}
+
+void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
+{
+       unsigned long rb, set;
+
+       /* wait for every other thread to get to real mode */
+       wait_for_sync(sip, PHASE_REALMODE);
+
+       /* Set LPCR and LPIDR */
+       mtspr(SPRN_LPCR, sip->lpcr_req);
+       mtspr(SPRN_LPID, sip->lpidr_req);
+       isync();
+
+       /* Invalidate the TLB on thread 0 */
+       if (local_paca->kvm_hstate.tid == 0) {
+               sip->do_set = 0;
+               asm volatile("ptesync" : : : "memory");
+               for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) {
+                       rb = TLBIEL_INVAL_SET_LPID +
+                               (set << TLBIEL_INVAL_SET_SHIFT);
+                       asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : :
+                                    "r" (rb), "r" (0));
+               }
+               asm volatile("ptesync" : : : "memory");
+       }
+
+       /* indicate that we have done so and wait for others */
+       wait_for_sync(sip, PHASE_SET_LPCR);
+       /* order read of sip->lpcr_sync.allphases vs. sip->do_set */
+       smp_rmb();
+}
+
+/*
+ * Called when a thread that has been in the guest needs
+ * to reload the host LPCR value - but only on POWER9 when
+ * running a HPT guest on a radix host.
+ */
+void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
+{
+       /* we're out of the guest... */
+       wait_for_sync(sip, PHASE_OUT_OF_GUEST);
+
+       mtspr(SPRN_LPID, 0);
+       mtspr(SPRN_LPCR, sip->host_lpcr);
+       isync();
+
+       if (local_paca->kvm_hstate.tid == 0) {
+               sip->do_restore = 0;
+               smp_wmb();      /* order store of do_restore vs. phase */
+       }
+
+       wait_for_sync(sip, PHASE_RESET_LPCR);
+       smp_mb();
+       local_paca->kvm_hstate.kvm_split_mode = NULL;
+}
index ae6c61641ade59371d51e62fc99b8ad451a19a82..7add18930e6d092805c41df8831cafc59ca7fb29 100644 (file)
@@ -82,6 +82,19 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
        RFI
 
 kvmppc_call_hv_entry:
+BEGIN_FTR_SECTION
+       /* On P9, do LPCR setting, if necessary */
+       ld      r3, HSTATE_SPLIT_MODE(r13)
+       cmpdi   r3, 0
+       beq     46f
+       lwz     r4, KVM_SPLIT_DO_SET(r3)
+       cmpwi   r4, 0
+       beq     46f
+       bl      kvmhv_p9_set_lpcr
+       nop
+46:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
        ld      r4, HSTATE_KVM_VCPU(r13)
        bl      kvmppc_hv_entry
 
@@ -385,10 +398,10 @@ kvm_secondary_got_guest:
        ld      r6, 0(r6)
        mtspr   SPRN_HDEC, r6
        /* and set per-LPAR registers, if doing dynamic micro-threading */
-BEGIN_FTR_SECTION
        ld      r6, HSTATE_SPLIT_MODE(r13)
        cmpdi   r6, 0
        beq     63f
+BEGIN_FTR_SECTION
        ld      r0, KVM_SPLIT_RPR(r6)
        mtspr   SPRN_RPR, r0
        ld      r0, KVM_SPLIT_PMMAR(r6)
@@ -396,7 +409,15 @@ BEGIN_FTR_SECTION
        ld      r0, KVM_SPLIT_LDBAR(r6)
        mtspr   SPRN_LDBAR, r0
        isync
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+FTR_SECTION_ELSE
+       /* On P9 we use the split_info for coordinating LPCR changes */
+       lwz     r4, KVM_SPLIT_DO_SET(r6)
+       cmpwi   r4, 0
+       beq     63f
+       mr      r3, r6
+       bl      kvmhv_p9_set_lpcr
+       nop
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 63:
        /* Order load of vcpu after load of vcore */
        lwsync
@@ -467,6 +488,12 @@ kvm_no_guest:
        ld      r3, HSTATE_SPLIT_MODE(r13)
        cmpdi   r3, 0
        beq     kvm_no_guest
+       lwz     r0, KVM_SPLIT_DO_SET(r3)
+       cmpwi   r0, 0
+       bne     kvmhv_do_set
+       lwz     r0, KVM_SPLIT_DO_RESTORE(r3)
+       cmpwi   r0, 0
+       bne     kvmhv_do_restore
        lbz     r0, KVM_SPLIT_DO_NAP(r3)
        cmpwi   r0, 0
        beq     kvm_no_guest
@@ -479,6 +506,19 @@ kvm_no_guest:
        stb     r0, HSTATE_HWTHREAD_STATE(r13)
        b       kvm_no_guest
 
+kvmhv_do_set:
+       /* Set LPCR, LPIDR etc. on P9 */
+       HMT_MEDIUM
+       bl      kvmhv_p9_set_lpcr
+       nop
+       b       kvm_no_guest
+
+kvmhv_do_restore:
+       HMT_MEDIUM
+       bl      kvmhv_p9_restore_lpcr
+       nop
+       b       kvm_no_guest
+
 /*
  * Here the primary thread is trying to return the core to
  * whole-core mode, so we need to nap.
@@ -516,8 +556,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        /* Set kvm_split_mode.napped[tid] = 1 */
        ld      r3, HSTATE_SPLIT_MODE(r13)
        li      r0, 1
-       lhz     r4, PACAPACAINDEX(r13)
-       clrldi  r4, r4, 61      /* micro-threading => P8 => 8 threads/core */
+       lbz     r4, HSTATE_TID(r13)
        addi    r4, r4, KVM_SPLIT_NAPPED
        stbx    r0, r3, r4
        /* Check the do_nap flag again after setting napped[] */
@@ -1911,10 +1950,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 19:    lis     r8,0x7fff               /* MAX_INT@h */
        mtspr   SPRN_HDEC,r8
 
-16:    ld      r8,KVM_HOST_LPCR(r4)
+16:
+BEGIN_FTR_SECTION
+       /* On POWER9 with HPT-on-radix we need to wait for all other threads */
+       ld      r3, HSTATE_SPLIT_MODE(r13)
+       cmpdi   r3, 0
+       beq     47f
+       lwz     r8, KVM_SPLIT_DO_RESTORE(r3)
+       cmpwi   r8, 0
+       beq     47f
+       stw     r12, STACK_SLOT_TRAP(r1)
+       bl      kvmhv_p9_restore_lpcr
+       nop
+       lwz     r12, STACK_SLOT_TRAP(r1)
+       b       48f
+47:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+       ld      r8,KVM_HOST_LPCR(r4)
        mtspr   SPRN_LPCR,r8
        isync
-
+48:
        /* load host SLB entries */
 BEGIN_MMU_FTR_SECTION
        b       0f