ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
-CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
-
obj-y := fault.o mem.o pgtable.o mmap.o \
init_$(BITS).o pgtable_$(BITS).o \
init-common.o mmu_context.o drmem.o
obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
tlb_nohash_low.o
obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o
-hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o
-obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb.o \
- $(hash64-y) mmu_context_book3s64.o \
- pgtable-book3s64.o pgtable-frag.o
+obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/
+obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o
obj-$(CONFIG_PPC32) += pgtable-frag.o
-obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o
obj-$(CONFIG_PPC_BOOK3S_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
-obj-$(CONFIG_PPC_BOOK3S) += tlb_hash$(BITS).o
-ifdef CONFIG_PPC_BOOK3S_64
-obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o
-obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o
-endif
+obj-$(CONFIG_PPC_BOOK3S_32) += tlb_hash32.o
obj-$(CONFIG_40x) += 40x_mmu.o
obj-$(CONFIG_44x) += 44x_mmu.o
obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o
obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o
obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
-obj-$(CONFIG_PPC_SPLPAR) += vphn.o
obj-$(CONFIG_PPC_MM_SLICES) += slice.o
obj-y += hugetlbpage.o
ifdef CONFIG_HUGETLB_PAGE
-obj-$(CONFIG_PPC_BOOK3S_64) += hugetlbpage-hash64.o
-obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o
obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
endif
-obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
-obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
obj-$(CONFIG_HIGHMEM) += highmem.o
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
-obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o
obj-$(CONFIG_PPC_PTDUMP) += ptdump/
-obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o
# Disable kcov instrumentation on sensitive code
# This is necessary for booting with kcov enabled on book3e machines
KCOV_INSTRUMENT_tlb_nohash.o := n
KCOV_INSTRUMENT_fsl_booke_mmu.o := n
-
-# Instrumenting the SLB fault path can lead to duplicate SLB entries
-KCOV_INSTRUMENT_slb.o := n
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y := $(NO_MINIMAL_TOC)
+
+CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
+
+obj-y += hash_pgtable.o hash_utils.o slb.o \
+ mmu_context.o pgtable.o hash_tlb.o
+obj-$(CONFIG_PPC_NATIVE) += hash_native.o
+obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o
+obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o
+obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o
+obj-$(CONFIG_PPC_SPLPAR) += vphn.o
+obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o
+ifdef CONFIG_HUGETLB_PAGE
+obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o
+endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o
+obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o
+obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o
+obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o
+
+# Instrumenting the SLB fault path can lead to duplicate SLB entries
+KCOV_INSTRUMENT_slb.o := n
--- /dev/null
+/*
+ * Copyright IBM Corporation, 2015
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+
+int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+ pte_t *ptep, unsigned long trap, unsigned long flags,
+ int ssize, int subpg_prot)
+{
+ real_pte_t rpte;
+ unsigned long hpte_group;
+ unsigned long rflags, pa;
+ unsigned long old_pte, new_pte;
+ unsigned long vpn, hash, slot;
+ unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
+
+ /*
+ * atomically mark the linux large page PTE busy and dirty
+ */
+ do {
+ pte_t pte = READ_ONCE(*ptep);
+
+ old_pte = pte_val(pte);
+ /* If PTE busy, retry the access */
+ if (unlikely(old_pte & H_PAGE_BUSY))
+ return 0;
+ /* If PTE permissions don't match, take page fault */
+ if (unlikely(!check_pte_access(access, old_pte)))
+ return 1;
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access. Since this is 4K insert of 64K page size
+ * also add H_PAGE_COMBO
+ */
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
+ new_pte |= _PAGE_DIRTY;
+ } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+ /*
+ * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+ * need to add in 0x1 if it's a read-only user page
+ */
+ rflags = htab_convert_pte_flags(new_pte);
+ rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+
+ if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+ !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+ vpn = hpt_vpn(ea, vsid, ssize);
+ if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+ /*
+ * There MIGHT be an HPTE for this pte
+ */
+ unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize,
+ rpte, 0);
+
+ if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K,
+ MMU_PAGE_4K, ssize, flags) == -1)
+ old_pte &= ~_PAGE_HPTEFLAGS;
+ }
+
+ if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+
+ pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+ hash = hpt_hash(vpn, shift, ssize);
+
+repeat:
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+ /* Insert into the hash table, primary slot */
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+ /*
+ * Primary is full, try the secondary
+ */
+ if (unlikely(slot == -1)) {
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+ rflags,
+ HPTE_V_SECONDARY,
+ MMU_PAGE_4K,
+ MMU_PAGE_4K, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
+ mmu_hash_ops.hpte_remove(hpte_group);
+ /*
+ * FIXME!! Should be try the group from which we removed ?
+ */
+ goto repeat;
+ }
+ }
+ /*
+ * Hypervisor failure. Restore old pte and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *ptep = __pte(old_pte);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
+ return -1;
+ }
+ new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+ new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+ }
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright IBM Corporation, 2015
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+
+/*
+ * Return true, if the entry has a slot value which
+ * the software considers as invalid.
+ */
+static inline bool hpte_soft_invalid(unsigned long hidx)
+{
+ return ((hidx & 0xfUL) == 0xfUL);
+}
+
+/*
+ * index from 0 - 15
+ */
+bool __rpte_sub_valid(real_pte_t rpte, unsigned long index)
+{
+ return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index)));
+}
+
+int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+ pte_t *ptep, unsigned long trap, unsigned long flags,
+ int ssize, int subpg_prot)
+{
+ real_pte_t rpte;
+ unsigned long hpte_group;
+ unsigned int subpg_index;
+ unsigned long rflags, pa;
+ unsigned long old_pte, new_pte, subpg_pte;
+ unsigned long vpn, hash, slot, gslot;
+ unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
+
+ /*
+ * atomically mark the linux large page PTE busy and dirty
+ */
+ do {
+ pte_t pte = READ_ONCE(*ptep);
+
+ old_pte = pte_val(pte);
+ /* If PTE busy, retry the access */
+ if (unlikely(old_pte & H_PAGE_BUSY))
+ return 0;
+ /* If PTE permissions don't match, take page fault */
+ if (unlikely(!check_pte_access(access, old_pte)))
+ return 1;
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access. Since this is 4K insert of 64K page size
+ * also add H_PAGE_COMBO
+ */
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO;
+ if (access & _PAGE_WRITE)
+ new_pte |= _PAGE_DIRTY;
+ } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+ /*
+ * Handle the subpage protection bits
+ */
+ subpg_pte = new_pte & ~subpg_prot;
+ rflags = htab_convert_pte_flags(subpg_pte);
+
+ if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+ !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+ /*
+ * No CPU has hugepages but lacks no execute, so we
+ * don't need to worry about that case
+ */
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+ }
+
+ subpg_index = (ea & (PAGE_SIZE - 1)) >> shift;
+ vpn = hpt_vpn(ea, vsid, ssize);
+ rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+ /*
+ *None of the sub 4k page is hashed
+ */
+ if (!(old_pte & H_PAGE_HASHPTE))
+ goto htab_insert_hpte;
+ /*
+ * Check if the pte was already inserted into the hash table
+ * as a 64k HW page, and invalidate the 64k HPTE if so.
+ */
+ if (!(old_pte & H_PAGE_COMBO)) {
+ flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
+ /*
+ * clear the old slot details from the old and new pte.
+ * On hash insert failure we use old pte value and we don't
+ * want slot information there if we have a insert failure.
+ */
+ old_pte &= ~H_PAGE_HASHPTE;
+ new_pte &= ~H_PAGE_HASHPTE;
+ goto htab_insert_hpte;
+ }
+ /*
+ * Check for sub page valid and update
+ */
+ if (__rpte_sub_valid(rpte, subpg_index)) {
+ int ret;
+
+ gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte,
+ subpg_index);
+ ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn,
+ MMU_PAGE_4K, MMU_PAGE_4K,
+ ssize, flags);
+
+ /*
+ * If we failed because typically the HPTE wasn't really here
+ * we try an insertion.
+ */
+ if (ret == -1)
+ goto htab_insert_hpte;
+
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+ return 0;
+ }
+
+htab_insert_hpte:
+
+ /*
+ * Initialize all hidx entries to invalid value, the first time
+ * the PTE is about to allocate a 4K HPTE.
+ */
+ if (!(old_pte & H_PAGE_COMBO))
+ rpte.hidx = INVALID_RPTE_HIDX;
+
+ /*
+ * handle H_PAGE_4K_PFN case
+ */
+ if (old_pte & H_PAGE_4K_PFN) {
+ /*
+ * All the sub 4k page have the same
+ * physical address.
+ */
+ pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT;
+ } else {
+ pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+ pa += (subpg_index << shift);
+ }
+ hash = hpt_hash(vpn, shift, ssize);
+repeat:
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+ /* Insert into the hash table, primary slot */
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ MMU_PAGE_4K, MMU_PAGE_4K, ssize);
+ /*
+ * Primary is full, try the secondary
+ */
+ if (unlikely(slot == -1)) {
+ bool soft_invalid;
+
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+ rflags, HPTE_V_SECONDARY,
+ MMU_PAGE_4K, MMU_PAGE_4K,
+ ssize);
+
+ soft_invalid = hpte_soft_invalid(slot);
+ if (unlikely(soft_invalid)) {
+ /*
+ * We got a valid slot from a hardware point of view.
+ * but we cannot use it, because we use this special
+ * value; as defined by hpte_soft_invalid(), to track
+ * invalid slots. We cannot use it. So invalidate it.
+ */
+ gslot = slot & _PTEIDX_GROUP_IX;
+ mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn,
+ MMU_PAGE_4K, MMU_PAGE_4K,
+ ssize, 0);
+ }
+
+ if (unlikely(slot == -1 || soft_invalid)) {
+ /*
+ * For soft invalid slot, let's ensure that we release a
+ * slot from the primary, with the hope that we will
+ * acquire that slot next time we try. This will ensure
+ * that we do not get the same soft-invalid slot.
+ */
+ if (soft_invalid || (mftb() & 0x1))
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+ mmu_hash_ops.hpte_remove(hpte_group);
+ /*
+ * FIXME!! Should be try the group from which we removed ?
+ */
+ goto repeat;
+ }
+ }
+ /*
+ * Hypervisor failure. Restore old pte and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *ptep = __pte(old_pte);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
+ return -1;
+ }
+
+ new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
+ new_pte |= H_PAGE_HASHPTE;
+
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+ return 0;
+}
+
+int __hash_page_64K(unsigned long ea, unsigned long access,
+ unsigned long vsid, pte_t *ptep, unsigned long trap,
+ unsigned long flags, int ssize)
+{
+ real_pte_t rpte;
+ unsigned long hpte_group;
+ unsigned long rflags, pa;
+ unsigned long old_pte, new_pte;
+ unsigned long vpn, hash, slot;
+ unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift;
+
+ /*
+ * atomically mark the linux large page PTE busy and dirty
+ */
+ do {
+ pte_t pte = READ_ONCE(*ptep);
+
+ old_pte = pte_val(pte);
+ /* If PTE busy, retry the access */
+ if (unlikely(old_pte & H_PAGE_BUSY))
+ return 0;
+ /* If PTE permissions don't match, take page fault */
+ if (unlikely(!check_pte_access(access, old_pte)))
+ return 1;
+ /*
+ * Check if PTE has the cache-inhibit bit set
+ * If so, bail out and refault as a 4k page
+ */
+ if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) &&
+ unlikely(pte_ci(pte)))
+ return 0;
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access.
+ */
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
+ new_pte |= _PAGE_DIRTY;
+ } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+ rflags = htab_convert_pte_flags(new_pte);
+ rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
+
+ if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+ !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+ vpn = hpt_vpn(ea, vsid, ssize);
+ if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+ unsigned long gslot;
+
+ /*
+ * There MIGHT be an HPTE for this pte
+ */
+ gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
+ if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
+ MMU_PAGE_64K, ssize,
+ flags) == -1)
+ old_pte &= ~_PAGE_HPTEFLAGS;
+ }
+
+ if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+
+ pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+ hash = hpt_hash(vpn, shift, ssize);
+
+repeat:
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+ /* Insert into the hash table, primary slot */
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ MMU_PAGE_64K, MMU_PAGE_64K,
+ ssize);
+ /*
+ * Primary is full, try the secondary
+ */
+ if (unlikely(slot == -1)) {
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+ rflags,
+ HPTE_V_SECONDARY,
+ MMU_PAGE_64K,
+ MMU_PAGE_64K, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
+ mmu_hash_ops.hpte_remove(hpte_group);
+ /*
+ * FIXME!! Should be try the group from which we removed ?
+ */
+ goto repeat;
+ }
+ }
+ /*
+ * Hypervisor failure. Restore old pte and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *ptep = __pte(old_pte);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
+ return -1;
+ }
+
+ new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+ new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+ }
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include <linux/mm.h>
+#include <asm/machdep.h>
+
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+ pmd_t *pmdp, unsigned long trap, unsigned long flags,
+ int ssize, unsigned int psize)
+{
+ unsigned int index, valid;
+ unsigned char *hpte_slot_array;
+ unsigned long rflags, pa, hidx;
+ unsigned long old_pmd, new_pmd;
+ int ret, lpsize = MMU_PAGE_16M;
+ unsigned long vpn, hash, shift, slot;
+
+ /*
+ * atomically mark the linux large page PMD busy and dirty
+ */
+ do {
+ pmd_t pmd = READ_ONCE(*pmdp);
+
+ old_pmd = pmd_val(pmd);
+ /* If PMD busy, retry the access */
+ if (unlikely(old_pmd & H_PAGE_BUSY))
+ return 0;
+ /* If PMD permissions don't match, take page fault */
+ if (unlikely(!check_pte_access(access, old_pmd)))
+ return 1;
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access
+ */
+ new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
+ new_pmd |= _PAGE_DIRTY;
+ } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd)));
+
+ /*
+ * Make sure this is thp or devmap entry
+ */
+ if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)))
+ return 0;
+
+ rflags = htab_convert_pte_flags(new_pmd);
+
+#if 0
+ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+ /*
+ * No CPU has hugepages but lacks no execute, so we
+ * don't need to worry about that case
+ */
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+ }
+#endif
+ /*
+ * Find the slot index details for this ea, using base page size.
+ */
+ shift = mmu_psize_defs[psize].shift;
+ index = (ea & ~HPAGE_PMD_MASK) >> shift;
+ BUG_ON(index >= PTE_FRAG_SIZE);
+
+ vpn = hpt_vpn(ea, vsid, ssize);
+ hpte_slot_array = get_hpte_slot_array(pmdp);
+ if (psize == MMU_PAGE_4K) {
+ /*
+ * invalidate the old hpte entry if we have that mapped via 64K
+ * base page size. This is because demote_segment won't flush
+ * hash page table entries.
+ */
+ if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) {
+ flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K,
+ ssize, flags);
+ /*
+ * With THP, we also clear the slot information with
+ * respect to all the 64K hash pte mapping the 16MB
+ * page. They are all invalid now. This make sure we
+ * don't find the slot valid when we fault with 4k
+ * base page size.
+ *
+ */
+ memset(hpte_slot_array, 0, PTE_FRAG_SIZE);
+ }
+ }
+
+ valid = hpte_valid(hpte_slot_array, index);
+ if (valid) {
+ /* update the hpte bits */
+ hash = hpt_hash(vpn, shift, ssize);
+ hidx = hpte_hash_index(hpte_slot_array, index);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+ ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
+ psize, lpsize, ssize, flags);
+ /*
+ * We failed to update, try to insert a new entry.
+ */
+ if (ret == -1) {
+ /*
+ * large pte is marked busy, so we can be sure
+ * nobody is looking at hpte_slot_array. hence we can
+ * safely update this here.
+ */
+ valid = 0;
+ hpte_slot_array[index] = 0;
+ }
+ }
+
+ if (!valid) {
+ unsigned long hpte_group;
+
+ hash = hpt_hash(vpn, shift, ssize);
+ /* insert new entry */
+ pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+ new_pmd |= H_PAGE_HASHPTE;
+
+repeat:
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+ /* Insert into the hash table, primary slot */
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+ psize, lpsize, ssize);
+ /*
+ * Primary is full, try the secondary
+ */
+ if (unlikely(slot == -1)) {
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
+ rflags,
+ HPTE_V_SECONDARY,
+ psize, lpsize, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
+
+ mmu_hash_ops.hpte_remove(hpte_group);
+ goto repeat;
+ }
+ }
+ /*
+ * Hypervisor failure. Restore old pmd and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *pmdp = __pmd(old_pmd);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ psize, lpsize, old_pmd);
+ return -1;
+ }
+ /*
+ * large pte is marked busy, so we can be sure
+ * nobody is looking at hpte_slot_array. hence we can
+ * safely update this here.
+ */
+ mark_hpte_slot_valid(hpte_slot_array, index, slot);
+ }
+ /*
+ * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with
+ * base page size 4k.
+ */
+ if (psize == MMU_PAGE_4K)
+ new_pmd |= H_PAGE_COMBO;
+ /*
+ * The hpte valid is stored in the pgtable whose address is in the
+ * second half of the PMD. Order this against clearing of the busy bit in
+ * huge pmd.
+ */
+ smp_wmb();
+ *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY);
+ return 0;
+}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+
+extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+ unsigned long pa, unsigned long rlags,
+ unsigned long vflags, int psize, int ssize);
+
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+ pte_t *ptep, unsigned long trap, unsigned long flags,
+ int ssize, unsigned int shift, unsigned int mmu_psize)
+{
+ real_pte_t rpte;
+ unsigned long vpn;
+ unsigned long old_pte, new_pte;
+ unsigned long rflags, pa;
+ long slot, offset;
+
+ BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
+
+ /* Search the Linux page table for a match with va */
+ vpn = hpt_vpn(ea, vsid, ssize);
+
+ /*
+ * At this point, we have a pte (old_pte) which can be used to build
+ * or update an HPTE. There are 2 cases:
+ *
+ * 1. There is a valid (present) pte with no associated HPTE (this is
+ * the most common case)
+ * 2. There is a valid (present) pte with an associated HPTE. The
+ * current values of the pp bits in the HPTE prevent access
+ * because we are doing software DIRTY bit management and the
+ * page is currently not DIRTY.
+ */
+
+
+ do {
+ old_pte = pte_val(*ptep);
+ /* If PTE busy, retry the access */
+ if (unlikely(old_pte & H_PAGE_BUSY))
+ return 0;
+ /* If PTE permissions don't match, take page fault */
+ if (unlikely(!check_pte_access(access, old_pte)))
+ return 1;
+
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access
+ */
+ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
+ if (access & _PAGE_WRITE)
+ new_pte |= _PAGE_DIRTY;
+ } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+
+ /* Make sure this is a hugetlb entry */
+ if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
+ return 0;
+
+ rflags = htab_convert_pte_flags(new_pte);
+ if (unlikely(mmu_psize == MMU_PAGE_16G))
+ offset = PTRS_PER_PUD;
+ else
+ offset = PTRS_PER_PMD;
+ rpte = __real_pte(__pte(old_pte), ptep, offset);
+
+ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+ /*
+ * No CPU has hugepages but lacks no execute, so we
+ * don't need to worry about that case
+ */
+ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+ /* Check if pte already has an hpte (case 2) */
+ if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+ /* There MIGHT be an HPTE for this pte */
+ unsigned long gslot;
+
+ gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
+ if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
+ mmu_psize, ssize, flags) == -1)
+ old_pte &= ~_PAGE_HPTEFLAGS;
+ }
+
+ if (likely(!(old_pte & H_PAGE_HASHPTE))) {
+ unsigned long hash = hpt_hash(vpn, shift, ssize);
+
+ pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+
+ /* clear HPTE slot informations in new PTE */
+ new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
+
+ slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
+ mmu_psize, ssize);
+
+ /*
+ * Hypervisor failure. Restore old pte and return -1
+ * similar to __hash_page_*
+ */
+ if (unlikely(slot == -2)) {
+ *ptep = __pte(old_pte);
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ mmu_psize, mmu_psize, old_pte);
+ return -1;
+ }
+
+ new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset);
+ }
+
+ /*
+ * No need to use ldarx/stdcx here
+ */
+ *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+ return 0;
+}
+
+pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ unsigned long pte_val;
+ /*
+ * Clear the _PAGE_PRESENT so that no hardware parallel update is
+ * possible. Also keep the pte_present true so that we don't take
+ * wrong fault.
+ */
+ pte_val = pte_update(vma->vm_mm, addr, ptep,
+ _PAGE_PRESENT, _PAGE_INVALID, 1);
+
+ return __pte(pte_val);
+}
+
+void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+
+ if (radix_enabled())
+ return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
+ old_pte, pte);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
--- /dev/null
+/*
+ * native hashtable management.
+ *
+ * SMP scalability work:
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG_LOW
+
+#include <linux/spinlock.h>
+#include <linux/bitops.h>
+#include <linux/of.h>
+#include <linux/processor.h>
+#include <linux/threads.h>
+#include <linux/smp.h>
+
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/trace.h>
+#include <asm/tlb.h>
+#include <asm/cputable.h>
+#include <asm/udbg.h>
+#include <asm/kexec.h>
+#include <asm/ppc-opcode.h>
+#include <asm/feature-fixups.h>
+
+#include <misc/cxl-base.h>
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
+
+#ifdef __BIG_ENDIAN__
+#define HPTE_LOCK_BIT 3
+#else
+#define HPTE_LOCK_BIT (56+3)
+#endif
+
+DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+
+static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
+{
+ unsigned long rb;
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+
+ asm volatile("tlbiel %0" : : "r" (rb));
+}
+
+/*
+ * tlbiel instruction for hash, set invalidation
+ * i.e., r=1 and is=01 or is=10 or is=11
+ */
+static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
+ unsigned int pid,
+ unsigned int ric, unsigned int prs)
+{
+ unsigned long rb;
+ unsigned long rs;
+ unsigned int r = 0; /* hash format */
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+ rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
+
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
+ : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
+ : "memory");
+}
+
+
+static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ for (set = 0; set < num_sets; set++)
+ tlbiel_hash_set_isa206(set, is);
+
+ asm volatile("ptesync": : :"memory");
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and any caching of partition table
+ * entries. Then flush the remaining sets of the TLB. Hash mode uses
+ * partition scoped TLB translations.
+ */
+ tlbiel_hash_set_isa300(0, is, 0, 2, 0);
+ for (set = 1; set < num_sets; set++)
+ tlbiel_hash_set_isa300(set, is, 0, 0, 0);
+
+ /*
+ * Now invalidate the process table cache.
+ *
+ * From ISA v3.0B p. 1078:
+ * The following forms are invalid.
+ * * PRS=1, R=0, and RIC!=2 (The only process-scoped
+ * HPT caching is of the Process Table.)
+ */
+ tlbiel_hash_set_isa300(0, is, 0, 2, 1);
+
+ asm volatile("ptesync": : :"memory");
+
+ asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+void hash__tlbiel_all(unsigned int action)
+{
+ unsigned int is;
+
+ switch (action) {
+ case TLB_INVAL_SCOPE_GLOBAL:
+ is = 3;
+ break;
+ case TLB_INVAL_SCOPE_LPID:
+ is = 2;
+ break;
+ default:
+ BUG();
+ }
+
+ if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+ tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
+ else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+ tlbiel_all_isa206(POWER8_TLB_SETS, is);
+ else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
+ tlbiel_all_isa206(POWER7_TLB_SETS, is);
+ else
+ WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
+}
+
+static inline unsigned long ___tlbie(unsigned long vpn, int psize,
+ int apsize, int ssize)
+{
+ unsigned long va;
+ unsigned int penc;
+ unsigned long sllp;
+
+ /*
+ * We need 14 to 65 bits of va for a tlibe of 4K page
+ * With vpn we ignore the lower VPN_SHIFT bits already.
+ * And top two bits are already ignored because we can
+ * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT
+ * of 12.
+ */
+ va = vpn << VPN_SHIFT;
+ /*
+ * clear top 16 bits of 64bit va, non SLS segment
+ * Older versions of the architecture (2.02 and earler) require the
+ * masking of the top 16 bits.
+ */
+ if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
+ va &= ~(0xffffULL << 48);
+
+ switch (psize) {
+ case MMU_PAGE_4K:
+ /* clear out bits after (52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
+ va |= ssize << 8;
+ sllp = get_sllp_encoding(apsize);
+ va |= sllp << 5;
+ asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
+ : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
+ : "memory");
+ break;
+ default:
+ /* We need 14 to 14 + i bits of va */
+ penc = mmu_psize_defs[psize].penc[apsize];
+ va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
+ va |= penc << 12;
+ va |= ssize << 8;
+ /*
+ * AVAL bits:
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe); /* AVAL */
+ va |= 1; /* L */
+ asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
+ : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
+ : "memory");
+ break;
+ }
+ return va;
+}
+
+static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+ /* Need the extra ptesync to ensure we don't reorder tlbie*/
+ asm volatile("ptesync": : :"memory");
+ ___tlbie(vpn, psize, apsize, ssize);
+ }
+}
+
+static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+{
+ unsigned long rb;
+
+ rb = ___tlbie(vpn, psize, apsize, ssize);
+ trace_tlbie(0, 0, rb, 0, 0, 0, 0);
+}
+
+static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
+{
+ unsigned long va;
+ unsigned int penc;
+ unsigned long sllp;
+
+ /* VPN_SHIFT can be atmost 12 */
+ va = vpn << VPN_SHIFT;
+ /*
+ * clear top 16 bits of 64 bit va, non SLS segment
+ * Older versions of the architecture (2.02 and earler) require the
+ * masking of the top 16 bits.
+ */
+ if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
+ va &= ~(0xffffULL << 48);
+
+ switch (psize) {
+ case MMU_PAGE_4K:
+ /* clear out bits after(52) [0....52.....63] */
+ va &= ~((1ul << (64 - 52)) - 1);
+ va |= ssize << 8;
+ sllp = get_sllp_encoding(apsize);
+ va |= sllp << 5;
+ asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1)
+ : : "r" (va), "i" (CPU_FTR_ARCH_206)
+ : "memory");
+ break;
+ default:
+ /* We need 14 to 14 + i bits of va */
+ penc = mmu_psize_defs[psize].penc[apsize];
+ va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
+ va |= penc << 12;
+ va |= ssize << 8;
+ /*
+ * AVAL bits:
+ * We don't need all the bits, but rest of the bits
+ * must be ignored by the processor.
+ * vpn cover upto 65 bits of va. (0...65) and we need
+ * 58..64 bits of va.
+ */
+ va |= (vpn & 0xfe);
+ va |= 1; /* L */
+ asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1)
+ : : "r" (va), "i" (CPU_FTR_ARCH_206)
+ : "memory");
+ break;
+ }
+ trace_tlbie(0, 1, va, 0, 0, 0, 0);
+
+}
+
+static inline void tlbie(unsigned long vpn, int psize, int apsize,
+ int ssize, int local)
+{
+ unsigned int use_local;
+ int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+ use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use();
+
+ if (use_local)
+ use_local = mmu_psize_defs[psize].tlbiel;
+ if (lock_tlbie && !use_local)
+ raw_spin_lock(&native_tlbie_lock);
+ asm volatile("ptesync": : :"memory");
+ if (use_local) {
+ __tlbiel(vpn, psize, apsize, ssize);
+ asm volatile("ptesync": : :"memory");
+ } else {
+ __tlbie(vpn, psize, apsize, ssize);
+ fixup_tlbie(vpn, psize, apsize, ssize);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+ }
+ if (lock_tlbie && !use_local)
+ raw_spin_unlock(&native_tlbie_lock);
+}
+
+static inline void native_lock_hpte(struct hash_pte *hptep)
+{
+ unsigned long *word = (unsigned long *)&hptep->v;
+
+ while (1) {
+ if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
+ break;
+ spin_begin();
+ while(test_bit(HPTE_LOCK_BIT, word))
+ spin_cpu_relax();
+ spin_end();
+ }
+}
+
+static inline void native_unlock_hpte(struct hash_pte *hptep)
+{
+ unsigned long *word = (unsigned long *)&hptep->v;
+
+ clear_bit_unlock(HPTE_LOCK_BIT, word);
+}
+
+static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
+ unsigned long pa, unsigned long rflags,
+ unsigned long vflags, int psize, int apsize, int ssize)
+{
+ struct hash_pte *hptep = htab_address + hpte_group;
+ unsigned long hpte_v, hpte_r;
+ int i;
+
+ if (!(vflags & HPTE_V_BOLTED)) {
+ DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx,"
+ " rflags=%lx, vflags=%lx, psize=%d)\n",
+ hpte_group, vpn, pa, rflags, vflags, psize);
+ }
+
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
+ /* retry with lock held */
+ native_lock_hpte(hptep);
+ if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
+ break;
+ native_unlock_hpte(hptep);
+ }
+
+ hptep++;
+ }
+
+ if (i == HPTES_PER_GROUP)
+ return -1;
+
+ hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
+ hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
+
+ if (!(vflags & HPTE_V_BOLTED)) {
+ DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
+ i, hpte_v, hpte_r);
+ }
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ hpte_r = hpte_old_to_new_r(hpte_v, hpte_r);
+ hpte_v = hpte_old_to_new_v(hpte_v);
+ }
+
+ hptep->r = cpu_to_be64(hpte_r);
+ /* Guarantee the second dword is visible before the valid bit */
+ eieio();
+ /*
+ * Now set the first dword including the valid bit
+ * NOTE: this also unlocks the hpte
+ */
+ hptep->v = cpu_to_be64(hpte_v);
+
+ __asm__ __volatile__ ("ptesync" : : : "memory");
+
+ return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static long native_hpte_remove(unsigned long hpte_group)
+{
+ struct hash_pte *hptep;
+ int i;
+ int slot_offset;
+ unsigned long hpte_v;
+
+ DBG_LOW(" remove(group=%lx)\n", hpte_group);
+
+ /* pick a random entry to start at */
+ slot_offset = mftb() & 0x7;
+
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ hptep = htab_address + hpte_group + slot_offset;
+ hpte_v = be64_to_cpu(hptep->v);
+
+ if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
+ /* retry with lock held */
+ native_lock_hpte(hptep);
+ hpte_v = be64_to_cpu(hptep->v);
+ if ((hpte_v & HPTE_V_VALID)
+ && !(hpte_v & HPTE_V_BOLTED))
+ break;
+ native_unlock_hpte(hptep);
+ }
+
+ slot_offset++;
+ slot_offset &= 0x7;
+ }
+
+ if (i == HPTES_PER_GROUP)
+ return -1;
+
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ hptep->v = 0;
+
+ return i;
+}
+
+static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
+ unsigned long vpn, int bpsize,
+ int apsize, int ssize, unsigned long flags)
+{
+ struct hash_pte *hptep = htab_address + slot;
+ unsigned long hpte_v, want_v;
+ int ret = 0, local = 0;
+
+ want_v = hpte_encode_avpn(vpn, bpsize, ssize);
+
+ DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
+ vpn, want_v & HPTE_V_AVPN, slot, newpp);
+
+ hpte_v = hpte_get_old_v(hptep);
+ /*
+ * We need to invalidate the TLB always because hpte_remove doesn't do
+ * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+ * random entry from it. When we do that we don't invalidate the TLB
+ * (hpte_remove) because we assume the old translation is still
+ * technically "valid".
+ */
+ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
+ DBG_LOW(" -> miss\n");
+ ret = -1;
+ } else {
+ native_lock_hpte(hptep);
+ /* recheck with locks held */
+ hpte_v = hpte_get_old_v(hptep);
+ if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
+ !(hpte_v & HPTE_V_VALID))) {
+ ret = -1;
+ } else {
+ DBG_LOW(" -> hit\n");
+ /* Update the HPTE */
+ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+ ~(HPTE_R_PPP | HPTE_R_N)) |
+ (newpp & (HPTE_R_PPP | HPTE_R_N |
+ HPTE_R_C)));
+ }
+ native_unlock_hpte(hptep);
+ }
+
+ if (flags & HPTE_LOCAL_UPDATE)
+ local = 1;
+ /*
+ * Ensure it is out of the tlb too if it is not a nohpte fault
+ */
+ if (!(flags & HPTE_NOHPTE_UPDATE))
+ tlbie(vpn, bpsize, apsize, ssize, local);
+
+ return ret;
+}
+
+static long native_hpte_find(unsigned long vpn, int psize, int ssize)
+{
+ struct hash_pte *hptep;
+ unsigned long hash;
+ unsigned long i;
+ long slot;
+ unsigned long want_v, hpte_v;
+
+ hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+
+ /* Bolted mappings are only ever in the primary group */
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+
+ hptep = htab_address + slot;
+ hpte_v = hpte_get_old_v(hptep);
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+ /* HPTE matches */
+ return slot;
+ ++slot;
+ }
+
+ return -1;
+}
+
+/*
+ * Update the page protection bits. Intended to be used to create
+ * guard pages for kernel data structures on pages which are bolted
+ * in the HPT. Assumes pages being operated on will not be stolen.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
+ int psize, int ssize)
+{
+ unsigned long vpn;
+ unsigned long vsid;
+ long slot;
+ struct hash_pte *hptep;
+
+ vsid = get_kernel_vsid(ea, ssize);
+ vpn = hpt_vpn(ea, vsid, ssize);
+
+ slot = native_hpte_find(vpn, psize, ssize);
+ if (slot == -1)
+ panic("could not find page to bolt\n");
+ hptep = htab_address + slot;
+
+ /* Update the HPTE */
+ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
+ ~(HPTE_R_PPP | HPTE_R_N)) |
+ (newpp & (HPTE_R_PPP | HPTE_R_N)));
+ /*
+ * Ensure it is out of the tlb too. Bolted entries base and
+ * actual page size will be same.
+ */
+ tlbie(vpn, psize, psize, ssize, 0);
+}
+
+/*
+ * Remove a bolted kernel entry. Memory hotplug uses this.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
+{
+ unsigned long vpn;
+ unsigned long vsid;
+ long slot;
+ struct hash_pte *hptep;
+
+ vsid = get_kernel_vsid(ea, ssize);
+ vpn = hpt_vpn(ea, vsid, ssize);
+
+ slot = native_hpte_find(vpn, psize, ssize);
+ if (slot == -1)
+ return -ENOENT;
+
+ hptep = htab_address + slot;
+
+ VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED));
+
+ /* Invalidate the hpte */
+ hptep->v = 0;
+
+ /* Invalidate the TLB */
+ tlbie(vpn, psize, psize, ssize, 0);
+ return 0;
+}
+
+
+static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
+ int bpsize, int apsize, int ssize, int local)
+{
+ struct hash_pte *hptep = htab_address + slot;
+ unsigned long hpte_v;
+ unsigned long want_v;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
+
+ want_v = hpte_encode_avpn(vpn, bpsize, ssize);
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+ native_lock_hpte(hptep);
+ /* recheck with locks held */
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+ /* Invalidate the hpte. NOTE: this also unlocks it */
+ hptep->v = 0;
+ else
+ native_unlock_hpte(hptep);
+ }
+ /*
+ * We need to invalidate the TLB always because hpte_remove doesn't do
+ * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+ * random entry from it. When we do that we don't invalidate the TLB
+ * (hpte_remove) because we assume the old translation is still
+ * technically "valid".
+ */
+ tlbie(vpn, bpsize, apsize, ssize, local);
+
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void native_hugepage_invalidate(unsigned long vsid,
+ unsigned long addr,
+ unsigned char *hpte_slot_array,
+ int psize, int ssize, int local)
+{
+ int i;
+ struct hash_pte *hptep;
+ int actual_psize = MMU_PAGE_16M;
+ unsigned int max_hpte_count, valid;
+ unsigned long flags, s_addr = addr;
+ unsigned long hpte_v, want_v, shift;
+ unsigned long hidx, vpn = 0, hash, slot;
+
+ shift = mmu_psize_defs[psize].shift;
+ max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+ local_irq_save(flags);
+ for (i = 0; i < max_hpte_count; i++) {
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
+
+ /* get the vpn */
+ addr = s_addr + (i * (1ul << shift));
+ vpn = hpt_vpn(addr, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+
+ hptep = htab_address + slot;
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+ hpte_v = hpte_get_old_v(hptep);
+
+ /* Even if we miss, we need to invalidate the TLB */
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+ /* recheck with locks held */
+ native_lock_hpte(hptep);
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
+ /*
+ * Invalidate the hpte. NOTE: this also unlocks it
+ */
+
+ hptep->v = 0;
+ } else
+ native_unlock_hpte(hptep);
+ }
+ /*
+ * We need to do tlb invalidate for all the address, tlbie
+ * instruction compares entry_VA in tlb with the VA specified
+ * here
+ */
+ tlbie(vpn, psize, actual_psize, ssize, local);
+ }
+ local_irq_restore(flags);
+}
+#else
+static void native_hugepage_invalidate(unsigned long vsid,
+ unsigned long addr,
+ unsigned char *hpte_slot_array,
+ int psize, int ssize, int local)
+{
+ WARN(1, "%s called without THP support\n", __func__);
+}
+#endif
+
+static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
+ int *psize, int *apsize, int *ssize, unsigned long *vpn)
+{
+ unsigned long avpn, pteg, vpi;
+ unsigned long hpte_v = be64_to_cpu(hpte->v);
+ unsigned long hpte_r = be64_to_cpu(hpte->r);
+ unsigned long vsid, seg_off;
+ int size, a_size, shift;
+ /* Look at the 8 bit LP value */
+ unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ hpte_v = hpte_new_to_old_v(hpte_v, hpte_r);
+ hpte_r = hpte_new_to_old_r(hpte_r);
+ }
+ if (!(hpte_v & HPTE_V_LARGE)) {
+ size = MMU_PAGE_4K;
+ a_size = MMU_PAGE_4K;
+ } else {
+ size = hpte_page_sizes[lp] & 0xf;
+ a_size = hpte_page_sizes[lp] >> 4;
+ }
+ /* This works for all page sizes, and for 256M and 1T segments */
+ *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
+ shift = mmu_psize_defs[size].shift;
+
+ avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
+ pteg = slot / HPTES_PER_GROUP;
+ if (hpte_v & HPTE_V_SECONDARY)
+ pteg = ~pteg;
+
+ switch (*ssize) {
+ case MMU_SEGSIZE_256M:
+ /* We only have 28 - 23 bits of seg_off in avpn */
+ seg_off = (avpn & 0x1f) << 23;
+ vsid = avpn >> 5;
+ /* We can find more bits from the pteg value */
+ if (shift < 23) {
+ vpi = (vsid ^ pteg) & htab_hash_mask;
+ seg_off |= vpi << shift;
+ }
+ *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+ break;
+ case MMU_SEGSIZE_1T:
+ /* We only have 40 - 23 bits of seg_off in avpn */
+ seg_off = (avpn & 0x1ffff) << 23;
+ vsid = avpn >> 17;
+ if (shift < 23) {
+ vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask;
+ seg_off |= vpi << shift;
+ }
+ *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+ break;
+ default:
+ *vpn = size = 0;
+ }
+ *psize = size;
+ *apsize = a_size;
+}
+
+/*
+ * clear all mappings on kexec. All cpus are in real mode (or they will
+ * be when they isi), and we are the only one left. We rely on our kernel
+ * mapping being 0xC0's and the hardware ignoring those two real bits.
+ *
+ * This must be called with interrupts disabled.
+ *
+ * Taking the native_tlbie_lock is unsafe here due to the possibility of
+ * lockdep being on. On pre POWER5 hardware, not taking the lock could
+ * cause deadlock. POWER5 and newer not taking the lock is fine. This only
+ * gets called during boot before secondary CPUs have come up and during
+ * crashdump and all bets are off anyway.
+ *
+ * TODO: add batching support when enabled. remember, no dynamic memory here,
+ * although there is the control page available...
+ */
+static void native_hpte_clear(void)
+{
+ unsigned long vpn = 0;
+ unsigned long slot, slots;
+ struct hash_pte *hptep = htab_address;
+ unsigned long hpte_v;
+ unsigned long pteg_count;
+ int psize, apsize, ssize;
+
+ pteg_count = htab_hash_mask + 1;
+
+ slots = pteg_count * HPTES_PER_GROUP;
+
+ for (slot = 0; slot < slots; slot++, hptep++) {
+ /*
+ * we could lock the pte here, but we are the only cpu
+ * running, right? and for crash dump, we probably
+ * don't want to wait for a maybe bad cpu.
+ */
+ hpte_v = be64_to_cpu(hptep->v);
+
+ /*
+ * Call __tlbie() here rather than tlbie() since we can't take the
+ * native_tlbie_lock.
+ */
+ if (hpte_v & HPTE_V_VALID) {
+ hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
+ hptep->v = 0;
+ ___tlbie(vpn, psize, apsize, ssize);
+ }
+ }
+
+ asm volatile("eieio; tlbsync; ptesync":::"memory");
+}
+
+/*
+ * Batched hash table flush, we batch the tlbie's to avoid taking/releasing
+ * the lock all the time
+ */
+static void native_flush_hash_range(unsigned long number, int local)
+{
+ unsigned long vpn = 0;
+ unsigned long hash, index, hidx, shift, slot;
+ struct hash_pte *hptep;
+ unsigned long hpte_v;
+ unsigned long want_v;
+ unsigned long flags;
+ real_pte_t pte;
+ struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
+ unsigned long psize = batch->psize;
+ int ssize = batch->ssize;
+ int i;
+ unsigned int use_local;
+
+ use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) &&
+ mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use();
+
+ local_irq_save(flags);
+
+ for (i = 0; i < number; i++) {
+ vpn = batch->vpn[i];
+ pte = batch->pte[i];
+
+ pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+ hash = hpt_hash(vpn, shift, ssize);
+ hidx = __rpte_to_hidx(pte, index);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+ hptep = htab_address + slot;
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+ continue;
+ /* lock and try again */
+ native_lock_hpte(hptep);
+ hpte_v = hpte_get_old_v(hptep);
+
+ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+ native_unlock_hpte(hptep);
+ else
+ hptep->v = 0;
+
+ } pte_iterate_hashed_end();
+ }
+
+ if (use_local) {
+ asm volatile("ptesync":::"memory");
+ for (i = 0; i < number; i++) {
+ vpn = batch->vpn[i];
+ pte = batch->pte[i];
+
+ pte_iterate_hashed_subpages(pte, psize,
+ vpn, index, shift) {
+ __tlbiel(vpn, psize, psize, ssize);
+ } pte_iterate_hashed_end();
+ }
+ asm volatile("ptesync":::"memory");
+ } else {
+ int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+ if (lock_tlbie)
+ raw_spin_lock(&native_tlbie_lock);
+
+ asm volatile("ptesync":::"memory");
+ for (i = 0; i < number; i++) {
+ vpn = batch->vpn[i];
+ pte = batch->pte[i];
+
+ pte_iterate_hashed_subpages(pte, psize,
+ vpn, index, shift) {
+ __tlbie(vpn, psize, psize, ssize);
+ } pte_iterate_hashed_end();
+ }
+ /*
+ * Just do one more with the last used values.
+ */
+ fixup_tlbie(vpn, psize, psize, ssize);
+ asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+ if (lock_tlbie)
+ raw_spin_unlock(&native_tlbie_lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+void __init hpte_init_native(void)
+{
+ mmu_hash_ops.hpte_invalidate = native_hpte_invalidate;
+ mmu_hash_ops.hpte_updatepp = native_hpte_updatepp;
+ mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
+ mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
+ mmu_hash_ops.hpte_insert = native_hpte_insert;
+ mmu_hash_ops.hpte_remove = native_hpte_remove;
+ mmu_hash_ops.hpte_clear_all = native_hpte_clear;
+ mmu_hash_ops.flush_hash_range = native_flush_hash_range;
+ mmu_hash_ops.hugepage_invalidate = native_hugepage_invalidate;
+}
--- /dev/null
+/*
+ * Copyright 2005, Paul Mackerras, IBM Corporation.
+ * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm_types.h>
+#include <linux/mm.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/mmu.h>
+#include <asm/tlb.h>
+
+#include <mm/mmu_decl.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
+#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * vmemmap is the starting address of the virtual address space where
+ * struct pages are allocated for all possible PFNs present on the system
+ * including holes and bad memory (hence sparse). These virtual struct
+ * pages are stored in sequence in this virtual address space irrespective
+ * of the fact whether the corresponding PFN is valid or not. This achieves
+ * constant relationship between address of struct page and its PFN.
+ *
+ * During boot or memory hotplug operation when a new memory section is
+ * added, physical memory allocation (including hash table bolting) will
+ * be performed for the set of struct pages which are part of the memory
+ * section. This saves memory by not allocating struct pages for PFNs
+ * which are not valid.
+ *
+ * ----------------------------------------------
+ * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
+ * ----------------------------------------------
+ *
+ * f000000000000000 c000000000000000
+ * vmemmap +--------------+ +--------------+
+ * + | page struct | +--------------> | page struct |
+ * | +--------------+ +--------------+
+ * | | page struct | +--------------> | page struct |
+ * | +--------------+ | +--------------+
+ * | | page struct | + +------> | page struct |
+ * | +--------------+ | +--------------+
+ * | | page struct | | +--> | page struct |
+ * | +--------------+ | | +--------------+
+ * | | page struct | | |
+ * | +--------------+ | |
+ * | | page struct | | |
+ * | +--------------+ | |
+ * | | page struct | | |
+ * | +--------------+ | |
+ * | | page struct | | |
+ * | +--------------+ | |
+ * | | page struct | +-------+ |
+ * | +--------------+ |
+ * | | page struct | +-----------+
+ * | +--------------+
+ * | | page struct | No mapping
+ * | +--------------+
+ * | | page struct | No mapping
+ * v +--------------+
+ *
+ * -----------------------------------------
+ * | RELATION BETWEEN STRUCT PAGES AND PFNS|
+ * -----------------------------------------
+ *
+ * vmemmap +--------------+ +---------------+
+ * + | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | |
+ * | +--------------+
+ * | | |
+ * | +--------------+
+ * | | |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | |
+ * | +--------------+
+ * | | |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * | +--------------+ +---------------+
+ * | | page struct | +-------------> | PFN |
+ * v +--------------+ +---------------+
+ */
+/*
+ * On hash-based CPUs, the vmemmap is bolted in the hash table.
+ *
+ */
+int __meminit hash__vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys)
+{
+ int rc;
+
+ if ((start + page_size) >= H_VMEMMAP_END) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
+ rc = htab_bolt_mapping(start, start + page_size, phys,
+ pgprot_val(PAGE_KERNEL),
+ mmu_vmemmap_psize, mmu_kernel_ssize);
+ if (rc < 0) {
+ int rc2 = htab_remove_mapping(start, start + page_size,
+ mmu_vmemmap_psize,
+ mmu_kernel_ssize);
+ BUG_ON(rc2 && (rc2 != -ENOENT));
+ }
+ return rc;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void hash__vmemmap_remove_mapping(unsigned long start,
+ unsigned long page_size)
+{
+ int rc = htab_remove_mapping(start, start + page_size,
+ mmu_vmemmap_psize,
+ mmu_kernel_ssize);
+ BUG_ON((rc < 0) && (rc != -ENOENT));
+ WARN_ON(rc == -ENOENT);
+}
+#endif
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+/*
+ * map_kernel_page currently only called by __ioremap
+ * map_kernel_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
+{
+ pgd_t *pgdp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
+ if (slab_is_available()) {
+ pgdp = pgd_offset_k(ea);
+ pudp = pud_alloc(&init_mm, pgdp, ea);
+ if (!pudp)
+ return -ENOMEM;
+ pmdp = pmd_alloc(&init_mm, pudp, ea);
+ if (!pmdp)
+ return -ENOMEM;
+ ptep = pte_alloc_kernel(pmdp, ea);
+ if (!ptep)
+ return -ENOMEM;
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
+ } else {
+ /*
+ * If the mm subsystem is not fully up, we cannot create a
+ * linux page table entry for this mapping. Simply bolt an
+ * entry in the hardware page table.
+ *
+ */
+ if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
+ mmu_io_psize, mmu_kernel_ssize)) {
+ printk(KERN_ERR "Failed to do bolted mapping IO "
+ "memory at %016lx !\n", pa);
+ return -ENOMEM;
+ }
+ }
+
+ smp_wmb();
+ return 0;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long clr,
+ unsigned long set)
+{
+ __be64 old_be, tmp;
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+#endif
+
+ __asm__ __volatile__(
+ "1: ldarx %0,0,%3\n\
+ and. %1,%0,%6\n\
+ bne- 1b \n\
+ andc %1,%0,%4 \n\
+ or %1,%1,%7\n\
+ stdcx. %1,0,%3 \n\
+ bne- 1b"
+ : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
+ : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
+ "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
+ : "cc" );
+
+ old = be64_to_cpu(old_be);
+
+ trace_hugepage_update(addr, old, clr, set);
+ if (old & H_PAGE_HASHPTE)
+ hpte_do_hugepage_flush(mm, addr, pmdp, old);
+ return old;
+}
+
+pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(pmd_trans_huge(*pmdp));
+ VM_BUG_ON(pmd_devmap(*pmdp));
+
+ pmd = *pmdp;
+ pmd_clear(pmdp);
+ /*
+ * Wait for all pending hash_page to finish. This is needed
+ * in case of subpage collapse. When we collapse normal pages
+ * to hugepage, we first clear the pmd, then invalidate all
+ * the PTE entries. The assumption here is that any low level
+ * page fault will see a none pmd and take the slow path that
+ * will wait on mmap_sem. But we could very well be in a
+ * hash_page with local ptep pointer value. Such a hash page
+ * can result in adding new HPTE entries for normal subpages.
+ * That means we could be modifying the page content as we
+ * copy them to a huge page. So wait for parallel hash_page
+ * to finish before invalidating HPTE entries. We can do this
+ * by sending an IPI to all the cpus and executing a dummy
+ * function there.
+ */
+ serialize_against_pte_lookup(vma->vm_mm);
+ /*
+ * Now invalidate the hpte entries in the range
+ * covered by pmd. This make sure we take a
+ * fault and will find the pmd as none, which will
+ * result in a major fault which takes mmap_sem and
+ * hence wait for collapse to complete. Without this
+ * the __collapse_huge_page_copy can result in copying
+ * the old content.
+ */
+ flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+ return pmd;
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ pgtable_t *pgtable_slot;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+ /*
+ * we store the pgtable in the second half of PMD
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ *pgtable_slot = pgtable;
+ /*
+ * expose the deposited pgtable to other cpus.
+ * before we set the hugepage PTE at pmd level
+ * hash fault code looks at the deposted pgtable
+ * to store hash index values.
+ */
+ smp_wmb();
+}
+
+pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pgtable_t pgtable;
+ pgtable_t *pgtable_slot;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Once we withdraw, mark the entry NULL.
+ */
+ *pgtable_slot = NULL;
+ /*
+ * We store HPTE information in the deposited PTE fragment.
+ * zero out the content on withdraw.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ return pgtable;
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long old_pmd)
+{
+ int ssize;
+ unsigned int psize;
+ unsigned long vsid;
+ unsigned long flags = 0;
+
+ /* get the base page size,vsid and segment size */
+#ifdef CONFIG_DEBUG_VM
+ psize = get_slice_psize(mm, addr);
+ BUG_ON(psize == MMU_PAGE_16M);
+#endif
+ if (old_pmd & H_PAGE_COMBO)
+ psize = MMU_PAGE_4K;
+ else
+ psize = MMU_PAGE_64K;
+
+ if (!is_kernel_addr(addr)) {
+ ssize = user_segment_size(addr);
+ vsid = get_user_vsid(&mm->context, addr, ssize);
+ WARN_ON(vsid == 0);
+ } else {
+ vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+ ssize = mmu_kernel_ssize;
+ }
+
+ if (mm_is_thread_local(mm))
+ flags |= HPTE_LOCAL_UPDATE;
+
+ return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
+}
+
+pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old_pmd;
+ pgtable_t pgtable;
+ unsigned long old;
+ pgtable_t *pgtable_slot;
+
+ old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+ old_pmd = __pmd(old);
+ /*
+ * We have pmd == none and we are holding page_table_lock.
+ * So we can safely go and clear the pgtable hash
+ * index info.
+ */
+ pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+ pgtable = *pgtable_slot;
+ /*
+ * Let's zero out old valid and hash index details
+ * hash fault look at them.
+ */
+ memset(pgtable, 0, PTE_FRAG_SIZE);
+ /*
+ * Serialize against find_current_mm_pte variants which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_curren_mm_pte to finish.
+ */
+ serialize_against_pte_lookup(mm);
+ return old_pmd;
+}
+
+int hash__has_transparent_hugepage(void)
+{
+
+ if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+ return 0;
+ /*
+ * We support THP only if PMD_SIZE is 16MB.
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+ return 0;
+ /*
+ * We need to make sure that we support 16MB hugepage in a segement
+ * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+ * of 64K.
+ */
+ /*
+ * If we have 64K HPTE, we will be using that by default
+ */
+ if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+ (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+ return 0;
+ /*
+ * Ok we only have 4K HPTE
+ */
+ if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+ return 0;
+
+ return 1;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+static bool hash__change_memory_range(unsigned long start, unsigned long end,
+ unsigned long newpp)
+{
+ unsigned long idx;
+ unsigned int step, shift;
+
+ shift = mmu_psize_defs[mmu_linear_psize].shift;
+ step = 1 << shift;
+
+ start = ALIGN_DOWN(start, step);
+ end = ALIGN(end, step); // aligns up
+
+ if (start >= end)
+ return false;
+
+ pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
+ start, end, newpp, step);
+
+ for (idx = start; idx < end; idx += step)
+ /* Not sure if we can do much with the return value */
+ mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
+ mmu_kernel_ssize);
+
+ return true;
+}
+
+void hash__mark_rodata_ro(void)
+{
+ unsigned long start, end;
+
+ start = (unsigned long)_stext;
+ end = (unsigned long)__init_begin;
+
+ WARN_ON(!hash__change_memory_range(start, end, PP_RXXX));
+}
+
+void hash__mark_initmem_nx(void)
+{
+ unsigned long start, end, pp;
+
+ start = (unsigned long)__init_begin;
+ end = (unsigned long)__init_end;
+
+ pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
+
+ WARN_ON(!hash__change_memory_range(start, end, pp));
+}
+#endif
--- /dev/null
+/*
+ * This file contains the routines for flushing entries from the
+ * TLB and MMU hash table.
+ *
+ * Derived from arch/ppc64/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Dave Engebretsen <engebret@us.ibm.com>
+ * Rework for PPC64 port.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/bug.h>
+#include <asm/pte-walk.h>
+
+
+#include <trace/events/thp.h>
+
+DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
+
+/*
+ * A linux PTE was changed and the corresponding hash table entry
+ * neesd to be flushed. This function will either perform the flush
+ * immediately or will batch it up if the current CPU has an active
+ * batch on it.
+ */
+void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned long pte, int huge)
+{
+ unsigned long vpn;
+ struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
+ unsigned long vsid;
+ unsigned int psize;
+ int ssize;
+ real_pte_t rpte;
+ int i, offset;
+
+ i = batch->index;
+
+ /*
+ * Get page size (maybe move back to caller).
+ *
+ * NOTE: when using special 64K mappings in 4K environment like
+ * for SPEs, we obtain the page size from the slice, which thus
+ * must still exist (and thus the VMA not reused) at the time
+ * of this call
+ */
+ if (huge) {
+#ifdef CONFIG_HUGETLB_PAGE
+ psize = get_slice_psize(mm, addr);
+ /* Mask the address for the correct page size */
+ addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
+ if (unlikely(psize == MMU_PAGE_16G))
+ offset = PTRS_PER_PUD;
+ else
+ offset = PTRS_PER_PMD;
+#else
+ BUG();
+ psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
+#endif
+ } else {
+ psize = pte_pagesize_index(mm, addr, pte);
+ /*
+ * Mask the address for the standard page size. If we
+ * have a 64k page kernel, but the hardware does not
+ * support 64k pages, this might be different from the
+ * hardware page size encoded in the slice table.
+ */
+ addr &= PAGE_MASK;
+ offset = PTRS_PER_PTE;
+ }
+
+
+ /* Build full vaddr */
+ if (!is_kernel_addr(addr)) {
+ ssize = user_segment_size(addr);
+ vsid = get_user_vsid(&mm->context, addr, ssize);
+ } else {
+ vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+ ssize = mmu_kernel_ssize;
+ }
+ WARN_ON(vsid == 0);
+ vpn = hpt_vpn(addr, vsid, ssize);
+ rpte = __real_pte(__pte(pte), ptep, offset);
+
+ /*
+ * Check if we have an active batch on this CPU. If not, just
+ * flush now and return.
+ */
+ if (!batch->active) {
+ flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm));
+ put_cpu_var(ppc64_tlb_batch);
+ return;
+ }
+
+ /*
+ * This can happen when we are in the middle of a TLB batch and
+ * we encounter memory pressure (eg copy_page_range when it tries
+ * to allocate a new pte). If we have to reclaim memory and end
+ * up scanning and resetting referenced bits then our batch context
+ * will change mid stream.
+ *
+ * We also need to ensure only one page size is present in a given
+ * batch
+ */
+ if (i != 0 && (mm != batch->mm || batch->psize != psize ||
+ batch->ssize != ssize)) {
+ __flush_tlb_pending(batch);
+ i = 0;
+ }
+ if (i == 0) {
+ batch->mm = mm;
+ batch->psize = psize;
+ batch->ssize = ssize;
+ }
+ batch->pte[i] = rpte;
+ batch->vpn[i] = vpn;
+ batch->index = ++i;
+ if (i >= PPC64_TLB_BATCH_NR)
+ __flush_tlb_pending(batch);
+ put_cpu_var(ppc64_tlb_batch);
+}
+
+/*
+ * This function is called when terminating an mmu batch or when a batch
+ * is full. It will perform the flush of all the entries currently stored
+ * in a batch.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
+ */
+void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
+{
+ int i, local;
+
+ i = batch->index;
+ local = mm_is_thread_local(batch->mm);
+ if (i == 1)
+ flush_hash_page(batch->vpn[0], batch->pte[0],
+ batch->psize, batch->ssize, local);
+ else
+ flush_hash_range(i, local);
+ batch->index = 0;
+}
+
+void hash__tlb_flush(struct mmu_gather *tlb)
+{
+ struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
+
+ /*
+ * If there's a TLB batch pending, then we must flush it because the
+ * pages are going to be freed and we really don't want to have a CPU
+ * access a freed page because it has a stale TLB
+ */
+ if (tlbbatch->index)
+ __flush_tlb_pending(tlbbatch);
+
+ put_cpu_var(ppc64_tlb_batch);
+}
+
+/**
+ * __flush_hash_table_range - Flush all HPTEs for a given address range
+ * from the hash table (and the TLB). But keeps
+ * the linux PTEs intact.
+ *
+ * @mm : mm_struct of the target address space (generally init_mm)
+ * @start : starting address
+ * @end : ending address (not included in the flush)
+ *
+ * This function is mostly to be used by some IO hotplug code in order
+ * to remove all hash entries from a given address range used to map IO
+ * space on a removed PCI-PCI bidge without tearing down the full mapping
+ * since 64K pages may overlap with other bridges when using 64K pages
+ * with 4K HW pages on IO space.
+ *
+ * Because of that usage pattern, it is implemented for small size rather
+ * than speed.
+ */
+void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ bool is_thp;
+ int hugepage_shift;
+ unsigned long flags;
+
+ start = _ALIGN_DOWN(start, PAGE_SIZE);
+ end = _ALIGN_UP(end, PAGE_SIZE);
+
+ BUG_ON(!mm->pgd);
+
+ /*
+ * Note: Normally, we should only ever use a batch within a
+ * PTE locked section. This violates the rule, but will work
+ * since we don't actually modify the PTEs, we just flush the
+ * hash while leaving the PTEs intact (including their reference
+ * to being hashed). This is not the most performance oriented
+ * way to do things but is fine for our needs here.
+ */
+ local_irq_save(flags);
+ arch_enter_lazy_mmu_mode();
+ for (; start < end; start += PAGE_SIZE) {
+ pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
+ &hugepage_shift);
+ unsigned long pte;
+
+ if (ptep == NULL)
+ continue;
+ pte = pte_val(*ptep);
+ if (is_thp)
+ trace_hugepage_invalidate(start, pte);
+ if (!(pte & H_PAGE_HASHPTE))
+ continue;
+ if (unlikely(is_thp))
+ hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
+ else
+ hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
+ }
+ arch_leave_lazy_mmu_mode();
+ local_irq_restore(flags);
+}
+
+void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+ pte_t *pte;
+ pte_t *start_pte;
+ unsigned long flags;
+
+ addr = _ALIGN_DOWN(addr, PMD_SIZE);
+ /*
+ * Note: Normally, we should only ever use a batch within a
+ * PTE locked section. This violates the rule, but will work
+ * since we don't actually modify the PTEs, we just flush the
+ * hash while leaving the PTEs intact (including their reference
+ * to being hashed). This is not the most performance oriented
+ * way to do things but is fine for our needs here.
+ */
+ local_irq_save(flags);
+ arch_enter_lazy_mmu_mode();
+ start_pte = pte_offset_map(pmd, addr);
+ for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
+ unsigned long pteval = pte_val(*pte);
+ if (pteval & H_PAGE_HASHPTE)
+ hpte_need_flush(mm, addr, pte, pteval, 0);
+ addr += PAGE_SIZE;
+ }
+ arch_leave_lazy_mmu_mode();
+ local_irq_restore(flags);
+}
--- /dev/null
+/*
+ * PowerPC64 port by Mike Corrigan and Dave Engebretsen
+ * {mikejc|engebret}@us.ibm.com
+ *
+ * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
+ *
+ * SMP scalability work:
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * Module name: htab.c
+ *
+ * Description:
+ * PowerPC Hashed Page Table functions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+#undef DEBUG_LOW
+
+#define pr_fmt(fmt) "hash-mmu: " fmt
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/sched/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/export.h>
+#include <linux/ctype.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/memblock.h>
+#include <linux/context_tracking.h>
+#include <linux/libfdt.h>
+#include <linux/pkeys.h>
+
+#include <asm/debugfs.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <linux/uaccess.h>
+#include <asm/machdep.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/eeh.h>
+#include <asm/tlb.h>
+#include <asm/cacheflush.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/copro.h>
+#include <asm/udbg.h>
+#include <asm/code-patching.h>
+#include <asm/fadump.h>
+#include <asm/firmware.h>
+#include <asm/tm.h>
+#include <asm/trace.h>
+#include <asm/ps3.h>
+#include <asm/pte-walk.h>
+#include <asm/asm-prototypes.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
+
+#define KB (1024)
+#define MB (1024*KB)
+#define GB (1024L*MB)
+
+/*
+ * Note: pte --> Linux PTE
+ * HPTE --> PowerPC Hashed Page Table Entry
+ *
+ * Execution context:
+ * htab_initialize is called with the MMU off (of course), but
+ * the kernel has been copied down to zero so it can directly
+ * reference global data. At this point it is very difficult
+ * to print debug info.
+ *
+ */
+
+static unsigned long _SDR1;
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+EXPORT_SYMBOL_GPL(mmu_psize_defs);
+
+u8 hpte_page_sizes[1 << LP_BITS];
+EXPORT_SYMBOL_GPL(hpte_page_sizes);
+
+struct hash_pte *htab_address;
+unsigned long htab_size_bytes;
+unsigned long htab_hash_mask;
+EXPORT_SYMBOL_GPL(htab_hash_mask);
+int mmu_linear_psize = MMU_PAGE_4K;
+EXPORT_SYMBOL_GPL(mmu_linear_psize);
+int mmu_virtual_psize = MMU_PAGE_4K;
+int mmu_vmalloc_psize = MMU_PAGE_4K;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+int mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif
+int mmu_io_psize = MMU_PAGE_4K;
+int mmu_kernel_ssize = MMU_SEGSIZE_256M;
+EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
+int mmu_highuser_ssize = MMU_SEGSIZE_256M;
+u16 mmu_slb_size = 64;
+EXPORT_SYMBOL_GPL(mmu_slb_size);
+#ifdef CONFIG_PPC_64K_PAGES
+int mmu_ci_restrictions;
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static u8 *linear_map_hash_slots;
+static unsigned long linear_map_hash_count;
+static DEFINE_SPINLOCK(linear_map_hash_lock);
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+struct mmu_hash_ops mmu_hash_ops;
+EXPORT_SYMBOL(mmu_hash_ops);
+
+/*
+ * These are definitions of page sizes arrays to be used when none
+ * is provided by the firmware.
+ */
+
+/*
+ * Fallback (4k pages only)
+ */
+static struct mmu_psize_def mmu_psize_defaults[] = {
+ [MMU_PAGE_4K] = {
+ .shift = 12,
+ .sllp = 0,
+ .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
+ .avpnm = 0,
+ .tlbiel = 0,
+ },
+};
+
+/*
+ * POWER4, GPUL, POWER5
+ *
+ * Support for 16Mb large pages
+ */
+static struct mmu_psize_def mmu_psize_defaults_gp[] = {
+ [MMU_PAGE_4K] = {
+ .shift = 12,
+ .sllp = 0,
+ .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
+ .avpnm = 0,
+ .tlbiel = 1,
+ },
+ [MMU_PAGE_16M] = {
+ .shift = 24,
+ .sllp = SLB_VSID_L,
+ .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
+ [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
+ .avpnm = 0x1UL,
+ .tlbiel = 0,
+ },
+};
+
+/*
+ * 'R' and 'C' update notes:
+ * - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
+ * create writeable HPTEs without C set, because the hcall H_PROTECT
+ * that we use in that case will not update C
+ * - The above is however not a problem, because we also don't do that
+ * fancy "no flush" variant of eviction and we use H_REMOVE which will
+ * do the right thing and thus we don't have the race I described earlier
+ *
+ * - Under bare metal, we do have the race, so we need R and C set
+ * - We make sure R is always set and never lost
+ * - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
+ */
+unsigned long htab_convert_pte_flags(unsigned long pteflags)
+{
+ unsigned long rflags = 0;
+
+ /* _PAGE_EXEC -> NOEXEC */
+ if ((pteflags & _PAGE_EXEC) == 0)
+ rflags |= HPTE_R_N;
+ /*
+ * PPP bits:
+ * Linux uses slb key 0 for kernel and 1 for user.
+ * kernel RW areas are mapped with PPP=0b000
+ * User area is mapped with PPP=0b010 for read/write
+ * or PPP=0b011 for read-only (including writeable but clean pages).
+ */
+ if (pteflags & _PAGE_PRIVILEGED) {
+ /*
+ * Kernel read only mapped with ppp bits 0b110
+ */
+ if (!(pteflags & _PAGE_WRITE)) {
+ if (mmu_has_feature(MMU_FTR_KERNEL_RO))
+ rflags |= (HPTE_R_PP0 | 0x2);
+ else
+ rflags |= 0x3;
+ }
+ } else {
+ if (pteflags & _PAGE_RWX)
+ rflags |= 0x2;
+ if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
+ rflags |= 0x1;
+ }
+ /*
+ * We can't allow hardware to update hpte bits. Hence always
+ * set 'R' bit and set 'C' if it is a write fault
+ */
+ rflags |= HPTE_R_R;
+
+ if (pteflags & _PAGE_DIRTY)
+ rflags |= HPTE_R_C;
+ /*
+ * Add in WIG bits
+ */
+
+ if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
+ rflags |= HPTE_R_I;
+ else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
+ rflags |= (HPTE_R_I | HPTE_R_G);
+ else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
+ rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);
+ else
+ /*
+ * Add memory coherence if cache inhibited is not set
+ */
+ rflags |= HPTE_R_M;
+
+ rflags |= pte_to_hpte_pkey_bits(pteflags);
+ return rflags;
+}
+
+int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
+ unsigned long pstart, unsigned long prot,
+ int psize, int ssize)
+{
+ unsigned long vaddr, paddr;
+ unsigned int step, shift;
+ int ret = 0;
+
+ shift = mmu_psize_defs[psize].shift;
+ step = 1 << shift;
+
+ prot = htab_convert_pte_flags(prot);
+
+ DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
+ vstart, vend, pstart, prot, psize, ssize);
+
+ for (vaddr = vstart, paddr = pstart; vaddr < vend;
+ vaddr += step, paddr += step) {
+ unsigned long hash, hpteg;
+ unsigned long vsid = get_kernel_vsid(vaddr, ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, ssize);
+ unsigned long tprot = prot;
+
+ /*
+ * If we hit a bad address return error.
+ */
+ if (!vsid)
+ return -1;
+ /* Make kernel text executable */
+ if (overlaps_kernel_text(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
+
+ /* Make kvm guest trampolines executable */
+ if (overlaps_kvm_tmp(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
+
+ /*
+ * If relocatable, check if it overlaps interrupt vectors that
+ * are copied down to real 0. For relocatable kernel
+ * (e.g. kdump case) we copy interrupt vectors down to real
+ * address 0. Mark that region as executable. This is
+ * because on p8 system with relocation on exception feature
+ * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
+ * in order to execute the interrupt handlers in virtual
+ * mode the vector region need to be marked as executable.
+ */
+ if ((PHYSICAL_START > MEMORY_START) &&
+ overlaps_interrupt_vector_text(vaddr, vaddr + step))
+ tprot &= ~HPTE_R_N;
+
+ hash = hpt_hash(vpn, shift, ssize);
+ hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+ BUG_ON(!mmu_hash_ops.hpte_insert);
+ ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
+ HPTE_V_BOLTED, psize, psize,
+ ssize);
+
+ if (ret < 0)
+ break;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (debug_pagealloc_enabled() &&
+ (paddr >> PAGE_SHIFT) < linear_map_hash_count)
+ linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+ }
+ return ret < 0 ? ret : 0;
+}
+
+int htab_remove_mapping(unsigned long vstart, unsigned long vend,
+ int psize, int ssize)
+{
+ unsigned long vaddr;
+ unsigned int step, shift;
+ int rc;
+ int ret = 0;
+
+ shift = mmu_psize_defs[psize].shift;
+ step = 1 << shift;
+
+ if (!mmu_hash_ops.hpte_removebolted)
+ return -ENODEV;
+
+ for (vaddr = vstart; vaddr < vend; vaddr += step) {
+ rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
+ if (rc == -ENOENT) {
+ ret = -ENOENT;
+ continue;
+ }
+ if (rc < 0)
+ return rc;
+ }
+
+ return ret;
+}
+
+static bool disable_1tb_segments = false;
+
+static int __init parse_disable_1tb_segments(char *p)
+{
+ disable_1tb_segments = true;
+ return 0;
+}
+early_param("disable_1tb_segments", parse_disable_1tb_segments);
+
+static int __init htab_dt_scan_seg_sizes(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int size = 0;
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
+ if (prop == NULL)
+ return 0;
+ for (; size >= 4; size -= 4, ++prop) {
+ if (be32_to_cpu(prop[0]) == 40) {
+ DBG("1T segment support detected\n");
+
+ if (disable_1tb_segments) {
+ DBG("1T segments disabled by command line\n");
+ break;
+ }
+
+ cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
+ return 1;
+ }
+ }
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
+ return 0;
+}
+
+static int __init get_idx_from_shift(unsigned int shift)
+{
+ int idx = -1;
+
+ switch (shift) {
+ case 0xc:
+ idx = MMU_PAGE_4K;
+ break;
+ case 0x10:
+ idx = MMU_PAGE_64K;
+ break;
+ case 0x14:
+ idx = MMU_PAGE_1M;
+ break;
+ case 0x18:
+ idx = MMU_PAGE_16M;
+ break;
+ case 0x22:
+ idx = MMU_PAGE_16G;
+ break;
+ }
+ return idx;
+}
+
+static int __init htab_dt_scan_page_sizes(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int size = 0;
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
+ if (!prop)
+ return 0;
+
+ pr_info("Page sizes from device-tree:\n");
+ size /= 4;
+ cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
+ while(size > 0) {
+ unsigned int base_shift = be32_to_cpu(prop[0]);
+ unsigned int slbenc = be32_to_cpu(prop[1]);
+ unsigned int lpnum = be32_to_cpu(prop[2]);
+ struct mmu_psize_def *def;
+ int idx, base_idx;
+
+ size -= 3; prop += 3;
+ base_idx = get_idx_from_shift(base_shift);
+ if (base_idx < 0) {
+ /* skip the pte encoding also */
+ prop += lpnum * 2; size -= lpnum * 2;
+ continue;
+ }
+ def = &mmu_psize_defs[base_idx];
+ if (base_idx == MMU_PAGE_16M)
+ cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
+
+ def->shift = base_shift;
+ if (base_shift <= 23)
+ def->avpnm = 0;
+ else
+ def->avpnm = (1 << (base_shift - 23)) - 1;
+ def->sllp = slbenc;
+ /*
+ * We don't know for sure what's up with tlbiel, so
+ * for now we only set it for 4K and 64K pages
+ */
+ if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
+ def->tlbiel = 1;
+ else
+ def->tlbiel = 0;
+
+ while (size > 0 && lpnum) {
+ unsigned int shift = be32_to_cpu(prop[0]);
+ int penc = be32_to_cpu(prop[1]);
+
+ prop += 2; size -= 2;
+ lpnum--;
+
+ idx = get_idx_from_shift(shift);
+ if (idx < 0)
+ continue;
+
+ if (penc == -1)
+ pr_err("Invalid penc for base_shift=%d "
+ "shift=%d\n", base_shift, shift);
+
+ def->penc[idx] = penc;
+ pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
+ " avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
+ base_shift, shift, def->sllp,
+ def->avpnm, def->tlbiel, def->penc[idx]);
+ }
+ }
+
+ return 1;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+ const char *uname, int depth,
+ void *data) {
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be64 *addr_prop;
+ const __be32 *page_count_prop;
+ unsigned int expected_pages;
+ long unsigned int phys_addr;
+ long unsigned int block_size;
+
+ /* We are scanning "memory" nodes only */
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ /*
+ * This property is the log base 2 of the number of virtual pages that
+ * will represent this memory block.
+ */
+ page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+ if (page_count_prop == NULL)
+ return 0;
+ expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
+ addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+ if (addr_prop == NULL)
+ return 0;
+ phys_addr = be64_to_cpu(addr_prop[0]);
+ block_size = be64_to_cpu(addr_prop[1]);
+ if (block_size != (16 * GB))
+ return 0;
+ printk(KERN_INFO "Huge page(16GB) memory: "
+ "addr = 0x%lX size = 0x%lX pages = %d\n",
+ phys_addr, block_size, expected_pages);
+ if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
+ memblock_reserve(phys_addr, block_size * expected_pages);
+ pseries_add_gpage(phys_addr, block_size, expected_pages);
+ }
+ return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static void mmu_psize_set_default_penc(void)
+{
+ int bpsize, apsize;
+ for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+ for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
+ mmu_psize_defs[bpsize].penc[apsize] = -1;
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+
+static bool might_have_hea(void)
+{
+ /*
+ * The HEA ethernet adapter requires awareness of the
+ * GX bus. Without that awareness we can easily assume
+ * we will never see an HEA ethernet device.
+ */
+#ifdef CONFIG_IBMEBUS
+ return !cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ firmware_has_feature(FW_FEATURE_SPLPAR);
+#else
+ return false;
+#endif
+}
+
+#endif /* #ifdef CONFIG_PPC_64K_PAGES */
+
+static void __init htab_scan_page_sizes(void)
+{
+ int rc;
+
+ /* se the invalid penc to -1 */
+ mmu_psize_set_default_penc();
+
+ /* Default to 4K pages only */
+ memcpy(mmu_psize_defs, mmu_psize_defaults,
+ sizeof(mmu_psize_defaults));
+
+ /*
+ * Try to find the available page sizes in the device-tree
+ */
+ rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
+ if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) {
+ /*
+ * Nothing in the device-tree, but the CPU supports 16M pages,
+ * so let's fallback on a known size list for 16M capable CPUs.
+ */
+ memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
+ sizeof(mmu_psize_defaults_gp));
+ }
+
+#ifdef CONFIG_HUGETLB_PAGE
+ if (!hugetlb_disabled) {
+ /* Reserve 16G huge page memory sections for huge pages */
+ of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+ }
+#endif /* CONFIG_HUGETLB_PAGE */
+}
+
+/*
+ * Fill in the hpte_page_sizes[] array.
+ * We go through the mmu_psize_defs[] array looking for all the
+ * supported base/actual page size combinations. Each combination
+ * has a unique pagesize encoding (penc) value in the low bits of
+ * the LP field of the HPTE. For actual page sizes less than 1MB,
+ * some of the upper LP bits are used for RPN bits, meaning that
+ * we need to fill in several entries in hpte_page_sizes[].
+ *
+ * In diagrammatic form, with r = RPN bits and z = page size bits:
+ * PTE LP actual page size
+ * rrrr rrrz >=8KB
+ * rrrr rrzz >=16KB
+ * rrrr rzzz >=32KB
+ * rrrr zzzz >=64KB
+ * ...
+ *
+ * The zzzz bits are implementation-specific but are chosen so that
+ * no encoding for a larger page size uses the same value in its
+ * low-order N bits as the encoding for the 2^(12+N) byte page size
+ * (if it exists).
+ */
+static void init_hpte_page_sizes(void)
+{
+ long int ap, bp;
+ long int shift, penc;
+
+ for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
+ if (!mmu_psize_defs[bp].shift)
+ continue; /* not a supported page size */
+ for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
+ penc = mmu_psize_defs[bp].penc[ap];
+ if (penc == -1 || !mmu_psize_defs[ap].shift)
+ continue;
+ shift = mmu_psize_defs[ap].shift - LP_SHIFT;
+ if (shift <= 0)
+ continue; /* should never happen */
+ /*
+ * For page sizes less than 1MB, this loop
+ * replicates the entry for all possible values
+ * of the rrrr bits.
+ */
+ while (penc < (1 << LP_BITS)) {
+ hpte_page_sizes[penc] = (ap << 4) | bp;
+ penc += 1 << shift;
+ }
+ }
+ }
+}
+
+static void __init htab_init_page_sizes(void)
+{
+ init_hpte_page_sizes();
+
+ if (!debug_pagealloc_enabled()) {
+ /*
+ * Pick a size for the linear mapping. Currently, we only
+ * support 16M, 1M and 4K which is the default
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ mmu_linear_psize = MMU_PAGE_16M;
+ else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+ mmu_linear_psize = MMU_PAGE_1M;
+ }
+
+#ifdef CONFIG_PPC_64K_PAGES
+ /*
+ * Pick a size for the ordinary pages. Default is 4K, we support
+ * 64K for user mappings and vmalloc if supported by the processor.
+ * We only use 64k for ioremap if the processor
+ * (and firmware) support cache-inhibited large pages.
+ * If not, we use 4k and set mmu_ci_restrictions so that
+ * hash_page knows to switch processes that use cache-inhibited
+ * mappings to 4k pages.
+ */
+ if (mmu_psize_defs[MMU_PAGE_64K].shift) {
+ mmu_virtual_psize = MMU_PAGE_64K;
+ mmu_vmalloc_psize = MMU_PAGE_64K;
+ if (mmu_linear_psize == MMU_PAGE_4K)
+ mmu_linear_psize = MMU_PAGE_64K;
+ if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
+ /*
+ * When running on pSeries using 64k pages for ioremap
+ * would stop us accessing the HEA ethernet. So if we
+ * have the chance of ever seeing one, stay at 4k.
+ */
+ if (!might_have_hea())
+ mmu_io_psize = MMU_PAGE_64K;
+ } else
+ mmu_ci_restrictions = 1;
+ }
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ /*
+ * We try to use 16M pages for vmemmap if that is supported
+ * and we have at least 1G of RAM at boot
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift &&
+ memblock_phys_mem_size() >= 0x40000000)
+ mmu_vmemmap_psize = MMU_PAGE_16M;
+ else if (mmu_psize_defs[MMU_PAGE_64K].shift)
+ mmu_vmemmap_psize = MMU_PAGE_64K;
+ else
+ mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+ printk(KERN_DEBUG "Page orders: linear mapping = %d, "
+ "virtual = %d, io = %d"
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ ", vmemmap = %d"
+#endif
+ "\n",
+ mmu_psize_defs[mmu_linear_psize].shift,
+ mmu_psize_defs[mmu_virtual_psize].shift,
+ mmu_psize_defs[mmu_io_psize].shift
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ ,mmu_psize_defs[mmu_vmemmap_psize].shift
+#endif
+ );
+}
+
+static int __init htab_dt_scan_pftsize(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+ if (prop != NULL) {
+ /* pft_size[0] is the NUMA CEC cookie */
+ ppc64_pft_size = be32_to_cpu(prop[1]);
+ return 1;
+ }
+ return 0;
+}
+
+unsigned htab_shift_for_mem_size(unsigned long mem_size)
+{
+ unsigned memshift = __ilog2(mem_size);
+ unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
+ unsigned pteg_shift;
+
+ /* round mem_size up to next power of 2 */
+ if ((1UL << memshift) < mem_size)
+ memshift += 1;
+
+ /* aim for 2 pages / pteg */
+ pteg_shift = memshift - (pshift + 1);
+
+ /*
+ * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab
+ * size permitted by the architecture.
+ */
+ return max(pteg_shift + 7, 18U);
+}
+
+static unsigned long __init htab_get_table_size(void)
+{
+ /*
+ * If hash size isn't already provided by the platform, we try to
+ * retrieve it from the device-tree. If it's not there neither, we
+ * calculate it now based on the total RAM size
+ */
+ if (ppc64_pft_size == 0)
+ of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
+ if (ppc64_pft_size)
+ return 1UL << ppc64_pft_size;
+
+ return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size());
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int resize_hpt_for_hotplug(unsigned long new_mem_size)
+{
+ unsigned target_hpt_shift;
+
+ if (!mmu_hash_ops.resize_hpt)
+ return 0;
+
+ target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
+
+ /*
+ * To avoid lots of HPT resizes if memory size is fluctuating
+ * across a boundary, we deliberately have some hysterisis
+ * here: we immediately increase the HPT size if the target
+ * shift exceeds the current shift, but we won't attempt to
+ * reduce unless the target shift is at least 2 below the
+ * current shift
+ */
+ if (target_hpt_shift > ppc64_pft_size ||
+ target_hpt_shift < ppc64_pft_size - 1)
+ return mmu_hash_ops.resize_hpt(target_hpt_shift);
+
+ return 0;
+}
+
+int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
+{
+ int rc;
+
+ if (end >= H_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
+ rc = htab_bolt_mapping(start, end, __pa(start),
+ pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+ mmu_kernel_ssize);
+
+ if (rc < 0) {
+ int rc2 = htab_remove_mapping(start, end, mmu_linear_psize,
+ mmu_kernel_ssize);
+ BUG_ON(rc2 && (rc2 != -ENOENT));
+ }
+ return rc;
+}
+
+int hash__remove_section_mapping(unsigned long start, unsigned long end)
+{
+ int rc = htab_remove_mapping(start, end, mmu_linear_psize,
+ mmu_kernel_ssize);
+ WARN_ON(rc < 0);
+ return rc;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static void __init hash_init_partition_table(phys_addr_t hash_table,
+ unsigned long htab_size)
+{
+ mmu_partition_table_init();
+
+ /*
+ * PS field (VRMA page size) is not used for LPID 0, hence set to 0.
+ * For now, UPRT is 0 and we have no segment table.
+ */
+ htab_size = __ilog2(htab_size) - 18;
+ mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
+ pr_info("Partition table %p\n", partition_tb);
+}
+
+static void __init htab_initialize(void)
+{
+ unsigned long table;
+ unsigned long pteg_count;
+ unsigned long prot;
+ unsigned long base = 0, size = 0;
+ struct memblock_region *reg;
+
+ DBG(" -> htab_initialize()\n");
+
+ if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
+ mmu_kernel_ssize = MMU_SEGSIZE_1T;
+ mmu_highuser_ssize = MMU_SEGSIZE_1T;
+ printk(KERN_INFO "Using 1TB segments\n");
+ }
+
+ /*
+ * Calculate the required size of the htab. We want the number of
+ * PTEGs to equal one half the number of real pages.
+ */
+ htab_size_bytes = htab_get_table_size();
+ pteg_count = htab_size_bytes >> 7;
+
+ htab_hash_mask = pteg_count - 1;
+
+ if (firmware_has_feature(FW_FEATURE_LPAR) ||
+ firmware_has_feature(FW_FEATURE_PS3_LV1)) {
+ /* Using a hypervisor which owns the htab */
+ htab_address = NULL;
+ _SDR1 = 0;
+ /*
+ * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
+ * to inform the hypervisor that we wish to use the HPT.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ register_process_table(0, 0, 0);
+#ifdef CONFIG_FA_DUMP
+ /*
+ * If firmware assisted dump is active firmware preserves
+ * the contents of htab along with entire partition memory.
+ * Clear the htab if firmware assisted dump is active so
+ * that we dont end up using old mappings.
+ */
+ if (is_fadump_active() && mmu_hash_ops.hpte_clear_all)
+ mmu_hash_ops.hpte_clear_all();
+#endif
+ } else {
+ unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE;
+
+#ifdef CONFIG_PPC_CELL
+ /*
+ * Cell may require the hash table down low when using the
+ * Axon IOMMU in order to fit the dynamic region over it, see
+ * comments in cell/iommu.c
+ */
+ if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) {
+ limit = 0x80000000;
+ pr_info("Hash table forced below 2G for Axon IOMMU\n");
+ }
+#endif /* CONFIG_PPC_CELL */
+
+ table = memblock_phys_alloc_range(htab_size_bytes,
+ htab_size_bytes,
+ 0, limit);
+ if (!table)
+ panic("ERROR: Failed to allocate %pa bytes below %pa\n",
+ &htab_size_bytes, &limit);
+
+ DBG("Hash table allocated at %lx, size: %lx\n", table,
+ htab_size_bytes);
+
+ htab_address = __va(table);
+
+ /* htab absolute addr + encoded htabsize */
+ _SDR1 = table + __ilog2(htab_size_bytes) - 18;
+
+ /* Initialize the HPT with no entries */
+ memset((void *)table, 0, htab_size_bytes);
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ /* Set SDR1 */
+ mtspr(SPRN_SDR1, _SDR1);
+ else
+ hash_init_partition_table(table, htab_size_bytes);
+ }
+
+ prot = pgprot_val(PAGE_KERNEL);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (debug_pagealloc_enabled()) {
+ linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ linear_map_hash_slots = memblock_alloc_try_nid(
+ linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
+ ppc64_rma_size, NUMA_NO_NODE);
+ if (!linear_map_hash_slots)
+ panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
+ __func__, linear_map_hash_count, &ppc64_rma_size);
+ }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+ /* create bolted the linear mapping in the hash table */
+ for_each_memblock(memory, reg) {
+ base = (unsigned long)__va(reg->base);
+ size = reg->size;
+
+ DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
+ base, size, prot);
+
+ if ((base + size) >= H_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ continue;
+ }
+
+ BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
+ prot, mmu_linear_psize, mmu_kernel_ssize));
+ }
+ memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+
+ /*
+ * If we have a memory_limit and we've allocated TCEs then we need to
+ * explicitly map the TCE area at the top of RAM. We also cope with the
+ * case that the TCEs start below memory_limit.
+ * tce_alloc_start/end are 16MB aligned so the mapping should work
+ * for either 4K or 16MB pages.
+ */
+ if (tce_alloc_start) {
+ tce_alloc_start = (unsigned long)__va(tce_alloc_start);
+ tce_alloc_end = (unsigned long)__va(tce_alloc_end);
+
+ if (base + size >= tce_alloc_start)
+ tce_alloc_start = base + size + 1;
+
+ BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
+ __pa(tce_alloc_start), prot,
+ mmu_linear_psize, mmu_kernel_ssize));
+ }
+
+
+ DBG(" <- htab_initialize()\n");
+}
+#undef KB
+#undef MB
+
+void __init hash__early_init_devtree(void)
+{
+ /* Initialize segment sizes */
+ of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
+
+ /* Initialize page sizes */
+ htab_scan_page_sizes();
+}
+
+struct hash_mm_context init_hash_mm_context;
+void __init hash__early_init_mmu(void)
+{
+#ifndef CONFIG_PPC_64K_PAGES
+ /*
+ * We have code in __hash_page_4K() and elsewhere, which assumes it can
+ * do the following:
+ * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
+ *
+ * Where the slot number is between 0-15, and values of 8-15 indicate
+ * the secondary bucket. For that code to work H_PAGE_F_SECOND and
+ * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
+ * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
+ * with a BUILD_BUG_ON().
+ */
+ BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3)));
+#endif /* CONFIG_PPC_64K_PAGES */
+
+ htab_init_page_sizes();
+
+ /*
+ * initialize page table size
+ */
+ __pte_frag_nr = H_PTE_FRAG_NR;
+ __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
+ __pmd_frag_nr = H_PMD_FRAG_NR;
+ __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT;
+
+ __pte_index_size = H_PTE_INDEX_SIZE;
+ __pmd_index_size = H_PMD_INDEX_SIZE;
+ __pud_index_size = H_PUD_INDEX_SIZE;
+ __pgd_index_size = H_PGD_INDEX_SIZE;
+ __pud_cache_index = H_PUD_CACHE_INDEX;
+ __pte_table_size = H_PTE_TABLE_SIZE;
+ __pmd_table_size = H_PMD_TABLE_SIZE;
+ __pud_table_size = H_PUD_TABLE_SIZE;
+ __pgd_table_size = H_PGD_TABLE_SIZE;
+ /*
+ * 4k use hugepd format, so for hash set then to
+ * zero
+ */
+ __pmd_val_bits = HASH_PMD_VAL_BITS;
+ __pud_val_bits = HASH_PUD_VAL_BITS;
+ __pgd_val_bits = HASH_PGD_VAL_BITS;
+
+ __kernel_virt_start = H_KERN_VIRT_START;
+ __vmalloc_start = H_VMALLOC_START;
+ __vmalloc_end = H_VMALLOC_END;
+ __kernel_io_start = H_KERN_IO_START;
+ __kernel_io_end = H_KERN_IO_END;
+ vmemmap = (struct page *)H_VMEMMAP_START;
+ ioremap_bot = IOREMAP_BASE;
+
+#ifdef CONFIG_PCI
+ pci_io_base = ISA_IO_BASE;
+#endif
+
+ /* Select appropriate backend */
+ if (firmware_has_feature(FW_FEATURE_PS3_LV1))
+ ps3_early_mm_init();
+ else if (firmware_has_feature(FW_FEATURE_LPAR))
+ hpte_init_pseries();
+ else if (IS_ENABLED(CONFIG_PPC_NATIVE))
+ hpte_init_native();
+
+ if (!mmu_hash_ops.hpte_insert)
+ panic("hash__early_init_mmu: No MMU hash ops defined!\n");
+
+ /*
+ * Initialize the MMU Hash table and create the linear mapping
+ * of memory. Has to be done before SLB initialization as this is
+ * currently where the page size encoding is obtained.
+ */
+ htab_initialize();
+
+ init_mm.context.hash_context = &init_hash_mm_context;
+ init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
+
+ pr_info("Initializing hash mmu with SLB\n");
+ /* Initialize SLB management */
+ slb_initialize();
+
+ if (cpu_has_feature(CPU_FTR_ARCH_206)
+ && cpu_has_feature(CPU_FTR_HVMODE))
+ tlbiel_all();
+}
+
+#ifdef CONFIG_SMP
+void hash__early_init_mmu_secondary(void)
+{
+ /* Initialize hash table for that CPU */
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ mtspr(SPRN_SDR1, _SDR1);
+ else
+ mtspr(SPRN_PTCR,
+ __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+ }
+ /* Initialize SLB */
+ slb_initialize();
+
+ if (cpu_has_feature(CPU_FTR_ARCH_206)
+ && cpu_has_feature(CPU_FTR_HVMODE))
+ tlbiel_all();
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
+{
+ struct page *page;
+
+ if (!pfn_valid(pte_pfn(pte)))
+ return pp;
+
+ page = pte_page(pte);
+
+ /* page is dirty */
+ if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+ if (trap == 0x400) {
+ flush_dcache_icache_page(page);
+ set_bit(PG_arch_1, &page->flags);
+ } else
+ pp |= HPTE_R_N;
+ }
+ return pp;
+}
+
+#ifdef CONFIG_PPC_MM_SLICES
+static unsigned int get_paca_psize(unsigned long addr)
+{
+ unsigned char *psizes;
+ unsigned long index, mask_index;
+
+ if (addr < SLICE_LOW_TOP) {
+ psizes = get_paca()->mm_ctx_low_slices_psize;
+ index = GET_LOW_SLICE_INDEX(addr);
+ } else {
+ psizes = get_paca()->mm_ctx_high_slices_psize;
+ index = GET_HIGH_SLICE_INDEX(addr);
+ }
+ mask_index = index & 0x1;
+ return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
+}
+
+#else
+unsigned int get_paca_psize(unsigned long addr)
+{
+ return get_paca()->mm_ctx_user_psize;
+}
+#endif
+
+/*
+ * Demote a segment to using 4k pages.
+ * For now this makes the whole process use 4k pages.
+ */
+#ifdef CONFIG_PPC_64K_PAGES
+void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
+{
+ if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
+ return;
+ slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
+ copro_flush_all_slbs(mm);
+ if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
+
+ copy_mm_to_paca(mm);
+ slb_flush_and_restore_bolted();
+ }
+}
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+/*
+ * This looks up a 2-bit protection code for a 4k subpage of a 64k page.
+ * Userspace sets the subpage permissions using the subpage_prot system call.
+ *
+ * Result is 0: full permissions, _PAGE_RW: read-only,
+ * _PAGE_RWX: no access.
+ */
+static int subpage_protection(struct mm_struct *mm, unsigned long ea)
+{
+ struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
+ u32 spp = 0;
+ u32 **sbpm, *sbpp;
+
+ if (!spt)
+ return 0;
+
+ if (ea >= spt->maxaddr)
+ return 0;
+ if (ea < 0x100000000UL) {
+ /* addresses below 4GB use spt->low_prot */
+ sbpm = spt->low_prot;
+ } else {
+ sbpm = spt->protptrs[ea >> SBP_L3_SHIFT];
+ if (!sbpm)
+ return 0;
+ }
+ sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
+ if (!sbpp)
+ return 0;
+ spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)];
+
+ /* extract 2-bit bitfield for this 4k subpage */
+ spp >>= 30 - 2 * ((ea >> 12) & 0xf);
+
+ /*
+ * 0 -> full premission
+ * 1 -> Read only
+ * 2 -> no access.
+ * We return the flag that need to be cleared.
+ */
+ spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0);
+ return spp;
+}
+
+#else /* CONFIG_PPC_SUBPAGE_PROT */
+static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
+{
+ return 0;
+}
+#endif
+
+void hash_failure_debug(unsigned long ea, unsigned long access,
+ unsigned long vsid, unsigned long trap,
+ int ssize, int psize, int lpsize, unsigned long pte)
+{
+ if (!printk_ratelimit())
+ return;
+ pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
+ ea, access, current->comm);
+ pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
+ trap, vsid, ssize, psize, lpsize, pte);
+}
+
+static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
+ int psize, bool user_region)
+{
+ if (user_region) {
+ if (psize != get_paca_psize(ea)) {
+ copy_mm_to_paca(mm);
+ slb_flush_and_restore_bolted();
+ }
+ } else if (get_paca()->vmalloc_sllp !=
+ mmu_psize_defs[mmu_vmalloc_psize].sllp) {
+ get_paca()->vmalloc_sllp =
+ mmu_psize_defs[mmu_vmalloc_psize].sllp;
+ slb_vmalloc_update();
+ }
+}
+
+/*
+ * Result code is:
+ * 0 - handled
+ * 1 - normal page fault
+ * -1 - critical hash insertion error
+ * -2 - access not permitted by subpage protection mechanism
+ */
+int hash_page_mm(struct mm_struct *mm, unsigned long ea,
+ unsigned long access, unsigned long trap,
+ unsigned long flags)
+{
+ bool is_thp;
+ enum ctx_state prev_state = exception_enter();
+ pgd_t *pgdir;
+ unsigned long vsid;
+ pte_t *ptep;
+ unsigned hugeshift;
+ int rc, user_region = 0;
+ int psize, ssize;
+
+ DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
+ ea, access, trap);
+ trace_hash_fault(ea, access, trap);
+
+ /* Get region & vsid */
+ switch (get_region_id(ea)) {
+ case USER_REGION_ID:
+ user_region = 1;
+ if (! mm) {
+ DBG_LOW(" user region with no mm !\n");
+ rc = 1;
+ goto bail;
+ }
+ psize = get_slice_psize(mm, ea);
+ ssize = user_segment_size(ea);
+ vsid = get_user_vsid(&mm->context, ea, ssize);
+ break;
+ case VMALLOC_REGION_ID:
+ vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+ psize = mmu_vmalloc_psize;
+ ssize = mmu_kernel_ssize;
+ break;
+
+ case IO_REGION_ID:
+ vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+ psize = mmu_io_psize;
+ ssize = mmu_kernel_ssize;
+ break;
+ default:
+ /*
+ * Not a valid range
+ * Send the problem up to do_page_fault()
+ */
+ rc = 1;
+ goto bail;
+ }
+ DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
+
+ /* Bad address. */
+ if (!vsid) {
+ DBG_LOW("Bad address!\n");
+ rc = 1;
+ goto bail;
+ }
+ /* Get pgdir */
+ pgdir = mm->pgd;
+ if (pgdir == NULL) {
+ rc = 1;
+ goto bail;
+ }
+
+ /* Check CPU locality */
+ if (user_region && mm_is_thread_local(mm))
+ flags |= HPTE_LOCAL_UPDATE;
+
+#ifndef CONFIG_PPC_64K_PAGES
+ /*
+ * If we use 4K pages and our psize is not 4K, then we might
+ * be hitting a special driver mapping, and need to align the
+ * address before we fetch the PTE.
+ *
+ * It could also be a hugepage mapping, in which case this is
+ * not necessary, but it's not harmful, either.
+ */
+ if (psize != MMU_PAGE_4K)
+ ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+#endif /* CONFIG_PPC_64K_PAGES */
+
+ /* Get PTE and page size from page tables */
+ ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
+ if (ptep == NULL || !pte_present(*ptep)) {
+ DBG_LOW(" no PTE !\n");
+ rc = 1;
+ goto bail;
+ }
+
+ /* Add _PAGE_PRESENT to the required access perm */
+ access |= _PAGE_PRESENT;
+
+ /*
+ * Pre-check access permissions (will be re-checked atomically
+ * in __hash_page_XX but this pre-check is a fast path
+ */
+ if (!check_pte_access(access, pte_val(*ptep))) {
+ DBG_LOW(" no access !\n");
+ rc = 1;
+ goto bail;
+ }
+
+ if (hugeshift) {
+ if (is_thp)
+ rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+ trap, flags, ssize, psize);
+#ifdef CONFIG_HUGETLB_PAGE
+ else
+ rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+ flags, ssize, hugeshift, psize);
+#else
+ else {
+ /*
+ * if we have hugeshift, and is not transhuge with
+ * hugetlb disabled, something is really wrong.
+ */
+ rc = 1;
+ WARN_ON(1);
+ }
+#endif
+ if (current->mm == mm)
+ check_paca_psize(ea, mm, psize, user_region);
+
+ goto bail;
+ }
+
+#ifndef CONFIG_PPC_64K_PAGES
+ DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
+#else
+ DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
+ pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+ /* Do actual hashing */
+#ifdef CONFIG_PPC_64K_PAGES
+ /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */
+ if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
+ demote_segment_4k(mm, ea);
+ psize = MMU_PAGE_4K;
+ }
+
+ /*
+ * If this PTE is non-cacheable and we have restrictions on
+ * using non cacheable large pages, then we switch to 4k
+ */
+ if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) {
+ if (user_region) {
+ demote_segment_4k(mm, ea);
+ psize = MMU_PAGE_4K;
+ } else if (ea < VMALLOC_END) {
+ /*
+ * some driver did a non-cacheable mapping
+ * in vmalloc space, so switch vmalloc
+ * to 4k pages
+ */
+ printk(KERN_ALERT "Reducing vmalloc segment "
+ "to 4kB pages because of "
+ "non-cacheable mapping\n");
+ psize = mmu_vmalloc_psize = MMU_PAGE_4K;
+ copro_flush_all_slbs(mm);
+ }
+ }
+
+#endif /* CONFIG_PPC_64K_PAGES */
+
+ if (current->mm == mm)
+ check_paca_psize(ea, mm, psize, user_region);
+
+#ifdef CONFIG_PPC_64K_PAGES
+ if (psize == MMU_PAGE_64K)
+ rc = __hash_page_64K(ea, access, vsid, ptep, trap,
+ flags, ssize);
+ else
+#endif /* CONFIG_PPC_64K_PAGES */
+ {
+ int spp = subpage_protection(mm, ea);
+ if (access & spp)
+ rc = -2;
+ else
+ rc = __hash_page_4K(ea, access, vsid, ptep, trap,
+ flags, ssize, spp);
+ }
+
+ /*
+ * Dump some info in case of hash insertion failure, they should
+ * never happen so it is really useful to know if/when they do
+ */
+ if (rc == -1)
+ hash_failure_debug(ea, access, vsid, trap, ssize, psize,
+ psize, pte_val(*ptep));
+#ifndef CONFIG_PPC_64K_PAGES
+ DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
+#else
+ DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
+ pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+ DBG_LOW(" -> rc=%d\n", rc);
+
+bail:
+ exception_exit(prev_state);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(hash_page_mm);
+
+int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
+ unsigned long dsisr)
+{
+ unsigned long flags = 0;
+ struct mm_struct *mm = current->mm;
+
+ if ((get_region_id(ea) == VMALLOC_REGION_ID) ||
+ (get_region_id(ea) == IO_REGION_ID))
+ mm = &init_mm;
+
+ if (dsisr & DSISR_NOHPTE)
+ flags |= HPTE_NOHPTE_UPDATE;
+
+ return hash_page_mm(mm, ea, access, trap, flags);
+}
+EXPORT_SYMBOL_GPL(hash_page);
+
+int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
+ unsigned long dsisr)
+{
+ unsigned long access = _PAGE_PRESENT | _PAGE_READ;
+ unsigned long flags = 0;
+ struct mm_struct *mm = current->mm;
+ unsigned int region_id = get_region_id(ea);
+
+ if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
+ mm = &init_mm;
+
+ if (dsisr & DSISR_NOHPTE)
+ flags |= HPTE_NOHPTE_UPDATE;
+
+ if (dsisr & DSISR_ISSTORE)
+ access |= _PAGE_WRITE;
+ /*
+ * We set _PAGE_PRIVILEGED only when
+ * kernel mode access kernel space.
+ *
+ * _PAGE_PRIVILEGED is NOT set
+ * 1) when kernel mode access user space
+ * 2) user space access kernel space.
+ */
+ access |= _PAGE_PRIVILEGED;
+ if ((msr & MSR_PR) || (region_id == USER_REGION_ID))
+ access &= ~_PAGE_PRIVILEGED;
+
+ if (trap == 0x400)
+ access |= _PAGE_EXEC;
+
+ return hash_page_mm(mm, ea, access, trap, flags);
+}
+
+#ifdef CONFIG_PPC_MM_SLICES
+static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+ int psize = get_slice_psize(mm, ea);
+
+ /* We only prefault standard pages for now */
+ if (unlikely(psize != mm_ctx_user_psize(&mm->context)))
+ return false;
+
+ /*
+ * Don't prefault if subpage protection is enabled for the EA.
+ */
+ if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea)))
+ return false;
+
+ return true;
+}
+#else
+static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+ return true;
+}
+#endif
+
+void hash_preload(struct mm_struct *mm, unsigned long ea,
+ bool is_exec, unsigned long trap)
+{
+ int hugepage_shift;
+ unsigned long vsid;
+ pgd_t *pgdir;
+ pte_t *ptep;
+ unsigned long flags;
+ int rc, ssize, update_flags = 0;
+ unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
+
+ BUG_ON(get_region_id(ea) != USER_REGION_ID);
+
+ if (!should_hash_preload(mm, ea))
+ return;
+
+ DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
+ " trap=%lx\n", mm, mm->pgd, ea, access, trap);
+
+ /* Get Linux PTE if available */
+ pgdir = mm->pgd;
+ if (pgdir == NULL)
+ return;
+
+ /* Get VSID */
+ ssize = user_segment_size(ea);
+ vsid = get_user_vsid(&mm->context, ea, ssize);
+ if (!vsid)
+ return;
+ /*
+ * Hash doesn't like irqs. Walking linux page table with irq disabled
+ * saves us from holding multiple locks.
+ */
+ local_irq_save(flags);
+
+ /*
+ * THP pages use update_mmu_cache_pmd. We don't do
+ * hash preload there. Hence can ignore THP here
+ */
+ ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
+ if (!ptep)
+ goto out_exit;
+
+ WARN_ON(hugepage_shift);
+#ifdef CONFIG_PPC_64K_PAGES
+ /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
+ * a 64K kernel), then we don't preload, hash_page() will take
+ * care of it once we actually try to access the page.
+ * That way we don't have to duplicate all of the logic for segment
+ * page size demotion here
+ */
+ if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
+ goto out_exit;
+#endif /* CONFIG_PPC_64K_PAGES */
+
+ /* Is that local to this CPU ? */
+ if (mm_is_thread_local(mm))
+ update_flags |= HPTE_LOCAL_UPDATE;
+
+ /* Hash it in */
+#ifdef CONFIG_PPC_64K_PAGES
+ if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K)
+ rc = __hash_page_64K(ea, access, vsid, ptep, trap,
+ update_flags, ssize);
+ else
+#endif /* CONFIG_PPC_64K_PAGES */
+ rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags,
+ ssize, subpage_protection(mm, ea));
+
+ /* Dump some info in case of hash insertion failure, they should
+ * never happen so it is really useful to know if/when they do
+ */
+ if (rc == -1)
+ hash_failure_debug(ea, access, vsid, trap, ssize,
+ mm_ctx_user_psize(&mm->context),
+ mm_ctx_user_psize(&mm->context),
+ pte_val(*ptep));
+out_exit:
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_PPC_MEM_KEYS
+/*
+ * Return the protection key associated with the given address and the
+ * mm_struct.
+ */
+u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
+{
+ pte_t *ptep;
+ u16 pkey = 0;
+ unsigned long flags;
+
+ if (!mm || !mm->pgd)
+ return 0;
+
+ local_irq_save(flags);
+ ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
+ if (ptep)
+ pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
+ local_irq_restore(flags);
+
+ return pkey;
+}
+#endif /* CONFIG_PPC_MEM_KEYS */
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void tm_flush_hash_page(int local)
+{
+ /*
+ * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a
+ * page back to a block device w/PIO could pick up transactional data
+ * (bad!) so we force an abort here. Before the sync the page will be
+ * made read-only, which will flush_hash_page. BIG ISSUE here: if the
+ * kernel uses a page from userspace without unmapping it first, it may
+ * see the speculated version.
+ */
+ if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+ MSR_TM_ACTIVE(current->thread.regs->msr)) {
+ tm_enable();
+ tm_abort(TM_CAUSE_TLBI);
+ }
+}
+#else
+static inline void tm_flush_hash_page(int local)
+{
+}
+#endif
+
+/*
+ * Return the global hash slot, corresponding to the given PTE, which contains
+ * the HPTE.
+ */
+unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
+ int ssize, real_pte_t rpte, unsigned int subpg_index)
+{
+ unsigned long hash, gslot, hidx;
+
+ hash = hpt_hash(vpn, shift, ssize);
+ hidx = __rpte_to_hidx(rpte, subpg_index);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ gslot += hidx & _PTEIDX_GROUP_IX;
+ return gslot;
+}
+
+/*
+ * WARNING: This is called from hash_low_64.S, if you change this prototype,
+ * do not forget to update the assembly call site !
+ */
+void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
+ unsigned long flags)
+{
+ unsigned long index, shift, gslot;
+ int local = flags & HPTE_LOCAL_UPDATE;
+
+ DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
+ pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+ gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index);
+ DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot);
+ /*
+ * We use same base page size and actual psize, because we don't
+ * use these functions for hugepage
+ */
+ mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize,
+ ssize, local);
+ } pte_iterate_hashed_end();
+
+ tm_flush_hash_page(local);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
+ pmd_t *pmdp, unsigned int psize, int ssize,
+ unsigned long flags)
+{
+ int i, max_hpte_count, valid;
+ unsigned long s_addr;
+ unsigned char *hpte_slot_array;
+ unsigned long hidx, shift, vpn, hash, slot;
+ int local = flags & HPTE_LOCAL_UPDATE;
+
+ s_addr = addr & HPAGE_PMD_MASK;
+ hpte_slot_array = get_hpte_slot_array(pmdp);
+ /*
+ * IF we try to do a HUGE PTE update after a withdraw is done.
+ * we will find the below NULL. This happens when we do
+ * split_huge_page_pmd
+ */
+ if (!hpte_slot_array)
+ return;
+
+ if (mmu_hash_ops.hugepage_invalidate) {
+ mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
+ psize, ssize, local);
+ goto tm_abort;
+ }
+ /*
+ * No bluk hpte removal support, invalidate each entry
+ */
+ shift = mmu_psize_defs[psize].shift;
+ max_hpte_count = HPAGE_PMD_SIZE >> shift;
+ for (i = 0; i < max_hpte_count; i++) {
+ /*
+ * 8 bits per each hpte entries
+ * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+ */
+ valid = hpte_valid(hpte_slot_array, i);
+ if (!valid)
+ continue;
+ hidx = hpte_hash_index(hpte_slot_array, i);
+
+ /* get the vpn */
+ addr = s_addr + (i * (1ul << shift));
+ vpn = hpt_vpn(addr, vsid, ssize);
+ hash = hpt_hash(vpn, shift, ssize);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+ mmu_hash_ops.hpte_invalidate(slot, vpn, psize,
+ MMU_PAGE_16M, ssize, local);
+ }
+tm_abort:
+ tm_flush_hash_page(local);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void flush_hash_range(unsigned long number, int local)
+{
+ if (mmu_hash_ops.flush_hash_range)
+ mmu_hash_ops.flush_hash_range(number, local);
+ else {
+ int i;
+ struct ppc64_tlb_batch *batch =
+ this_cpu_ptr(&ppc64_tlb_batch);
+
+ for (i = 0; i < number; i++)
+ flush_hash_page(batch->vpn[i], batch->pte[i],
+ batch->psize, batch->ssize, local);
+ }
+}
+
+/*
+ * low_hash_fault is called when we the low level hash code failed
+ * to instert a PTE due to an hypervisor error
+ */
+void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
+{
+ enum ctx_state prev_state = exception_enter();
+
+ if (user_mode(regs)) {
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+ if (rc == -2)
+ _exception(SIGSEGV, regs, SEGV_ACCERR, address);
+ else
+#endif
+ _exception(SIGBUS, regs, BUS_ADRERR, address);
+ } else
+ bad_page_fault(regs, address, SIGBUS);
+
+ exception_exit(prev_state);
+}
+
+long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+ unsigned long pa, unsigned long rflags,
+ unsigned long vflags, int psize, int ssize)
+{
+ unsigned long hpte_group;
+ long slot;
+
+repeat:
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+ /* Insert into the hash table, primary slot */
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
+ psize, psize, ssize);
+
+ /* Primary is full, try the secondary */
+ if (unlikely(slot == -1)) {
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags,
+ vflags | HPTE_V_SECONDARY,
+ psize, psize, ssize);
+ if (slot == -1) {
+ if (mftb() & 0x1)
+ hpte_group = (hash & htab_hash_mask) *
+ HPTES_PER_GROUP;
+
+ mmu_hash_ops.hpte_remove(hpte_group);
+ goto repeat;
+ }
+ }
+
+ return slot;
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
+{
+ unsigned long hash;
+ unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+ unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
+ long ret;
+
+ hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+
+ /* Don't create HPTE entries for bad address */
+ if (!vsid)
+ return;
+
+ ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
+ HPTE_V_BOLTED,
+ mmu_linear_psize, mmu_kernel_ssize);
+
+ BUG_ON (ret < 0);
+ spin_lock(&linear_map_hash_lock);
+ BUG_ON(linear_map_hash_slots[lmi] & 0x80);
+ linear_map_hash_slots[lmi] = ret | 0x80;
+ spin_unlock(&linear_map_hash_lock);
+}
+
+static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
+{
+ unsigned long hash, hidx, slot;
+ unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+ unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+
+ hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+ spin_lock(&linear_map_hash_lock);
+ BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
+ hidx = linear_map_hash_slots[lmi] & 0x7f;
+ linear_map_hash_slots[lmi] = 0;
+ spin_unlock(&linear_map_hash_lock);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+ mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
+ mmu_linear_psize,
+ mmu_kernel_ssize, 0);
+}
+
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ unsigned long flags, vaddr, lmi;
+ int i;
+
+ local_irq_save(flags);
+ for (i = 0; i < numpages; i++, page++) {
+ vaddr = (unsigned long)page_address(page);
+ lmi = __pa(vaddr) >> PAGE_SHIFT;
+ if (lmi >= linear_map_hash_count)
+ continue;
+ if (enable)
+ kernel_map_linear_page(vaddr, lmi);
+ else
+ kernel_unmap_linear_page(vaddr, lmi);
+ }
+ local_irq_restore(flags);
+}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /*
+ * We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+
+ /*
+ * On virtualized systems the first entry is our RMA region aka VRMA,
+ * non-virtualized 64-bit hash MMU systems don't have a limitation
+ * on real mode access.
+ *
+ * For guests on platforms before POWER9, we clamp the it limit to 1G
+ * to avoid some funky things such as RTAS bugs etc...
+ */
+ if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
+ ppc64_rma_size = first_memblock_size;
+ if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
+ ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000);
+
+ /* Finally limit subsequent allocations */
+ memblock_set_current_limit(ppc64_rma_size);
+ } else {
+ ppc64_rma_size = ULONG_MAX;
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static int hpt_order_get(void *data, u64 *val)
+{
+ *val = ppc64_pft_size;
+ return 0;
+}
+
+static int hpt_order_set(void *data, u64 val)
+{
+ if (!mmu_hash_ops.resize_hpt)
+ return -ENODEV;
+
+ return mmu_hash_ops.resize_hpt(val);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
+
+static int __init hash64_debugfs(void)
+{
+ if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
+ NULL, &fops_hpt_order)) {
+ pr_err("lpar: unable to create hpt_order debugsfs file\n");
+ }
+
+ return 0;
+}
+machine_device_initcall(pseries, hash64_debugfs);
+#endif /* CONFIG_DEBUG_FS */
--- /dev/null
+/*
+ * IOMMU helpers in MMU context.
+ *
+ * Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/sizes.h>
+#include <asm/mmu_context.h>
+#include <asm/pte-walk.h>
+#include <linux/mm_inline.h>
+
+static DEFINE_MUTEX(mem_list_mutex);
+
+#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1
+#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1)
+
+struct mm_iommu_table_group_mem_t {
+ struct list_head next;
+ struct rcu_head rcu;
+ unsigned long used;
+ atomic64_t mapped;
+ unsigned int pageshift;
+ u64 ua; /* userspace address */
+ u64 entries; /* number of entries in hpas/hpages[] */
+ /*
+ * in mm_iommu_get we temporarily use this to store
+ * struct page address.
+ *
+ * We need to convert ua to hpa in real mode. Make it
+ * simpler by storing physical address.
+ */
+ union {
+ struct page **hpages; /* vmalloc'ed */
+ phys_addr_t *hpas;
+ };
+#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1)
+ u64 dev_hpa; /* Device memory base address */
+};
+
+static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
+ unsigned long npages, bool incr)
+{
+ long ret = 0, locked, lock_limit;
+
+ if (!npages)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+
+ if (incr) {
+ locked = mm->locked_vm + npages;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ ret = -ENOMEM;
+ else
+ mm->locked_vm += npages;
+ } else {
+ if (WARN_ON_ONCE(npages > mm->locked_vm))
+ npages = mm->locked_vm;
+ mm->locked_vm -= npages;
+ }
+
+ pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
+ current ? current->pid : 0,
+ incr ? '+' : '-',
+ npages << PAGE_SHIFT,
+ mm->locked_vm << PAGE_SHIFT,
+ rlimit(RLIMIT_MEMLOCK));
+ up_write(&mm->mmap_sem);
+
+ return ret;
+}
+
+bool mm_iommu_preregistered(struct mm_struct *mm)
+{
+ return !list_empty(&mm->context.iommu_group_mem_list);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
+
+static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
+ unsigned long entries, unsigned long dev_hpa,
+ struct mm_iommu_table_group_mem_t **pmem)
+{
+ struct mm_iommu_table_group_mem_t *mem;
+ long i, ret, locked_entries = 0;
+ unsigned int pageshift;
+
+ mutex_lock(&mem_list_mutex);
+
+ list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list,
+ next) {
+ /* Overlap? */
+ if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
+ (ua < (mem->ua +
+ (mem->entries << PAGE_SHIFT)))) {
+ ret = -EINVAL;
+ goto unlock_exit;
+ }
+
+ }
+
+ if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
+ ret = mm_iommu_adjust_locked_vm(mm, entries, true);
+ if (ret)
+ goto unlock_exit;
+
+ locked_entries = entries;
+ }
+
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem) {
+ ret = -ENOMEM;
+ goto unlock_exit;
+ }
+
+ if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
+ mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
+ mem->dev_hpa = dev_hpa;
+ goto good_exit;
+ }
+ mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA;
+
+ /*
+ * For a starting point for a maximum page size calculation
+ * we use @ua and @entries natural alignment to allow IOMMU pages
+ * smaller than huge pages but still bigger than PAGE_SIZE.
+ */
+ mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT));
+ mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0])));
+ if (!mem->hpas) {
+ kfree(mem);
+ ret = -ENOMEM;
+ goto unlock_exit;
+ }
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL);
+ up_read(&mm->mmap_sem);
+ if (ret != entries) {
+ /* free the reference taken */
+ for (i = 0; i < ret; i++)
+ put_page(mem->hpages[i]);
+
+ vfree(mem->hpas);
+ kfree(mem);
+ ret = -EFAULT;
+ goto unlock_exit;
+ }
+
+ pageshift = PAGE_SHIFT;
+ for (i = 0; i < entries; ++i) {
+ struct page *page = mem->hpages[i];
+
+ /*
+ * Allow to use larger than 64k IOMMU pages. Only do that
+ * if we are backed by hugetlb.
+ */
+ if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
+ struct page *head = compound_head(page);
+
+ pageshift = compound_order(head) + PAGE_SHIFT;
+ }
+ mem->pageshift = min(mem->pageshift, pageshift);
+ /*
+ * We don't need struct page reference any more, switch
+ * to physical address.
+ */
+ mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
+ }
+
+good_exit:
+ ret = 0;
+ atomic64_set(&mem->mapped, 1);
+ mem->used = 1;
+ mem->ua = ua;
+ mem->entries = entries;
+ *pmem = mem;
+
+ list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
+
+unlock_exit:
+ if (locked_entries && ret)
+ mm_iommu_adjust_locked_vm(mm, locked_entries, false);
+
+ mutex_unlock(&mem_list_mutex);
+
+ return ret;
+}
+
+long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
+ struct mm_iommu_table_group_mem_t **pmem)
+{
+ return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA,
+ pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_new);
+
+long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
+ unsigned long entries, unsigned long dev_hpa,
+ struct mm_iommu_table_group_mem_t **pmem)
+{
+ return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_newdev);
+
+static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
+{
+ long i;
+ struct page *page = NULL;
+
+ if (!mem->hpas)
+ return;
+
+ for (i = 0; i < mem->entries; ++i) {
+ if (!mem->hpas[i])
+ continue;
+
+ page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
+ if (!page)
+ continue;
+
+ if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
+ SetPageDirty(page);
+
+ put_page(page);
+ mem->hpas[i] = 0;
+ }
+}
+
+static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
+{
+
+ mm_iommu_unpin(mem);
+ vfree(mem->hpas);
+ kfree(mem);
+}
+
+static void mm_iommu_free(struct rcu_head *head)
+{
+ struct mm_iommu_table_group_mem_t *mem = container_of(head,
+ struct mm_iommu_table_group_mem_t, rcu);
+
+ mm_iommu_do_free(mem);
+}
+
+static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
+{
+ list_del_rcu(&mem->next);
+ call_rcu(&mem->rcu, mm_iommu_free);
+}
+
+long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
+{
+ long ret = 0;
+ unsigned long entries, dev_hpa;
+
+ mutex_lock(&mem_list_mutex);
+
+ if (mem->used == 0) {
+ ret = -ENOENT;
+ goto unlock_exit;
+ }
+
+ --mem->used;
+ /* There are still users, exit */
+ if (mem->used)
+ goto unlock_exit;
+
+ /* Are there still mappings? */
+ if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
+ ++mem->used;
+ ret = -EBUSY;
+ goto unlock_exit;
+ }
+
+ /* @mapped became 0 so now mappings are disabled, release the region */
+ entries = mem->entries;
+ dev_hpa = mem->dev_hpa;
+ mm_iommu_release(mem);
+
+ if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+ mm_iommu_adjust_locked_vm(mm, entries, false);
+
+unlock_exit:
+ mutex_unlock(&mem_list_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_put);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
+ unsigned long ua, unsigned long size)
+{
+ struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+ list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+ if ((mem->ua <= ua) &&
+ (ua + size <= mem->ua +
+ (mem->entries << PAGE_SHIFT))) {
+ ret = mem;
+ break;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
+ unsigned long ua, unsigned long size)
+{
+ struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+ list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
+ next) {
+ if ((mem->ua <= ua) &&
+ (ua + size <= mem->ua +
+ (mem->entries << PAGE_SHIFT))) {
+ ret = mem;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
+ unsigned long ua, unsigned long entries)
+{
+ struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+ mutex_lock(&mem_list_mutex);
+
+ list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+ if ((mem->ua == ua) && (mem->entries == entries)) {
+ ret = mem;
+ ++mem->used;
+ break;
+ }
+ }
+
+ mutex_unlock(&mem_list_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_get);
+
+long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+ unsigned long ua, unsigned int pageshift, unsigned long *hpa)
+{
+ const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+ u64 *va;
+
+ if (entry >= mem->entries)
+ return -EFAULT;
+
+ if (pageshift > mem->pageshift)
+ return -EFAULT;
+
+ if (!mem->hpas) {
+ *hpa = mem->dev_hpa + (ua - mem->ua);
+ return 0;
+ }
+
+ va = &mem->hpas[entry];
+ *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
+
+long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+ unsigned long ua, unsigned int pageshift, unsigned long *hpa)
+{
+ const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+ unsigned long *pa;
+
+ if (entry >= mem->entries)
+ return -EFAULT;
+
+ if (pageshift > mem->pageshift)
+ return -EFAULT;
+
+ if (!mem->hpas) {
+ *hpa = mem->dev_hpa + (ua - mem->ua);
+ return 0;
+ }
+
+ pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
+ if (!pa)
+ return -EFAULT;
+
+ *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
+
+ return 0;
+}
+
+extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
+{
+ struct mm_iommu_table_group_mem_t *mem;
+ long entry;
+ void *va;
+ unsigned long *pa;
+
+ mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
+ if (!mem)
+ return;
+
+ if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA)
+ return;
+
+ entry = (ua - mem->ua) >> PAGE_SHIFT;
+ va = &mem->hpas[entry];
+
+ pa = (void *) vmalloc_to_phys(va);
+ if (!pa)
+ return;
+
+ *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
+}
+
+bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+ unsigned int pageshift, unsigned long *size)
+{
+ struct mm_iommu_table_group_mem_t *mem;
+ unsigned long end;
+
+ list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+ if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+ continue;
+
+ end = mem->dev_hpa + (mem->entries << PAGE_SHIFT);
+ if ((mem->dev_hpa <= hpa) && (hpa < end)) {
+ /*
+ * Since the IOMMU page size might be bigger than
+ * PAGE_SIZE, the amount of preregistered memory
+ * starting from @hpa might be smaller than 1<<pageshift
+ * and the caller needs to distinguish this situation.
+ */
+ *size = min(1UL << pageshift, end - hpa);
+ return true;
+ }
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_is_devmem);
+
+long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
+{
+ if (atomic64_inc_not_zero(&mem->mapped))
+ return 0;
+
+ /* Last mm_iommu_put() has been called, no more mappings allowed() */
+ return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
+
+void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
+{
+ atomic64_add_unless(&mem->mapped, -1, 1);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
+
+void mm_iommu_init(struct mm_struct *mm)
+{
+ INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
+}
--- /dev/null
+/*
+ * MMU context allocation for 64-bit kernels.
+ *
+ * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/pkeys.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+
+static DEFINE_IDA(mmu_context_ida);
+
+static int alloc_context_id(int min_id, int max_id)
+{
+ return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL);
+}
+
+void hash__reserve_context_id(int id)
+{
+ int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL);
+
+ WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
+}
+
+int hash__alloc_context_id(void)
+{
+ unsigned long max;
+
+ if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+ max = MAX_USER_CONTEXT;
+ else
+ max = MAX_USER_CONTEXT_65BIT_VA;
+
+ return alloc_context_id(MIN_USER_CONTEXT, max);
+}
+EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+
+void slb_setup_new_exec(void);
+
+static int hash__init_new_context(struct mm_struct *mm)
+{
+ int index;
+
+ index = hash__alloc_context_id();
+ if (index < 0)
+ return index;
+
+ mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context),
+ GFP_KERNEL);
+ if (!mm->context.hash_context) {
+ ida_free(&mmu_context_ida, index);
+ return -ENOMEM;
+ }
+
+ /*
+ * The old code would re-promote on fork, we don't do that when using
+ * slices as it could cause problem promoting slices that have been
+ * forced down to 4K.
+ *
+ * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
+ * explicitly against context.id == 0. This ensures that we properly
+ * initialize context slice details for newly allocated mm's (which will
+ * have id == 0) and don't alter context slice inherited via fork (which
+ * will have id != 0).
+ *
+ * We should not be calling init_new_context() on init_mm. Hence a
+ * check against 0 is OK.
+ */
+ if (mm->context.id == 0) {
+ memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context));
+ slice_init_new_context_exec(mm);
+ } else {
+ /* This is fork. Copy hash_context details from current->mm */
+ memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+ /* inherit subpage prot detalis if we have one. */
+ if (current->mm->context.hash_context->spt) {
+ mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table),
+ GFP_KERNEL);
+ if (!mm->context.hash_context->spt) {
+ ida_free(&mmu_context_ida, index);
+ kfree(mm->context.hash_context);
+ return -ENOMEM;
+ }
+ }
+#endif
+
+ }
+
+ pkey_mm_init(mm);
+ return index;
+}
+
+void hash__setup_new_exec(void)
+{
+ slice_setup_new_exec();
+
+ slb_setup_new_exec();
+}
+
+static int radix__init_new_context(struct mm_struct *mm)
+{
+ unsigned long rts_field;
+ int index, max_id;
+
+ max_id = (1 << mmu_pid_bits) - 1;
+ index = alloc_context_id(mmu_base_pid, max_id);
+ if (index < 0)
+ return index;
+
+ /*
+ * set the process table entry,
+ */
+ rts_field = radix__get_tree_size();
+ process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
+
+ /*
+ * Order the above store with subsequent update of the PID
+ * register (at which point HW can start loading/caching
+ * the entry) and the corresponding load by the MMU from
+ * the L2 cache.
+ */
+ asm volatile("ptesync;isync" : : : "memory");
+
+ mm->context.npu_context = NULL;
+ mm->context.hash_context = NULL;
+
+ return index;
+}
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+ int index;
+
+ if (radix_enabled())
+ index = radix__init_new_context(mm);
+ else
+ index = hash__init_new_context(mm);
+
+ if (index < 0)
+ return index;
+
+ mm->context.id = index;
+
+ mm->context.pte_frag = NULL;
+ mm->context.pmd_frag = NULL;
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+ mm_iommu_init(mm);
+#endif
+ atomic_set(&mm->context.active_cpus, 0);
+ atomic_set(&mm->context.copros, 0);
+
+ return 0;
+}
+
+void __destroy_context(int context_id)
+{
+ ida_free(&mmu_context_ida, context_id);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
+
+static void destroy_contexts(mm_context_t *ctx)
+{
+ int index, context_id;
+
+ for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
+ context_id = ctx->extended_id[index];
+ if (context_id)
+ ida_free(&mmu_context_ida, context_id);
+ }
+ kfree(ctx->hash_context);
+}
+
+static void pmd_frag_destroy(void *pmd_frag)
+{
+ int count;
+ struct page *page;
+
+ page = virt_to_page(pmd_frag);
+ /* drop all the pending references */
+ count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
+ /* We allow PTE_FRAG_NR fragments from a PTE page */
+ if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
+ pgtable_pmd_page_dtor(page);
+ __free_page(page);
+ }
+}
+
+static void destroy_pagetable_cache(struct mm_struct *mm)
+{
+ void *frag;
+
+ frag = mm->context.pte_frag;
+ if (frag)
+ pte_frag_destroy(frag);
+
+ frag = mm->context.pmd_frag;
+ if (frag)
+ pmd_frag_destroy(frag);
+ return;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+ WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
+#endif
+ if (radix_enabled())
+ WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+ else
+ subpage_prot_free(mm);
+ destroy_contexts(&mm->context);
+ mm->context.id = MMU_NO_CONTEXT;
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
+ destroy_pagetable_cache(mm);
+
+ if (radix_enabled()) {
+ /*
+ * Radix doesn't have a valid bit in the process table
+ * entries. However we know that at least P9 implementation
+ * will avoid caching an entry with an invalid RTS field,
+ * and 0 is invalid. So this will do.
+ *
+ * This runs before the "fullmm" tlb flush in exit_mmap,
+ * which does a RIC=2 tlbie to clear the process table
+ * entry. See the "fullmm" comments in tlb-radix.c.
+ *
+ * No barrier required here after the store because
+ * this process will do the invalidate, which starts with
+ * ptesync.
+ */
+ process_tb[mm->context.id].prtb0 = 0;
+ }
+}
+
+#ifdef CONFIG_PPC_RADIX_MMU
+void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
+{
+ mtspr(SPRN_PID, next->context.id);
+ isync();
+}
+#endif
--- /dev/null
+/*
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm_types.h>
+#include <linux/memblock.h>
+#include <misc/cxl-base.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/trace.h>
+#include <asm/powernv.h>
+
+#include <mm/mmu_decl.h>
+#include <trace/events/thp.h>
+
+unsigned long __pmd_frag_nr;
+EXPORT_SYMBOL(__pmd_frag_nr);
+unsigned long __pmd_frag_size_shift;
+EXPORT_SYMBOL(__pmd_frag_size_shift);
+
+int (*register_process_table)(unsigned long base, unsigned long page_size,
+ unsigned long tbl_size);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp, pmd_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
+#endif
+ changed = !pmd_same(*(pmdp), entry);
+ if (changed) {
+ /*
+ * We can use MMU_PAGE_2M here, because only radix
+ * path look at the psize.
+ */
+ __ptep_set_access_flags(vma, pmdp_ptep(pmdp),
+ pmd_pte(entry), address, MMU_PAGE_2M);
+ }
+ return changed;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+
+ WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+ WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd)));
+#endif
+ trace_hugepage_set_pmd(addr, pmd_val(pmd));
+ return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+
+static void do_nothing(void *unused)
+{
+
+}
+/*
+ * Serialize against find_current_mm_pte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_current_mm_pte to finish.
+ */
+void serialize_against_pte_lookup(struct mm_struct *mm)
+{
+ smp_mb();
+ smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+}
+
+/*
+ * We use this to invalidate a pmdp entry before switching from a
+ * hugepte to regular pmd entry.
+ */
+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ unsigned long old_pmd;
+
+ old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
+ flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ /*
+ * This ensures that generic code that rely on IRQ disabling
+ * to prevent a parallel THP split work as expected.
+ */
+ serialize_against_pte_lookup(vma->vm_mm);
+ return __pmd(old_pmd);
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+ return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+ unsigned long pmdv;
+
+ pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+ return pmd_set_protbits(__pmd(pmdv), pgprot);
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+ return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ unsigned long pmdv;
+
+ pmdv = pmd_val(pmd);
+ pmdv &= _HPAGE_CHG_MASK;
+ return pmd_set_protbits(__pmd(pmdv), newprot);
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd)
+{
+ if (radix_enabled())
+ prefetch((void *)addr);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/* For use by kexec */
+void mmu_cleanup_all(void)
+{
+ if (radix_enabled())
+ radix__mmu_cleanup_all();
+ else if (mmu_hash_ops.hpte_clear_all)
+ mmu_hash_ops.hpte_clear_all();
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid)
+{
+ if (radix_enabled())
+ return radix__create_section_mapping(start, end, nid);
+
+ return hash__create_section_mapping(start, end, nid);
+}
+
+int __meminit remove_section_mapping(unsigned long start, unsigned long end)
+{
+ if (radix_enabled())
+ return radix__remove_section_mapping(start, end);
+
+ return hash__remove_section_mapping(start, end);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+void __init mmu_partition_table_init(void)
+{
+ unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+ unsigned long ptcr;
+
+ BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
+ /* Initialize the Partition Table with no entries */
+ partition_tb = memblock_alloc(patb_size, patb_size);
+ if (!partition_tb)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, patb_size, patb_size);
+
+ /*
+ * update partition table control register,
+ * 64 K size.
+ */
+ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
+ mtspr(SPRN_PTCR, ptcr);
+ powernv_set_nmmu_ptcr(ptcr);
+}
+
+void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
+ unsigned long dw1)
+{
+ unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
+ partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+ partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+
+ /*
+ * Global flush of TLBs and partition table caches for this lpid.
+ * The type of flush (hash or radix) depends on what the previous
+ * use of this partition ID was, not the new use.
+ */
+ asm volatile("ptesync" : : : "memory");
+ if (old & PATB_HR) {
+ asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
+ } else {
+ asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
+ }
+ /* do we need fixup here ?*/
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
+
+static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
+{
+ void *pmd_frag, *ret;
+
+ if (PMD_FRAG_NR == 1)
+ return NULL;
+
+ spin_lock(&mm->page_table_lock);
+ ret = mm->context.pmd_frag;
+ if (ret) {
+ pmd_frag = ret + PMD_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0)
+ pmd_frag = NULL;
+ mm->context.pmd_frag = pmd_frag;
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pmd_t *)ret;
+}
+
+static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
+{
+ void *ret = NULL;
+ struct page *page;
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
+
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
+ page = alloc_page(gfp);
+ if (!page)
+ return NULL;
+ if (!pgtable_pmd_page_ctor(page)) {
+ __free_pages(page, 0);
+ return NULL;
+ }
+
+ atomic_set(&page->pt_frag_refcount, 1);
+
+ ret = page_address(page);
+ /*
+ * if we support only one fragment just return the
+ * allocated page.
+ */
+ if (PMD_FRAG_NR == 1)
+ return ret;
+
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find pgtable_page set, we return
+ * the allocated page with single fragement
+ * count.
+ */
+ if (likely(!mm->context.pmd_frag)) {
+ atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
+ mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return (pmd_t *)ret;
+}
+
+pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
+{
+ pmd_t *pmd;
+
+ pmd = get_pmd_from_cache(mm);
+ if (pmd)
+ return pmd;
+
+ return __alloc_for_pmdcache(mm);
+}
+
+void pmd_fragment_free(unsigned long *pmd)
+{
+ struct page *page = virt_to_page(pmd);
+
+ BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&page->pt_frag_refcount)) {
+ pgtable_pmd_page_dtor(page);
+ __free_page(page);
+ }
+}
+
+static inline void pgtable_free(void *table, int index)
+{
+ switch (index) {
+ case PTE_INDEX:
+ pte_fragment_free(table, 0);
+ break;
+ case PMD_INDEX:
+ pmd_fragment_free(table);
+ break;
+ case PUD_INDEX:
+ kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
+ break;
+#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
+ /* 16M hugepd directory at pud level */
+ case HTLB_16M_INDEX:
+ BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0);
+ kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table);
+ break;
+ /* 16G hugepd directory at the pgd level */
+ case HTLB_16G_INDEX:
+ BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0);
+ kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table);
+ break;
+#endif
+ /* We don't free pgd table via RCU callback */
+ default:
+ BUG();
+ }
+}
+
+#ifdef CONFIG_SMP
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
+{
+ unsigned long pgf = (unsigned long)table;
+
+ BUG_ON(index > MAX_PGTABLE_INDEX_SIZE);
+ pgf |= index;
+ tlb_remove_table(tlb, (void *)pgf);
+}
+
+void __tlb_remove_table(void *_table)
+{
+ void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+ unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+ return pgtable_free(table, index);
+}
+#else
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
+{
+ return pgtable_free(table, index);
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
+
+void arch_report_meminfo(struct seq_file *m)
+{
+ /*
+ * Hash maps the memory with one size mmu_linear_psize.
+ * So don't bother to print these on hash
+ */
+ if (!radix_enabled())
+ return;
+ seq_printf(m, "DirectMap4k: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2);
+ seq_printf(m, "DirectMap64k: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6);
+ seq_printf(m, "DirectMap2M: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
+ seq_printf(m, "DirectMap1G: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
+}
+#endif /* CONFIG_PROC_FS */
+
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep)
+{
+ unsigned long pte_val;
+
+ /*
+ * Clear the _PAGE_PRESENT so that no hardware parallel update is
+ * possible. Also keep the pte_present true so that we don't take
+ * wrong fault.
+ */
+ pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
+
+ return __pte(pte_val);
+
+}
+
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+ if (radix_enabled())
+ return radix__ptep_modify_prot_commit(vma, addr,
+ ptep, old_pte, pte);
+ set_pte_at(vma->vm_mm, addr, ptep, pte);
+}
+
+/*
+ * For hash translation mode, we use the deposited table to store hash slot
+ * information and they are stored at PTRS_PER_PMD offset from related pmd
+ * location. Hence a pmd move requires deposit and withdraw.
+ *
+ * For radix translation with split pmd ptl, we store the deposited table in the
+ * pmd page. Hence if we have different pmd page we need to withdraw during pmd
+ * move.
+ *
+ * With hash we use deposited table always irrespective of anon or not.
+ * With radix we use deposited table only for anonymous mapping.
+ */
+int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
+ struct spinlock *old_pmd_ptl,
+ struct vm_area_struct *vma)
+{
+ if (radix_enabled())
+ return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+
+ return true;
+}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PowerPC Memory Protection Keys management
+ *
+ * Copyright 2017, Ram Pai, IBM Corporation.
+ */
+
+#include <asm/mman.h>
+#include <asm/mmu_context.h>
+#include <asm/mmu.h>
+#include <asm/setup.h>
+#include <linux/pkeys.h>
+#include <linux/of_device.h>
+
+DEFINE_STATIC_KEY_TRUE(pkey_disabled);
+int pkeys_total; /* Total pkeys as per device tree */
+u32 initial_allocation_mask; /* Bits set for the initially allocated keys */
+u32 reserved_allocation_mask; /* Bits set for reserved keys */
+static bool pkey_execute_disable_supported;
+static bool pkeys_devtree_defined; /* property exported by device tree */
+static u64 pkey_amr_mask; /* Bits in AMR not to be touched */
+static u64 pkey_iamr_mask; /* Bits in AMR not to be touched */
+static u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */
+static int execute_only_key = 2;
+
+#define AMR_BITS_PER_PKEY 2
+#define AMR_RD_BIT 0x1UL
+#define AMR_WR_BIT 0x2UL
+#define IAMR_EX_BIT 0x1UL
+#define PKEY_REG_BITS (sizeof(u64)*8)
+#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
+
+static void scan_pkey_feature(void)
+{
+ u32 vals[2];
+ struct device_node *cpu;
+
+ cpu = of_find_node_by_type(NULL, "cpu");
+ if (!cpu)
+ return;
+
+ if (of_property_read_u32_array(cpu,
+ "ibm,processor-storage-keys", vals, 2))
+ return;
+
+ /*
+ * Since any pkey can be used for data or execute, we will just treat
+ * all keys as equal and track them as one entity.
+ */
+ pkeys_total = vals[0];
+ pkeys_devtree_defined = true;
+}
+
+static inline bool pkey_mmu_enabled(void)
+{
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ return pkeys_total;
+ else
+ return cpu_has_feature(CPU_FTR_PKEY);
+}
+
+static int pkey_initialize(void)
+{
+ int os_reserved, i;
+
+ /*
+ * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral
+ * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE.
+ * Ensure that the bits a distinct.
+ */
+ BUILD_BUG_ON(PKEY_DISABLE_EXECUTE &
+ (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+ /*
+ * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous
+ * in the vmaflag. Make sure that is really the case.
+ */
+ BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) +
+ __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)
+ != (sizeof(u64) * BITS_PER_BYTE));
+
+ /* scan the device tree for pkey feature */
+ scan_pkey_feature();
+
+ /*
+ * Let's assume 32 pkeys on P8 bare metal, if its not defined by device
+ * tree. We make this exception since skiboot forgot to expose this
+ * property on power8.
+ */
+ if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) &&
+ cpu_has_feature(CPU_FTRS_POWER8))
+ pkeys_total = 32;
+
+ /*
+ * Adjust the upper limit, based on the number of bits supported by
+ * arch-neutral code.
+ */
+ pkeys_total = min_t(int, pkeys_total,
+ ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1));
+
+ if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total)
+ static_branch_enable(&pkey_disabled);
+ else
+ static_branch_disable(&pkey_disabled);
+
+ if (static_branch_likely(&pkey_disabled))
+ return 0;
+
+ /*
+ * The device tree cannot be relied to indicate support for
+ * execute_disable support. Instead we use a PVR check.
+ */
+ if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p))
+ pkey_execute_disable_supported = false;
+ else
+ pkey_execute_disable_supported = true;
+
+#ifdef CONFIG_PPC_4K_PAGES
+ /*
+ * The OS can manage only 8 pkeys due to its inability to represent them
+ * in the Linux 4K PTE.
+ */
+ os_reserved = pkeys_total - 8;
+#else
+ os_reserved = 0;
+#endif
+ /* Bits are in LE format. */
+ reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);
+
+ /* register mask is in BE format */
+ pkey_amr_mask = ~0x0ul;
+ pkey_amr_mask &= ~(0x3ul << pkeyshift(0));
+
+ pkey_iamr_mask = ~0x0ul;
+ pkey_iamr_mask &= ~(0x3ul << pkeyshift(0));
+ pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key));
+
+ pkey_uamor_mask = ~0x0ul;
+ pkey_uamor_mask &= ~(0x3ul << pkeyshift(0));
+ pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key));
+
+ /* mark the rest of the keys as reserved and hence unavailable */
+ for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) {
+ reserved_allocation_mask |= (0x1 << i);
+ pkey_uamor_mask &= ~(0x3ul << pkeyshift(i));
+ }
+ initial_allocation_mask = reserved_allocation_mask | (0x1 << 0);
+
+ if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) {
+ /*
+ * Insufficient number of keys to support
+ * execute only key. Mark it unavailable.
+ * Any AMR, UAMOR, IAMR bit set for
+ * this key is irrelevant since this key
+ * can never be allocated.
+ */
+ execute_only_key = -1;
+ }
+
+ return 0;
+}
+
+arch_initcall(pkey_initialize);
+
+void pkey_mm_init(struct mm_struct *mm)
+{
+ if (static_branch_likely(&pkey_disabled))
+ return;
+ mm_pkey_allocation_map(mm) = initial_allocation_mask;
+ mm->context.execute_only_pkey = execute_only_key;
+}
+
+static inline u64 read_amr(void)
+{
+ return mfspr(SPRN_AMR);
+}
+
+static inline void write_amr(u64 value)
+{
+ mtspr(SPRN_AMR, value);
+}
+
+static inline u64 read_iamr(void)
+{
+ if (!likely(pkey_execute_disable_supported))
+ return 0x0UL;
+
+ return mfspr(SPRN_IAMR);
+}
+
+static inline void write_iamr(u64 value)
+{
+ if (!likely(pkey_execute_disable_supported))
+ return;
+
+ mtspr(SPRN_IAMR, value);
+}
+
+static inline u64 read_uamor(void)
+{
+ return mfspr(SPRN_UAMOR);
+}
+
+static inline void write_uamor(u64 value)
+{
+ mtspr(SPRN_UAMOR, value);
+}
+
+static bool is_pkey_enabled(int pkey)
+{
+ u64 uamor = read_uamor();
+ u64 pkey_bits = 0x3ul << pkeyshift(pkey);
+ u64 uamor_pkey_bits = (uamor & pkey_bits);
+
+ /*
+ * Both the bits in UAMOR corresponding to the key should be set or
+ * reset.
+ */
+ WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits));
+ return !!(uamor_pkey_bits);
+}
+
+static inline void init_amr(int pkey, u8 init_bits)
+{
+ u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
+ u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
+
+ write_amr(old_amr | new_amr_bits);
+}
+
+static inline void init_iamr(int pkey, u8 init_bits)
+{
+ u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey));
+ u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey));
+
+ write_iamr(old_iamr | new_iamr_bits);
+}
+
+/*
+ * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that
+ * specified in @init_val.
+ */
+int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val)
+{
+ u64 new_amr_bits = 0x0ul;
+ u64 new_iamr_bits = 0x0ul;
+
+ if (!is_pkey_enabled(pkey))
+ return -EINVAL;
+
+ if (init_val & PKEY_DISABLE_EXECUTE) {
+ if (!pkey_execute_disable_supported)
+ return -EINVAL;
+ new_iamr_bits |= IAMR_EX_BIT;
+ }
+ init_iamr(pkey, new_iamr_bits);
+
+ /* Set the bits we need in AMR: */
+ if (init_val & PKEY_DISABLE_ACCESS)
+ new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT;
+ else if (init_val & PKEY_DISABLE_WRITE)
+ new_amr_bits |= AMR_WR_BIT;
+
+ init_amr(pkey, new_amr_bits);
+ return 0;
+}
+
+void thread_pkey_regs_save(struct thread_struct *thread)
+{
+ if (static_branch_likely(&pkey_disabled))
+ return;
+
+ /*
+ * TODO: Skip saving registers if @thread hasn't used any keys yet.
+ */
+ thread->amr = read_amr();
+ thread->iamr = read_iamr();
+ thread->uamor = read_uamor();
+}
+
+void thread_pkey_regs_restore(struct thread_struct *new_thread,
+ struct thread_struct *old_thread)
+{
+ if (static_branch_likely(&pkey_disabled))
+ return;
+
+ if (old_thread->amr != new_thread->amr)
+ write_amr(new_thread->amr);
+ if (old_thread->iamr != new_thread->iamr)
+ write_iamr(new_thread->iamr);
+ if (old_thread->uamor != new_thread->uamor)
+ write_uamor(new_thread->uamor);
+}
+
+void thread_pkey_regs_init(struct thread_struct *thread)
+{
+ if (static_branch_likely(&pkey_disabled))
+ return;
+
+ thread->amr = pkey_amr_mask;
+ thread->iamr = pkey_iamr_mask;
+ thread->uamor = pkey_uamor_mask;
+
+ write_uamor(pkey_uamor_mask);
+ write_amr(pkey_amr_mask);
+ write_iamr(pkey_iamr_mask);
+}
+
+static inline bool pkey_allows_readwrite(int pkey)
+{
+ int pkey_shift = pkeyshift(pkey);
+
+ if (!is_pkey_enabled(pkey))
+ return true;
+
+ return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift));
+}
+
+int __execute_only_pkey(struct mm_struct *mm)
+{
+ return mm->context.execute_only_pkey;
+}
+
+static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
+{
+ /* Do this check first since the vm_flags should be hot */
+ if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+ return false;
+
+ return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
+}
+
+/*
+ * This should only be called for *plain* mprotect calls.
+ */
+int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot,
+ int pkey)
+{
+ /*
+ * If the currently associated pkey is execute-only, but the requested
+ * protection is not execute-only, move it back to the default pkey.
+ */
+ if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC))
+ return 0;
+
+ /*
+ * The requested protection is execute-only. Hence let's use an
+ * execute-only pkey.
+ */
+ if (prot == PROT_EXEC) {
+ pkey = execute_only_pkey(vma->vm_mm);
+ if (pkey > 0)
+ return pkey;
+ }
+
+ /* Nothing to override. */
+ return vma_pkey(vma);
+}
+
+static bool pkey_access_permitted(int pkey, bool write, bool execute)
+{
+ int pkey_shift;
+ u64 amr;
+
+ if (!is_pkey_enabled(pkey))
+ return true;
+
+ pkey_shift = pkeyshift(pkey);
+ if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift)))
+ return true;
+
+ amr = read_amr(); /* Delay reading amr until absolutely needed */
+ return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) ||
+ (write && !(amr & (AMR_WR_BIT << pkey_shift))));
+}
+
+bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
+{
+ if (static_branch_likely(&pkey_disabled))
+ return true;
+
+ return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
+}
+
+/*
+ * We only want to enforce protection keys on the current thread because we
+ * effectively have no access to AMR/IAMR for other threads or any way to tell
+ * which AMR/IAMR in a threaded process we could use.
+ *
+ * So do not enforce things if the VMA is not from the current mm, or if we are
+ * in a kernel thread.
+ */
+static inline bool vma_is_foreign(struct vm_area_struct *vma)
+{
+ if (!current->mm)
+ return true;
+
+ /* if it is not our ->mm, it has to be foreign */
+ if (current->mm != vma->vm_mm)
+ return true;
+
+ return false;
+}
+
+bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
+ bool execute, bool foreign)
+{
+ if (static_branch_likely(&pkey_disabled))
+ return true;
+ /*
+ * Do not enforce our key-permissions on a foreign vma.
+ */
+ if (foreign || vma_is_foreign(vma))
+ return true;
+
+ return pkey_access_permitted(vma_pkey(vma), write, execute);
+}
+
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ if (static_branch_likely(&pkey_disabled))
+ return;
+
+ /* Duplicate the oldmm pkey state in mm: */
+ mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
+ mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/security.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+#include <asm/mman.h>
+#include <asm/tlb.h>
+
+void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ int psize;
+ struct hstate *hstate = hstate_file(vma->vm_file);
+
+ psize = hstate_get_psize(hstate);
+ radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
+}
+
+void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+ int psize;
+ struct hstate *hstate = hstate_file(vma->vm_file);
+
+ psize = hstate_get_psize(hstate);
+ radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
+}
+
+void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+{
+ int psize;
+ struct hstate *hstate = hstate_file(vma->vm_file);
+
+ psize = hstate_get_psize(hstate);
+ radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
+}
+
+/*
+ * A vairant of hugetlb_get_unmapped_area doing topdown search
+ * FIXME!! should we do as x86 does or non hugetlb area does ?
+ * ie, use topdown or not based on mmap_is_legacy check ?
+ */
+unsigned long
+radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct hstate *h = hstate_file(file);
+ int fixed = (flags & MAP_FIXED);
+ unsigned long high_limit;
+ struct vm_unmapped_area_info info;
+
+ high_limit = DEFAULT_MAP_WINDOW;
+ if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+ high_limit = TASK_SIZE;
+
+ if (len & ~huge_page_mask(h))
+ return -EINVAL;
+ if (len > high_limit)
+ return -ENOMEM;
+
+ if (fixed) {
+ if (addr > high_limit - len)
+ return -ENOMEM;
+ if (prepare_hugepage_range(file, addr, len))
+ return -EINVAL;
+ return addr;
+ }
+
+ if (addr) {
+ addr = ALIGN(addr, huge_page_size(h));
+ vma = find_vma(mm, addr);
+ if (high_limit - len >= addr && addr >= mmap_min_addr &&
+ (!vma || addr + len <= vm_start_gap(vma)))
+ return addr;
+ }
+ /*
+ * We are always doing an topdown search here. Slice code
+ * does that too.
+ */
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+
+ return vm_unmapped_area(&info);
+}
+
+void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ /*
+ * To avoid NMMU hang while relaxing access we need to flush the tlb before
+ * we set the new value.
+ */
+ if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ (atomic_read(&mm->context.copros) > 0))
+ radix__flush_hugetlb_page(vma, addr);
+
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
--- /dev/null
+/*
+ * Page table handling routines for radix page table.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "radix-mmu: " fmt
+
+#include <linux/kernel.h>
+#include <linux/sched/mm.h>
+#include <linux/memblock.h>
+#include <linux/of_fdt.h>
+#include <linux/mm.h>
+#include <linux/string_helpers.h>
+#include <linux/stop_machine.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/mmu_context.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/firmware.h>
+#include <asm/powernv.h>
+#include <asm/sections.h>
+#include <asm/trace.h>
+#include <asm/uaccess.h>
+
+#include <trace/events/thp.h>
+
+unsigned int mmu_pid_bits;
+unsigned int mmu_base_pid;
+
+static int native_register_process_table(unsigned long base, unsigned long pg_sz,
+ unsigned long table_size)
+{
+ unsigned long patb0, patb1;
+
+ patb0 = be64_to_cpu(partition_tb[0].patb0);
+ patb1 = base | table_size | PATB_GR;
+
+ mmu_partition_table_set_entry(0, patb0, patb1);
+
+ return 0;
+}
+
+static __ref void *early_alloc_pgtable(unsigned long size, int nid,
+ unsigned long region_start, unsigned long region_end)
+{
+ phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
+ phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
+ void *ptr;
+
+ if (region_start)
+ min_addr = region_start;
+ if (region_end)
+ max_addr = region_end;
+
+ ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
+
+ if (!ptr)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
+ __func__, size, size, nid, &min_addr, &max_addr);
+
+ return ptr;
+}
+
+static int early_map_kernel_page(unsigned long ea, unsigned long pa,
+ pgprot_t flags,
+ unsigned int map_page_size,
+ int nid,
+ unsigned long region_start, unsigned long region_end)
+{
+ unsigned long pfn = pa >> PAGE_SHIFT;
+ pgd_t *pgdp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ pgdp = pgd_offset_k(ea);
+ if (pgd_none(*pgdp)) {
+ pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
+ region_start, region_end);
+ pgd_populate(&init_mm, pgdp, pudp);
+ }
+ pudp = pud_offset(pgdp, ea);
+ if (map_page_size == PUD_SIZE) {
+ ptep = (pte_t *)pudp;
+ goto set_the_pte;
+ }
+ if (pud_none(*pudp)) {
+ pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
+ region_start, region_end);
+ pud_populate(&init_mm, pudp, pmdp);
+ }
+ pmdp = pmd_offset(pudp, ea);
+ if (map_page_size == PMD_SIZE) {
+ ptep = pmdp_ptep(pmdp);
+ goto set_the_pte;
+ }
+ if (!pmd_present(*pmdp)) {
+ ptep = early_alloc_pgtable(PAGE_SIZE, nid,
+ region_start, region_end);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+ }
+ ptep = pte_offset_kernel(pmdp, ea);
+
+set_the_pte:
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
+ smp_wmb();
+ return 0;
+}
+
+/*
+ * nid, region_start, and region_end are hints to try to place the page
+ * table memory in the same node or region.
+ */
+static int __map_kernel_page(unsigned long ea, unsigned long pa,
+ pgprot_t flags,
+ unsigned int map_page_size,
+ int nid,
+ unsigned long region_start, unsigned long region_end)
+{
+ unsigned long pfn = pa >> PAGE_SHIFT;
+ pgd_t *pgdp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ /*
+ * Make sure task size is correct as per the max adddr
+ */
+ BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
+
+#ifdef CONFIG_PPC_64K_PAGES
+ BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
+#endif
+
+ if (unlikely(!slab_is_available()))
+ return early_map_kernel_page(ea, pa, flags, map_page_size,
+ nid, region_start, region_end);
+
+ /*
+ * Should make page table allocation functions be able to take a
+ * node, so we can place kernel page tables on the right nodes after
+ * boot.
+ */
+ pgdp = pgd_offset_k(ea);
+ pudp = pud_alloc(&init_mm, pgdp, ea);
+ if (!pudp)
+ return -ENOMEM;
+ if (map_page_size == PUD_SIZE) {
+ ptep = (pte_t *)pudp;
+ goto set_the_pte;
+ }
+ pmdp = pmd_alloc(&init_mm, pudp, ea);
+ if (!pmdp)
+ return -ENOMEM;
+ if (map_page_size == PMD_SIZE) {
+ ptep = pmdp_ptep(pmdp);
+ goto set_the_pte;
+ }
+ ptep = pte_alloc_kernel(pmdp, ea);
+ if (!ptep)
+ return -ENOMEM;
+
+set_the_pte:
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
+ smp_wmb();
+ return 0;
+}
+
+int radix__map_kernel_page(unsigned long ea, unsigned long pa,
+ pgprot_t flags,
+ unsigned int map_page_size)
+{
+ return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
+}
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void radix__change_memory_range(unsigned long start, unsigned long end,
+ unsigned long clear)
+{
+ unsigned long idx;
+ pgd_t *pgdp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ start = ALIGN_DOWN(start, PAGE_SIZE);
+ end = PAGE_ALIGN(end); // aligns up
+
+ pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
+ start, end, clear);
+
+ for (idx = start; idx < end; idx += PAGE_SIZE) {
+ pgdp = pgd_offset_k(idx);
+ pudp = pud_alloc(&init_mm, pgdp, idx);
+ if (!pudp)
+ continue;
+ if (pud_huge(*pudp)) {
+ ptep = (pte_t *)pudp;
+ goto update_the_pte;
+ }
+ pmdp = pmd_alloc(&init_mm, pudp, idx);
+ if (!pmdp)
+ continue;
+ if (pmd_huge(*pmdp)) {
+ ptep = pmdp_ptep(pmdp);
+ goto update_the_pte;
+ }
+ ptep = pte_alloc_kernel(pmdp, idx);
+ if (!ptep)
+ continue;
+update_the_pte:
+ radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
+ }
+
+ radix__flush_tlb_kernel_range(start, end);
+}
+
+void radix__mark_rodata_ro(void)
+{
+ unsigned long start, end;
+
+ start = (unsigned long)_stext;
+ end = (unsigned long)__init_begin;
+
+ radix__change_memory_range(start, end, _PAGE_WRITE);
+}
+
+void radix__mark_initmem_nx(void)
+{
+ unsigned long start = (unsigned long)__init_begin;
+ unsigned long end = (unsigned long)__init_end;
+
+ radix__change_memory_range(start, end, _PAGE_EXEC);
+}
+#endif /* CONFIG_STRICT_KERNEL_RWX */
+
+static inline void __meminit
+print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
+{
+ char buf[10];
+
+ if (end <= start)
+ return;
+
+ string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
+
+ pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
+ exec ? " (exec)" : "");
+}
+
+static unsigned long next_boundary(unsigned long addr, unsigned long end)
+{
+#ifdef CONFIG_STRICT_KERNEL_RWX
+ if (addr < __pa_symbol(__init_begin))
+ return __pa_symbol(__init_begin);
+#endif
+ return end;
+}
+
+static int __meminit create_physical_mapping(unsigned long start,
+ unsigned long end,
+ int nid)
+{
+ unsigned long vaddr, addr, mapping_size = 0;
+ bool prev_exec, exec = false;
+ pgprot_t prot;
+ int psize;
+
+ start = _ALIGN_UP(start, PAGE_SIZE);
+ for (addr = start; addr < end; addr += mapping_size) {
+ unsigned long gap, previous_size;
+ int rc;
+
+ gap = next_boundary(addr, end) - addr;
+ previous_size = mapping_size;
+ prev_exec = exec;
+
+ if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
+ mmu_psize_defs[MMU_PAGE_1G].shift) {
+ mapping_size = PUD_SIZE;
+ psize = MMU_PAGE_1G;
+ } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
+ mmu_psize_defs[MMU_PAGE_2M].shift) {
+ mapping_size = PMD_SIZE;
+ psize = MMU_PAGE_2M;
+ } else {
+ mapping_size = PAGE_SIZE;
+ psize = mmu_virtual_psize;
+ }
+
+ vaddr = (unsigned long)__va(addr);
+
+ if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
+ overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
+ prot = PAGE_KERNEL_X;
+ exec = true;
+ } else {
+ prot = PAGE_KERNEL;
+ exec = false;
+ }
+
+ if (mapping_size != previous_size || exec != prev_exec) {
+ print_mapping(start, addr, previous_size, prev_exec);
+ start = addr;
+ }
+
+ rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
+ if (rc)
+ return rc;
+
+ update_page_count(psize, 1);
+ }
+
+ print_mapping(start, addr, mapping_size, exec);
+ return 0;
+}
+
+void __init radix_init_pgtable(void)
+{
+ unsigned long rts_field;
+ struct memblock_region *reg;
+
+ /* We don't support slb for radix */
+ mmu_slb_size = 0;
+ /*
+ * Create the linear mapping, using standard page size for now
+ */
+ for_each_memblock(memory, reg) {
+ /*
+ * The memblock allocator is up at this point, so the
+ * page tables will be allocated within the range. No
+ * need or a node (which we don't have yet).
+ */
+
+ if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ continue;
+ }
+
+ WARN_ON(create_physical_mapping(reg->base,
+ reg->base + reg->size,
+ -1));
+ }
+
+ /* Find out how many PID bits are supported */
+ if (cpu_has_feature(CPU_FTR_HVMODE)) {
+ if (!mmu_pid_bits)
+ mmu_pid_bits = 20;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ /*
+ * When KVM is possible, we only use the top half of the
+ * PID space to avoid collisions between host and guest PIDs
+ * which can cause problems due to prefetch when exiting the
+ * guest with AIL=3
+ */
+ mmu_base_pid = 1 << (mmu_pid_bits - 1);
+#else
+ mmu_base_pid = 1;
+#endif
+ } else {
+ /* The guest uses the bottom half of the PID space */
+ if (!mmu_pid_bits)
+ mmu_pid_bits = 19;
+ mmu_base_pid = 1;
+ }
+
+ /*
+ * Allocate Partition table and process table for the
+ * host.
+ */
+ BUG_ON(PRTB_SIZE_SHIFT > 36);
+ process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
+ /*
+ * Fill in the process table.
+ */
+ rts_field = radix__get_tree_size();
+ process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
+ /*
+ * Fill in the partition table. We are suppose to use effective address
+ * of process table here. But our linear mapping also enable us to use
+ * physical address here.
+ */
+ register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
+ pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
+ asm volatile("ptesync" : : : "memory");
+ asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+ trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
+
+ /*
+ * The init_mm context is given the first available (non-zero) PID,
+ * which is the "guard PID" and contains no page table. PIDR should
+ * never be set to zero because that duplicates the kernel address
+ * space at the 0x0... offset (quadrant 0)!
+ *
+ * An arbitrary PID that may later be allocated by the PID allocator
+ * for userspace processes must not be used either, because that
+ * would cause stale user mappings for that PID on CPUs outside of
+ * the TLB invalidation scheme (because it won't be in mm_cpumask).
+ *
+ * So permanently carve out one PID for the purpose of a guard PID.
+ */
+ init_mm.context.id = mmu_base_pid;
+ mmu_base_pid++;
+}
+
+static void __init radix_init_partition_table(void)
+{
+ unsigned long rts_field, dw0;
+
+ mmu_partition_table_init();
+ rts_field = radix__get_tree_size();
+ dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
+ mmu_partition_table_set_entry(0, dw0, 0);
+
+ pr_info("Initializing Radix MMU\n");
+ pr_info("Partition table %p\n", partition_tb);
+}
+
+void __init radix_init_native(void)
+{
+ register_process_table = native_register_process_table;
+}
+
+static int __init get_idx_from_shift(unsigned int shift)
+{
+ int idx = -1;
+
+ switch (shift) {
+ case 0xc:
+ idx = MMU_PAGE_4K;
+ break;
+ case 0x10:
+ idx = MMU_PAGE_64K;
+ break;
+ case 0x15:
+ idx = MMU_PAGE_2M;
+ break;
+ case 0x1e:
+ idx = MMU_PAGE_1G;
+ break;
+ }
+ return idx;
+}
+
+static int __init radix_dt_scan_page_sizes(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ int size = 0;
+ int shift, idx;
+ unsigned int ap;
+ const __be32 *prop;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
+
+ /* Find MMU PID size */
+ prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
+ if (prop && size == 4)
+ mmu_pid_bits = be32_to_cpup(prop);
+
+ /* Grab page size encodings */
+ prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
+ if (!prop)
+ return 0;
+
+ pr_info("Page sizes from device-tree:\n");
+ for (; size >= 4; size -= 4, ++prop) {
+
+ struct mmu_psize_def *def;
+
+ /* top 3 bit is AP encoding */
+ shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
+ ap = be32_to_cpu(prop[0]) >> 29;
+ pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
+
+ idx = get_idx_from_shift(shift);
+ if (idx < 0)
+ continue;
+
+ def = &mmu_psize_defs[idx];
+ def->shift = shift;
+ def->ap = ap;
+ }
+
+ /* needed ? */
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
+ return 1;
+}
+
+void __init radix__early_init_devtree(void)
+{
+ int rc;
+
+ /*
+ * Try to find the available page sizes in the device-tree
+ */
+ rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
+ if (rc != 0) /* Found */
+ goto found;
+ /*
+ * let's assume we have page 4k and 64k support
+ */
+ mmu_psize_defs[MMU_PAGE_4K].shift = 12;
+ mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
+
+ mmu_psize_defs[MMU_PAGE_64K].shift = 16;
+ mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
+found:
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ if (mmu_psize_defs[MMU_PAGE_2M].shift) {
+ /*
+ * map vmemmap using 2M if available
+ */
+ mmu_vmemmap_psize = MMU_PAGE_2M;
+ }
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+ return;
+}
+
+static void radix_init_amor(void)
+{
+ /*
+ * In HV mode, we init AMOR (Authority Mask Override Register) so that
+ * the hypervisor and guest can setup IAMR (Instruction Authority Mask
+ * Register), enable key 0 and set it to 1.
+ *
+ * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
+ */
+ mtspr(SPRN_AMOR, (3ul << 62));
+}
+
+#ifdef CONFIG_PPC_KUEP
+void setup_kuep(bool disabled)
+{
+ if (disabled || !early_radix_enabled())
+ return;
+
+ if (smp_processor_id() == boot_cpuid)
+ pr_info("Activating Kernel Userspace Execution Prevention\n");
+
+ /*
+ * Radix always uses key0 of the IAMR to determine if an access is
+ * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
+ * fetch.
+ */
+ mtspr(SPRN_IAMR, (1ul << 62));
+}
+#endif
+
+#ifdef CONFIG_PPC_KUAP
+void setup_kuap(bool disabled)
+{
+ if (disabled || !early_radix_enabled())
+ return;
+
+ if (smp_processor_id() == boot_cpuid) {
+ pr_info("Activating Kernel Userspace Access Prevention\n");
+ cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
+ }
+
+ /* Make sure userspace can't change the AMR */
+ mtspr(SPRN_UAMOR, 0);
+ mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
+ isync();
+}
+#endif
+
+void __init radix__early_init_mmu(void)
+{
+ unsigned long lpcr;
+
+#ifdef CONFIG_PPC_64K_PAGES
+ /* PAGE_SIZE mappings */
+ mmu_virtual_psize = MMU_PAGE_64K;
+#else
+ mmu_virtual_psize = MMU_PAGE_4K;
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ /* vmemmap mapping */
+ mmu_vmemmap_psize = mmu_virtual_psize;
+#endif
+ /*
+ * initialize page table size
+ */
+ __pte_index_size = RADIX_PTE_INDEX_SIZE;
+ __pmd_index_size = RADIX_PMD_INDEX_SIZE;
+ __pud_index_size = RADIX_PUD_INDEX_SIZE;
+ __pgd_index_size = RADIX_PGD_INDEX_SIZE;
+ __pud_cache_index = RADIX_PUD_INDEX_SIZE;
+ __pte_table_size = RADIX_PTE_TABLE_SIZE;
+ __pmd_table_size = RADIX_PMD_TABLE_SIZE;
+ __pud_table_size = RADIX_PUD_TABLE_SIZE;
+ __pgd_table_size = RADIX_PGD_TABLE_SIZE;
+
+ __pmd_val_bits = RADIX_PMD_VAL_BITS;
+ __pud_val_bits = RADIX_PUD_VAL_BITS;
+ __pgd_val_bits = RADIX_PGD_VAL_BITS;
+
+ __kernel_virt_start = RADIX_KERN_VIRT_START;
+ __vmalloc_start = RADIX_VMALLOC_START;
+ __vmalloc_end = RADIX_VMALLOC_END;
+ __kernel_io_start = RADIX_KERN_IO_START;
+ __kernel_io_end = RADIX_KERN_IO_END;
+ vmemmap = (struct page *)RADIX_VMEMMAP_START;
+ ioremap_bot = IOREMAP_BASE;
+
+#ifdef CONFIG_PCI
+ pci_io_base = ISA_IO_BASE;
+#endif
+ __pte_frag_nr = RADIX_PTE_FRAG_NR;
+ __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
+ __pmd_frag_nr = RADIX_PMD_FRAG_NR;
+ __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
+
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+ radix_init_native();
+ lpcr = mfspr(SPRN_LPCR);
+ mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
+ radix_init_partition_table();
+ radix_init_amor();
+ } else {
+ radix_init_pseries();
+ }
+
+ memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+
+ radix_init_pgtable();
+ /* Switch to the guard PID before turning on MMU */
+ radix__switch_mmu_context(NULL, &init_mm);
+ if (cpu_has_feature(CPU_FTR_HVMODE))
+ tlbiel_all();
+}
+
+void radix__early_init_mmu_secondary(void)
+{
+ unsigned long lpcr;
+ /*
+ * update partition table control register and UPRT
+ */
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+ lpcr = mfspr(SPRN_LPCR);
+ mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
+
+ mtspr(SPRN_PTCR,
+ __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+ radix_init_amor();
+ }
+
+ radix__switch_mmu_context(NULL, &init_mm);
+ if (cpu_has_feature(CPU_FTR_HVMODE))
+ tlbiel_all();
+}
+
+void radix__mmu_cleanup_all(void)
+{
+ unsigned long lpcr;
+
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+ lpcr = mfspr(SPRN_LPCR);
+ mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
+ mtspr(SPRN_PTCR, 0);
+ powernv_set_nmmu_ptcr(0);
+ radix__flush_tlb_all();
+ }
+}
+
+void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /*
+ * We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+
+ /*
+ * Radix mode is not limited by RMA / VRMA addressing.
+ */
+ ppc64_rma_size = ULONG_MAX;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (!pte_none(*pte))
+ return;
+ }
+
+ pte_free_kernel(&init_mm, pte_start);
+ pmd_clear(pmd);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (!pmd_none(*pmd))
+ return;
+ }
+
+ pmd_free(&init_mm, pmd_start);
+ pud_clear(pud);
+}
+
+struct change_mapping_params {
+ pte_t *pte;
+ unsigned long start;
+ unsigned long end;
+ unsigned long aligned_start;
+ unsigned long aligned_end;
+};
+
+static int __meminit stop_machine_change_mapping(void *data)
+{
+ struct change_mapping_params *params =
+ (struct change_mapping_params *)data;
+
+ if (!data)
+ return -1;
+
+ spin_unlock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, params->aligned_start, params->pte);
+ create_physical_mapping(params->aligned_start, params->start, -1);
+ create_physical_mapping(params->end, params->aligned_end, -1);
+ spin_lock(&init_mm.page_table_lock);
+ return 0;
+}
+
+static void remove_pte_table(pte_t *pte_start, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+ pte_t *pte;
+
+ pte = pte_start + pte_index(addr);
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
+ /*
+ * The vmemmap_free() and remove_section_mapping()
+ * codepaths call us with aligned addresses.
+ */
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+
+ pte_clear(&init_mm, addr, pte);
+ }
+}
+
+/*
+ * clear the pte and potentially split the mapping helper
+ */
+static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
+ unsigned long size, pte_t *pte)
+{
+ unsigned long mask = ~(size - 1);
+ unsigned long aligned_start = addr & mask;
+ unsigned long aligned_end = addr + size;
+ struct change_mapping_params params;
+ bool split_region = false;
+
+ if ((end - addr) < size) {
+ /*
+ * We're going to clear the PTE, but not flushed
+ * the mapping, time to remap and flush. The
+ * effects if visible outside the processor or
+ * if we are running in code close to the
+ * mapping we cleared, we are in trouble.
+ */
+ if (overlaps_kernel_text(aligned_start, addr) ||
+ overlaps_kernel_text(end, aligned_end)) {
+ /*
+ * Hack, just return, don't pte_clear
+ */
+ WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
+ "text, not splitting\n", addr, end);
+ return;
+ }
+ split_region = true;
+ }
+
+ if (split_region) {
+ params.pte = pte;
+ params.start = addr;
+ params.end = end;
+ params.aligned_start = addr & ~(size - 1);
+ params.aligned_end = min_t(unsigned long, aligned_end,
+ (unsigned long)__va(memblock_end_of_DRAM()));
+ stop_machine(stop_machine_change_mapping, ¶ms, NULL);
+ return;
+ }
+
+ pte_clear(&init_mm, addr, pte);
+}
+
+static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+ pte_t *pte_base;
+ pmd_t *pmd;
+
+ pmd = pmd_start + pmd_index(addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (pmd_huge(*pmd)) {
+ split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
+ continue;
+ }
+
+ pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+ remove_pte_table(pte_base, addr, next);
+ free_pte_table(pte_base, pmd);
+ }
+}
+
+static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+ pmd_t *pmd_base;
+ pud_t *pud;
+
+ pud = pud_start + pud_index(addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (pud_huge(*pud)) {
+ split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
+ continue;
+ }
+
+ pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+ remove_pmd_table(pmd_base, addr, next);
+ free_pmd_table(pmd_base, pud);
+ }
+}
+
+static void __meminit remove_pagetable(unsigned long start, unsigned long end)
+{
+ unsigned long addr, next;
+ pud_t *pud_base;
+ pgd_t *pgd;
+
+ spin_lock(&init_mm.page_table_lock);
+
+ for (addr = start; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ if (!pgd_present(*pgd))
+ continue;
+
+ if (pgd_huge(*pgd)) {
+ split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
+ continue;
+ }
+
+ pud_base = (pud_t *)pgd_page_vaddr(*pgd);
+ remove_pud_table(pud_base, addr, next);
+ }
+
+ spin_unlock(&init_mm.page_table_lock);
+ radix__flush_tlb_kernel_range(start, end);
+}
+
+int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
+{
+ if (end >= RADIX_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
+ return create_physical_mapping(start, end, nid);
+}
+
+int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
+{
+ remove_pagetable(start, end);
+ return 0;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
+ pgprot_t flags, unsigned int map_page_size,
+ int nid)
+{
+ return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
+}
+
+int __meminit radix__vmemmap_create_mapping(unsigned long start,
+ unsigned long page_size,
+ unsigned long phys)
+{
+ /* Create a PTE encoding */
+ unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
+ int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
+ int ret;
+
+ if ((start + page_size) >= RADIX_VMEMMAP_END) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
+ ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
+ BUG_ON(ret);
+
+ return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
+{
+ remove_pagetable(start, start + page_size);
+}
+#endif
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, unsigned long clr,
+ unsigned long set)
+{
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+#endif
+
+ old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
+ trace_hugepage_update(addr, old, clr, set);
+
+ return old;
+}
+
+pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+
+{
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
+ VM_BUG_ON(pmd_devmap(*pmdp));
+ /*
+ * khugepaged calls this for normal pmd
+ */
+ pmd = *pmdp;
+ pmd_clear(pmdp);
+
+ /*FIXME!! Verify whether we need this kick below */
+ serialize_against_pte_lookup(vma->vm_mm);
+
+ radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
+
+ return pmd;
+}
+
+/*
+ * For us pgtable_t is pte_t *. Inorder to save the deposisted
+ * page table, we consider the allocated page table as a list
+ * head. On withdraw we need to make sure we zero out the used
+ * list_head memory area.
+ */
+void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ struct list_head *lh = (struct list_head *) pgtable;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ if (!pmd_huge_pte(mm, pmdp))
+ INIT_LIST_HEAD(lh);
+ else
+ list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+ pmd_huge_pte(mm, pmdp) = pgtable;
+}
+
+pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pte_t *ptep;
+ pgtable_t pgtable;
+ struct list_head *lh;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ pgtable = pmd_huge_pte(mm, pmdp);
+ lh = (struct list_head *) pgtable;
+ if (list_empty(lh))
+ pmd_huge_pte(mm, pmdp) = NULL;
+ else {
+ pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
+ list_del(lh);
+ }
+ ptep = (pte_t *) pgtable;
+ *ptep = __pte(0);
+ ptep++;
+ *ptep = __pte(0);
+ return pgtable;
+}
+
+pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old_pmd;
+ unsigned long old;
+
+ old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
+ old_pmd = __pmd(old);
+ /*
+ * Serialize against find_current_mm_pte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_current_mm_pte to finish.
+ */
+ serialize_against_pte_lookup(mm);
+ return old_pmd;
+}
+
+int radix__has_transparent_hugepage(void)
+{
+ /* For radix 2M at PMD level means thp */
+ if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
+ return 1;
+ return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
+ pte_t entry, unsigned long address, int psize)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
+ _PAGE_RW | _PAGE_EXEC);
+
+ unsigned long change = pte_val(entry) ^ pte_val(*ptep);
+ /*
+ * To avoid NMMU hang while relaxing access, we need mark
+ * the pte invalid in between.
+ */
+ if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
+ unsigned long old_pte, new_pte;
+
+ old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
+ /*
+ * new value of pte
+ */
+ new_pte = old_pte | set;
+ radix__flush_tlb_page_psize(mm, address, psize);
+ __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
+ } else {
+ __radix_pte_update(ptep, 0, set);
+ /*
+ * Book3S does not require a TLB flush when relaxing access
+ * restrictions when the address space is not attached to a
+ * NMMU, because the core MMU will reload the pte after taking
+ * an access fault, which is defined by the architectue.
+ */
+ }
+ /* See ptesync comment in radix__set_pte_at */
+}
+
+void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ /*
+ * To avoid NMMU hang while relaxing access we need to flush the tlb before
+ * we set the new value. We need to do this only for radix, because hash
+ * translation does flush when updating the linux pte.
+ */
+ if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ (atomic_read(&mm->context.copros) > 0))
+ radix__flush_tlb_page(vma, addr);
+
+ set_pte_at(mm, addr, ptep, pte);
+}
--- /dev/null
+/*
+ * TLB flush routines for radix kernels.
+ *
+ * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/memblock.h>
+#include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
+
+#include <asm/ppc-opcode.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/trace.h>
+#include <asm/cputhreads.h>
+
+#define RIC_FLUSH_TLB 0
+#define RIC_FLUSH_PWC 1
+#define RIC_FLUSH_ALL 2
+
+/*
+ * tlbiel instruction for radix, set invalidation
+ * i.e., r=1 and is=01 or is=10 or is=11
+ */
+static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
+ unsigned int pid,
+ unsigned int ric, unsigned int prs)
+{
+ unsigned long rb;
+ unsigned long rs;
+
+ rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
+ rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
+
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
+ : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
+ : "memory");
+}
+
+static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
+{
+ unsigned int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and the entire Page Walk Cache
+ * and partition table entries. Then flush the remaining sets of the
+ * TLB.
+ */
+ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
+ for (set = 1; set < num_sets; set++)
+ tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
+
+ /* Do the same for process scoped entries. */
+ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
+ for (set = 1; set < num_sets; set++)
+ tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
+
+ asm volatile("ptesync": : :"memory");
+}
+
+void radix__tlbiel_all(unsigned int action)
+{
+ unsigned int is;
+
+ switch (action) {
+ case TLB_INVAL_SCOPE_GLOBAL:
+ is = 3;
+ break;
+ case TLB_INVAL_SCOPE_LPID:
+ is = 2;
+ break;
+ default:
+ BUG();
+ }
+
+ if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+ tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is);
+ else
+ WARN(1, "%s called on pre-POWER9 CPU\n", __func__);
+
+ asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void __tlbiel_pid(unsigned long pid, int set,
+ unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rb |= set << PPC_BITLSHIFT(51);
+ rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rs = pid << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbiel_lpid(unsigned long lpid, int set,
+ unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rb |= set << PPC_BITLSHIFT(51);
+ rs = 0; /* LPID comes from LPIDR */
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rs = lpid;
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
+ unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rb |= set << PPC_BITLSHIFT(51);
+ rs = 0; /* LPID comes from LPIDR */
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+
+static inline void __tlbiel_va(unsigned long va, unsigned long pid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = pid << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_va(unsigned long va, unsigned long pid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = pid << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = lpid;
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+static inline void fixup_tlbie(void)
+{
+ unsigned long pid = 0;
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
+ }
+}
+
+static inline void fixup_tlbie_lpid(unsigned long lpid)
+{
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
+ }
+}
+
+/*
+ * We use 128 set in radix mode and 256 set in hpt mode.
+ */
+static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
+{
+ int set;
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+ * also flush the entire Page Walk Cache.
+ */
+ __tlbiel_pid(pid, 0, ric);
+
+ /* For PWC, only one flush is needed */
+ if (ric == RIC_FLUSH_PWC) {
+ asm volatile("ptesync": : :"memory");
+ return;
+ }
+
+ /* For the remaining sets, just flush the TLB */
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+ __tlbiel_pid(pid, set, RIC_FLUSH_TLB);
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
+{
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_pid(pid, RIC_FLUSH_TLB);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_pid(pid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_pid(pid, RIC_FLUSH_ALL);
+ }
+ fixup_tlbie();
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
+{
+ int set;
+
+ VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+ * also flush the entire Page Walk Cache.
+ */
+ __tlbiel_lpid(lpid, 0, ric);
+
+ /* For PWC, only one flush is needed */
+ if (ric == RIC_FLUSH_PWC) {
+ asm volatile("ptesync": : :"memory");
+ return;
+ }
+
+ /* For the remaining sets, just flush the TLB */
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+ __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_lpid(lpid, RIC_FLUSH_TLB);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_lpid(lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_lpid(lpid, RIC_FLUSH_ALL);
+ }
+ fixup_tlbie_lpid(lpid);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
+{
+ int set;
+
+ VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+ * also flush the entire Page Walk Cache.
+ */
+ __tlbiel_lpid_guest(lpid, 0, ric);
+
+ /* For PWC, only one flush is needed */
+ if (ric == RIC_FLUSH_PWC) {
+ asm volatile("ptesync": : :"memory");
+ return;
+ }
+
+ /* For the remaining sets, just flush the TLB */
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+ __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
+}
+
+
+static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
+static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+ unsigned long psize, unsigned long ric)
+{
+ unsigned long ap = mmu_get_ap(psize);
+
+ asm volatile("ptesync": : :"memory");
+ __tlbiel_va(va, pid, ap, ric);
+ asm volatile("ptesync": : :"memory");
+}
+
+static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync": : :"memory");
+ if (also_pwc)
+ __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+ __tlbiel_va_range(start, end, pid, page_size, psize);
+ asm volatile("ptesync": : :"memory");
+}
+
+static inline void __tlbie_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
+static inline void _tlbie_va(unsigned long va, unsigned long pid,
+ unsigned long psize, unsigned long ric)
+{
+ unsigned long ap = mmu_get_ap(psize);
+
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, pid, ap, ric);
+ fixup_tlbie();
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long psize, unsigned long ric)
+{
+ unsigned long ap = mmu_get_ap(psize);
+
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, ap, ric);
+ fixup_tlbie_lpid(lpid);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbie_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync": : :"memory");
+ if (also_pwc)
+ __tlbie_pid(pid, RIC_FLUSH_PWC);
+ __tlbie_va_range(start, end, pid, page_size, psize);
+ fixup_tlbie();
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+/*
+ * Base TLB flushing operations:
+ *
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ * - local_* variants of page and mm only apply to the current
+ * processor
+ */
+void radix__local_flush_tlb_mm(struct mm_struct *mm)
+{
+ unsigned long pid;
+
+ preempt_disable();
+ pid = mm->context.id;
+ if (pid != MMU_NO_CONTEXT)
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ preempt_enable();
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_mm);
+
+#ifndef CONFIG_SMP
+void radix__local_flush_all_mm(struct mm_struct *mm)
+{
+ unsigned long pid;
+
+ preempt_disable();
+ pid = mm->context.id;
+ if (pid != MMU_NO_CONTEXT)
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ preempt_enable();
+}
+EXPORT_SYMBOL(radix__local_flush_all_mm);
+#endif /* CONFIG_SMP */
+
+void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+ int psize)
+{
+ unsigned long pid;
+
+ preempt_disable();
+ pid = mm->context.id;
+ if (pid != MMU_NO_CONTEXT)
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ preempt_enable();
+}
+
+void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ /* need the return fix for nohash.c */
+ if (is_vm_hugetlb_page(vma))
+ return radix__local_flush_hugetlb_page(vma, vmaddr);
+#endif
+ radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
+}
+EXPORT_SYMBOL(radix__local_flush_tlb_page);
+
+static bool mm_is_singlethreaded(struct mm_struct *mm)
+{
+ if (atomic_read(&mm->context.copros) > 0)
+ return false;
+ if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
+ return true;
+ return false;
+}
+
+static bool mm_needs_flush_escalation(struct mm_struct *mm)
+{
+ /*
+ * P9 nest MMU has issues with the page walk cache
+ * caching PTEs and not flushing them properly when
+ * RIC = 0 for a PID/LPID invalidate
+ */
+ if (atomic_read(&mm->context.copros) > 0)
+ return true;
+ return false;
+}
+
+#ifdef CONFIG_SMP
+static void do_exit_flush_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+ unsigned long pid = mm->context.id;
+
+ if (current->mm == mm)
+ return; /* Local CPU */
+
+ if (current->active_mm == mm) {
+ /*
+ * Must be a kernel thread because sender is single-threaded.
+ */
+ BUG_ON(current->mm);
+ mmgrab(&init_mm);
+ switch_mm(mm, &init_mm, current);
+ current->active_mm = &init_mm;
+ mmdrop(mm);
+ }
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+}
+
+static void exit_flush_lazy_tlbs(struct mm_struct *mm)
+{
+ /*
+ * Would be nice if this was async so it could be run in
+ * parallel with our local flush, but generic code does not
+ * give a good API for it. Could extend the generic code or
+ * make a special powerpc IPI for flushing TLBs.
+ * For now it's not too performance critical.
+ */
+ smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
+ (void *)mm, 1);
+ mm_reset_thread_local(mm);
+}
+
+void radix__flush_tlb_mm(struct mm_struct *mm)
+{
+ unsigned long pid;
+
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ /*
+ * Order loads of mm_cpumask vs previous stores to clear ptes before
+ * the invalidate. See barrier in switch_mm_irqs_off
+ */
+ smp_mb();
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
+
+ if (mm_needs_flush_escalation(mm))
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
+ else
+ _tlbie_pid(pid, RIC_FLUSH_TLB);
+ } else {
+local:
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL(radix__flush_tlb_mm);
+
+static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
+{
+ unsigned long pid;
+
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ if (!fullmm) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
+ }
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
+ } else {
+local:
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ }
+ preempt_enable();
+}
+void radix__flush_all_mm(struct mm_struct *mm)
+{
+ __flush_all_mm(mm, false);
+}
+EXPORT_SYMBOL(radix__flush_all_mm);
+
+void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
+{
+ tlb->need_flush_all = 1;
+}
+EXPORT_SYMBOL(radix__flush_tlb_pwc);
+
+void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
+ int psize)
+{
+ unsigned long pid;
+
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
+ _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ } else {
+local:
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ }
+ preempt_enable();
+}
+
+void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ if (is_vm_hugetlb_page(vma))
+ return radix__flush_hugetlb_page(vma, vmaddr);
+#endif
+ radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
+}
+EXPORT_SYMBOL(radix__flush_tlb_page);
+
+#else /* CONFIG_SMP */
+#define radix__flush_all_mm radix__local_flush_all_mm
+#endif /* CONFIG_SMP */
+
+void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ _tlbie_pid(0, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+
+#define TLB_FLUSH_ALL -1UL
+
+/*
+ * Number of pages above which we invalidate the entire PID rather than
+ * flush individual pages, for local and global flushes respectively.
+ *
+ * tlbie goes out to the interconnect and individual ops are more costly.
+ * It also does not iterate over sets like the local tlbiel variant when
+ * invalidating a full PID, so it has a far lower threshold to change from
+ * individual page flushes to full-pid flushes.
+ */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
+
+static inline void __radix__flush_tlb_range(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool flush_all_sizes)
+
+{
+ unsigned long pid;
+ unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
+ unsigned long page_size = 1UL << page_shift;
+ unsigned long nr_pages = (end - start) >> page_shift;
+ bool local, full;
+
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ if (end != TLB_FLUSH_ALL) {
+ exit_flush_lazy_tlbs(mm);
+ goto is_local;
+ }
+ }
+ local = false;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_single_page_flush_ceiling);
+ } else {
+is_local:
+ local = true;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_local_single_page_flush_ceiling);
+ }
+
+ if (full) {
+ if (local) {
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ } else {
+ if (mm_needs_flush_escalation(mm))
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
+ else
+ _tlbie_pid(pid, RIC_FLUSH_TLB);
+ }
+ } else {
+ bool hflush = flush_all_sizes;
+ bool gflush = flush_all_sizes;
+ unsigned long hstart, hend;
+ unsigned long gstart, gend;
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ hflush = true;
+
+ if (hflush) {
+ hstart = (start + PMD_SIZE - 1) & PMD_MASK;
+ hend = end & PMD_MASK;
+ if (hstart == hend)
+ hflush = false;
+ }
+
+ if (gflush) {
+ gstart = (start + PUD_SIZE - 1) & PUD_MASK;
+ gend = end & PUD_MASK;
+ if (gstart == gend)
+ gflush = false;
+ }
+
+ asm volatile("ptesync": : :"memory");
+ if (local) {
+ __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
+ if (hflush)
+ __tlbiel_va_range(hstart, hend, pid,
+ PMD_SIZE, MMU_PAGE_2M);
+ if (gflush)
+ __tlbiel_va_range(gstart, gend, pid,
+ PUD_SIZE, MMU_PAGE_1G);
+ asm volatile("ptesync": : :"memory");
+ } else {
+ __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
+ if (hflush)
+ __tlbie_va_range(hstart, hend, pid,
+ PMD_SIZE, MMU_PAGE_2M);
+ if (gflush)
+ __tlbie_va_range(gstart, gend, pid,
+ PUD_SIZE, MMU_PAGE_1G);
+ fixup_tlbie();
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+ }
+ }
+ preempt_enable();
+}
+
+void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ if (is_vm_hugetlb_page(vma))
+ return radix__flush_hugetlb_tlb_range(vma, start, end);
+#endif
+
+ __radix__flush_tlb_range(vma->vm_mm, start, end, false);
+}
+EXPORT_SYMBOL(radix__flush_tlb_range);
+
+static int radix_get_mmu_psize(int page_size)
+{
+ int psize;
+
+ if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift))
+ psize = mmu_virtual_psize;
+ else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift))
+ psize = MMU_PAGE_2M;
+ else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift))
+ psize = MMU_PAGE_1G;
+ else
+ return -1;
+ return psize;
+}
+
+/*
+ * Flush partition scoped LPID address translation for all CPUs.
+ */
+void radix__flush_tlb_lpid_page(unsigned int lpid,
+ unsigned long addr,
+ unsigned long page_size)
+{
+ int psize = radix_get_mmu_psize(page_size);
+
+ _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page);
+
+/*
+ * Flush partition scoped PWC from LPID for all CPUs.
+ */
+void radix__flush_pwc_lpid(unsigned int lpid)
+{
+ _tlbie_lpid(lpid, RIC_FLUSH_PWC);
+}
+EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
+void radix__flush_tlb_lpid(unsigned int lpid)
+{
+ _tlbie_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
+void radix__local_flush_tlb_lpid(unsigned int lpid)
+{
+ _tlbiel_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
+
+/*
+ * Flush process scoped translations from LPID (=LPIDR).
+ * Important difference, the guest normally manages its own translations,
+ * but some cases e.g., vCPU CPU migration require KVM to flush.
+ */
+void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
+{
+ _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
+
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize);
+
+void radix__tlb_flush(struct mmu_gather *tlb)
+{
+ int psize = 0;
+ struct mm_struct *mm = tlb->mm;
+ int page_size = tlb->page_size;
+ unsigned long start = tlb->start;
+ unsigned long end = tlb->end;
+
+ /*
+ * if page size is not something we understand, do a full mm flush
+ *
+ * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
+ * that flushes the process table entry cache upon process teardown.
+ * See the comment for radix in arch_exit_mmap().
+ */
+ if (tlb->fullmm) {
+ __flush_all_mm(mm, true);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+ } else if (mm_tlb_flush_nested(mm)) {
+ /*
+ * If there is a concurrent invalidation that is clearing ptes,
+ * then it's possible this invalidation will miss one of those
+ * cleared ptes and miss flushing the TLB. If this invalidate
+ * returns before the other one flushes TLBs, that can result
+ * in it returning while there are still valid TLBs inside the
+ * range to be invalidated.
+ *
+ * See mm/memory.c:tlb_finish_mmu() for more details.
+ *
+ * The solution to this is ensure the entire range is always
+ * flushed here. The problem for powerpc is that the flushes
+ * are page size specific, so this "forced flush" would not
+ * do the right thing if there are a mix of page sizes in
+ * the range to be invalidated. So use __flush_tlb_range
+ * which invalidates all possible page sizes in the range.
+ *
+ * PWC flush probably is not be required because the core code
+ * shouldn't free page tables in this path, but accounting
+ * for the possibility makes us a bit more robust.
+ *
+ * need_flush_all is an uncommon case because page table
+ * teardown should be done with exclusive locks held (but
+ * after locks are dropped another invalidate could come
+ * in), it could be optimized further if necessary.
+ */
+ if (!tlb->need_flush_all)
+ __radix__flush_tlb_range(mm, start, end, true);
+ else
+ radix__flush_all_mm(mm);
+#endif
+ } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
+ if (!tlb->need_flush_all)
+ radix__flush_tlb_mm(mm);
+ else
+ radix__flush_all_mm(mm);
+ } else {
+ if (!tlb->need_flush_all)
+ radix__flush_tlb_range_psize(mm, start, end, psize);
+ else
+ radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
+ }
+ tlb->need_flush_all = 0;
+}
+
+static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ int psize, bool also_pwc)
+{
+ unsigned long pid;
+ unsigned int page_shift = mmu_psize_defs[psize].shift;
+ unsigned long page_size = 1UL << page_shift;
+ unsigned long nr_pages = (end - start) >> page_shift;
+ bool local, full;
+
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ if (end != TLB_FLUSH_ALL) {
+ exit_flush_lazy_tlbs(mm);
+ goto is_local;
+ }
+ }
+ local = false;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_single_page_flush_ceiling);
+ } else {
+is_local:
+ local = true;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_local_single_page_flush_ceiling);
+ }
+
+ if (full) {
+ if (local) {
+ _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+ } else {
+ if (mm_needs_flush_escalation(mm))
+ also_pwc = true;
+
+ _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+ }
+ } else {
+ if (local)
+ _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
+ else
+ _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
+ }
+ preempt_enable();
+}
+
+void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize)
+{
+ return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
+}
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize)
+{
+ __radix__flush_tlb_range_psize(mm, start, end, psize, true);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long pid, end;
+
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ return;
+
+ /* 4k page size, just blow the world */
+ if (PAGE_SIZE == 0x1000) {
+ radix__flush_all_mm(mm);
+ return;
+ }
+
+ end = addr + HPAGE_PMD_SIZE;
+
+ /* Otherwise first do the PWC, then iterate the pages. */
+ preempt_disable();
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
+ _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+ } else {
+local:
+ _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+ }
+
+ preempt_enable();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M);
+}
+EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
+
+void radix__flush_tlb_all(void)
+{
+ unsigned long rb,prs,r,rs;
+ unsigned long ric = RIC_FLUSH_ALL;
+
+ rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+ rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */
+
+ asm volatile("ptesync": : :"memory");
+ /*
+ * now flush guest entries by passing PRS = 1 and LPID != 0
+ */
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory");
+ /*
+ * now flush host entires by passing PRS = 0 and LPID == 0
+ */
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
+{
+ unsigned long pid = mm->context.id;
+
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ return;
+
+ /*
+ * If this context hasn't run on that CPU before and KVM is
+ * around, there's a slim chance that the guest on another
+ * CPU just brought in obsolete translation into the TLB of
+ * this CPU due to a bad prefetch using the guest PID on
+ * the way into the hypervisor.
+ *
+ * We work around this here. If KVM is possible, we check if
+ * any sibling thread is in KVM. If it is, the window may exist
+ * and thus we flush that PID from the core.
+ *
+ * A potential future improvement would be to mark which PIDs
+ * have never been used on the system and avoid it if the PID
+ * is new and the process has no other cpumask bit set.
+ */
+ if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
+ int cpu = smp_processor_id();
+ int sib = cpu_first_thread_sibling(cpu);
+ bool flush = false;
+
+ for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
+ if (sib == cpu)
+ continue;
+ if (!cpu_possible(sib))
+ continue;
+ if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
+ flush = true;
+ }
+ if (flush)
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ }
+}
+EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
--- /dev/null
+/*
+ * PowerPC64 SLB support.
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ * Based on earlier code written by:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ * Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/asm-prototypes.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/smp.h>
+#include <linux/compiler.h>
+#include <linux/context_tracking.h>
+#include <linux/mm_types.h>
+
+#include <asm/udbg.h>
+#include <asm/code-patching.h>
+
+enum slb_index {
+ LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */
+ KSTACK_INDEX = 1, /* Kernel stack map */
+};
+
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
+
+#define slb_esid_mask(ssize) \
+ (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
+
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+ enum slb_index index)
+{
+ return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
+}
+
+static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
+ unsigned long flags)
+{
+ return (vsid << slb_vsid_shift(ssize)) | flags |
+ ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+ unsigned long flags)
+{
+ return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
+}
+
+static void assert_slb_presence(bool present, unsigned long ea)
+{
+#ifdef CONFIG_DEBUG_VM
+ unsigned long tmp;
+
+ WARN_ON_ONCE(mfmsr() & MSR_EE);
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_206))
+ return;
+
+ /*
+ * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
+ * ignores all other bits from 0-27, so just clear them all.
+ */
+ ea &= ~((1UL << 28) - 1);
+ asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
+
+ WARN_ON(present == (tmp == 0));
+#endif
+}
+
+static inline void slb_shadow_update(unsigned long ea, int ssize,
+ unsigned long flags,
+ enum slb_index index)
+{
+ struct slb_shadow *p = get_slb_shadow();
+
+ /*
+ * Clear the ESID first so the entry is not valid while we are
+ * updating it. No write barriers are needed here, provided
+ * we only update the current CPU's SLB shadow buffer.
+ */
+ WRITE_ONCE(p->save_area[index].esid, 0);
+ WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
+ WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
+}
+
+static inline void slb_shadow_clear(enum slb_index index)
+{
+ WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
+}
+
+static inline void create_shadowed_slbe(unsigned long ea, int ssize,
+ unsigned long flags,
+ enum slb_index index)
+{
+ /*
+ * Updating the shadow buffer before writing the SLB ensures
+ * we don't get a stale entry here if we get preempted by PHYP
+ * between these two statements.
+ */
+ slb_shadow_update(ea, ssize, flags, index);
+
+ assert_slb_presence(false, ea);
+ asm volatile("slbmte %0,%1" :
+ : "r" (mk_vsid_data(ea, ssize, flags)),
+ "r" (mk_esid_data(ea, ssize, index))
+ : "memory" );
+}
+
+/*
+ * Insert bolted entries into SLB (which may not be empty, so don't clear
+ * slb_cache_ptr).
+ */
+void __slb_restore_bolted_realmode(void)
+{
+ struct slb_shadow *p = get_slb_shadow();
+ enum slb_index index;
+
+ /* No isync needed because realmode. */
+ for (index = 0; index < SLB_NUM_BOLTED; index++) {
+ asm volatile("slbmte %0,%1" :
+ : "r" (be64_to_cpu(p->save_area[index].vsid)),
+ "r" (be64_to_cpu(p->save_area[index].esid)));
+ }
+
+ assert_slb_presence(true, local_paca->kstack);
+}
+
+/*
+ * Insert the bolted entries into an empty SLB.
+ */
+void slb_restore_bolted_realmode(void)
+{
+ __slb_restore_bolted_realmode();
+ get_paca()->slb_cache_ptr = 0;
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+}
+
+/*
+ * This flushes all SLB entries including 0, so it must be realmode.
+ */
+void slb_flush_all_realmode(void)
+{
+ asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+}
+
+/*
+ * This flushes non-bolted entries, it can be run in virtual mode. Must
+ * be called with interrupts disabled.
+ */
+void slb_flush_and_restore_bolted(void)
+{
+ struct slb_shadow *p = get_slb_shadow();
+
+ BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
+
+ WARN_ON(!irqs_disabled());
+
+ /*
+ * We can't take a PMU exception in the following code, so hard
+ * disable interrupts.
+ */
+ hard_irq_disable();
+
+ asm volatile("isync\n"
+ "slbia\n"
+ "slbmte %0, %1\n"
+ "isync\n"
+ :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)),
+ "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid))
+ : "memory");
+ assert_slb_presence(true, get_paca()->kstack);
+
+ get_paca()->slb_cache_ptr = 0;
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+}
+
+void slb_save_contents(struct slb_entry *slb_ptr)
+{
+ int i;
+ unsigned long e, v;
+
+ /* Save slb_cache_ptr value. */
+ get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
+
+ if (!slb_ptr)
+ return;
+
+ for (i = 0; i < mmu_slb_size; i++) {
+ asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i));
+ asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i));
+ slb_ptr->esid = e;
+ slb_ptr->vsid = v;
+ slb_ptr++;
+ }
+}
+
+void slb_dump_contents(struct slb_entry *slb_ptr)
+{
+ int i, n;
+ unsigned long e, v;
+ unsigned long llp;
+
+ if (!slb_ptr)
+ return;
+
+ pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
+ pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr);
+
+ for (i = 0; i < mmu_slb_size; i++) {
+ e = slb_ptr->esid;
+ v = slb_ptr->vsid;
+ slb_ptr++;
+
+ if (!e && !v)
+ continue;
+
+ pr_err("%02d %016lx %016lx\n", i, e, v);
+
+ if (!(e & SLB_ESID_V)) {
+ pr_err("\n");
+ continue;
+ }
+ llp = v & SLB_VSID_LLP;
+ if (v & SLB_VSID_B_1T) {
+ pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ GET_ESID_1T(e),
+ (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp);
+ } else {
+ pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ GET_ESID(e),
+ (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp);
+ }
+ }
+ pr_err("----------------------------------\n");
+
+ /* Dump slb cache entires as well. */
+ pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
+ pr_err("Valid SLB cache entries:\n");
+ n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
+ for (i = 0; i < n; i++)
+ pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+ pr_err("Rest of SLB cache entries:\n");
+ for (i = n; i < SLB_CACHE_ENTRIES; i++)
+ pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+}
+
+void slb_vmalloc_update(void)
+{
+ /*
+ * vmalloc is not bolted, so just have to flush non-bolted.
+ */
+ slb_flush_and_restore_bolted();
+}
+
+static bool preload_hit(struct thread_info *ti, unsigned long esid)
+{
+ unsigned char i;
+
+ for (i = 0; i < ti->slb_preload_nr; i++) {
+ unsigned char idx;
+
+ idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+ if (esid == ti->slb_preload_esid[idx])
+ return true;
+ }
+ return false;
+}
+
+static bool preload_add(struct thread_info *ti, unsigned long ea)
+{
+ unsigned char idx;
+ unsigned long esid;
+
+ if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
+ /* EAs are stored >> 28 so 256MB segments don't need clearing */
+ if (ea & ESID_MASK_1T)
+ ea &= ESID_MASK_1T;
+ }
+
+ esid = ea >> SID_SHIFT;
+
+ if (preload_hit(ti, esid))
+ return false;
+
+ idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR;
+ ti->slb_preload_esid[idx] = esid;
+ if (ti->slb_preload_nr == SLB_PRELOAD_NR)
+ ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
+ else
+ ti->slb_preload_nr++;
+
+ return true;
+}
+
+static void preload_age(struct thread_info *ti)
+{
+ if (!ti->slb_preload_nr)
+ return;
+ ti->slb_preload_nr--;
+ ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
+}
+
+void slb_setup_new_exec(void)
+{
+ struct thread_info *ti = current_thread_info();
+ struct mm_struct *mm = current->mm;
+ unsigned long exec = 0x10000000;
+
+ WARN_ON(irqs_disabled());
+
+ /*
+ * preload cache can only be used to determine whether a SLB
+ * entry exists if it does not start to overflow.
+ */
+ if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR)
+ return;
+
+ hard_irq_disable();
+
+ /*
+ * We have no good place to clear the slb preload cache on exec,
+ * flush_thread is about the earliest arch hook but that happens
+ * after we switch to the mm and have aleady preloaded the SLBEs.
+ *
+ * For the most part that's probably okay to use entries from the
+ * previous exec, they will age out if unused. It may turn out to
+ * be an advantage to clear the cache before switching to it,
+ * however.
+ */
+
+ /*
+ * preload some userspace segments into the SLB.
+ * Almost all 32 and 64bit PowerPC executables are linked at
+ * 0x10000000 so it makes sense to preload this segment.
+ */
+ if (!is_kernel_addr(exec)) {
+ if (preload_add(ti, exec))
+ slb_allocate_user(mm, exec);
+ }
+
+ /* Libraries and mmaps. */
+ if (!is_kernel_addr(mm->mmap_base)) {
+ if (preload_add(ti, mm->mmap_base))
+ slb_allocate_user(mm, mm->mmap_base);
+ }
+
+ /* see switch_slb */
+ asm volatile("isync" : : : "memory");
+
+ local_irq_enable();
+}
+
+void preload_new_slb_context(unsigned long start, unsigned long sp)
+{
+ struct thread_info *ti = current_thread_info();
+ struct mm_struct *mm = current->mm;
+ unsigned long heap = mm->start_brk;
+
+ WARN_ON(irqs_disabled());
+
+ /* see above */
+ if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR)
+ return;
+
+ hard_irq_disable();
+
+ /* Userspace entry address. */
+ if (!is_kernel_addr(start)) {
+ if (preload_add(ti, start))
+ slb_allocate_user(mm, start);
+ }
+
+ /* Top of stack, grows down. */
+ if (!is_kernel_addr(sp)) {
+ if (preload_add(ti, sp))
+ slb_allocate_user(mm, sp);
+ }
+
+ /* Bottom of heap, grows up. */
+ if (heap && !is_kernel_addr(heap)) {
+ if (preload_add(ti, heap))
+ slb_allocate_user(mm, heap);
+ }
+
+ /* see switch_slb */
+ asm volatile("isync" : : : "memory");
+
+ local_irq_enable();
+}
+
+
+/* Flush all user entries from the segment table of the current processor. */
+void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
+{
+ struct thread_info *ti = task_thread_info(tsk);
+ unsigned char i;
+
+ /*
+ * We need interrupts hard-disabled here, not just soft-disabled,
+ * so that a PMU interrupt can't occur, which might try to access
+ * user memory (to get a stack trace) and possible cause an SLB miss
+ * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+ */
+ hard_irq_disable();
+ asm volatile("isync" : : : "memory");
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ /*
+ * SLBIA IH=3 invalidates all Class=1 SLBEs and their
+ * associated lookaside structures, which matches what
+ * switch_slb wants. So ARCH_300 does not use the slb
+ * cache.
+ */
+ asm volatile(PPC_SLBIA(3));
+ } else {
+ unsigned long offset = get_paca()->slb_cache_ptr;
+
+ if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
+ offset <= SLB_CACHE_ENTRIES) {
+ unsigned long slbie_data = 0;
+
+ for (i = 0; i < offset; i++) {
+ unsigned long ea;
+
+ ea = (unsigned long)
+ get_paca()->slb_cache[i] << SID_SHIFT;
+ /*
+ * Could assert_slb_presence(true) here, but
+ * hypervisor or machine check could have come
+ * in and removed the entry at this point.
+ */
+
+ slbie_data = ea;
+ slbie_data |= user_segment_size(slbie_data)
+ << SLBIE_SSIZE_SHIFT;
+ slbie_data |= SLBIE_C; /* user slbs have C=1 */
+ asm volatile("slbie %0" : : "r" (slbie_data));
+ }
+
+ /* Workaround POWER5 < DD2.1 issue */
+ if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
+ asm volatile("slbie %0" : : "r" (slbie_data));
+
+ } else {
+ struct slb_shadow *p = get_slb_shadow();
+ unsigned long ksp_esid_data =
+ be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
+ unsigned long ksp_vsid_data =
+ be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
+
+ asm volatile(PPC_SLBIA(1) "\n"
+ "slbmte %0,%1\n"
+ "isync"
+ :: "r"(ksp_vsid_data),
+ "r"(ksp_esid_data));
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ }
+
+ get_paca()->slb_cache_ptr = 0;
+ }
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+
+ copy_mm_to_paca(mm);
+
+ /*
+ * We gradually age out SLBs after a number of context switches to
+ * reduce reload overhead of unused entries (like we do with FP/VEC
+ * reload). Each time we wrap 256 switches, take an entry out of the
+ * SLB preload cache.
+ */
+ tsk->thread.load_slb++;
+ if (!tsk->thread.load_slb) {
+ unsigned long pc = KSTK_EIP(tsk);
+
+ preload_age(ti);
+ preload_add(ti, pc);
+ }
+
+ for (i = 0; i < ti->slb_preload_nr; i++) {
+ unsigned char idx;
+ unsigned long ea;
+
+ idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+ ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
+
+ slb_allocate_user(mm, ea);
+ }
+
+ /*
+ * Synchronize slbmte preloads with possible subsequent user memory
+ * address accesses by the kernel (user mode won't happen until
+ * rfid, which is safe).
+ */
+ asm volatile("isync" : : : "memory");
+}
+
+void slb_set_size(u16 size)
+{
+ mmu_slb_size = size;
+}
+
+void slb_initialize(void)
+{
+ unsigned long linear_llp, vmalloc_llp, io_llp;
+ unsigned long lflags;
+ static int slb_encoding_inited;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ unsigned long vmemmap_llp;
+#endif
+
+ /* Prepare our SLB miss handler based on our page size */
+ linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
+ io_llp = mmu_psize_defs[mmu_io_psize].sllp;
+ vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
+ get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+ if (!slb_encoding_inited) {
+ slb_encoding_inited = 1;
+ pr_devel("SLB: linear LLP = %04lx\n", linear_llp);
+ pr_devel("SLB: io LLP = %04lx\n", io_llp);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
+#endif
+ }
+
+ get_paca()->stab_rr = SLB_NUM_BOLTED - 1;
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+
+ lflags = SLB_VSID_KERNEL | linear_llp;
+
+ /* Invalidate the entire SLB (even entry 0) & all the ERATS */
+ asm volatile("isync":::"memory");
+ asm volatile("slbmte %0,%0"::"r" (0) : "memory");
+ asm volatile("isync; slbia; isync":::"memory");
+ create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX);
+
+ /*
+ * For the boot cpu, we're running on the stack in init_thread_union,
+ * which is in the first segment of the linear mapping, and also
+ * get_paca()->kstack hasn't been initialized yet.
+ * For secondary cpus, we need to bolt the kernel stack entry now.
+ */
+ slb_shadow_clear(KSTACK_INDEX);
+ if (raw_smp_processor_id() != boot_cpuid &&
+ (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
+ create_shadowed_slbe(get_paca()->kstack,
+ mmu_kernel_ssize, lflags, KSTACK_INDEX);
+
+ asm volatile("isync":::"memory");
+}
+
+static void slb_cache_update(unsigned long esid_data)
+{
+ int slb_cache_index;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return; /* ISAv3.0B and later does not use slb_cache */
+
+ /*
+ * Now update slb cache entries
+ */
+ slb_cache_index = local_paca->slb_cache_ptr;
+ if (slb_cache_index < SLB_CACHE_ENTRIES) {
+ /*
+ * We have space in slb cache for optimized switch_slb().
+ * Top 36 bits from esid_data as per ISA
+ */
+ local_paca->slb_cache[slb_cache_index++] = esid_data >> 28;
+ local_paca->slb_cache_ptr++;
+ } else {
+ /*
+ * Our cache is full and the current cache content strictly
+ * doesn't indicate the active SLB conents. Bump the ptr
+ * so that switch_slb() will ignore the cache.
+ */
+ local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
+ }
+}
+
+static enum slb_index alloc_slb_index(bool kernel)
+{
+ enum slb_index index;
+
+ /*
+ * The allocation bitmaps can become out of synch with the SLB
+ * when the _switch code does slbie when bolting a new stack
+ * segment and it must not be anywhere else in the SLB. This leaves
+ * a kernel allocated entry that is unused in the SLB. With very
+ * large systems or small segment sizes, the bitmaps could slowly
+ * fill with these entries. They will eventually be cleared out
+ * by the round robin allocator in that case, so it's probably not
+ * worth accounting for.
+ */
+
+ /*
+ * SLBs beyond 32 entries are allocated with stab_rr only
+ * POWER7/8/9 have 32 SLB entries, this could be expanded if a
+ * future CPU has more.
+ */
+ if (local_paca->slb_used_bitmap != U32_MAX) {
+ index = ffz(local_paca->slb_used_bitmap);
+ local_paca->slb_used_bitmap |= 1U << index;
+ if (kernel)
+ local_paca->slb_kern_bitmap |= 1U << index;
+ } else {
+ /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */
+ index = local_paca->stab_rr;
+ if (index < (mmu_slb_size - 1))
+ index++;
+ else
+ index = SLB_NUM_BOLTED;
+ local_paca->stab_rr = index;
+ if (index < 32) {
+ if (kernel)
+ local_paca->slb_kern_bitmap |= 1U << index;
+ else
+ local_paca->slb_kern_bitmap &= ~(1U << index);
+ }
+ }
+ BUG_ON(index < SLB_NUM_BOLTED);
+
+ return index;
+}
+
+static long slb_insert_entry(unsigned long ea, unsigned long context,
+ unsigned long flags, int ssize, bool kernel)
+{
+ unsigned long vsid;
+ unsigned long vsid_data, esid_data;
+ enum slb_index index;
+
+ vsid = get_vsid(context, ea, ssize);
+ if (!vsid)
+ return -EFAULT;
+
+ /*
+ * There must not be a kernel SLB fault in alloc_slb_index or before
+ * slbmte here or the allocation bitmaps could get out of whack with
+ * the SLB.
+ *
+ * User SLB faults or preloads take this path which might get inlined
+ * into the caller, so add compiler barriers here to ensure unsafe
+ * memory accesses do not come between.
+ */
+ barrier();
+
+ index = alloc_slb_index(kernel);
+
+ vsid_data = __mk_vsid_data(vsid, ssize, flags);
+ esid_data = mk_esid_data(ea, ssize, index);
+
+ /*
+ * No need for an isync before or after this slbmte. The exception
+ * we enter with and the rfid we exit with are context synchronizing.
+ * User preloads should add isync afterwards in case the kernel
+ * accesses user memory before it returns to userspace with rfid.
+ */
+ assert_slb_presence(false, ea);
+ asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
+
+ barrier();
+
+ if (!kernel)
+ slb_cache_update(esid_data);
+
+ return 0;
+}
+
+static long slb_allocate_kernel(unsigned long ea, unsigned long id)
+{
+ unsigned long context;
+ unsigned long flags;
+ int ssize;
+
+ if (id == LINEAR_MAP_REGION_ID) {
+
+ /* We only support upto MAX_PHYSMEM_BITS */
+ if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS))
+ return -EFAULT;
+
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ } else if (id == VMEMMAP_REGION_ID) {
+
+ if (ea >= H_VMEMMAP_END)
+ return -EFAULT;
+
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+ } else if (id == VMALLOC_REGION_ID) {
+
+ if (ea >= H_VMALLOC_END)
+ return -EFAULT;
+
+ flags = local_paca->vmalloc_sllp;
+
+ } else if (id == IO_REGION_ID) {
+
+ if (ea >= H_KERN_IO_END)
+ return -EFAULT;
+
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp;
+
+ } else {
+ return -EFAULT;
+ }
+
+ ssize = MMU_SEGSIZE_1T;
+ if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
+ ssize = MMU_SEGSIZE_256M;
+
+ context = get_kernel_context(ea);
+
+ return slb_insert_entry(ea, context, flags, ssize, true);
+}
+
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
+{
+ unsigned long context;
+ unsigned long flags;
+ int bpsize;
+ int ssize;
+
+ /*
+ * consider this as bad access if we take a SLB miss
+ * on an address above addr limit.
+ */
+ if (ea >= mm_ctx_slb_addr_limit(&mm->context))
+ return -EFAULT;
+
+ context = get_user_context(&mm->context, ea);
+ if (!context)
+ return -EFAULT;
+
+ if (unlikely(ea >= H_PGTABLE_RANGE)) {
+ WARN_ON(1);
+ return -EFAULT;
+ }
+
+ ssize = user_segment_size(ea);
+
+ bpsize = get_slice_psize(mm, ea);
+ flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp;
+
+ return slb_insert_entry(ea, context, flags, ssize, false);
+}
+
+long do_slb_fault(struct pt_regs *regs, unsigned long ea)
+{
+ unsigned long id = get_region_id(ea);
+
+ /* IRQs are not reconciled here, so can't check irqs_disabled */
+ VM_WARN_ON(mfmsr() & MSR_EE);
+
+ if (unlikely(!(regs->msr & MSR_RI)))
+ return -EINVAL;
+
+ /*
+ * SLB kernel faults must be very careful not to touch anything
+ * that is not bolted. E.g., PACA and global variables are okay,
+ * mm->context stuff is not.
+ *
+ * SLB user faults can access all of kernel memory, but must be
+ * careful not to touch things like IRQ state because it is not
+ * "reconciled" here. The difficulty is that we must use
+ * fast_exception_return to return from kernel SLB faults without
+ * looking at possible non-bolted memory. We could test user vs
+ * kernel faults in the interrupt handler asm and do a full fault,
+ * reconcile, ret_from_except for user faults which would make them
+ * first class kernel code. But for performance it's probably nicer
+ * if they go via fast_exception_return too.
+ */
+ if (id >= LINEAR_MAP_REGION_ID) {
+ long err;
+#ifdef CONFIG_DEBUG_VM
+ /* Catch recursive kernel SLB faults. */
+ BUG_ON(local_paca->in_kernel_slb_handler);
+ local_paca->in_kernel_slb_handler = 1;
+#endif
+ err = slb_allocate_kernel(ea, id);
+#ifdef CONFIG_DEBUG_VM
+ local_paca->in_kernel_slb_handler = 0;
+#endif
+ return err;
+ } else {
+ struct mm_struct *mm = current->mm;
+ long err;
+
+ if (unlikely(!mm))
+ return -EFAULT;
+
+ err = slb_allocate_user(mm, ea);
+ if (!err)
+ preload_add(current_thread_info(), ea);
+
+ return err;
+ }
+}
+
+void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err)
+{
+ if (err == -EFAULT) {
+ if (user_mode(regs))
+ _exception(SIGSEGV, regs, SEGV_BNDERR, ea);
+ else
+ bad_page_fault(regs, ea, SIGSEGV);
+ } else if (err == -EINVAL) {
+ unrecoverable_exception(regs);
+ } else {
+ BUG();
+ }
+}
--- /dev/null
+/*
+ * Copyright 2007-2008 Paul Mackerras, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <linux/uaccess.h>
+
+/*
+ * Free all pages allocated for subpage protection maps and pointers.
+ * Also makes sure that the subpage_prot_table structure is
+ * reinitialized for the next user.
+ */
+void subpage_prot_free(struct mm_struct *mm)
+{
+ struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
+ unsigned long i, j, addr;
+ u32 **p;
+
+ if (!spt)
+ return;
+
+ for (i = 0; i < 4; ++i) {
+ if (spt->low_prot[i]) {
+ free_page((unsigned long)spt->low_prot[i]);
+ spt->low_prot[i] = NULL;
+ }
+ }
+ addr = 0;
+ for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
+ p = spt->protptrs[i];
+ if (!p)
+ continue;
+ spt->protptrs[i] = NULL;
+ for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr;
+ ++j, addr += PAGE_SIZE)
+ if (p[j])
+ free_page((unsigned long)p[j]);
+ free_page((unsigned long)p);
+ }
+ spt->maxaddr = 0;
+ kfree(spt);
+}
+
+static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
+ int npages)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_none(*pgd))
+ return;
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud))
+ return;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return;
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+ for (; npages > 0; --npages) {
+ pte_update(mm, addr, pte, 0, 0, 0);
+ addr += PAGE_SIZE;
+ ++pte;
+ }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte - 1, ptl);
+}
+
+/*
+ * Clear the subpage protection map for an address range, allowing
+ * all accesses that are allowed by the pte permissions.
+ */
+static void subpage_prot_clear(unsigned long addr, unsigned long len)
+{
+ struct mm_struct *mm = current->mm;
+ struct subpage_prot_table *spt;
+ u32 **spm, *spp;
+ unsigned long i;
+ size_t nw;
+ unsigned long next, limit;
+
+ down_write(&mm->mmap_sem);
+
+ spt = mm_ctx_subpage_prot(&mm->context);
+ if (!spt)
+ goto err_out;
+
+ limit = addr + len;
+ if (limit > spt->maxaddr)
+ limit = spt->maxaddr;
+ for (; addr < limit; addr = next) {
+ next = pmd_addr_end(addr, limit);
+ if (addr < 0x100000000UL) {
+ spm = spt->low_prot;
+ } else {
+ spm = spt->protptrs[addr >> SBP_L3_SHIFT];
+ if (!spm)
+ continue;
+ }
+ spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
+ if (!spp)
+ continue;
+ spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
+
+ i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ nw = PTRS_PER_PTE - i;
+ if (addr + (nw << PAGE_SHIFT) > next)
+ nw = (next - addr) >> PAGE_SHIFT;
+
+ memset(spp, 0, nw * sizeof(u32));
+
+ /* now flush any existing HPTEs for the range */
+ hpte_flush_range(mm, addr, nw);
+ }
+
+err_out:
+ up_write(&mm->mmap_sem);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ split_huge_pmd(vma, pmd, addr);
+ return 0;
+}
+
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ struct vm_area_struct *vma;
+ struct mm_walk subpage_proto_walk = {
+ .mm = mm,
+ .pmd_entry = subpage_walk_pmd_entry,
+ };
+
+ /*
+ * We don't try too hard, we just mark all the vma in that range
+ * VM_NOHUGEPAGE and split them.
+ */
+ vma = find_vma(mm, addr);
+ /*
+ * If the range is in unmapped range, just return
+ */
+ if (vma && ((addr + len) <= vma->vm_start))
+ return;
+
+ while (vma) {
+ if (vma->vm_start >= (addr + len))
+ break;
+ vma->vm_flags |= VM_NOHUGEPAGE;
+ walk_page_vma(vma, &subpage_proto_walk);
+ vma = vma->vm_next;
+ }
+}
+#else
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ return;
+}
+#endif
+
+/*
+ * Copy in a subpage protection map for an address range.
+ * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
+ * Each 2-bit field is 0 to allow any access, 1 to prevent writes,
+ * 2 or 3 to prevent all accesses.
+ * Note that the normal page protections also apply; the subpage
+ * protection mechanism is an additional constraint, so putting 0
+ * in a 2-bit field won't allow writes to a page that is otherwise
+ * write-protected.
+ */
+SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
+ unsigned long, len, u32 __user *, map)
+{
+ struct mm_struct *mm = current->mm;
+ struct subpage_prot_table *spt;
+ u32 **spm, *spp;
+ unsigned long i;
+ size_t nw;
+ unsigned long next, limit;
+ int err;
+
+ if (radix_enabled())
+ return -ENOENT;
+
+ /* Check parameters */
+ if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
+ addr >= mm->task_size || len >= mm->task_size ||
+ addr + len > mm->task_size)
+ return -EINVAL;
+
+ if (is_hugepage_only_range(mm, addr, len))
+ return -EINVAL;
+
+ if (!map) {
+ /* Clear out the protection map for the address range */
+ subpage_prot_clear(addr, len);
+ return 0;
+ }
+
+ if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
+ return -EFAULT;
+
+ down_write(&mm->mmap_sem);
+
+ spt = mm_ctx_subpage_prot(&mm->context);
+ if (!spt) {
+ /*
+ * Allocate subpage prot table if not already done.
+ * Do this with mmap_sem held
+ */
+ spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
+ if (!spt) {
+ err = -ENOMEM;
+ goto out;
+ }
+ mm->context.hash_context->spt = spt;
+ }
+
+ subpage_mark_vma_nohuge(mm, addr, len);
+ for (limit = addr + len; addr < limit; addr = next) {
+ next = pmd_addr_end(addr, limit);
+ err = -ENOMEM;
+ if (addr < 0x100000000UL) {
+ spm = spt->low_prot;
+ } else {
+ spm = spt->protptrs[addr >> SBP_L3_SHIFT];
+ if (!spm) {
+ spm = (u32 **)get_zeroed_page(GFP_KERNEL);
+ if (!spm)
+ goto out;
+ spt->protptrs[addr >> SBP_L3_SHIFT] = spm;
+ }
+ }
+ spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1);
+ spp = *spm;
+ if (!spp) {
+ spp = (u32 *)get_zeroed_page(GFP_KERNEL);
+ if (!spp)
+ goto out;
+ *spm = spp;
+ }
+ spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
+
+ local_irq_disable();
+ demote_segment_4k(mm, addr);
+ local_irq_enable();
+
+ i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ nw = PTRS_PER_PTE - i;
+ if (addr + (nw << PAGE_SHIFT) > next)
+ nw = (next - addr) >> PAGE_SHIFT;
+
+ up_write(&mm->mmap_sem);
+ if (__copy_from_user(spp, map, nw * sizeof(u32)))
+ return -EFAULT;
+ map += nw;
+ down_write(&mm->mmap_sem);
+
+ /* now flush any existing HPTEs for the range */
+ hpte_flush_range(mm, addr, nw);
+ }
+ if (limit > spt->maxaddr)
+ spt->maxaddr = limit;
+ err = 0;
+ out:
+ up_write(&mm->mmap_sem);
+ return err;
+}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/byteorder.h>
+#include "vphn.h"
+
+/*
+ * The associativity domain numbers are returned from the hypervisor as a
+ * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the
+ * special value of "all ones" (aka. 0xffff) and its size may not exceed 48
+ * bytes.
+ *
+ * --- 16-bit fields -->
+ * _________________________
+ * | 0 | 1 | 2 | 3 | be_packed[0]
+ * ------+-----+-----+------
+ * _________________________
+ * | 4 | 5 | 6 | 7 | be_packed[1]
+ * -------------------------
+ * ...
+ * _________________________
+ * | 20 | 21 | 22 | 23 | be_packed[5]
+ * -------------------------
+ *
+ * Convert to the sequence they would appear in the ibm,associativity property.
+ */
+int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
+{
+ __be64 be_packed[VPHN_REGISTER_COUNT];
+ int i, nr_assoc_doms = 0;
+ const __be16 *field = (const __be16 *) be_packed;
+ u16 last = 0;
+ bool is_32bit = false;
+
+#define VPHN_FIELD_UNUSED (0xffff)
+#define VPHN_FIELD_MSB (0x8000)
+#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB)
+
+ /* Let's fix the values returned by plpar_hcall9() */
+ for (i = 0; i < VPHN_REGISTER_COUNT; i++)
+ be_packed[i] = cpu_to_be64(packed[i]);
+
+ for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
+ u16 new = be16_to_cpup(field++);
+
+ if (is_32bit) {
+ /*
+ * Let's concatenate the 16 bits of this field to the
+ * 15 lower bits of the previous field
+ */
+ unpacked[++nr_assoc_doms] =
+ cpu_to_be32(last << 16 | new);
+ is_32bit = false;
+ } else if (new == VPHN_FIELD_UNUSED)
+ /* This is the list terminator */
+ break;
+ else if (new & VPHN_FIELD_MSB) {
+ /* Data is in the lower 15 bits of this field */
+ unpacked[++nr_assoc_doms] =
+ cpu_to_be32(new & VPHN_FIELD_MASK);
+ } else {
+ /*
+ * Data is in the lower 15 bits of this field
+ * concatenated with the next 16 bit field
+ */
+ last = new;
+ is_32bit = true;
+ }
+ }
+
+ /* The first cell contains the length of the property */
+ unpacked[0] = cpu_to_be32(nr_assoc_doms);
+
+ return nr_assoc_doms;
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_POWERPC_MM_VPHN_H_
+#define _ARCH_POWERPC_MM_VPHN_H_
+
+/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */
+#define VPHN_REGISTER_COUNT 6
+
+/*
+ * 6 64-bit registers unpacked into up to 24 be32 associativity values. To
+ * form the complete property we have to add the length in the first cell.
+ */
+#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1)
+
+extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked);
+
+#endif
+++ /dev/null
-/*
- * Copyright IBM Corporation, 2015
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-#include <linux/mm.h>
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-
-int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
- pte_t *ptep, unsigned long trap, unsigned long flags,
- int ssize, int subpg_prot)
-{
- real_pte_t rpte;
- unsigned long hpte_group;
- unsigned long rflags, pa;
- unsigned long old_pte, new_pte;
- unsigned long vpn, hash, slot;
- unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
-
- /*
- * atomically mark the linux large page PTE busy and dirty
- */
- do {
- pte_t pte = READ_ONCE(*ptep);
-
- old_pte = pte_val(pte);
- /* If PTE busy, retry the access */
- if (unlikely(old_pte & H_PAGE_BUSY))
- return 0;
- /* If PTE permissions don't match, take page fault */
- if (unlikely(!check_pte_access(access, old_pte)))
- return 1;
- /*
- * Try to lock the PTE, add ACCESSED and DIRTY if it was
- * a write access. Since this is 4K insert of 64K page size
- * also add H_PAGE_COMBO
- */
- new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
- if (access & _PAGE_WRITE)
- new_pte |= _PAGE_DIRTY;
- } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
- /*
- * PP bits. _PAGE_USER is already PP bit 0x2, so we only
- * need to add in 0x1 if it's a read-only user page
- */
- rflags = htab_convert_pte_flags(new_pte);
- rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
-
- if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
- !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
- rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-
- vpn = hpt_vpn(ea, vsid, ssize);
- if (unlikely(old_pte & H_PAGE_HASHPTE)) {
- /*
- * There MIGHT be an HPTE for this pte
- */
- unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize,
- rpte, 0);
-
- if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K,
- MMU_PAGE_4K, ssize, flags) == -1)
- old_pte &= ~_PAGE_HPTEFLAGS;
- }
-
- if (likely(!(old_pte & H_PAGE_HASHPTE))) {
-
- pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
- hash = hpt_hash(vpn, shift, ssize);
-
-repeat:
- hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
- /* Insert into the hash table, primary slot */
- slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
- MMU_PAGE_4K, MMU_PAGE_4K, ssize);
- /*
- * Primary is full, try the secondary
- */
- if (unlikely(slot == -1)) {
- hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
- rflags,
- HPTE_V_SECONDARY,
- MMU_PAGE_4K,
- MMU_PAGE_4K, ssize);
- if (slot == -1) {
- if (mftb() & 0x1)
- hpte_group = (hash & htab_hash_mask) *
- HPTES_PER_GROUP;
- mmu_hash_ops.hpte_remove(hpte_group);
- /*
- * FIXME!! Should be try the group from which we removed ?
- */
- goto repeat;
- }
- }
- /*
- * Hypervisor failure. Restore old pte and return -1
- * similar to __hash_page_*
- */
- if (unlikely(slot == -2)) {
- *ptep = __pte(old_pte);
- hash_failure_debug(ea, access, vsid, trap, ssize,
- MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
- return -1;
- }
- new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
- new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
- }
- *ptep = __pte(new_pte & ~H_PAGE_BUSY);
- return 0;
-}
+++ /dev/null
-/*
- * Copyright IBM Corporation, 2015
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-#include <linux/mm.h>
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-
-/*
- * Return true, if the entry has a slot value which
- * the software considers as invalid.
- */
-static inline bool hpte_soft_invalid(unsigned long hidx)
-{
- return ((hidx & 0xfUL) == 0xfUL);
-}
-
-/*
- * index from 0 - 15
- */
-bool __rpte_sub_valid(real_pte_t rpte, unsigned long index)
-{
- return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index)));
-}
-
-int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
- pte_t *ptep, unsigned long trap, unsigned long flags,
- int ssize, int subpg_prot)
-{
- real_pte_t rpte;
- unsigned long hpte_group;
- unsigned int subpg_index;
- unsigned long rflags, pa;
- unsigned long old_pte, new_pte, subpg_pte;
- unsigned long vpn, hash, slot, gslot;
- unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
-
- /*
- * atomically mark the linux large page PTE busy and dirty
- */
- do {
- pte_t pte = READ_ONCE(*ptep);
-
- old_pte = pte_val(pte);
- /* If PTE busy, retry the access */
- if (unlikely(old_pte & H_PAGE_BUSY))
- return 0;
- /* If PTE permissions don't match, take page fault */
- if (unlikely(!check_pte_access(access, old_pte)))
- return 1;
- /*
- * Try to lock the PTE, add ACCESSED and DIRTY if it was
- * a write access. Since this is 4K insert of 64K page size
- * also add H_PAGE_COMBO
- */
- new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO;
- if (access & _PAGE_WRITE)
- new_pte |= _PAGE_DIRTY;
- } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
- /*
- * Handle the subpage protection bits
- */
- subpg_pte = new_pte & ~subpg_prot;
- rflags = htab_convert_pte_flags(subpg_pte);
-
- if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
- !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
-
- /*
- * No CPU has hugepages but lacks no execute, so we
- * don't need to worry about that case
- */
- rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
- }
-
- subpg_index = (ea & (PAGE_SIZE - 1)) >> shift;
- vpn = hpt_vpn(ea, vsid, ssize);
- rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
- /*
- *None of the sub 4k page is hashed
- */
- if (!(old_pte & H_PAGE_HASHPTE))
- goto htab_insert_hpte;
- /*
- * Check if the pte was already inserted into the hash table
- * as a 64k HW page, and invalidate the 64k HPTE if so.
- */
- if (!(old_pte & H_PAGE_COMBO)) {
- flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
- /*
- * clear the old slot details from the old and new pte.
- * On hash insert failure we use old pte value and we don't
- * want slot information there if we have a insert failure.
- */
- old_pte &= ~H_PAGE_HASHPTE;
- new_pte &= ~H_PAGE_HASHPTE;
- goto htab_insert_hpte;
- }
- /*
- * Check for sub page valid and update
- */
- if (__rpte_sub_valid(rpte, subpg_index)) {
- int ret;
-
- gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte,
- subpg_index);
- ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn,
- MMU_PAGE_4K, MMU_PAGE_4K,
- ssize, flags);
-
- /*
- * If we failed because typically the HPTE wasn't really here
- * we try an insertion.
- */
- if (ret == -1)
- goto htab_insert_hpte;
-
- *ptep = __pte(new_pte & ~H_PAGE_BUSY);
- return 0;
- }
-
-htab_insert_hpte:
-
- /*
- * Initialize all hidx entries to invalid value, the first time
- * the PTE is about to allocate a 4K HPTE.
- */
- if (!(old_pte & H_PAGE_COMBO))
- rpte.hidx = INVALID_RPTE_HIDX;
-
- /*
- * handle H_PAGE_4K_PFN case
- */
- if (old_pte & H_PAGE_4K_PFN) {
- /*
- * All the sub 4k page have the same
- * physical address.
- */
- pa = pte_pfn(__pte(old_pte)) << HW_PAGE_SHIFT;
- } else {
- pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
- pa += (subpg_index << shift);
- }
- hash = hpt_hash(vpn, shift, ssize);
-repeat:
- hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
- /* Insert into the hash table, primary slot */
- slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
- MMU_PAGE_4K, MMU_PAGE_4K, ssize);
- /*
- * Primary is full, try the secondary
- */
- if (unlikely(slot == -1)) {
- bool soft_invalid;
-
- hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
- rflags, HPTE_V_SECONDARY,
- MMU_PAGE_4K, MMU_PAGE_4K,
- ssize);
-
- soft_invalid = hpte_soft_invalid(slot);
- if (unlikely(soft_invalid)) {
- /*
- * We got a valid slot from a hardware point of view.
- * but we cannot use it, because we use this special
- * value; as defined by hpte_soft_invalid(), to track
- * invalid slots. We cannot use it. So invalidate it.
- */
- gslot = slot & _PTEIDX_GROUP_IX;
- mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn,
- MMU_PAGE_4K, MMU_PAGE_4K,
- ssize, 0);
- }
-
- if (unlikely(slot == -1 || soft_invalid)) {
- /*
- * For soft invalid slot, let's ensure that we release a
- * slot from the primary, with the hope that we will
- * acquire that slot next time we try. This will ensure
- * that we do not get the same soft-invalid slot.
- */
- if (soft_invalid || (mftb() & 0x1))
- hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
- mmu_hash_ops.hpte_remove(hpte_group);
- /*
- * FIXME!! Should be try the group from which we removed ?
- */
- goto repeat;
- }
- }
- /*
- * Hypervisor failure. Restore old pte and return -1
- * similar to __hash_page_*
- */
- if (unlikely(slot == -2)) {
- *ptep = __pte(old_pte);
- hash_failure_debug(ea, access, vsid, trap, ssize,
- MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
- return -1;
- }
-
- new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
- new_pte |= H_PAGE_HASHPTE;
-
- *ptep = __pte(new_pte & ~H_PAGE_BUSY);
- return 0;
-}
-
-int __hash_page_64K(unsigned long ea, unsigned long access,
- unsigned long vsid, pte_t *ptep, unsigned long trap,
- unsigned long flags, int ssize)
-{
- real_pte_t rpte;
- unsigned long hpte_group;
- unsigned long rflags, pa;
- unsigned long old_pte, new_pte;
- unsigned long vpn, hash, slot;
- unsigned long shift = mmu_psize_defs[MMU_PAGE_64K].shift;
-
- /*
- * atomically mark the linux large page PTE busy and dirty
- */
- do {
- pte_t pte = READ_ONCE(*ptep);
-
- old_pte = pte_val(pte);
- /* If PTE busy, retry the access */
- if (unlikely(old_pte & H_PAGE_BUSY))
- return 0;
- /* If PTE permissions don't match, take page fault */
- if (unlikely(!check_pte_access(access, old_pte)))
- return 1;
- /*
- * Check if PTE has the cache-inhibit bit set
- * If so, bail out and refault as a 4k page
- */
- if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) &&
- unlikely(pte_ci(pte)))
- return 0;
- /*
- * Try to lock the PTE, add ACCESSED and DIRTY if it was
- * a write access.
- */
- new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
- if (access & _PAGE_WRITE)
- new_pte |= _PAGE_DIRTY;
- } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
- rflags = htab_convert_pte_flags(new_pte);
- rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
-
- if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
- !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
- rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-
- vpn = hpt_vpn(ea, vsid, ssize);
- if (unlikely(old_pte & H_PAGE_HASHPTE)) {
- unsigned long gslot;
-
- /*
- * There MIGHT be an HPTE for this pte
- */
- gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
- if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
- MMU_PAGE_64K, ssize,
- flags) == -1)
- old_pte &= ~_PAGE_HPTEFLAGS;
- }
-
- if (likely(!(old_pte & H_PAGE_HASHPTE))) {
-
- pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
- hash = hpt_hash(vpn, shift, ssize);
-
-repeat:
- hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
- /* Insert into the hash table, primary slot */
- slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
- MMU_PAGE_64K, MMU_PAGE_64K,
- ssize);
- /*
- * Primary is full, try the secondary
- */
- if (unlikely(slot == -1)) {
- hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
- rflags,
- HPTE_V_SECONDARY,
- MMU_PAGE_64K,
- MMU_PAGE_64K, ssize);
- if (slot == -1) {
- if (mftb() & 0x1)
- hpte_group = (hash & htab_hash_mask) *
- HPTES_PER_GROUP;
- mmu_hash_ops.hpte_remove(hpte_group);
- /*
- * FIXME!! Should be try the group from which we removed ?
- */
- goto repeat;
- }
- }
- /*
- * Hypervisor failure. Restore old pte and return -1
- * similar to __hash_page_*
- */
- if (unlikely(slot == -2)) {
- *ptep = __pte(old_pte);
- hash_failure_debug(ea, access, vsid, trap, ssize,
- MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
- return -1;
- }
-
- new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
- new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
- }
- *ptep = __pte(new_pte & ~H_PAGE_BUSY);
- return 0;
-}
+++ /dev/null
-/*
- * native hashtable management.
- *
- * SMP scalability work:
- * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#undef DEBUG_LOW
-
-#include <linux/spinlock.h>
-#include <linux/bitops.h>
-#include <linux/of.h>
-#include <linux/processor.h>
-#include <linux/threads.h>
-#include <linux/smp.h>
-
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/trace.h>
-#include <asm/tlb.h>
-#include <asm/cputable.h>
-#include <asm/udbg.h>
-#include <asm/kexec.h>
-#include <asm/ppc-opcode.h>
-#include <asm/feature-fixups.h>
-
-#include <misc/cxl-base.h>
-
-#ifdef DEBUG_LOW
-#define DBG_LOW(fmt...) udbg_printf(fmt)
-#else
-#define DBG_LOW(fmt...)
-#endif
-
-#ifdef __BIG_ENDIAN__
-#define HPTE_LOCK_BIT 3
-#else
-#define HPTE_LOCK_BIT (56+3)
-#endif
-
-DEFINE_RAW_SPINLOCK(native_tlbie_lock);
-
-static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
-{
- unsigned long rb;
-
- rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
-
- asm volatile("tlbiel %0" : : "r" (rb));
-}
-
-/*
- * tlbiel instruction for hash, set invalidation
- * i.e., r=1 and is=01 or is=10 or is=11
- */
-static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
- unsigned int pid,
- unsigned int ric, unsigned int prs)
-{
- unsigned long rb;
- unsigned long rs;
- unsigned int r = 0; /* hash format */
-
- rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
- rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
-
- asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
- : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
- : "memory");
-}
-
-
-static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
-{
- unsigned int set;
-
- asm volatile("ptesync": : :"memory");
-
- for (set = 0; set < num_sets; set++)
- tlbiel_hash_set_isa206(set, is);
-
- asm volatile("ptesync": : :"memory");
-}
-
-static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
-{
- unsigned int set;
-
- asm volatile("ptesync": : :"memory");
-
- /*
- * Flush the first set of the TLB, and any caching of partition table
- * entries. Then flush the remaining sets of the TLB. Hash mode uses
- * partition scoped TLB translations.
- */
- tlbiel_hash_set_isa300(0, is, 0, 2, 0);
- for (set = 1; set < num_sets; set++)
- tlbiel_hash_set_isa300(set, is, 0, 0, 0);
-
- /*
- * Now invalidate the process table cache.
- *
- * From ISA v3.0B p. 1078:
- * The following forms are invalid.
- * * PRS=1, R=0, and RIC!=2 (The only process-scoped
- * HPT caching is of the Process Table.)
- */
- tlbiel_hash_set_isa300(0, is, 0, 2, 1);
-
- asm volatile("ptesync": : :"memory");
-
- asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-void hash__tlbiel_all(unsigned int action)
-{
- unsigned int is;
-
- switch (action) {
- case TLB_INVAL_SCOPE_GLOBAL:
- is = 3;
- break;
- case TLB_INVAL_SCOPE_LPID:
- is = 2;
- break;
- default:
- BUG();
- }
-
- if (early_cpu_has_feature(CPU_FTR_ARCH_300))
- tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
- else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
- tlbiel_all_isa206(POWER8_TLB_SETS, is);
- else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
- tlbiel_all_isa206(POWER7_TLB_SETS, is);
- else
- WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
-}
-
-static inline unsigned long ___tlbie(unsigned long vpn, int psize,
- int apsize, int ssize)
-{
- unsigned long va;
- unsigned int penc;
- unsigned long sllp;
-
- /*
- * We need 14 to 65 bits of va for a tlibe of 4K page
- * With vpn we ignore the lower VPN_SHIFT bits already.
- * And top two bits are already ignored because we can
- * only accomodate 76 bits in a 64 bit vpn with a VPN_SHIFT
- * of 12.
- */
- va = vpn << VPN_SHIFT;
- /*
- * clear top 16 bits of 64bit va, non SLS segment
- * Older versions of the architecture (2.02 and earler) require the
- * masking of the top 16 bits.
- */
- if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
- va &= ~(0xffffULL << 48);
-
- switch (psize) {
- case MMU_PAGE_4K:
- /* clear out bits after (52) [0....52.....63] */
- va &= ~((1ul << (64 - 52)) - 1);
- va |= ssize << 8;
- sllp = get_sllp_encoding(apsize);
- va |= sllp << 5;
- asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
- : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
- : "memory");
- break;
- default:
- /* We need 14 to 14 + i bits of va */
- penc = mmu_psize_defs[psize].penc[apsize];
- va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
- va |= penc << 12;
- va |= ssize << 8;
- /*
- * AVAL bits:
- * We don't need all the bits, but rest of the bits
- * must be ignored by the processor.
- * vpn cover upto 65 bits of va. (0...65) and we need
- * 58..64 bits of va.
- */
- va |= (vpn & 0xfe); /* AVAL */
- va |= 1; /* L */
- asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
- : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
- : "memory");
- break;
- }
- return va;
-}
-
-static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize)
-{
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
- /* Need the extra ptesync to ensure we don't reorder tlbie*/
- asm volatile("ptesync": : :"memory");
- ___tlbie(vpn, psize, apsize, ssize);
- }
-}
-
-static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
-{
- unsigned long rb;
-
- rb = ___tlbie(vpn, psize, apsize, ssize);
- trace_tlbie(0, 0, rb, 0, 0, 0, 0);
-}
-
-static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
-{
- unsigned long va;
- unsigned int penc;
- unsigned long sllp;
-
- /* VPN_SHIFT can be atmost 12 */
- va = vpn << VPN_SHIFT;
- /*
- * clear top 16 bits of 64 bit va, non SLS segment
- * Older versions of the architecture (2.02 and earler) require the
- * masking of the top 16 bits.
- */
- if (mmu_has_feature(MMU_FTR_TLBIE_CROP_VA))
- va &= ~(0xffffULL << 48);
-
- switch (psize) {
- case MMU_PAGE_4K:
- /* clear out bits after(52) [0....52.....63] */
- va &= ~((1ul << (64 - 52)) - 1);
- va |= ssize << 8;
- sllp = get_sllp_encoding(apsize);
- va |= sllp << 5;
- asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1)
- : : "r" (va), "i" (CPU_FTR_ARCH_206)
- : "memory");
- break;
- default:
- /* We need 14 to 14 + i bits of va */
- penc = mmu_psize_defs[psize].penc[apsize];
- va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1);
- va |= penc << 12;
- va |= ssize << 8;
- /*
- * AVAL bits:
- * We don't need all the bits, but rest of the bits
- * must be ignored by the processor.
- * vpn cover upto 65 bits of va. (0...65) and we need
- * 58..64 bits of va.
- */
- va |= (vpn & 0xfe);
- va |= 1; /* L */
- asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1)
- : : "r" (va), "i" (CPU_FTR_ARCH_206)
- : "memory");
- break;
- }
- trace_tlbie(0, 1, va, 0, 0, 0, 0);
-
-}
-
-static inline void tlbie(unsigned long vpn, int psize, int apsize,
- int ssize, int local)
-{
- unsigned int use_local;
- int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
- use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use();
-
- if (use_local)
- use_local = mmu_psize_defs[psize].tlbiel;
- if (lock_tlbie && !use_local)
- raw_spin_lock(&native_tlbie_lock);
- asm volatile("ptesync": : :"memory");
- if (use_local) {
- __tlbiel(vpn, psize, apsize, ssize);
- asm volatile("ptesync": : :"memory");
- } else {
- __tlbie(vpn, psize, apsize, ssize);
- fixup_tlbie(vpn, psize, apsize, ssize);
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
- }
- if (lock_tlbie && !use_local)
- raw_spin_unlock(&native_tlbie_lock);
-}
-
-static inline void native_lock_hpte(struct hash_pte *hptep)
-{
- unsigned long *word = (unsigned long *)&hptep->v;
-
- while (1) {
- if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
- break;
- spin_begin();
- while(test_bit(HPTE_LOCK_BIT, word))
- spin_cpu_relax();
- spin_end();
- }
-}
-
-static inline void native_unlock_hpte(struct hash_pte *hptep)
-{
- unsigned long *word = (unsigned long *)&hptep->v;
-
- clear_bit_unlock(HPTE_LOCK_BIT, word);
-}
-
-static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
- unsigned long pa, unsigned long rflags,
- unsigned long vflags, int psize, int apsize, int ssize)
-{
- struct hash_pte *hptep = htab_address + hpte_group;
- unsigned long hpte_v, hpte_r;
- int i;
-
- if (!(vflags & HPTE_V_BOLTED)) {
- DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx,"
- " rflags=%lx, vflags=%lx, psize=%d)\n",
- hpte_group, vpn, pa, rflags, vflags, psize);
- }
-
- for (i = 0; i < HPTES_PER_GROUP; i++) {
- if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
- /* retry with lock held */
- native_lock_hpte(hptep);
- if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
- break;
- native_unlock_hpte(hptep);
- }
-
- hptep++;
- }
-
- if (i == HPTES_PER_GROUP)
- return -1;
-
- hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
- hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
-
- if (!(vflags & HPTE_V_BOLTED)) {
- DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
- i, hpte_v, hpte_r);
- }
-
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- hpte_r = hpte_old_to_new_r(hpte_v, hpte_r);
- hpte_v = hpte_old_to_new_v(hpte_v);
- }
-
- hptep->r = cpu_to_be64(hpte_r);
- /* Guarantee the second dword is visible before the valid bit */
- eieio();
- /*
- * Now set the first dword including the valid bit
- * NOTE: this also unlocks the hpte
- */
- hptep->v = cpu_to_be64(hpte_v);
-
- __asm__ __volatile__ ("ptesync" : : : "memory");
-
- return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
-}
-
-static long native_hpte_remove(unsigned long hpte_group)
-{
- struct hash_pte *hptep;
- int i;
- int slot_offset;
- unsigned long hpte_v;
-
- DBG_LOW(" remove(group=%lx)\n", hpte_group);
-
- /* pick a random entry to start at */
- slot_offset = mftb() & 0x7;
-
- for (i = 0; i < HPTES_PER_GROUP; i++) {
- hptep = htab_address + hpte_group + slot_offset;
- hpte_v = be64_to_cpu(hptep->v);
-
- if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
- /* retry with lock held */
- native_lock_hpte(hptep);
- hpte_v = be64_to_cpu(hptep->v);
- if ((hpte_v & HPTE_V_VALID)
- && !(hpte_v & HPTE_V_BOLTED))
- break;
- native_unlock_hpte(hptep);
- }
-
- slot_offset++;
- slot_offset &= 0x7;
- }
-
- if (i == HPTES_PER_GROUP)
- return -1;
-
- /* Invalidate the hpte. NOTE: this also unlocks it */
- hptep->v = 0;
-
- return i;
-}
-
-static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
- unsigned long vpn, int bpsize,
- int apsize, int ssize, unsigned long flags)
-{
- struct hash_pte *hptep = htab_address + slot;
- unsigned long hpte_v, want_v;
- int ret = 0, local = 0;
-
- want_v = hpte_encode_avpn(vpn, bpsize, ssize);
-
- DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
- vpn, want_v & HPTE_V_AVPN, slot, newpp);
-
- hpte_v = hpte_get_old_v(hptep);
- /*
- * We need to invalidate the TLB always because hpte_remove doesn't do
- * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
- * random entry from it. When we do that we don't invalidate the TLB
- * (hpte_remove) because we assume the old translation is still
- * technically "valid".
- */
- if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
- DBG_LOW(" -> miss\n");
- ret = -1;
- } else {
- native_lock_hpte(hptep);
- /* recheck with locks held */
- hpte_v = hpte_get_old_v(hptep);
- if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
- !(hpte_v & HPTE_V_VALID))) {
- ret = -1;
- } else {
- DBG_LOW(" -> hit\n");
- /* Update the HPTE */
- hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
- ~(HPTE_R_PPP | HPTE_R_N)) |
- (newpp & (HPTE_R_PPP | HPTE_R_N |
- HPTE_R_C)));
- }
- native_unlock_hpte(hptep);
- }
-
- if (flags & HPTE_LOCAL_UPDATE)
- local = 1;
- /*
- * Ensure it is out of the tlb too if it is not a nohpte fault
- */
- if (!(flags & HPTE_NOHPTE_UPDATE))
- tlbie(vpn, bpsize, apsize, ssize, local);
-
- return ret;
-}
-
-static long native_hpte_find(unsigned long vpn, int psize, int ssize)
-{
- struct hash_pte *hptep;
- unsigned long hash;
- unsigned long i;
- long slot;
- unsigned long want_v, hpte_v;
-
- hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
- want_v = hpte_encode_avpn(vpn, psize, ssize);
-
- /* Bolted mappings are only ever in the primary group */
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- for (i = 0; i < HPTES_PER_GROUP; i++) {
-
- hptep = htab_address + slot;
- hpte_v = hpte_get_old_v(hptep);
- if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
- /* HPTE matches */
- return slot;
- ++slot;
- }
-
- return -1;
-}
-
-/*
- * Update the page protection bits. Intended to be used to create
- * guard pages for kernel data structures on pages which are bolted
- * in the HPT. Assumes pages being operated on will not be stolen.
- *
- * No need to lock here because we should be the only user.
- */
-static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
- int psize, int ssize)
-{
- unsigned long vpn;
- unsigned long vsid;
- long slot;
- struct hash_pte *hptep;
-
- vsid = get_kernel_vsid(ea, ssize);
- vpn = hpt_vpn(ea, vsid, ssize);
-
- slot = native_hpte_find(vpn, psize, ssize);
- if (slot == -1)
- panic("could not find page to bolt\n");
- hptep = htab_address + slot;
-
- /* Update the HPTE */
- hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
- ~(HPTE_R_PPP | HPTE_R_N)) |
- (newpp & (HPTE_R_PPP | HPTE_R_N)));
- /*
- * Ensure it is out of the tlb too. Bolted entries base and
- * actual page size will be same.
- */
- tlbie(vpn, psize, psize, ssize, 0);
-}
-
-/*
- * Remove a bolted kernel entry. Memory hotplug uses this.
- *
- * No need to lock here because we should be the only user.
- */
-static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
-{
- unsigned long vpn;
- unsigned long vsid;
- long slot;
- struct hash_pte *hptep;
-
- vsid = get_kernel_vsid(ea, ssize);
- vpn = hpt_vpn(ea, vsid, ssize);
-
- slot = native_hpte_find(vpn, psize, ssize);
- if (slot == -1)
- return -ENOENT;
-
- hptep = htab_address + slot;
-
- VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED));
-
- /* Invalidate the hpte */
- hptep->v = 0;
-
- /* Invalidate the TLB */
- tlbie(vpn, psize, psize, ssize, 0);
- return 0;
-}
-
-
-static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
- int bpsize, int apsize, int ssize, int local)
-{
- struct hash_pte *hptep = htab_address + slot;
- unsigned long hpte_v;
- unsigned long want_v;
- unsigned long flags;
-
- local_irq_save(flags);
-
- DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
-
- want_v = hpte_encode_avpn(vpn, bpsize, ssize);
- hpte_v = hpte_get_old_v(hptep);
-
- if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
- native_lock_hpte(hptep);
- /* recheck with locks held */
- hpte_v = hpte_get_old_v(hptep);
-
- if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
- /* Invalidate the hpte. NOTE: this also unlocks it */
- hptep->v = 0;
- else
- native_unlock_hpte(hptep);
- }
- /*
- * We need to invalidate the TLB always because hpte_remove doesn't do
- * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
- * random entry from it. When we do that we don't invalidate the TLB
- * (hpte_remove) because we assume the old translation is still
- * technically "valid".
- */
- tlbie(vpn, bpsize, apsize, ssize, local);
-
- local_irq_restore(flags);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void native_hugepage_invalidate(unsigned long vsid,
- unsigned long addr,
- unsigned char *hpte_slot_array,
- int psize, int ssize, int local)
-{
- int i;
- struct hash_pte *hptep;
- int actual_psize = MMU_PAGE_16M;
- unsigned int max_hpte_count, valid;
- unsigned long flags, s_addr = addr;
- unsigned long hpte_v, want_v, shift;
- unsigned long hidx, vpn = 0, hash, slot;
-
- shift = mmu_psize_defs[psize].shift;
- max_hpte_count = 1U << (PMD_SHIFT - shift);
-
- local_irq_save(flags);
- for (i = 0; i < max_hpte_count; i++) {
- valid = hpte_valid(hpte_slot_array, i);
- if (!valid)
- continue;
- hidx = hpte_hash_index(hpte_slot_array, i);
-
- /* get the vpn */
- addr = s_addr + (i * (1ul << shift));
- vpn = hpt_vpn(addr, vsid, ssize);
- hash = hpt_hash(vpn, shift, ssize);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
-
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += hidx & _PTEIDX_GROUP_IX;
-
- hptep = htab_address + slot;
- want_v = hpte_encode_avpn(vpn, psize, ssize);
- hpte_v = hpte_get_old_v(hptep);
-
- /* Even if we miss, we need to invalidate the TLB */
- if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
- /* recheck with locks held */
- native_lock_hpte(hptep);
- hpte_v = hpte_get_old_v(hptep);
-
- if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) {
- /*
- * Invalidate the hpte. NOTE: this also unlocks it
- */
-
- hptep->v = 0;
- } else
- native_unlock_hpte(hptep);
- }
- /*
- * We need to do tlb invalidate for all the address, tlbie
- * instruction compares entry_VA in tlb with the VA specified
- * here
- */
- tlbie(vpn, psize, actual_psize, ssize, local);
- }
- local_irq_restore(flags);
-}
-#else
-static void native_hugepage_invalidate(unsigned long vsid,
- unsigned long addr,
- unsigned char *hpte_slot_array,
- int psize, int ssize, int local)
-{
- WARN(1, "%s called without THP support\n", __func__);
-}
-#endif
-
-static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
- int *psize, int *apsize, int *ssize, unsigned long *vpn)
-{
- unsigned long avpn, pteg, vpi;
- unsigned long hpte_v = be64_to_cpu(hpte->v);
- unsigned long hpte_r = be64_to_cpu(hpte->r);
- unsigned long vsid, seg_off;
- int size, a_size, shift;
- /* Look at the 8 bit LP value */
- unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- hpte_v = hpte_new_to_old_v(hpte_v, hpte_r);
- hpte_r = hpte_new_to_old_r(hpte_r);
- }
- if (!(hpte_v & HPTE_V_LARGE)) {
- size = MMU_PAGE_4K;
- a_size = MMU_PAGE_4K;
- } else {
- size = hpte_page_sizes[lp] & 0xf;
- a_size = hpte_page_sizes[lp] >> 4;
- }
- /* This works for all page sizes, and for 256M and 1T segments */
- *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
- shift = mmu_psize_defs[size].shift;
-
- avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
- pteg = slot / HPTES_PER_GROUP;
- if (hpte_v & HPTE_V_SECONDARY)
- pteg = ~pteg;
-
- switch (*ssize) {
- case MMU_SEGSIZE_256M:
- /* We only have 28 - 23 bits of seg_off in avpn */
- seg_off = (avpn & 0x1f) << 23;
- vsid = avpn >> 5;
- /* We can find more bits from the pteg value */
- if (shift < 23) {
- vpi = (vsid ^ pteg) & htab_hash_mask;
- seg_off |= vpi << shift;
- }
- *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
- break;
- case MMU_SEGSIZE_1T:
- /* We only have 40 - 23 bits of seg_off in avpn */
- seg_off = (avpn & 0x1ffff) << 23;
- vsid = avpn >> 17;
- if (shift < 23) {
- vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask;
- seg_off |= vpi << shift;
- }
- *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
- break;
- default:
- *vpn = size = 0;
- }
- *psize = size;
- *apsize = a_size;
-}
-
-/*
- * clear all mappings on kexec. All cpus are in real mode (or they will
- * be when they isi), and we are the only one left. We rely on our kernel
- * mapping being 0xC0's and the hardware ignoring those two real bits.
- *
- * This must be called with interrupts disabled.
- *
- * Taking the native_tlbie_lock is unsafe here due to the possibility of
- * lockdep being on. On pre POWER5 hardware, not taking the lock could
- * cause deadlock. POWER5 and newer not taking the lock is fine. This only
- * gets called during boot before secondary CPUs have come up and during
- * crashdump and all bets are off anyway.
- *
- * TODO: add batching support when enabled. remember, no dynamic memory here,
- * although there is the control page available...
- */
-static void native_hpte_clear(void)
-{
- unsigned long vpn = 0;
- unsigned long slot, slots;
- struct hash_pte *hptep = htab_address;
- unsigned long hpte_v;
- unsigned long pteg_count;
- int psize, apsize, ssize;
-
- pteg_count = htab_hash_mask + 1;
-
- slots = pteg_count * HPTES_PER_GROUP;
-
- for (slot = 0; slot < slots; slot++, hptep++) {
- /*
- * we could lock the pte here, but we are the only cpu
- * running, right? and for crash dump, we probably
- * don't want to wait for a maybe bad cpu.
- */
- hpte_v = be64_to_cpu(hptep->v);
-
- /*
- * Call __tlbie() here rather than tlbie() since we can't take the
- * native_tlbie_lock.
- */
- if (hpte_v & HPTE_V_VALID) {
- hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
- hptep->v = 0;
- ___tlbie(vpn, psize, apsize, ssize);
- }
- }
-
- asm volatile("eieio; tlbsync; ptesync":::"memory");
-}
-
-/*
- * Batched hash table flush, we batch the tlbie's to avoid taking/releasing
- * the lock all the time
- */
-static void native_flush_hash_range(unsigned long number, int local)
-{
- unsigned long vpn = 0;
- unsigned long hash, index, hidx, shift, slot;
- struct hash_pte *hptep;
- unsigned long hpte_v;
- unsigned long want_v;
- unsigned long flags;
- real_pte_t pte;
- struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
- unsigned long psize = batch->psize;
- int ssize = batch->ssize;
- int i;
- unsigned int use_local;
-
- use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) &&
- mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use();
-
- local_irq_save(flags);
-
- for (i = 0; i < number; i++) {
- vpn = batch->vpn[i];
- pte = batch->pte[i];
-
- pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
- hash = hpt_hash(vpn, shift, ssize);
- hidx = __rpte_to_hidx(pte, index);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += hidx & _PTEIDX_GROUP_IX;
- hptep = htab_address + slot;
- want_v = hpte_encode_avpn(vpn, psize, ssize);
- hpte_v = hpte_get_old_v(hptep);
-
- if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
- continue;
- /* lock and try again */
- native_lock_hpte(hptep);
- hpte_v = hpte_get_old_v(hptep);
-
- if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
- native_unlock_hpte(hptep);
- else
- hptep->v = 0;
-
- } pte_iterate_hashed_end();
- }
-
- if (use_local) {
- asm volatile("ptesync":::"memory");
- for (i = 0; i < number; i++) {
- vpn = batch->vpn[i];
- pte = batch->pte[i];
-
- pte_iterate_hashed_subpages(pte, psize,
- vpn, index, shift) {
- __tlbiel(vpn, psize, psize, ssize);
- } pte_iterate_hashed_end();
- }
- asm volatile("ptesync":::"memory");
- } else {
- int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
- if (lock_tlbie)
- raw_spin_lock(&native_tlbie_lock);
-
- asm volatile("ptesync":::"memory");
- for (i = 0; i < number; i++) {
- vpn = batch->vpn[i];
- pte = batch->pte[i];
-
- pte_iterate_hashed_subpages(pte, psize,
- vpn, index, shift) {
- __tlbie(vpn, psize, psize, ssize);
- } pte_iterate_hashed_end();
- }
- /*
- * Just do one more with the last used values.
- */
- fixup_tlbie(vpn, psize, psize, ssize);
- asm volatile("eieio; tlbsync; ptesync":::"memory");
-
- if (lock_tlbie)
- raw_spin_unlock(&native_tlbie_lock);
- }
-
- local_irq_restore(flags);
-}
-
-void __init hpte_init_native(void)
-{
- mmu_hash_ops.hpte_invalidate = native_hpte_invalidate;
- mmu_hash_ops.hpte_updatepp = native_hpte_updatepp;
- mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
- mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
- mmu_hash_ops.hpte_insert = native_hpte_insert;
- mmu_hash_ops.hpte_remove = native_hpte_remove;
- mmu_hash_ops.hpte_clear_all = native_hpte_clear;
- mmu_hash_ops.flush_hash_range = native_flush_hash_range;
- mmu_hash_ops.hugepage_invalidate = native_hugepage_invalidate;
-}
+++ /dev/null
-/*
- * PowerPC64 port by Mike Corrigan and Dave Engebretsen
- * {mikejc|engebret}@us.ibm.com
- *
- * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
- *
- * SMP scalability work:
- * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * Module name: htab.c
- *
- * Description:
- * PowerPC Hashed Page Table functions
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#undef DEBUG
-#undef DEBUG_LOW
-
-#define pr_fmt(fmt) "hash-mmu: " fmt
-#include <linux/spinlock.h>
-#include <linux/errno.h>
-#include <linux/sched/mm.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include <linux/export.h>
-#include <linux/ctype.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/signal.h>
-#include <linux/memblock.h>
-#include <linux/context_tracking.h>
-#include <linux/libfdt.h>
-#include <linux/pkeys.h>
-
-#include <asm/debugfs.h>
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <linux/uaccess.h>
-#include <asm/machdep.h>
-#include <asm/prom.h>
-#include <asm/io.h>
-#include <asm/eeh.h>
-#include <asm/tlb.h>
-#include <asm/cacheflush.h>
-#include <asm/cputable.h>
-#include <asm/sections.h>
-#include <asm/copro.h>
-#include <asm/udbg.h>
-#include <asm/code-patching.h>
-#include <asm/fadump.h>
-#include <asm/firmware.h>
-#include <asm/tm.h>
-#include <asm/trace.h>
-#include <asm/ps3.h>
-#include <asm/pte-walk.h>
-#include <asm/asm-prototypes.h>
-
-#ifdef DEBUG
-#define DBG(fmt...) udbg_printf(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
-#ifdef DEBUG_LOW
-#define DBG_LOW(fmt...) udbg_printf(fmt)
-#else
-#define DBG_LOW(fmt...)
-#endif
-
-#define KB (1024)
-#define MB (1024*KB)
-#define GB (1024L*MB)
-
-/*
- * Note: pte --> Linux PTE
- * HPTE --> PowerPC Hashed Page Table Entry
- *
- * Execution context:
- * htab_initialize is called with the MMU off (of course), but
- * the kernel has been copied down to zero so it can directly
- * reference global data. At this point it is very difficult
- * to print debug info.
- *
- */
-
-static unsigned long _SDR1;
-struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-EXPORT_SYMBOL_GPL(mmu_psize_defs);
-
-u8 hpte_page_sizes[1 << LP_BITS];
-EXPORT_SYMBOL_GPL(hpte_page_sizes);
-
-struct hash_pte *htab_address;
-unsigned long htab_size_bytes;
-unsigned long htab_hash_mask;
-EXPORT_SYMBOL_GPL(htab_hash_mask);
-int mmu_linear_psize = MMU_PAGE_4K;
-EXPORT_SYMBOL_GPL(mmu_linear_psize);
-int mmu_virtual_psize = MMU_PAGE_4K;
-int mmu_vmalloc_psize = MMU_PAGE_4K;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-int mmu_vmemmap_psize = MMU_PAGE_4K;
-#endif
-int mmu_io_psize = MMU_PAGE_4K;
-int mmu_kernel_ssize = MMU_SEGSIZE_256M;
-EXPORT_SYMBOL_GPL(mmu_kernel_ssize);
-int mmu_highuser_ssize = MMU_SEGSIZE_256M;
-u16 mmu_slb_size = 64;
-EXPORT_SYMBOL_GPL(mmu_slb_size);
-#ifdef CONFIG_PPC_64K_PAGES
-int mmu_ci_restrictions;
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static u8 *linear_map_hash_slots;
-static unsigned long linear_map_hash_count;
-static DEFINE_SPINLOCK(linear_map_hash_lock);
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-struct mmu_hash_ops mmu_hash_ops;
-EXPORT_SYMBOL(mmu_hash_ops);
-
-/* There are definitions of page sizes arrays to be used when none
- * is provided by the firmware.
- */
-
-/*
- * Fallback (4k pages only)
- */
-static struct mmu_psize_def mmu_psize_defaults[] = {
- [MMU_PAGE_4K] = {
- .shift = 12,
- .sllp = 0,
- .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
- .avpnm = 0,
- .tlbiel = 0,
- },
-};
-
-/* POWER4, GPUL, POWER5
- *
- * Support for 16Mb large pages
- */
-static struct mmu_psize_def mmu_psize_defaults_gp[] = {
- [MMU_PAGE_4K] = {
- .shift = 12,
- .sllp = 0,
- .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1},
- .avpnm = 0,
- .tlbiel = 1,
- },
- [MMU_PAGE_16M] = {
- .shift = 24,
- .sllp = SLB_VSID_L,
- .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0,
- [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 },
- .avpnm = 0x1UL,
- .tlbiel = 0,
- },
-};
-
-/*
- * 'R' and 'C' update notes:
- * - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
- * create writeable HPTEs without C set, because the hcall H_PROTECT
- * that we use in that case will not update C
- * - The above is however not a problem, because we also don't do that
- * fancy "no flush" variant of eviction and we use H_REMOVE which will
- * do the right thing and thus we don't have the race I described earlier
- *
- * - Under bare metal, we do have the race, so we need R and C set
- * - We make sure R is always set and never lost
- * - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
- */
-unsigned long htab_convert_pte_flags(unsigned long pteflags)
-{
- unsigned long rflags = 0;
-
- /* _PAGE_EXEC -> NOEXEC */
- if ((pteflags & _PAGE_EXEC) == 0)
- rflags |= HPTE_R_N;
- /*
- * PPP bits:
- * Linux uses slb key 0 for kernel and 1 for user.
- * kernel RW areas are mapped with PPP=0b000
- * User area is mapped with PPP=0b010 for read/write
- * or PPP=0b011 for read-only (including writeable but clean pages).
- */
- if (pteflags & _PAGE_PRIVILEGED) {
- /*
- * Kernel read only mapped with ppp bits 0b110
- */
- if (!(pteflags & _PAGE_WRITE)) {
- if (mmu_has_feature(MMU_FTR_KERNEL_RO))
- rflags |= (HPTE_R_PP0 | 0x2);
- else
- rflags |= 0x3;
- }
- } else {
- if (pteflags & _PAGE_RWX)
- rflags |= 0x2;
- if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
- rflags |= 0x1;
- }
- /*
- * We can't allow hardware to update hpte bits. Hence always
- * set 'R' bit and set 'C' if it is a write fault
- */
- rflags |= HPTE_R_R;
-
- if (pteflags & _PAGE_DIRTY)
- rflags |= HPTE_R_C;
- /*
- * Add in WIG bits
- */
-
- if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
- rflags |= HPTE_R_I;
- else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
- rflags |= (HPTE_R_I | HPTE_R_G);
- else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
- rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);
- else
- /*
- * Add memory coherence if cache inhibited is not set
- */
- rflags |= HPTE_R_M;
-
- rflags |= pte_to_hpte_pkey_bits(pteflags);
- return rflags;
-}
-
-int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
- unsigned long pstart, unsigned long prot,
- int psize, int ssize)
-{
- unsigned long vaddr, paddr;
- unsigned int step, shift;
- int ret = 0;
-
- shift = mmu_psize_defs[psize].shift;
- step = 1 << shift;
-
- prot = htab_convert_pte_flags(prot);
-
- DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
- vstart, vend, pstart, prot, psize, ssize);
-
- for (vaddr = vstart, paddr = pstart; vaddr < vend;
- vaddr += step, paddr += step) {
- unsigned long hash, hpteg;
- unsigned long vsid = get_kernel_vsid(vaddr, ssize);
- unsigned long vpn = hpt_vpn(vaddr, vsid, ssize);
- unsigned long tprot = prot;
-
- /*
- * If we hit a bad address return error.
- */
- if (!vsid)
- return -1;
- /* Make kernel text executable */
- if (overlaps_kernel_text(vaddr, vaddr + step))
- tprot &= ~HPTE_R_N;
-
- /* Make kvm guest trampolines executable */
- if (overlaps_kvm_tmp(vaddr, vaddr + step))
- tprot &= ~HPTE_R_N;
-
- /*
- * If relocatable, check if it overlaps interrupt vectors that
- * are copied down to real 0. For relocatable kernel
- * (e.g. kdump case) we copy interrupt vectors down to real
- * address 0. Mark that region as executable. This is
- * because on p8 system with relocation on exception feature
- * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence
- * in order to execute the interrupt handlers in virtual
- * mode the vector region need to be marked as executable.
- */
- if ((PHYSICAL_START > MEMORY_START) &&
- overlaps_interrupt_vector_text(vaddr, vaddr + step))
- tprot &= ~HPTE_R_N;
-
- hash = hpt_hash(vpn, shift, ssize);
- hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
-
- BUG_ON(!mmu_hash_ops.hpte_insert);
- ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
- HPTE_V_BOLTED, psize, psize,
- ssize);
-
- if (ret < 0)
- break;
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (debug_pagealloc_enabled() &&
- (paddr >> PAGE_SHIFT) < linear_map_hash_count)
- linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
-#endif /* CONFIG_DEBUG_PAGEALLOC */
- }
- return ret < 0 ? ret : 0;
-}
-
-int htab_remove_mapping(unsigned long vstart, unsigned long vend,
- int psize, int ssize)
-{
- unsigned long vaddr;
- unsigned int step, shift;
- int rc;
- int ret = 0;
-
- shift = mmu_psize_defs[psize].shift;
- step = 1 << shift;
-
- if (!mmu_hash_ops.hpte_removebolted)
- return -ENODEV;
-
- for (vaddr = vstart; vaddr < vend; vaddr += step) {
- rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
- if (rc == -ENOENT) {
- ret = -ENOENT;
- continue;
- }
- if (rc < 0)
- return rc;
- }
-
- return ret;
-}
-
-static bool disable_1tb_segments = false;
-
-static int __init parse_disable_1tb_segments(char *p)
-{
- disable_1tb_segments = true;
- return 0;
-}
-early_param("disable_1tb_segments", parse_disable_1tb_segments);
-
-static int __init htab_dt_scan_seg_sizes(unsigned long node,
- const char *uname, int depth,
- void *data)
-{
- const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- const __be32 *prop;
- int size = 0;
-
- /* We are scanning "cpu" nodes only */
- if (type == NULL || strcmp(type, "cpu") != 0)
- return 0;
-
- prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size);
- if (prop == NULL)
- return 0;
- for (; size >= 4; size -= 4, ++prop) {
- if (be32_to_cpu(prop[0]) == 40) {
- DBG("1T segment support detected\n");
-
- if (disable_1tb_segments) {
- DBG("1T segments disabled by command line\n");
- break;
- }
-
- cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT;
- return 1;
- }
- }
- cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
- return 0;
-}
-
-static int __init get_idx_from_shift(unsigned int shift)
-{
- int idx = -1;
-
- switch (shift) {
- case 0xc:
- idx = MMU_PAGE_4K;
- break;
- case 0x10:
- idx = MMU_PAGE_64K;
- break;
- case 0x14:
- idx = MMU_PAGE_1M;
- break;
- case 0x18:
- idx = MMU_PAGE_16M;
- break;
- case 0x22:
- idx = MMU_PAGE_16G;
- break;
- }
- return idx;
-}
-
-static int __init htab_dt_scan_page_sizes(unsigned long node,
- const char *uname, int depth,
- void *data)
-{
- const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- const __be32 *prop;
- int size = 0;
-
- /* We are scanning "cpu" nodes only */
- if (type == NULL || strcmp(type, "cpu") != 0)
- return 0;
-
- prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size);
- if (!prop)
- return 0;
-
- pr_info("Page sizes from device-tree:\n");
- size /= 4;
- cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE);
- while(size > 0) {
- unsigned int base_shift = be32_to_cpu(prop[0]);
- unsigned int slbenc = be32_to_cpu(prop[1]);
- unsigned int lpnum = be32_to_cpu(prop[2]);
- struct mmu_psize_def *def;
- int idx, base_idx;
-
- size -= 3; prop += 3;
- base_idx = get_idx_from_shift(base_shift);
- if (base_idx < 0) {
- /* skip the pte encoding also */
- prop += lpnum * 2; size -= lpnum * 2;
- continue;
- }
- def = &mmu_psize_defs[base_idx];
- if (base_idx == MMU_PAGE_16M)
- cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE;
-
- def->shift = base_shift;
- if (base_shift <= 23)
- def->avpnm = 0;
- else
- def->avpnm = (1 << (base_shift - 23)) - 1;
- def->sllp = slbenc;
- /*
- * We don't know for sure what's up with tlbiel, so
- * for now we only set it for 4K and 64K pages
- */
- if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
- def->tlbiel = 1;
- else
- def->tlbiel = 0;
-
- while (size > 0 && lpnum) {
- unsigned int shift = be32_to_cpu(prop[0]);
- int penc = be32_to_cpu(prop[1]);
-
- prop += 2; size -= 2;
- lpnum--;
-
- idx = get_idx_from_shift(shift);
- if (idx < 0)
- continue;
-
- if (penc == -1)
- pr_err("Invalid penc for base_shift=%d "
- "shift=%d\n", base_shift, shift);
-
- def->penc[idx] = penc;
- pr_info("base_shift=%d: shift=%d, sllp=0x%04lx,"
- " avpnm=0x%08lx, tlbiel=%d, penc=%d\n",
- base_shift, shift, def->sllp,
- def->avpnm, def->tlbiel, def->penc[idx]);
- }
- }
-
- return 1;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Scan for 16G memory blocks that have been set aside for huge pages
- * and reserve those blocks for 16G huge pages.
- */
-static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
- const char *uname, int depth,
- void *data) {
- const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- const __be64 *addr_prop;
- const __be32 *page_count_prop;
- unsigned int expected_pages;
- long unsigned int phys_addr;
- long unsigned int block_size;
-
- /* We are scanning "memory" nodes only */
- if (type == NULL || strcmp(type, "memory") != 0)
- return 0;
-
- /* This property is the log base 2 of the number of virtual pages that
- * will represent this memory block. */
- page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
- if (page_count_prop == NULL)
- return 0;
- expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
- addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
- if (addr_prop == NULL)
- return 0;
- phys_addr = be64_to_cpu(addr_prop[0]);
- block_size = be64_to_cpu(addr_prop[1]);
- if (block_size != (16 * GB))
- return 0;
- printk(KERN_INFO "Huge page(16GB) memory: "
- "addr = 0x%lX size = 0x%lX pages = %d\n",
- phys_addr, block_size, expected_pages);
- if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
- memblock_reserve(phys_addr, block_size * expected_pages);
- pseries_add_gpage(phys_addr, block_size, expected_pages);
- }
- return 0;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
-
-static void mmu_psize_set_default_penc(void)
-{
- int bpsize, apsize;
- for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
- for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
- mmu_psize_defs[bpsize].penc[apsize] = -1;
-}
-
-#ifdef CONFIG_PPC_64K_PAGES
-
-static bool might_have_hea(void)
-{
- /*
- * The HEA ethernet adapter requires awareness of the
- * GX bus. Without that awareness we can easily assume
- * we will never see an HEA ethernet device.
- */
-#ifdef CONFIG_IBMEBUS
- return !cpu_has_feature(CPU_FTR_ARCH_207S) &&
- firmware_has_feature(FW_FEATURE_SPLPAR);
-#else
- return false;
-#endif
-}
-
-#endif /* #ifdef CONFIG_PPC_64K_PAGES */
-
-static void __init htab_scan_page_sizes(void)
-{
- int rc;
-
- /* se the invalid penc to -1 */
- mmu_psize_set_default_penc();
-
- /* Default to 4K pages only */
- memcpy(mmu_psize_defs, mmu_psize_defaults,
- sizeof(mmu_psize_defaults));
-
- /*
- * Try to find the available page sizes in the device-tree
- */
- rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
- if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) {
- /*
- * Nothing in the device-tree, but the CPU supports 16M pages,
- * so let's fallback on a known size list for 16M capable CPUs.
- */
- memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
- sizeof(mmu_psize_defaults_gp));
- }
-
-#ifdef CONFIG_HUGETLB_PAGE
- if (!hugetlb_disabled) {
- /* Reserve 16G huge page memory sections for huge pages */
- of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
- }
-#endif /* CONFIG_HUGETLB_PAGE */
-}
-
-/*
- * Fill in the hpte_page_sizes[] array.
- * We go through the mmu_psize_defs[] array looking for all the
- * supported base/actual page size combinations. Each combination
- * has a unique pagesize encoding (penc) value in the low bits of
- * the LP field of the HPTE. For actual page sizes less than 1MB,
- * some of the upper LP bits are used for RPN bits, meaning that
- * we need to fill in several entries in hpte_page_sizes[].
- *
- * In diagrammatic form, with r = RPN bits and z = page size bits:
- * PTE LP actual page size
- * rrrr rrrz >=8KB
- * rrrr rrzz >=16KB
- * rrrr rzzz >=32KB
- * rrrr zzzz >=64KB
- * ...
- *
- * The zzzz bits are implementation-specific but are chosen so that
- * no encoding for a larger page size uses the same value in its
- * low-order N bits as the encoding for the 2^(12+N) byte page size
- * (if it exists).
- */
-static void init_hpte_page_sizes(void)
-{
- long int ap, bp;
- long int shift, penc;
-
- for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
- if (!mmu_psize_defs[bp].shift)
- continue; /* not a supported page size */
- for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
- penc = mmu_psize_defs[bp].penc[ap];
- if (penc == -1 || !mmu_psize_defs[ap].shift)
- continue;
- shift = mmu_psize_defs[ap].shift - LP_SHIFT;
- if (shift <= 0)
- continue; /* should never happen */
- /*
- * For page sizes less than 1MB, this loop
- * replicates the entry for all possible values
- * of the rrrr bits.
- */
- while (penc < (1 << LP_BITS)) {
- hpte_page_sizes[penc] = (ap << 4) | bp;
- penc += 1 << shift;
- }
- }
- }
-}
-
-static void __init htab_init_page_sizes(void)
-{
- init_hpte_page_sizes();
-
- if (!debug_pagealloc_enabled()) {
- /*
- * Pick a size for the linear mapping. Currently, we only
- * support 16M, 1M and 4K which is the default
- */
- if (mmu_psize_defs[MMU_PAGE_16M].shift)
- mmu_linear_psize = MMU_PAGE_16M;
- else if (mmu_psize_defs[MMU_PAGE_1M].shift)
- mmu_linear_psize = MMU_PAGE_1M;
- }
-
-#ifdef CONFIG_PPC_64K_PAGES
- /*
- * Pick a size for the ordinary pages. Default is 4K, we support
- * 64K for user mappings and vmalloc if supported by the processor.
- * We only use 64k for ioremap if the processor
- * (and firmware) support cache-inhibited large pages.
- * If not, we use 4k and set mmu_ci_restrictions so that
- * hash_page knows to switch processes that use cache-inhibited
- * mappings to 4k pages.
- */
- if (mmu_psize_defs[MMU_PAGE_64K].shift) {
- mmu_virtual_psize = MMU_PAGE_64K;
- mmu_vmalloc_psize = MMU_PAGE_64K;
- if (mmu_linear_psize == MMU_PAGE_4K)
- mmu_linear_psize = MMU_PAGE_64K;
- if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) {
- /*
- * When running on pSeries using 64k pages for ioremap
- * would stop us accessing the HEA ethernet. So if we
- * have the chance of ever seeing one, stay at 4k.
- */
- if (!might_have_hea())
- mmu_io_psize = MMU_PAGE_64K;
- } else
- mmu_ci_restrictions = 1;
- }
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- /* We try to use 16M pages for vmemmap if that is supported
- * and we have at least 1G of RAM at boot
- */
- if (mmu_psize_defs[MMU_PAGE_16M].shift &&
- memblock_phys_mem_size() >= 0x40000000)
- mmu_vmemmap_psize = MMU_PAGE_16M;
- else if (mmu_psize_defs[MMU_PAGE_64K].shift)
- mmu_vmemmap_psize = MMU_PAGE_64K;
- else
- mmu_vmemmap_psize = MMU_PAGE_4K;
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-
- printk(KERN_DEBUG "Page orders: linear mapping = %d, "
- "virtual = %d, io = %d"
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- ", vmemmap = %d"
-#endif
- "\n",
- mmu_psize_defs[mmu_linear_psize].shift,
- mmu_psize_defs[mmu_virtual_psize].shift,
- mmu_psize_defs[mmu_io_psize].shift
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- ,mmu_psize_defs[mmu_vmemmap_psize].shift
-#endif
- );
-}
-
-static int __init htab_dt_scan_pftsize(unsigned long node,
- const char *uname, int depth,
- void *data)
-{
- const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- const __be32 *prop;
-
- /* We are scanning "cpu" nodes only */
- if (type == NULL || strcmp(type, "cpu") != 0)
- return 0;
-
- prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
- if (prop != NULL) {
- /* pft_size[0] is the NUMA CEC cookie */
- ppc64_pft_size = be32_to_cpu(prop[1]);
- return 1;
- }
- return 0;
-}
-
-unsigned htab_shift_for_mem_size(unsigned long mem_size)
-{
- unsigned memshift = __ilog2(mem_size);
- unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
- unsigned pteg_shift;
-
- /* round mem_size up to next power of 2 */
- if ((1UL << memshift) < mem_size)
- memshift += 1;
-
- /* aim for 2 pages / pteg */
- pteg_shift = memshift - (pshift + 1);
-
- /*
- * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab
- * size permitted by the architecture.
- */
- return max(pteg_shift + 7, 18U);
-}
-
-static unsigned long __init htab_get_table_size(void)
-{
- /* If hash size isn't already provided by the platform, we try to
- * retrieve it from the device-tree. If it's not there neither, we
- * calculate it now based on the total RAM size
- */
- if (ppc64_pft_size == 0)
- of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
- if (ppc64_pft_size)
- return 1UL << ppc64_pft_size;
-
- return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size());
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int resize_hpt_for_hotplug(unsigned long new_mem_size)
-{
- unsigned target_hpt_shift;
-
- if (!mmu_hash_ops.resize_hpt)
- return 0;
-
- target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
-
- /*
- * To avoid lots of HPT resizes if memory size is fluctuating
- * across a boundary, we deliberately have some hysterisis
- * here: we immediately increase the HPT size if the target
- * shift exceeds the current shift, but we won't attempt to
- * reduce unless the target shift is at least 2 below the
- * current shift
- */
- if (target_hpt_shift > ppc64_pft_size ||
- target_hpt_shift < ppc64_pft_size - 1)
- return mmu_hash_ops.resize_hpt(target_hpt_shift);
-
- return 0;
-}
-
-int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
-{
- int rc;
-
- if (end >= H_VMALLOC_START) {
- pr_warn("Outside the supported range\n");
- return -1;
- }
-
- rc = htab_bolt_mapping(start, end, __pa(start),
- pgprot_val(PAGE_KERNEL), mmu_linear_psize,
- mmu_kernel_ssize);
-
- if (rc < 0) {
- int rc2 = htab_remove_mapping(start, end, mmu_linear_psize,
- mmu_kernel_ssize);
- BUG_ON(rc2 && (rc2 != -ENOENT));
- }
- return rc;
-}
-
-int hash__remove_section_mapping(unsigned long start, unsigned long end)
-{
- int rc = htab_remove_mapping(start, end, mmu_linear_psize,
- mmu_kernel_ssize);
- WARN_ON(rc < 0);
- return rc;
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-static void __init hash_init_partition_table(phys_addr_t hash_table,
- unsigned long htab_size)
-{
- mmu_partition_table_init();
-
- /*
- * PS field (VRMA page size) is not used for LPID 0, hence set to 0.
- * For now, UPRT is 0 and we have no segment table.
- */
- htab_size = __ilog2(htab_size) - 18;
- mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
- pr_info("Partition table %p\n", partition_tb);
-}
-
-static void __init htab_initialize(void)
-{
- unsigned long table;
- unsigned long pteg_count;
- unsigned long prot;
- unsigned long base = 0, size = 0;
- struct memblock_region *reg;
-
- DBG(" -> htab_initialize()\n");
-
- if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
- mmu_kernel_ssize = MMU_SEGSIZE_1T;
- mmu_highuser_ssize = MMU_SEGSIZE_1T;
- printk(KERN_INFO "Using 1TB segments\n");
- }
-
- /*
- * Calculate the required size of the htab. We want the number of
- * PTEGs to equal one half the number of real pages.
- */
- htab_size_bytes = htab_get_table_size();
- pteg_count = htab_size_bytes >> 7;
-
- htab_hash_mask = pteg_count - 1;
-
- if (firmware_has_feature(FW_FEATURE_LPAR) ||
- firmware_has_feature(FW_FEATURE_PS3_LV1)) {
- /* Using a hypervisor which owns the htab */
- htab_address = NULL;
- _SDR1 = 0;
- /*
- * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
- * to inform the hypervisor that we wish to use the HPT.
- */
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- register_process_table(0, 0, 0);
-#ifdef CONFIG_FA_DUMP
- /*
- * If firmware assisted dump is active firmware preserves
- * the contents of htab along with entire partition memory.
- * Clear the htab if firmware assisted dump is active so
- * that we dont end up using old mappings.
- */
- if (is_fadump_active() && mmu_hash_ops.hpte_clear_all)
- mmu_hash_ops.hpte_clear_all();
-#endif
- } else {
- unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE;
-
-#ifdef CONFIG_PPC_CELL
- /*
- * Cell may require the hash table down low when using the
- * Axon IOMMU in order to fit the dynamic region over it, see
- * comments in cell/iommu.c
- */
- if (fdt_subnode_offset(initial_boot_params, 0, "axon") > 0) {
- limit = 0x80000000;
- pr_info("Hash table forced below 2G for Axon IOMMU\n");
- }
-#endif /* CONFIG_PPC_CELL */
-
- table = memblock_phys_alloc_range(htab_size_bytes,
- htab_size_bytes,
- 0, limit);
- if (!table)
- panic("ERROR: Failed to allocate %pa bytes below %pa\n",
- &htab_size_bytes, &limit);
-
- DBG("Hash table allocated at %lx, size: %lx\n", table,
- htab_size_bytes);
-
- htab_address = __va(table);
-
- /* htab absolute addr + encoded htabsize */
- _SDR1 = table + __ilog2(htab_size_bytes) - 18;
-
- /* Initialize the HPT with no entries */
- memset((void *)table, 0, htab_size_bytes);
-
- if (!cpu_has_feature(CPU_FTR_ARCH_300))
- /* Set SDR1 */
- mtspr(SPRN_SDR1, _SDR1);
- else
- hash_init_partition_table(table, htab_size_bytes);
- }
-
- prot = pgprot_val(PAGE_KERNEL);
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (debug_pagealloc_enabled()) {
- linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
- linear_map_hash_slots = memblock_alloc_try_nid(
- linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
- ppc64_rma_size, NUMA_NO_NODE);
- if (!linear_map_hash_slots)
- panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
- __func__, linear_map_hash_count, &ppc64_rma_size);
- }
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
- /* create bolted the linear mapping in the hash table */
- for_each_memblock(memory, reg) {
- base = (unsigned long)__va(reg->base);
- size = reg->size;
-
- DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
- base, size, prot);
-
- if ((base + size) >= H_VMALLOC_START) {
- pr_warn("Outside the supported range\n");
- continue;
- }
-
- BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
- prot, mmu_linear_psize, mmu_kernel_ssize));
- }
- memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
-
- /*
- * If we have a memory_limit and we've allocated TCEs then we need to
- * explicitly map the TCE area at the top of RAM. We also cope with the
- * case that the TCEs start below memory_limit.
- * tce_alloc_start/end are 16MB aligned so the mapping should work
- * for either 4K or 16MB pages.
- */
- if (tce_alloc_start) {
- tce_alloc_start = (unsigned long)__va(tce_alloc_start);
- tce_alloc_end = (unsigned long)__va(tce_alloc_end);
-
- if (base + size >= tce_alloc_start)
- tce_alloc_start = base + size + 1;
-
- BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
- __pa(tce_alloc_start), prot,
- mmu_linear_psize, mmu_kernel_ssize));
- }
-
-
- DBG(" <- htab_initialize()\n");
-}
-#undef KB
-#undef MB
-
-void __init hash__early_init_devtree(void)
-{
- /* Initialize segment sizes */
- of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
-
- /* Initialize page sizes */
- htab_scan_page_sizes();
-}
-
-struct hash_mm_context init_hash_mm_context;
-void __init hash__early_init_mmu(void)
-{
-#ifndef CONFIG_PPC_64K_PAGES
- /*
- * We have code in __hash_page_4K() and elsewhere, which assumes it can
- * do the following:
- * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
- *
- * Where the slot number is between 0-15, and values of 8-15 indicate
- * the secondary bucket. For that code to work H_PAGE_F_SECOND and
- * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
- * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
- * with a BUILD_BUG_ON().
- */
- BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3)));
-#endif /* CONFIG_PPC_64K_PAGES */
-
- htab_init_page_sizes();
-
- /*
- * initialize page table size
- */
- __pte_frag_nr = H_PTE_FRAG_NR;
- __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
- __pmd_frag_nr = H_PMD_FRAG_NR;
- __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT;
-
- __pte_index_size = H_PTE_INDEX_SIZE;
- __pmd_index_size = H_PMD_INDEX_SIZE;
- __pud_index_size = H_PUD_INDEX_SIZE;
- __pgd_index_size = H_PGD_INDEX_SIZE;
- __pud_cache_index = H_PUD_CACHE_INDEX;
- __pte_table_size = H_PTE_TABLE_SIZE;
- __pmd_table_size = H_PMD_TABLE_SIZE;
- __pud_table_size = H_PUD_TABLE_SIZE;
- __pgd_table_size = H_PGD_TABLE_SIZE;
- /*
- * 4k use hugepd format, so for hash set then to
- * zero
- */
- __pmd_val_bits = HASH_PMD_VAL_BITS;
- __pud_val_bits = HASH_PUD_VAL_BITS;
- __pgd_val_bits = HASH_PGD_VAL_BITS;
-
- __kernel_virt_start = H_KERN_VIRT_START;
- __vmalloc_start = H_VMALLOC_START;
- __vmalloc_end = H_VMALLOC_END;
- __kernel_io_start = H_KERN_IO_START;
- __kernel_io_end = H_KERN_IO_END;
- vmemmap = (struct page *)H_VMEMMAP_START;
- ioremap_bot = IOREMAP_BASE;
-
-#ifdef CONFIG_PCI
- pci_io_base = ISA_IO_BASE;
-#endif
-
- /* Select appropriate backend */
- if (firmware_has_feature(FW_FEATURE_PS3_LV1))
- ps3_early_mm_init();
- else if (firmware_has_feature(FW_FEATURE_LPAR))
- hpte_init_pseries();
- else if (IS_ENABLED(CONFIG_PPC_NATIVE))
- hpte_init_native();
-
- if (!mmu_hash_ops.hpte_insert)
- panic("hash__early_init_mmu: No MMU hash ops defined!\n");
-
- /* Initialize the MMU Hash table and create the linear mapping
- * of memory. Has to be done before SLB initialization as this is
- * currently where the page size encoding is obtained.
- */
- htab_initialize();
-
- init_mm.context.hash_context = &init_hash_mm_context;
- init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
-
- pr_info("Initializing hash mmu with SLB\n");
- /* Initialize SLB management */
- slb_initialize();
-
- if (cpu_has_feature(CPU_FTR_ARCH_206)
- && cpu_has_feature(CPU_FTR_HVMODE))
- tlbiel_all();
-}
-
-#ifdef CONFIG_SMP
-void hash__early_init_mmu_secondary(void)
-{
- /* Initialize hash table for that CPU */
- if (!firmware_has_feature(FW_FEATURE_LPAR)) {
-
- if (!cpu_has_feature(CPU_FTR_ARCH_300))
- mtspr(SPRN_SDR1, _SDR1);
- else
- mtspr(SPRN_PTCR,
- __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
- }
- /* Initialize SLB */
- slb_initialize();
-
- if (cpu_has_feature(CPU_FTR_ARCH_206)
- && cpu_has_feature(CPU_FTR_HVMODE))
- tlbiel_all();
-}
-#endif /* CONFIG_SMP */
-
-/*
- * Called by asm hashtable.S for doing lazy icache flush
- */
-unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
-{
- struct page *page;
-
- if (!pfn_valid(pte_pfn(pte)))
- return pp;
-
- page = pte_page(pte);
-
- /* page is dirty */
- if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
- if (trap == 0x400) {
- flush_dcache_icache_page(page);
- set_bit(PG_arch_1, &page->flags);
- } else
- pp |= HPTE_R_N;
- }
- return pp;
-}
-
-#ifdef CONFIG_PPC_MM_SLICES
-static unsigned int get_paca_psize(unsigned long addr)
-{
- unsigned char *psizes;
- unsigned long index, mask_index;
-
- if (addr < SLICE_LOW_TOP) {
- psizes = get_paca()->mm_ctx_low_slices_psize;
- index = GET_LOW_SLICE_INDEX(addr);
- } else {
- psizes = get_paca()->mm_ctx_high_slices_psize;
- index = GET_HIGH_SLICE_INDEX(addr);
- }
- mask_index = index & 0x1;
- return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
-}
-
-#else
-unsigned int get_paca_psize(unsigned long addr)
-{
- return get_paca()->mm_ctx_user_psize;
-}
-#endif
-
-/*
- * Demote a segment to using 4k pages.
- * For now this makes the whole process use 4k pages.
- */
-#ifdef CONFIG_PPC_64K_PAGES
-void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
-{
- if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
- return;
- slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
- copro_flush_all_slbs(mm);
- if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
-
- copy_mm_to_paca(mm);
- slb_flush_and_restore_bolted();
- }
-}
-#endif /* CONFIG_PPC_64K_PAGES */
-
-#ifdef CONFIG_PPC_SUBPAGE_PROT
-/*
- * This looks up a 2-bit protection code for a 4k subpage of a 64k page.
- * Userspace sets the subpage permissions using the subpage_prot system call.
- *
- * Result is 0: full permissions, _PAGE_RW: read-only,
- * _PAGE_RWX: no access.
- */
-static int subpage_protection(struct mm_struct *mm, unsigned long ea)
-{
- struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
- u32 spp = 0;
- u32 **sbpm, *sbpp;
-
- if (!spt)
- return 0;
-
- if (ea >= spt->maxaddr)
- return 0;
- if (ea < 0x100000000UL) {
- /* addresses below 4GB use spt->low_prot */
- sbpm = spt->low_prot;
- } else {
- sbpm = spt->protptrs[ea >> SBP_L3_SHIFT];
- if (!sbpm)
- return 0;
- }
- sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
- if (!sbpp)
- return 0;
- spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)];
-
- /* extract 2-bit bitfield for this 4k subpage */
- spp >>= 30 - 2 * ((ea >> 12) & 0xf);
-
- /*
- * 0 -> full premission
- * 1 -> Read only
- * 2 -> no access.
- * We return the flag that need to be cleared.
- */
- spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0);
- return spp;
-}
-
-#else /* CONFIG_PPC_SUBPAGE_PROT */
-static inline int subpage_protection(struct mm_struct *mm, unsigned long ea)
-{
- return 0;
-}
-#endif
-
-void hash_failure_debug(unsigned long ea, unsigned long access,
- unsigned long vsid, unsigned long trap,
- int ssize, int psize, int lpsize, unsigned long pte)
-{
- if (!printk_ratelimit())
- return;
- pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n",
- ea, access, current->comm);
- pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n",
- trap, vsid, ssize, psize, lpsize, pte);
-}
-
-static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
- int psize, bool user_region)
-{
- if (user_region) {
- if (psize != get_paca_psize(ea)) {
- copy_mm_to_paca(mm);
- slb_flush_and_restore_bolted();
- }
- } else if (get_paca()->vmalloc_sllp !=
- mmu_psize_defs[mmu_vmalloc_psize].sllp) {
- get_paca()->vmalloc_sllp =
- mmu_psize_defs[mmu_vmalloc_psize].sllp;
- slb_vmalloc_update();
- }
-}
-
-/* Result code is:
- * 0 - handled
- * 1 - normal page fault
- * -1 - critical hash insertion error
- * -2 - access not permitted by subpage protection mechanism
- */
-int hash_page_mm(struct mm_struct *mm, unsigned long ea,
- unsigned long access, unsigned long trap,
- unsigned long flags)
-{
- bool is_thp;
- enum ctx_state prev_state = exception_enter();
- pgd_t *pgdir;
- unsigned long vsid;
- pte_t *ptep;
- unsigned hugeshift;
- int rc, user_region = 0;
- int psize, ssize;
-
- DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
- ea, access, trap);
- trace_hash_fault(ea, access, trap);
-
- /* Get region & vsid */
- switch (get_region_id(ea)) {
- case USER_REGION_ID:
- user_region = 1;
- if (! mm) {
- DBG_LOW(" user region with no mm !\n");
- rc = 1;
- goto bail;
- }
- psize = get_slice_psize(mm, ea);
- ssize = user_segment_size(ea);
- vsid = get_user_vsid(&mm->context, ea, ssize);
- break;
- case VMALLOC_REGION_ID:
- vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
- psize = mmu_vmalloc_psize;
- ssize = mmu_kernel_ssize;
- break;
-
- case IO_REGION_ID:
- vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
- psize = mmu_io_psize;
- ssize = mmu_kernel_ssize;
- break;
- default:
- /* Not a valid range
- * Send the problem up to do_page_fault
- */
- rc = 1;
- goto bail;
- }
- DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
-
- /* Bad address. */
- if (!vsid) {
- DBG_LOW("Bad address!\n");
- rc = 1;
- goto bail;
- }
- /* Get pgdir */
- pgdir = mm->pgd;
- if (pgdir == NULL) {
- rc = 1;
- goto bail;
- }
-
- /* Check CPU locality */
- if (user_region && mm_is_thread_local(mm))
- flags |= HPTE_LOCAL_UPDATE;
-
-#ifndef CONFIG_PPC_64K_PAGES
- /* If we use 4K pages and our psize is not 4K, then we might
- * be hitting a special driver mapping, and need to align the
- * address before we fetch the PTE.
- *
- * It could also be a hugepage mapping, in which case this is
- * not necessary, but it's not harmful, either.
- */
- if (psize != MMU_PAGE_4K)
- ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
-#endif /* CONFIG_PPC_64K_PAGES */
-
- /* Get PTE and page size from page tables */
- ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
- if (ptep == NULL || !pte_present(*ptep)) {
- DBG_LOW(" no PTE !\n");
- rc = 1;
- goto bail;
- }
-
- /* Add _PAGE_PRESENT to the required access perm */
- access |= _PAGE_PRESENT;
-
- /* Pre-check access permissions (will be re-checked atomically
- * in __hash_page_XX but this pre-check is a fast path
- */
- if (!check_pte_access(access, pte_val(*ptep))) {
- DBG_LOW(" no access !\n");
- rc = 1;
- goto bail;
- }
-
- if (hugeshift) {
- if (is_thp)
- rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
- trap, flags, ssize, psize);
-#ifdef CONFIG_HUGETLB_PAGE
- else
- rc = __hash_page_huge(ea, access, vsid, ptep, trap,
- flags, ssize, hugeshift, psize);
-#else
- else {
- /*
- * if we have hugeshift, and is not transhuge with
- * hugetlb disabled, something is really wrong.
- */
- rc = 1;
- WARN_ON(1);
- }
-#endif
- if (current->mm == mm)
- check_paca_psize(ea, mm, psize, user_region);
-
- goto bail;
- }
-
-#ifndef CONFIG_PPC_64K_PAGES
- DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
-#else
- DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
- pte_val(*(ptep + PTRS_PER_PTE)));
-#endif
- /* Do actual hashing */
-#ifdef CONFIG_PPC_64K_PAGES
- /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */
- if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
- demote_segment_4k(mm, ea);
- psize = MMU_PAGE_4K;
- }
-
- /* If this PTE is non-cacheable and we have restrictions on
- * using non cacheable large pages, then we switch to 4k
- */
- if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) {
- if (user_region) {
- demote_segment_4k(mm, ea);
- psize = MMU_PAGE_4K;
- } else if (ea < VMALLOC_END) {
- /*
- * some driver did a non-cacheable mapping
- * in vmalloc space, so switch vmalloc
- * to 4k pages
- */
- printk(KERN_ALERT "Reducing vmalloc segment "
- "to 4kB pages because of "
- "non-cacheable mapping\n");
- psize = mmu_vmalloc_psize = MMU_PAGE_4K;
- copro_flush_all_slbs(mm);
- }
- }
-
-#endif /* CONFIG_PPC_64K_PAGES */
-
- if (current->mm == mm)
- check_paca_psize(ea, mm, psize, user_region);
-
-#ifdef CONFIG_PPC_64K_PAGES
- if (psize == MMU_PAGE_64K)
- rc = __hash_page_64K(ea, access, vsid, ptep, trap,
- flags, ssize);
- else
-#endif /* CONFIG_PPC_64K_PAGES */
- {
- int spp = subpage_protection(mm, ea);
- if (access & spp)
- rc = -2;
- else
- rc = __hash_page_4K(ea, access, vsid, ptep, trap,
- flags, ssize, spp);
- }
-
- /* Dump some info in case of hash insertion failure, they should
- * never happen so it is really useful to know if/when they do
- */
- if (rc == -1)
- hash_failure_debug(ea, access, vsid, trap, ssize, psize,
- psize, pte_val(*ptep));
-#ifndef CONFIG_PPC_64K_PAGES
- DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
-#else
- DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
- pte_val(*(ptep + PTRS_PER_PTE)));
-#endif
- DBG_LOW(" -> rc=%d\n", rc);
-
-bail:
- exception_exit(prev_state);
- return rc;
-}
-EXPORT_SYMBOL_GPL(hash_page_mm);
-
-int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
- unsigned long dsisr)
-{
- unsigned long flags = 0;
- struct mm_struct *mm = current->mm;
-
- if ((get_region_id(ea) == VMALLOC_REGION_ID) ||
- (get_region_id(ea) == IO_REGION_ID))
- mm = &init_mm;
-
- if (dsisr & DSISR_NOHPTE)
- flags |= HPTE_NOHPTE_UPDATE;
-
- return hash_page_mm(mm, ea, access, trap, flags);
-}
-EXPORT_SYMBOL_GPL(hash_page);
-
-int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
- unsigned long dsisr)
-{
- unsigned long access = _PAGE_PRESENT | _PAGE_READ;
- unsigned long flags = 0;
- struct mm_struct *mm = current->mm;
- unsigned int region_id = get_region_id(ea);
-
- if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
- mm = &init_mm;
-
- if (dsisr & DSISR_NOHPTE)
- flags |= HPTE_NOHPTE_UPDATE;
-
- if (dsisr & DSISR_ISSTORE)
- access |= _PAGE_WRITE;
- /*
- * We set _PAGE_PRIVILEGED only when
- * kernel mode access kernel space.
- *
- * _PAGE_PRIVILEGED is NOT set
- * 1) when kernel mode access user space
- * 2) user space access kernel space.
- */
- access |= _PAGE_PRIVILEGED;
- if ((msr & MSR_PR) || (region_id == USER_REGION_ID))
- access &= ~_PAGE_PRIVILEGED;
-
- if (trap == 0x400)
- access |= _PAGE_EXEC;
-
- return hash_page_mm(mm, ea, access, trap, flags);
-}
-
-#ifdef CONFIG_PPC_MM_SLICES
-static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
-{
- int psize = get_slice_psize(mm, ea);
-
- /* We only prefault standard pages for now */
- if (unlikely(psize != mm_ctx_user_psize(&mm->context)))
- return false;
-
- /*
- * Don't prefault if subpage protection is enabled for the EA.
- */
- if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea)))
- return false;
-
- return true;
-}
-#else
-static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
-{
- return true;
-}
-#endif
-
-void hash_preload(struct mm_struct *mm, unsigned long ea,
- bool is_exec, unsigned long trap)
-{
- int hugepage_shift;
- unsigned long vsid;
- pgd_t *pgdir;
- pte_t *ptep;
- unsigned long flags;
- int rc, ssize, update_flags = 0;
- unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
-
- BUG_ON(get_region_id(ea) != USER_REGION_ID);
-
- if (!should_hash_preload(mm, ea))
- return;
-
- DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
- " trap=%lx\n", mm, mm->pgd, ea, access, trap);
-
- /* Get Linux PTE if available */
- pgdir = mm->pgd;
- if (pgdir == NULL)
- return;
-
- /* Get VSID */
- ssize = user_segment_size(ea);
- vsid = get_user_vsid(&mm->context, ea, ssize);
- if (!vsid)
- return;
- /*
- * Hash doesn't like irqs. Walking linux page table with irq disabled
- * saves us from holding multiple locks.
- */
- local_irq_save(flags);
-
- /*
- * THP pages use update_mmu_cache_pmd. We don't do
- * hash preload there. Hence can ignore THP here
- */
- ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
- if (!ptep)
- goto out_exit;
-
- WARN_ON(hugepage_shift);
-#ifdef CONFIG_PPC_64K_PAGES
- /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
- * a 64K kernel), then we don't preload, hash_page() will take
- * care of it once we actually try to access the page.
- * That way we don't have to duplicate all of the logic for segment
- * page size demotion here
- */
- if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
- goto out_exit;
-#endif /* CONFIG_PPC_64K_PAGES */
-
- /* Is that local to this CPU ? */
- if (mm_is_thread_local(mm))
- update_flags |= HPTE_LOCAL_UPDATE;
-
- /* Hash it in */
-#ifdef CONFIG_PPC_64K_PAGES
- if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K)
- rc = __hash_page_64K(ea, access, vsid, ptep, trap,
- update_flags, ssize);
- else
-#endif /* CONFIG_PPC_64K_PAGES */
- rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags,
- ssize, subpage_protection(mm, ea));
-
- /* Dump some info in case of hash insertion failure, they should
- * never happen so it is really useful to know if/when they do
- */
- if (rc == -1)
- hash_failure_debug(ea, access, vsid, trap, ssize,
- mm_ctx_user_psize(&mm->context),
- mm_ctx_user_psize(&mm->context),
- pte_val(*ptep));
-out_exit:
- local_irq_restore(flags);
-}
-
-#ifdef CONFIG_PPC_MEM_KEYS
-/*
- * Return the protection key associated with the given address and the
- * mm_struct.
- */
-u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
-{
- pte_t *ptep;
- u16 pkey = 0;
- unsigned long flags;
-
- if (!mm || !mm->pgd)
- return 0;
-
- local_irq_save(flags);
- ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
- if (ptep)
- pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
- local_irq_restore(flags);
-
- return pkey;
-}
-#endif /* CONFIG_PPC_MEM_KEYS */
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-static inline void tm_flush_hash_page(int local)
-{
- /*
- * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a
- * page back to a block device w/PIO could pick up transactional data
- * (bad!) so we force an abort here. Before the sync the page will be
- * made read-only, which will flush_hash_page. BIG ISSUE here: if the
- * kernel uses a page from userspace without unmapping it first, it may
- * see the speculated version.
- */
- if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
- MSR_TM_ACTIVE(current->thread.regs->msr)) {
- tm_enable();
- tm_abort(TM_CAUSE_TLBI);
- }
-}
-#else
-static inline void tm_flush_hash_page(int local)
-{
-}
-#endif
-
-/*
- * Return the global hash slot, corresponding to the given PTE, which contains
- * the HPTE.
- */
-unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
- int ssize, real_pte_t rpte, unsigned int subpg_index)
-{
- unsigned long hash, gslot, hidx;
-
- hash = hpt_hash(vpn, shift, ssize);
- hidx = __rpte_to_hidx(rpte, subpg_index);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
- gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- gslot += hidx & _PTEIDX_GROUP_IX;
- return gslot;
-}
-
-/* WARNING: This is called from hash_low_64.S, if you change this prototype,
- * do not forget to update the assembly call site !
- */
-void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
- unsigned long flags)
-{
- unsigned long index, shift, gslot;
- int local = flags & HPTE_LOCAL_UPDATE;
-
- DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
- pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
- gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index);
- DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot);
- /*
- * We use same base page size and actual psize, because we don't
- * use these functions for hugepage
- */
- mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize,
- ssize, local);
- } pte_iterate_hashed_end();
-
- tm_flush_hash_page(local);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
- pmd_t *pmdp, unsigned int psize, int ssize,
- unsigned long flags)
-{
- int i, max_hpte_count, valid;
- unsigned long s_addr;
- unsigned char *hpte_slot_array;
- unsigned long hidx, shift, vpn, hash, slot;
- int local = flags & HPTE_LOCAL_UPDATE;
-
- s_addr = addr & HPAGE_PMD_MASK;
- hpte_slot_array = get_hpte_slot_array(pmdp);
- /*
- * IF we try to do a HUGE PTE update after a withdraw is done.
- * we will find the below NULL. This happens when we do
- * split_huge_page_pmd
- */
- if (!hpte_slot_array)
- return;
-
- if (mmu_hash_ops.hugepage_invalidate) {
- mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
- psize, ssize, local);
- goto tm_abort;
- }
- /*
- * No bluk hpte removal support, invalidate each entry
- */
- shift = mmu_psize_defs[psize].shift;
- max_hpte_count = HPAGE_PMD_SIZE >> shift;
- for (i = 0; i < max_hpte_count; i++) {
- /*
- * 8 bits per each hpte entries
- * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
- */
- valid = hpte_valid(hpte_slot_array, i);
- if (!valid)
- continue;
- hidx = hpte_hash_index(hpte_slot_array, i);
-
- /* get the vpn */
- addr = s_addr + (i * (1ul << shift));
- vpn = hpt_vpn(addr, vsid, ssize);
- hash = hpt_hash(vpn, shift, ssize);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
-
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += hidx & _PTEIDX_GROUP_IX;
- mmu_hash_ops.hpte_invalidate(slot, vpn, psize,
- MMU_PAGE_16M, ssize, local);
- }
-tm_abort:
- tm_flush_hash_page(local);
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-void flush_hash_range(unsigned long number, int local)
-{
- if (mmu_hash_ops.flush_hash_range)
- mmu_hash_ops.flush_hash_range(number, local);
- else {
- int i;
- struct ppc64_tlb_batch *batch =
- this_cpu_ptr(&ppc64_tlb_batch);
-
- for (i = 0; i < number; i++)
- flush_hash_page(batch->vpn[i], batch->pte[i],
- batch->psize, batch->ssize, local);
- }
-}
-
-/*
- * low_hash_fault is called when we the low level hash code failed
- * to instert a PTE due to an hypervisor error
- */
-void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
-{
- enum ctx_state prev_state = exception_enter();
-
- if (user_mode(regs)) {
-#ifdef CONFIG_PPC_SUBPAGE_PROT
- if (rc == -2)
- _exception(SIGSEGV, regs, SEGV_ACCERR, address);
- else
-#endif
- _exception(SIGBUS, regs, BUS_ADRERR, address);
- } else
- bad_page_fault(regs, address, SIGBUS);
-
- exception_exit(prev_state);
-}
-
-long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
- unsigned long pa, unsigned long rflags,
- unsigned long vflags, int psize, int ssize)
-{
- unsigned long hpte_group;
- long slot;
-
-repeat:
- hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
- /* Insert into the hash table, primary slot */
- slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, vflags,
- psize, psize, ssize);
-
- /* Primary is full, try the secondary */
- if (unlikely(slot == -1)) {
- hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags,
- vflags | HPTE_V_SECONDARY,
- psize, psize, ssize);
- if (slot == -1) {
- if (mftb() & 0x1)
- hpte_group = (hash & htab_hash_mask) *
- HPTES_PER_GROUP;
-
- mmu_hash_ops.hpte_remove(hpte_group);
- goto repeat;
- }
- }
-
- return slot;
-}
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
-{
- unsigned long hash;
- unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
- unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
- unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
- long ret;
-
- hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-
- /* Don't create HPTE entries for bad address */
- if (!vsid)
- return;
-
- ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
- HPTE_V_BOLTED,
- mmu_linear_psize, mmu_kernel_ssize);
-
- BUG_ON (ret < 0);
- spin_lock(&linear_map_hash_lock);
- BUG_ON(linear_map_hash_slots[lmi] & 0x80);
- linear_map_hash_slots[lmi] = ret | 0x80;
- spin_unlock(&linear_map_hash_lock);
-}
-
-static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
-{
- unsigned long hash, hidx, slot;
- unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
- unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
-
- hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
- spin_lock(&linear_map_hash_lock);
- BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
- hidx = linear_map_hash_slots[lmi] & 0x7f;
- linear_map_hash_slots[lmi] = 0;
- spin_unlock(&linear_map_hash_lock);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += hidx & _PTEIDX_GROUP_IX;
- mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
- mmu_linear_psize,
- mmu_kernel_ssize, 0);
-}
-
-void __kernel_map_pages(struct page *page, int numpages, int enable)
-{
- unsigned long flags, vaddr, lmi;
- int i;
-
- local_irq_save(flags);
- for (i = 0; i < numpages; i++, page++) {
- vaddr = (unsigned long)page_address(page);
- lmi = __pa(vaddr) >> PAGE_SHIFT;
- if (lmi >= linear_map_hash_count)
- continue;
- if (enable)
- kernel_map_linear_page(vaddr, lmi);
- else
- kernel_unmap_linear_page(vaddr, lmi);
- }
- local_irq_restore(flags);
-}
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
-void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* We don't currently support the first MEMBLOCK not mapping 0
- * physical on those processors
- */
- BUG_ON(first_memblock_base != 0);
-
- /*
- * On virtualized systems the first entry is our RMA region aka VRMA,
- * non-virtualized 64-bit hash MMU systems don't have a limitation
- * on real mode access.
- *
- * For guests on platforms before POWER9, we clamp the it limit to 1G
- * to avoid some funky things such as RTAS bugs etc...
- */
- if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
- ppc64_rma_size = first_memblock_size;
- if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
- ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000);
-
- /* Finally limit subsequent allocations */
- memblock_set_current_limit(ppc64_rma_size);
- } else {
- ppc64_rma_size = ULONG_MAX;
- }
-}
-
-#ifdef CONFIG_DEBUG_FS
-
-static int hpt_order_get(void *data, u64 *val)
-{
- *val = ppc64_pft_size;
- return 0;
-}
-
-static int hpt_order_set(void *data, u64 val)
-{
- if (!mmu_hash_ops.resize_hpt)
- return -ENODEV;
-
- return mmu_hash_ops.resize_hpt(val);
-}
-
-DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
-
-static int __init hash64_debugfs(void)
-{
- if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
- NULL, &fops_hpt_order)) {
- pr_err("lpar: unable to create hpt_order debugsfs file\n");
- }
-
- return 0;
-}
-machine_device_initcall(pseries, hash64_debugfs);
-#endif /* CONFIG_DEBUG_FS */
+++ /dev/null
-/*
- * Copyright IBM Corporation, 2013
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-/*
- * PPC64 THP Support for hash based MMUs
- */
-#include <linux/mm.h>
-#include <asm/machdep.h>
-
-int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
- pmd_t *pmdp, unsigned long trap, unsigned long flags,
- int ssize, unsigned int psize)
-{
- unsigned int index, valid;
- unsigned char *hpte_slot_array;
- unsigned long rflags, pa, hidx;
- unsigned long old_pmd, new_pmd;
- int ret, lpsize = MMU_PAGE_16M;
- unsigned long vpn, hash, shift, slot;
-
- /*
- * atomically mark the linux large page PMD busy and dirty
- */
- do {
- pmd_t pmd = READ_ONCE(*pmdp);
-
- old_pmd = pmd_val(pmd);
- /* If PMD busy, retry the access */
- if (unlikely(old_pmd & H_PAGE_BUSY))
- return 0;
- /* If PMD permissions don't match, take page fault */
- if (unlikely(!check_pte_access(access, old_pmd)))
- return 1;
- /*
- * Try to lock the PTE, add ACCESSED and DIRTY if it was
- * a write access
- */
- new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED;
- if (access & _PAGE_WRITE)
- new_pmd |= _PAGE_DIRTY;
- } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd)));
-
- /*
- * Make sure this is thp or devmap entry
- */
- if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)))
- return 0;
-
- rflags = htab_convert_pte_flags(new_pmd);
-
-#if 0
- if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
-
- /*
- * No CPU has hugepages but lacks no execute, so we
- * don't need to worry about that case
- */
- rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
- }
-#endif
- /*
- * Find the slot index details for this ea, using base page size.
- */
- shift = mmu_psize_defs[psize].shift;
- index = (ea & ~HPAGE_PMD_MASK) >> shift;
- BUG_ON(index >= PTE_FRAG_SIZE);
-
- vpn = hpt_vpn(ea, vsid, ssize);
- hpte_slot_array = get_hpte_slot_array(pmdp);
- if (psize == MMU_PAGE_4K) {
- /*
- * invalidate the old hpte entry if we have that mapped via 64K
- * base page size. This is because demote_segment won't flush
- * hash page table entries.
- */
- if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) {
- flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K,
- ssize, flags);
- /*
- * With THP, we also clear the slot information with
- * respect to all the 64K hash pte mapping the 16MB
- * page. They are all invalid now. This make sure we
- * don't find the slot valid when we fault with 4k
- * base page size.
- *
- */
- memset(hpte_slot_array, 0, PTE_FRAG_SIZE);
- }
- }
-
- valid = hpte_valid(hpte_slot_array, index);
- if (valid) {
- /* update the hpte bits */
- hash = hpt_hash(vpn, shift, ssize);
- hidx = hpte_hash_index(hpte_slot_array, index);
- if (hidx & _PTEIDX_SECONDARY)
- hash = ~hash;
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot += hidx & _PTEIDX_GROUP_IX;
-
- ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
- psize, lpsize, ssize, flags);
- /*
- * We failed to update, try to insert a new entry.
- */
- if (ret == -1) {
- /*
- * large pte is marked busy, so we can be sure
- * nobody is looking at hpte_slot_array. hence we can
- * safely update this here.
- */
- valid = 0;
- hpte_slot_array[index] = 0;
- }
- }
-
- if (!valid) {
- unsigned long hpte_group;
-
- hash = hpt_hash(vpn, shift, ssize);
- /* insert new entry */
- pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
- new_pmd |= H_PAGE_HASHPTE;
-
-repeat:
- hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-
- /* Insert into the hash table, primary slot */
- slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa, rflags, 0,
- psize, lpsize, ssize);
- /*
- * Primary is full, try the secondary
- */
- if (unlikely(slot == -1)) {
- hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
- slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
- rflags,
- HPTE_V_SECONDARY,
- psize, lpsize, ssize);
- if (slot == -1) {
- if (mftb() & 0x1)
- hpte_group = (hash & htab_hash_mask) *
- HPTES_PER_GROUP;
-
- mmu_hash_ops.hpte_remove(hpte_group);
- goto repeat;
- }
- }
- /*
- * Hypervisor failure. Restore old pmd and return -1
- * similar to __hash_page_*
- */
- if (unlikely(slot == -2)) {
- *pmdp = __pmd(old_pmd);
- hash_failure_debug(ea, access, vsid, trap, ssize,
- psize, lpsize, old_pmd);
- return -1;
- }
- /*
- * large pte is marked busy, so we can be sure
- * nobody is looking at hpte_slot_array. hence we can
- * safely update this here.
- */
- mark_hpte_slot_valid(hpte_slot_array, index, slot);
- }
- /*
- * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with
- * base page size 4k.
- */
- if (psize == MMU_PAGE_4K)
- new_pmd |= H_PAGE_COMBO;
- /*
- * The hpte valid is stored in the pgtable whose address is in the
- * second half of the PMD. Order this against clearing of the busy bit in
- * huge pmd.
- */
- smp_wmb();
- *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY);
- return 0;
-}
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0
-/*
- * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
- *
- * Copyright (C) 2003 David Gibson, IBM Corporation.
- *
- * Based on the IA-32 version:
- * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
- */
-
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/machdep.h>
-
-extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
- unsigned long pa, unsigned long rlags,
- unsigned long vflags, int psize, int ssize);
-
-int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
- pte_t *ptep, unsigned long trap, unsigned long flags,
- int ssize, unsigned int shift, unsigned int mmu_psize)
-{
- real_pte_t rpte;
- unsigned long vpn;
- unsigned long old_pte, new_pte;
- unsigned long rflags, pa;
- long slot, offset;
-
- BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
-
- /* Search the Linux page table for a match with va */
- vpn = hpt_vpn(ea, vsid, ssize);
-
- /* At this point, we have a pte (old_pte) which can be used to build
- * or update an HPTE. There are 2 cases:
- *
- * 1. There is a valid (present) pte with no associated HPTE (this is
- * the most common case)
- * 2. There is a valid (present) pte with an associated HPTE. The
- * current values of the pp bits in the HPTE prevent access
- * because we are doing software DIRTY bit management and the
- * page is currently not DIRTY.
- */
-
-
- do {
- old_pte = pte_val(*ptep);
- /* If PTE busy, retry the access */
- if (unlikely(old_pte & H_PAGE_BUSY))
- return 0;
- /* If PTE permissions don't match, take page fault */
- if (unlikely(!check_pte_access(access, old_pte)))
- return 1;
-
- /* Try to lock the PTE, add ACCESSED and DIRTY if it was
- * a write access */
- new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
- if (access & _PAGE_WRITE)
- new_pte |= _PAGE_DIRTY;
- } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
- /* Make sure this is a hugetlb entry */
- if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
- return 0;
-
- rflags = htab_convert_pte_flags(new_pte);
- if (unlikely(mmu_psize == MMU_PAGE_16G))
- offset = PTRS_PER_PUD;
- else
- offset = PTRS_PER_PMD;
- rpte = __real_pte(__pte(old_pte), ptep, offset);
-
- if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
- /* No CPU has hugepages but lacks no execute, so we
- * don't need to worry about that case */
- rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
-
- /* Check if pte already has an hpte (case 2) */
- if (unlikely(old_pte & H_PAGE_HASHPTE)) {
- /* There MIGHT be an HPTE for this pte */
- unsigned long gslot;
-
- gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
- if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
- mmu_psize, ssize, flags) == -1)
- old_pte &= ~_PAGE_HPTEFLAGS;
- }
-
- if (likely(!(old_pte & H_PAGE_HASHPTE))) {
- unsigned long hash = hpt_hash(vpn, shift, ssize);
-
- pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-
- /* clear HPTE slot informations in new PTE */
- new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-
- slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
- mmu_psize, ssize);
-
- /*
- * Hypervisor failure. Restore old pte and return -1
- * similar to __hash_page_*
- */
- if (unlikely(slot == -2)) {
- *ptep = __pte(old_pte);
- hash_failure_debug(ea, access, vsid, trap, ssize,
- mmu_psize, mmu_psize, old_pte);
- return -1;
- }
-
- new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset);
- }
-
- /*
- * No need to use ldarx/stdcx here
- */
- *ptep = __pte(new_pte & ~H_PAGE_BUSY);
- return 0;
-}
-
-pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- unsigned long pte_val;
- /*
- * Clear the _PAGE_PRESENT so that no hardware parallel update is
- * possible. Also keep the pte_present true so that we don't take
- * wrong fault.
- */
- pte_val = pte_update(vma->vm_mm, addr, ptep,
- _PAGE_PRESENT, _PAGE_INVALID, 1);
-
- return __pte(pte_val);
-}
-
-void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, pte_t old_pte, pte_t pte)
-{
-
- if (radix_enabled())
- return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
- old_pte, pte);
- set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
-}
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/security.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/machdep.h>
-#include <asm/mman.h>
-#include <asm/tlb.h>
-
-void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
- int psize;
- struct hstate *hstate = hstate_file(vma->vm_file);
-
- psize = hstate_get_psize(hstate);
- radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
-}
-
-void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
- int psize;
- struct hstate *hstate = hstate_file(vma->vm_file);
-
- psize = hstate_get_psize(hstate);
- radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, psize);
-}
-
-void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
-{
- int psize;
- struct hstate *hstate = hstate_file(vma->vm_file);
-
- psize = hstate_get_psize(hstate);
- radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
-}
-
-/*
- * A vairant of hugetlb_get_unmapped_area doing topdown search
- * FIXME!! should we do as x86 does or non hugetlb area does ?
- * ie, use topdown or not based on mmap_is_legacy check ?
- */
-unsigned long
-radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- struct hstate *h = hstate_file(file);
- int fixed = (flags & MAP_FIXED);
- unsigned long high_limit;
- struct vm_unmapped_area_info info;
-
- high_limit = DEFAULT_MAP_WINDOW;
- if (addr >= high_limit || (fixed && (addr + len > high_limit)))
- high_limit = TASK_SIZE;
-
- if (len & ~huge_page_mask(h))
- return -EINVAL;
- if (len > high_limit)
- return -ENOMEM;
-
- if (fixed) {
- if (addr > high_limit - len)
- return -ENOMEM;
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
- return addr;
- }
-
- if (addr) {
- addr = ALIGN(addr, huge_page_size(h));
- vma = find_vma(mm, addr);
- if (high_limit - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
- return addr;
- }
- /*
- * We are always doing an topdown search here. Slice code
- * does that too.
- */
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
-
- return vm_unmapped_area(&info);
-}
-
-void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t old_pte, pte_t pte)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- /*
- * To avoid NMMU hang while relaxing access we need to flush the tlb before
- * we set the new value.
- */
- if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
- (atomic_read(&mm->context.copros) > 0))
- radix__flush_hugetlb_page(vma, addr);
-
- set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
-}
+++ /dev/null
-/*
- * MMU context allocation for 64-bit kernels.
- *
- * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/pkeys.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/export.h>
-#include <linux/gfp.h>
-#include <linux/slab.h>
-
-#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
-
-static DEFINE_IDA(mmu_context_ida);
-
-static int alloc_context_id(int min_id, int max_id)
-{
- return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL);
-}
-
-void hash__reserve_context_id(int id)
-{
- int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL);
-
- WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
-}
-
-int hash__alloc_context_id(void)
-{
- unsigned long max;
-
- if (mmu_has_feature(MMU_FTR_68_BIT_VA))
- max = MAX_USER_CONTEXT;
- else
- max = MAX_USER_CONTEXT_65BIT_VA;
-
- return alloc_context_id(MIN_USER_CONTEXT, max);
-}
-EXPORT_SYMBOL_GPL(hash__alloc_context_id);
-
-void slb_setup_new_exec(void);
-
-static int hash__init_new_context(struct mm_struct *mm)
-{
- int index;
-
- index = hash__alloc_context_id();
- if (index < 0)
- return index;
-
- mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context),
- GFP_KERNEL);
- if (!mm->context.hash_context) {
- ida_free(&mmu_context_ida, index);
- return -ENOMEM;
- }
-
- /*
- * The old code would re-promote on fork, we don't do that when using
- * slices as it could cause problem promoting slices that have been
- * forced down to 4K.
- *
- * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
- * explicitly against context.id == 0. This ensures that we properly
- * initialize context slice details for newly allocated mm's (which will
- * have id == 0) and don't alter context slice inherited via fork (which
- * will have id != 0).
- *
- * We should not be calling init_new_context() on init_mm. Hence a
- * check against 0 is OK.
- */
- if (mm->context.id == 0) {
- memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context));
- slice_init_new_context_exec(mm);
- } else {
- /* This is fork. Copy hash_context details from current->mm */
- memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
-#ifdef CONFIG_PPC_SUBPAGE_PROT
- /* inherit subpage prot detalis if we have one. */
- if (current->mm->context.hash_context->spt) {
- mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table),
- GFP_KERNEL);
- if (!mm->context.hash_context->spt) {
- ida_free(&mmu_context_ida, index);
- kfree(mm->context.hash_context);
- return -ENOMEM;
- }
- }
-#endif
-
- }
-
- pkey_mm_init(mm);
- return index;
-}
-
-void hash__setup_new_exec(void)
-{
- slice_setup_new_exec();
-
- slb_setup_new_exec();
-}
-
-static int radix__init_new_context(struct mm_struct *mm)
-{
- unsigned long rts_field;
- int index, max_id;
-
- max_id = (1 << mmu_pid_bits) - 1;
- index = alloc_context_id(mmu_base_pid, max_id);
- if (index < 0)
- return index;
-
- /*
- * set the process table entry,
- */
- rts_field = radix__get_tree_size();
- process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
-
- /*
- * Order the above store with subsequent update of the PID
- * register (at which point HW can start loading/caching
- * the entry) and the corresponding load by the MMU from
- * the L2 cache.
- */
- asm volatile("ptesync;isync" : : : "memory");
-
- mm->context.npu_context = NULL;
- mm->context.hash_context = NULL;
-
- return index;
-}
-
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
- int index;
-
- if (radix_enabled())
- index = radix__init_new_context(mm);
- else
- index = hash__init_new_context(mm);
-
- if (index < 0)
- return index;
-
- mm->context.id = index;
-
- mm->context.pte_frag = NULL;
- mm->context.pmd_frag = NULL;
-#ifdef CONFIG_SPAPR_TCE_IOMMU
- mm_iommu_init(mm);
-#endif
- atomic_set(&mm->context.active_cpus, 0);
- atomic_set(&mm->context.copros, 0);
-
- return 0;
-}
-
-void __destroy_context(int context_id)
-{
- ida_free(&mmu_context_ida, context_id);
-}
-EXPORT_SYMBOL_GPL(__destroy_context);
-
-static void destroy_contexts(mm_context_t *ctx)
-{
- int index, context_id;
-
- for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) {
- context_id = ctx->extended_id[index];
- if (context_id)
- ida_free(&mmu_context_ida, context_id);
- }
- kfree(ctx->hash_context);
-}
-
-static void pmd_frag_destroy(void *pmd_frag)
-{
- int count;
- struct page *page;
-
- page = virt_to_page(pmd_frag);
- /* drop all the pending references */
- count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
- /* We allow PTE_FRAG_NR fragments from a PTE page */
- if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
- }
-}
-
-static void destroy_pagetable_cache(struct mm_struct *mm)
-{
- void *frag;
-
- frag = mm->context.pte_frag;
- if (frag)
- pte_frag_destroy(frag);
-
- frag = mm->context.pmd_frag;
- if (frag)
- pmd_frag_destroy(frag);
- return;
-}
-
-void destroy_context(struct mm_struct *mm)
-{
-#ifdef CONFIG_SPAPR_TCE_IOMMU
- WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
-#endif
- if (radix_enabled())
- WARN_ON(process_tb[mm->context.id].prtb0 != 0);
- else
- subpage_prot_free(mm);
- destroy_contexts(&mm->context);
- mm->context.id = MMU_NO_CONTEXT;
-}
-
-void arch_exit_mmap(struct mm_struct *mm)
-{
- destroy_pagetable_cache(mm);
-
- if (radix_enabled()) {
- /*
- * Radix doesn't have a valid bit in the process table
- * entries. However we know that at least P9 implementation
- * will avoid caching an entry with an invalid RTS field,
- * and 0 is invalid. So this will do.
- *
- * This runs before the "fullmm" tlb flush in exit_mmap,
- * which does a RIC=2 tlbie to clear the process table
- * entry. See the "fullmm" comments in tlb-radix.c.
- *
- * No barrier required here after the store because
- * this process will do the invalidate, which starts with
- * ptesync.
- */
- process_tb[mm->context.id].prtb0 = 0;
- }
-}
-
-#ifdef CONFIG_PPC_RADIX_MMU
-void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
-{
- mtspr(SPRN_PID, next->context.id);
- isync();
-}
-#endif
+++ /dev/null
-/*
- * IOMMU helpers in MMU context.
- *
- * Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched/signal.h>
-#include <linux/slab.h>
-#include <linux/rculist.h>
-#include <linux/vmalloc.h>
-#include <linux/mutex.h>
-#include <linux/migrate.h>
-#include <linux/hugetlb.h>
-#include <linux/swap.h>
-#include <linux/sizes.h>
-#include <asm/mmu_context.h>
-#include <asm/pte-walk.h>
-#include <linux/mm_inline.h>
-
-static DEFINE_MUTEX(mem_list_mutex);
-
-#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1
-#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1)
-
-struct mm_iommu_table_group_mem_t {
- struct list_head next;
- struct rcu_head rcu;
- unsigned long used;
- atomic64_t mapped;
- unsigned int pageshift;
- u64 ua; /* userspace address */
- u64 entries; /* number of entries in hpas/hpages[] */
- /*
- * in mm_iommu_get we temporarily use this to store
- * struct page address.
- *
- * We need to convert ua to hpa in real mode. Make it
- * simpler by storing physical address.
- */
- union {
- struct page **hpages; /* vmalloc'ed */
- phys_addr_t *hpas;
- };
-#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1)
- u64 dev_hpa; /* Device memory base address */
-};
-
-static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
- unsigned long npages, bool incr)
-{
- long ret = 0, locked, lock_limit;
-
- if (!npages)
- return 0;
-
- down_write(&mm->mmap_sem);
-
- if (incr) {
- locked = mm->locked_vm + npages;
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- ret = -ENOMEM;
- else
- mm->locked_vm += npages;
- } else {
- if (WARN_ON_ONCE(npages > mm->locked_vm))
- npages = mm->locked_vm;
- mm->locked_vm -= npages;
- }
-
- pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
- current ? current->pid : 0,
- incr ? '+' : '-',
- npages << PAGE_SHIFT,
- mm->locked_vm << PAGE_SHIFT,
- rlimit(RLIMIT_MEMLOCK));
- up_write(&mm->mmap_sem);
-
- return ret;
-}
-
-bool mm_iommu_preregistered(struct mm_struct *mm)
-{
- return !list_empty(&mm->context.iommu_group_mem_list);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
-
-static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
- unsigned long entries, unsigned long dev_hpa,
- struct mm_iommu_table_group_mem_t **pmem)
-{
- struct mm_iommu_table_group_mem_t *mem;
- long i, ret, locked_entries = 0;
- unsigned int pageshift;
-
- mutex_lock(&mem_list_mutex);
-
- list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list,
- next) {
- /* Overlap? */
- if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
- (ua < (mem->ua +
- (mem->entries << PAGE_SHIFT)))) {
- ret = -EINVAL;
- goto unlock_exit;
- }
-
- }
-
- if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
- ret = mm_iommu_adjust_locked_vm(mm, entries, true);
- if (ret)
- goto unlock_exit;
-
- locked_entries = entries;
- }
-
- mem = kzalloc(sizeof(*mem), GFP_KERNEL);
- if (!mem) {
- ret = -ENOMEM;
- goto unlock_exit;
- }
-
- if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
- mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
- mem->dev_hpa = dev_hpa;
- goto good_exit;
- }
- mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA;
-
- /*
- * For a starting point for a maximum page size calculation
- * we use @ua and @entries natural alignment to allow IOMMU pages
- * smaller than huge pages but still bigger than PAGE_SIZE.
- */
- mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT));
- mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0])));
- if (!mem->hpas) {
- kfree(mem);
- ret = -ENOMEM;
- goto unlock_exit;
- }
-
- down_read(&mm->mmap_sem);
- ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL);
- up_read(&mm->mmap_sem);
- if (ret != entries) {
- /* free the reference taken */
- for (i = 0; i < ret; i++)
- put_page(mem->hpages[i]);
-
- vfree(mem->hpas);
- kfree(mem);
- ret = -EFAULT;
- goto unlock_exit;
- }
-
- pageshift = PAGE_SHIFT;
- for (i = 0; i < entries; ++i) {
- struct page *page = mem->hpages[i];
-
- /*
- * Allow to use larger than 64k IOMMU pages. Only do that
- * if we are backed by hugetlb.
- */
- if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
- struct page *head = compound_head(page);
-
- pageshift = compound_order(head) + PAGE_SHIFT;
- }
- mem->pageshift = min(mem->pageshift, pageshift);
- /*
- * We don't need struct page reference any more, switch
- * to physical address.
- */
- mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
- }
-
-good_exit:
- ret = 0;
- atomic64_set(&mem->mapped, 1);
- mem->used = 1;
- mem->ua = ua;
- mem->entries = entries;
- *pmem = mem;
-
- list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
-
-unlock_exit:
- if (locked_entries && ret)
- mm_iommu_adjust_locked_vm(mm, locked_entries, false);
-
- mutex_unlock(&mem_list_mutex);
-
- return ret;
-}
-
-long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
- struct mm_iommu_table_group_mem_t **pmem)
-{
- return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA,
- pmem);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_new);
-
-long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
- unsigned long entries, unsigned long dev_hpa,
- struct mm_iommu_table_group_mem_t **pmem)
-{
- return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_newdev);
-
-static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
-{
- long i;
- struct page *page = NULL;
-
- if (!mem->hpas)
- return;
-
- for (i = 0; i < mem->entries; ++i) {
- if (!mem->hpas[i])
- continue;
-
- page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
- if (!page)
- continue;
-
- if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
- SetPageDirty(page);
-
- put_page(page);
- mem->hpas[i] = 0;
- }
-}
-
-static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
-{
-
- mm_iommu_unpin(mem);
- vfree(mem->hpas);
- kfree(mem);
-}
-
-static void mm_iommu_free(struct rcu_head *head)
-{
- struct mm_iommu_table_group_mem_t *mem = container_of(head,
- struct mm_iommu_table_group_mem_t, rcu);
-
- mm_iommu_do_free(mem);
-}
-
-static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
-{
- list_del_rcu(&mem->next);
- call_rcu(&mem->rcu, mm_iommu_free);
-}
-
-long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
-{
- long ret = 0;
- unsigned long entries, dev_hpa;
-
- mutex_lock(&mem_list_mutex);
-
- if (mem->used == 0) {
- ret = -ENOENT;
- goto unlock_exit;
- }
-
- --mem->used;
- /* There are still users, exit */
- if (mem->used)
- goto unlock_exit;
-
- /* Are there still mappings? */
- if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
- ++mem->used;
- ret = -EBUSY;
- goto unlock_exit;
- }
-
- /* @mapped became 0 so now mappings are disabled, release the region */
- entries = mem->entries;
- dev_hpa = mem->dev_hpa;
- mm_iommu_release(mem);
-
- if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
- mm_iommu_adjust_locked_vm(mm, entries, false);
-
-unlock_exit:
- mutex_unlock(&mem_list_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_put);
-
-struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
- unsigned long ua, unsigned long size)
-{
- struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
- list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
- if ((mem->ua <= ua) &&
- (ua + size <= mem->ua +
- (mem->entries << PAGE_SHIFT))) {
- ret = mem;
- break;
- }
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_lookup);
-
-struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
- unsigned long ua, unsigned long size)
-{
- struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
- list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
- next) {
- if ((mem->ua <= ua) &&
- (ua + size <= mem->ua +
- (mem->entries << PAGE_SHIFT))) {
- ret = mem;
- break;
- }
- }
-
- return ret;
-}
-
-struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
- unsigned long ua, unsigned long entries)
-{
- struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
- mutex_lock(&mem_list_mutex);
-
- list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
- if ((mem->ua == ua) && (mem->entries == entries)) {
- ret = mem;
- ++mem->used;
- break;
- }
- }
-
- mutex_unlock(&mem_list_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_get);
-
-long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned int pageshift, unsigned long *hpa)
-{
- const long entry = (ua - mem->ua) >> PAGE_SHIFT;
- u64 *va;
-
- if (entry >= mem->entries)
- return -EFAULT;
-
- if (pageshift > mem->pageshift)
- return -EFAULT;
-
- if (!mem->hpas) {
- *hpa = mem->dev_hpa + (ua - mem->ua);
- return 0;
- }
-
- va = &mem->hpas[entry];
- *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
-
-long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned int pageshift, unsigned long *hpa)
-{
- const long entry = (ua - mem->ua) >> PAGE_SHIFT;
- unsigned long *pa;
-
- if (entry >= mem->entries)
- return -EFAULT;
-
- if (pageshift > mem->pageshift)
- return -EFAULT;
-
- if (!mem->hpas) {
- *hpa = mem->dev_hpa + (ua - mem->ua);
- return 0;
- }
-
- pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
- if (!pa)
- return -EFAULT;
-
- *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
-
- return 0;
-}
-
-extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
-{
- struct mm_iommu_table_group_mem_t *mem;
- long entry;
- void *va;
- unsigned long *pa;
-
- mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
- if (!mem)
- return;
-
- if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA)
- return;
-
- entry = (ua - mem->ua) >> PAGE_SHIFT;
- va = &mem->hpas[entry];
-
- pa = (void *) vmalloc_to_phys(va);
- if (!pa)
- return;
-
- *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
-}
-
-bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
- unsigned int pageshift, unsigned long *size)
-{
- struct mm_iommu_table_group_mem_t *mem;
- unsigned long end;
-
- list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
- if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
- continue;
-
- end = mem->dev_hpa + (mem->entries << PAGE_SHIFT);
- if ((mem->dev_hpa <= hpa) && (hpa < end)) {
- /*
- * Since the IOMMU page size might be bigger than
- * PAGE_SIZE, the amount of preregistered memory
- * starting from @hpa might be smaller than 1<<pageshift
- * and the caller needs to distinguish this situation.
- */
- *size = min(1UL << pageshift, end - hpa);
- return true;
- }
- }
-
- return false;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_is_devmem);
-
-long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
-{
- if (atomic64_inc_not_zero(&mem->mapped))
- return 0;
-
- /* Last mm_iommu_put() has been called, no more mappings allowed() */
- return -ENXIO;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
-
-void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
-{
- atomic64_add_unless(&mem->mapped, -1, 1);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
-
-void mm_iommu_init(struct mm_struct *mm)
-{
- INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
-}
/* Virtual Processor Home Node (VPHN) support */
#ifdef CONFIG_PPC_SPLPAR
-#include "vphn.h"
+#include "book3s64/vphn.h"
struct topology_update_data {
struct topology_update_data *next;
+++ /dev/null
-/*
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/sched.h>
-#include <linux/mm_types.h>
-#include <linux/memblock.h>
-#include <misc/cxl-base.h>
-
-#include <asm/pgalloc.h>
-#include <asm/tlb.h>
-#include <asm/trace.h>
-#include <asm/powernv.h>
-
-#include <mm/mmu_decl.h>
-#include <trace/events/thp.h>
-
-unsigned long __pmd_frag_nr;
-EXPORT_SYMBOL(__pmd_frag_nr);
-unsigned long __pmd_frag_size_shift;
-EXPORT_SYMBOL(__pmd_frag_size_shift);
-
-int (*register_process_table)(unsigned long base, unsigned long page_size,
- unsigned long tbl_size);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- * This is called when relaxing access to a hugepage. It's also called in the page
- * fault path when we don't hit any of the major fault cases, ie, a minor
- * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
- * handled those two for us, we additionally deal with missing execute
- * permission here on some processors
- */
-int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp, pmd_t entry, int dirty)
-{
- int changed;
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
- assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
-#endif
- changed = !pmd_same(*(pmdp), entry);
- if (changed) {
- /*
- * We can use MMU_PAGE_2M here, because only radix
- * path look at the psize.
- */
- __ptep_set_access_flags(vma, pmdp_ptep(pmdp),
- pmd_pte(entry), address, MMU_PAGE_2M);
- }
- return changed;
-}
-
-int pmdp_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
-}
-/*
- * set a new huge pmd. We should not be called for updating
- * an existing pmd entry. That should go via pmd_hugepage_update.
- */
-void set_pmd_at(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmd)
-{
-#ifdef CONFIG_DEBUG_VM
- /*
- * Make sure hardware valid bit is not set. We don't do
- * tlb flush for this update.
- */
-
- WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
- assert_spin_locked(pmd_lockptr(mm, pmdp));
- WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd)));
-#endif
- trace_hugepage_set_pmd(addr, pmd_val(pmd));
- return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
-}
-
-static void do_nothing(void *unused)
-{
-
-}
-/*
- * Serialize against find_current_mm_pte which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_current_mm_pte to finish.
- */
-void serialize_against_pte_lookup(struct mm_struct *mm)
-{
- smp_mb();
- smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
-}
-
-/*
- * We use this to invalidate a pmdp entry before switching from a
- * hugepte to regular pmd entry.
- */
-pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- unsigned long old_pmd;
-
- old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- /*
- * This ensures that generic code that rely on IRQ disabling
- * to prevent a parallel THP split work as expected.
- */
- serialize_against_pte_lookup(vma->vm_mm);
- return __pmd(old_pmd);
-}
-
-static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
-{
- return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
-}
-
-pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
-{
- unsigned long pmdv;
-
- pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
- return pmd_set_protbits(__pmd(pmdv), pgprot);
-}
-
-pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
-{
- return pfn_pmd(page_to_pfn(page), pgprot);
-}
-
-pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
- unsigned long pmdv;
-
- pmdv = pmd_val(pmd);
- pmdv &= _HPAGE_CHG_MASK;
- return pmd_set_protbits(__pmd(pmdv), newprot);
-}
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a HUGE PMD entry in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux HUGE PMD entry.
- */
-void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd)
-{
- if (radix_enabled())
- prefetch((void *)addr);
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-/* For use by kexec */
-void mmu_cleanup_all(void)
-{
- if (radix_enabled())
- radix__mmu_cleanup_all();
- else if (mmu_hash_ops.hpte_clear_all)
- mmu_hash_ops.hpte_clear_all();
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid)
-{
- if (radix_enabled())
- return radix__create_section_mapping(start, end, nid);
-
- return hash__create_section_mapping(start, end, nid);
-}
-
-int __meminit remove_section_mapping(unsigned long start, unsigned long end)
-{
- if (radix_enabled())
- return radix__remove_section_mapping(start, end);
-
- return hash__remove_section_mapping(start, end);
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-void __init mmu_partition_table_init(void)
-{
- unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
- unsigned long ptcr;
-
- BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
- /* Initialize the Partition Table with no entries */
- partition_tb = memblock_alloc(patb_size, patb_size);
- if (!partition_tb)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, patb_size, patb_size);
-
- /*
- * update partition table control register,
- * 64 K size.
- */
- ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
- mtspr(SPRN_PTCR, ptcr);
- powernv_set_nmmu_ptcr(ptcr);
-}
-
-void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
- unsigned long dw1)
-{
- unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
-
- partition_tb[lpid].patb0 = cpu_to_be64(dw0);
- partition_tb[lpid].patb1 = cpu_to_be64(dw1);
-
- /*
- * Global flush of TLBs and partition table caches for this lpid.
- * The type of flush (hash or radix) depends on what the previous
- * use of this partition ID was, not the new use.
- */
- asm volatile("ptesync" : : : "memory");
- if (old & PATB_HR) {
- asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
- } else {
- asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
- }
- /* do we need fixup here ?*/
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-}
-EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
-
-static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
-{
- void *pmd_frag, *ret;
-
- if (PMD_FRAG_NR == 1)
- return NULL;
-
- spin_lock(&mm->page_table_lock);
- ret = mm->context.pmd_frag;
- if (ret) {
- pmd_frag = ret + PMD_FRAG_SIZE;
- /*
- * If we have taken up all the fragments mark PTE page NULL
- */
- if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0)
- pmd_frag = NULL;
- mm->context.pmd_frag = pmd_frag;
- }
- spin_unlock(&mm->page_table_lock);
- return (pmd_t *)ret;
-}
-
-static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
-{
- void *ret = NULL;
- struct page *page;
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
-
- if (mm == &init_mm)
- gfp &= ~__GFP_ACCOUNT;
- page = alloc_page(gfp);
- if (!page)
- return NULL;
- if (!pgtable_pmd_page_ctor(page)) {
- __free_pages(page, 0);
- return NULL;
- }
-
- atomic_set(&page->pt_frag_refcount, 1);
-
- ret = page_address(page);
- /*
- * if we support only one fragment just return the
- * allocated page.
- */
- if (PMD_FRAG_NR == 1)
- return ret;
-
- spin_lock(&mm->page_table_lock);
- /*
- * If we find pgtable_page set, we return
- * the allocated page with single fragement
- * count.
- */
- if (likely(!mm->context.pmd_frag)) {
- atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
- mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
- }
- spin_unlock(&mm->page_table_lock);
-
- return (pmd_t *)ret;
-}
-
-pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
-{
- pmd_t *pmd;
-
- pmd = get_pmd_from_cache(mm);
- if (pmd)
- return pmd;
-
- return __alloc_for_pmdcache(mm);
-}
-
-void pmd_fragment_free(unsigned long *pmd)
-{
- struct page *page = virt_to_page(pmd);
-
- BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
- if (atomic_dec_and_test(&page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
- }
-}
-
-static inline void pgtable_free(void *table, int index)
-{
- switch (index) {
- case PTE_INDEX:
- pte_fragment_free(table, 0);
- break;
- case PMD_INDEX:
- pmd_fragment_free(table);
- break;
- case PUD_INDEX:
- kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
- break;
-#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
- /* 16M hugepd directory at pud level */
- case HTLB_16M_INDEX:
- BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0);
- kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table);
- break;
- /* 16G hugepd directory at the pgd level */
- case HTLB_16G_INDEX:
- BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0);
- kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table);
- break;
-#endif
- /* We don't free pgd table via RCU callback */
- default:
- BUG();
- }
-}
-
-#ifdef CONFIG_SMP
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
-{
- unsigned long pgf = (unsigned long)table;
-
- BUG_ON(index > MAX_PGTABLE_INDEX_SIZE);
- pgf |= index;
- tlb_remove_table(tlb, (void *)pgf);
-}
-
-void __tlb_remove_table(void *_table)
-{
- void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
- unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
- return pgtable_free(table, index);
-}
-#else
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
-{
- return pgtable_free(table, index);
-}
-#endif
-
-#ifdef CONFIG_PROC_FS
-atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
-
-void arch_report_meminfo(struct seq_file *m)
-{
- /*
- * Hash maps the memory with one size mmu_linear_psize.
- * So don't bother to print these on hash
- */
- if (!radix_enabled())
- return;
- seq_printf(m, "DirectMap4k: %8lu kB\n",
- atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2);
- seq_printf(m, "DirectMap64k: %8lu kB\n",
- atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6);
- seq_printf(m, "DirectMap2M: %8lu kB\n",
- atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
- seq_printf(m, "DirectMap1G: %8lu kB\n",
- atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
-}
-#endif /* CONFIG_PROC_FS */
-
-pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep)
-{
- unsigned long pte_val;
-
- /*
- * Clear the _PAGE_PRESENT so that no hardware parallel update is
- * possible. Also keep the pte_present true so that we don't take
- * wrong fault.
- */
- pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
-
- return __pte(pte_val);
-
-}
-
-void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, pte_t old_pte, pte_t pte)
-{
- if (radix_enabled())
- return radix__ptep_modify_prot_commit(vma, addr,
- ptep, old_pte, pte);
- set_pte_at(vma->vm_mm, addr, ptep, pte);
-}
-
-/*
- * For hash translation mode, we use the deposited table to store hash slot
- * information and they are stored at PTRS_PER_PMD offset from related pmd
- * location. Hence a pmd move requires deposit and withdraw.
- *
- * For radix translation with split pmd ptl, we store the deposited table in the
- * pmd page. Hence if we have different pmd page we need to withdraw during pmd
- * move.
- *
- * With hash we use deposited table always irrespective of anon or not.
- * With radix we use deposited table only for anonymous mapping.
- */
-int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
- struct spinlock *old_pmd_ptl,
- struct vm_area_struct *vma)
-{
- if (radix_enabled())
- return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
-
- return true;
-}
+++ /dev/null
-/*
- * Copyright 2005, Paul Mackerras, IBM Corporation.
- * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/sched.h>
-#include <linux/mm_types.h>
-#include <linux/mm.h>
-
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
-#include <asm/sections.h>
-#include <asm/mmu.h>
-#include <asm/tlb.h>
-
-#include <mm/mmu_decl.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/thp.h>
-
-#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE))
-#warning Limited user VSID range means pagetable space is wasted
-#endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-/*
- * vmemmap is the starting address of the virtual address space where
- * struct pages are allocated for all possible PFNs present on the system
- * including holes and bad memory (hence sparse). These virtual struct
- * pages are stored in sequence in this virtual address space irrespective
- * of the fact whether the corresponding PFN is valid or not. This achieves
- * constant relationship between address of struct page and its PFN.
- *
- * During boot or memory hotplug operation when a new memory section is
- * added, physical memory allocation (including hash table bolting) will
- * be performed for the set of struct pages which are part of the memory
- * section. This saves memory by not allocating struct pages for PFNs
- * which are not valid.
- *
- * ----------------------------------------------
- * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
- * ----------------------------------------------
- *
- * f000000000000000 c000000000000000
- * vmemmap +--------------+ +--------------+
- * + | page struct | +--------------> | page struct |
- * | +--------------+ +--------------+
- * | | page struct | +--------------> | page struct |
- * | +--------------+ | +--------------+
- * | | page struct | + +------> | page struct |
- * | +--------------+ | +--------------+
- * | | page struct | | +--> | page struct |
- * | +--------------+ | | +--------------+
- * | | page struct | | |
- * | +--------------+ | |
- * | | page struct | | |
- * | +--------------+ | |
- * | | page struct | | |
- * | +--------------+ | |
- * | | page struct | | |
- * | +--------------+ | |
- * | | page struct | +-------+ |
- * | +--------------+ |
- * | | page struct | +-----------+
- * | +--------------+
- * | | page struct | No mapping
- * | +--------------+
- * | | page struct | No mapping
- * v +--------------+
- *
- * -----------------------------------------
- * | RELATION BETWEEN STRUCT PAGES AND PFNS|
- * -----------------------------------------
- *
- * vmemmap +--------------+ +---------------+
- * + | page struct | +-------------> | PFN |
- * | +--------------+ +---------------+
- * | | page struct | +-------------> | PFN |
- * | +--------------+ +---------------+
- * | | page struct | +-------------> | PFN |
- * | +--------------+ +---------------+
- * | | page struct | +-------------> | PFN |
- * | +--------------+ +---------------+
- * | | |
- * | +--------------+
- * | | |
- * | +--------------+
- * | | |
- * | +--------------+ +---------------+
- * | | page struct | +-------------> | PFN |
- * | +--------------+ +---------------+
- * | | |
- * | +--------------+
- * | | |
- * | +--------------+ +---------------+
- * | | page struct | +-------------> | PFN |
- * | +--------------+ +---------------+
- * | | page struct | +-------------> | PFN |
- * v +--------------+ +---------------+
- */
-/*
- * On hash-based CPUs, the vmemmap is bolted in the hash table.
- *
- */
-int __meminit hash__vmemmap_create_mapping(unsigned long start,
- unsigned long page_size,
- unsigned long phys)
-{
- int rc;
-
- if ((start + page_size) >= H_VMEMMAP_END) {
- pr_warn("Outside the supported range\n");
- return -1;
- }
-
- rc = htab_bolt_mapping(start, start + page_size, phys,
- pgprot_val(PAGE_KERNEL),
- mmu_vmemmap_psize, mmu_kernel_ssize);
- if (rc < 0) {
- int rc2 = htab_remove_mapping(start, start + page_size,
- mmu_vmemmap_psize,
- mmu_kernel_ssize);
- BUG_ON(rc2 && (rc2 != -ENOENT));
- }
- return rc;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-void hash__vmemmap_remove_mapping(unsigned long start,
- unsigned long page_size)
-{
- int rc = htab_remove_mapping(start, start + page_size,
- mmu_vmemmap_psize,
- mmu_kernel_ssize);
- BUG_ON((rc < 0) && (rc != -ENOENT));
- WARN_ON(rc == -ENOENT);
-}
-#endif
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-
-/*
- * map_kernel_page currently only called by __ioremap
- * map_kernel_page adds an entry to the ioremap page table
- * and adds an entry to the HPT, possibly bolting it
- */
-int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
-{
- pgd_t *pgdp;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
- if (slab_is_available()) {
- pgdp = pgd_offset_k(ea);
- pudp = pud_alloc(&init_mm, pgdp, ea);
- if (!pudp)
- return -ENOMEM;
- pmdp = pmd_alloc(&init_mm, pudp, ea);
- if (!pmdp)
- return -ENOMEM;
- ptep = pte_alloc_kernel(pmdp, ea);
- if (!ptep)
- return -ENOMEM;
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
- } else {
- /*
- * If the mm subsystem is not fully up, we cannot create a
- * linux page table entry for this mapping. Simply bolt an
- * entry in the hardware page table.
- *
- */
- if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
- mmu_io_psize, mmu_kernel_ssize)) {
- printk(KERN_ERR "Failed to do bolted mapping IO "
- "memory at %016lx !\n", pa);
- return -ENOMEM;
- }
- }
-
- smp_wmb();
- return 0;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, unsigned long clr,
- unsigned long set)
-{
- __be64 old_be, tmp;
- unsigned long old;
-
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
- assert_spin_locked(pmd_lockptr(mm, pmdp));
-#endif
-
- __asm__ __volatile__(
- "1: ldarx %0,0,%3\n\
- and. %1,%0,%6\n\
- bne- 1b \n\
- andc %1,%0,%4 \n\
- or %1,%1,%7\n\
- stdcx. %1,0,%3 \n\
- bne- 1b"
- : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
- : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
- "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
- : "cc" );
-
- old = be64_to_cpu(old_be);
-
- trace_hugepage_update(addr, old, clr, set);
- if (old & H_PAGE_HASHPTE)
- hpte_do_hugepage_flush(mm, addr, pmdp, old);
- return old;
-}
-
-pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd;
-
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(pmd_trans_huge(*pmdp));
- VM_BUG_ON(pmd_devmap(*pmdp));
-
- pmd = *pmdp;
- pmd_clear(pmdp);
- /*
- * Wait for all pending hash_page to finish. This is needed
- * in case of subpage collapse. When we collapse normal pages
- * to hugepage, we first clear the pmd, then invalidate all
- * the PTE entries. The assumption here is that any low level
- * page fault will see a none pmd and take the slow path that
- * will wait on mmap_sem. But we could very well be in a
- * hash_page with local ptep pointer value. Such a hash page
- * can result in adding new HPTE entries for normal subpages.
- * That means we could be modifying the page content as we
- * copy them to a huge page. So wait for parallel hash_page
- * to finish before invalidating HPTE entries. We can do this
- * by sending an IPI to all the cpus and executing a dummy
- * function there.
- */
- serialize_against_pte_lookup(vma->vm_mm);
- /*
- * Now invalidate the hpte entries in the range
- * covered by pmd. This make sure we take a
- * fault and will find the pmd as none, which will
- * result in a major fault which takes mmap_sem and
- * hence wait for collapse to complete. Without this
- * the __collapse_huge_page_copy can result in copying
- * the old content.
- */
- flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
- return pmd;
-}
-
-/*
- * We want to put the pgtable in pmd and use pgtable for tracking
- * the base page size hptes
- */
-void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
- pgtable_t pgtable)
-{
- pgtable_t *pgtable_slot;
-
- assert_spin_locked(pmd_lockptr(mm, pmdp));
- /*
- * we store the pgtable in the second half of PMD
- */
- pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
- *pgtable_slot = pgtable;
- /*
- * expose the deposited pgtable to other cpus.
- * before we set the hugepage PTE at pmd level
- * hash fault code looks at the deposted pgtable
- * to store hash index values.
- */
- smp_wmb();
-}
-
-pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
-{
- pgtable_t pgtable;
- pgtable_t *pgtable_slot;
-
- assert_spin_locked(pmd_lockptr(mm, pmdp));
-
- pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
- pgtable = *pgtable_slot;
- /*
- * Once we withdraw, mark the entry NULL.
- */
- *pgtable_slot = NULL;
- /*
- * We store HPTE information in the deposited PTE fragment.
- * zero out the content on withdraw.
- */
- memset(pgtable, 0, PTE_FRAG_SIZE);
- return pgtable;
-}
-
-/*
- * A linux hugepage PMD was changed and the corresponding hash table entries
- * neesd to be flushed.
- */
-void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, unsigned long old_pmd)
-{
- int ssize;
- unsigned int psize;
- unsigned long vsid;
- unsigned long flags = 0;
-
- /* get the base page size,vsid and segment size */
-#ifdef CONFIG_DEBUG_VM
- psize = get_slice_psize(mm, addr);
- BUG_ON(psize == MMU_PAGE_16M);
-#endif
- if (old_pmd & H_PAGE_COMBO)
- psize = MMU_PAGE_4K;
- else
- psize = MMU_PAGE_64K;
-
- if (!is_kernel_addr(addr)) {
- ssize = user_segment_size(addr);
- vsid = get_user_vsid(&mm->context, addr, ssize);
- WARN_ON(vsid == 0);
- } else {
- vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
- ssize = mmu_kernel_ssize;
- }
-
- if (mm_is_thread_local(mm))
- flags |= HPTE_LOCAL_UPDATE;
-
- return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
-}
-
-pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp)
-{
- pmd_t old_pmd;
- pgtable_t pgtable;
- unsigned long old;
- pgtable_t *pgtable_slot;
-
- old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
- old_pmd = __pmd(old);
- /*
- * We have pmd == none and we are holding page_table_lock.
- * So we can safely go and clear the pgtable hash
- * index info.
- */
- pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
- pgtable = *pgtable_slot;
- /*
- * Let's zero out old valid and hash index details
- * hash fault look at them.
- */
- memset(pgtable, 0, PTE_FRAG_SIZE);
- /*
- * Serialize against find_current_mm_pte variants which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_curren_mm_pte to finish.
- */
- serialize_against_pte_lookup(mm);
- return old_pmd;
-}
-
-int hash__has_transparent_hugepage(void)
-{
-
- if (!mmu_has_feature(MMU_FTR_16M_PAGE))
- return 0;
- /*
- * We support THP only if PMD_SIZE is 16MB.
- */
- if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
- return 0;
- /*
- * We need to make sure that we support 16MB hugepage in a segement
- * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
- * of 64K.
- */
- /*
- * If we have 64K HPTE, we will be using that by default
- */
- if (mmu_psize_defs[MMU_PAGE_64K].shift &&
- (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
- return 0;
- /*
- * Ok we only have 4K HPTE
- */
- if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
- return 0;
-
- return 1;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-#ifdef CONFIG_STRICT_KERNEL_RWX
-static bool hash__change_memory_range(unsigned long start, unsigned long end,
- unsigned long newpp)
-{
- unsigned long idx;
- unsigned int step, shift;
-
- shift = mmu_psize_defs[mmu_linear_psize].shift;
- step = 1 << shift;
-
- start = ALIGN_DOWN(start, step);
- end = ALIGN(end, step); // aligns up
-
- if (start >= end)
- return false;
-
- pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
- start, end, newpp, step);
-
- for (idx = start; idx < end; idx += step)
- /* Not sure if we can do much with the return value */
- mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
- mmu_kernel_ssize);
-
- return true;
-}
-
-void hash__mark_rodata_ro(void)
-{
- unsigned long start, end;
-
- start = (unsigned long)_stext;
- end = (unsigned long)__init_begin;
-
- WARN_ON(!hash__change_memory_range(start, end, PP_RXXX));
-}
-
-void hash__mark_initmem_nx(void)
-{
- unsigned long start, end, pp;
-
- start = (unsigned long)__init_begin;
- end = (unsigned long)__init_end;
-
- pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
-
- WARN_ON(!hash__change_memory_range(start, end, pp));
-}
-#endif
+++ /dev/null
-/*
- * Page table handling routines for radix page table.
- *
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) "radix-mmu: " fmt
-
-#include <linux/kernel.h>
-#include <linux/sched/mm.h>
-#include <linux/memblock.h>
-#include <linux/of_fdt.h>
-#include <linux/mm.h>
-#include <linux/string_helpers.h>
-#include <linux/stop_machine.h>
-
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/mmu_context.h>
-#include <asm/dma.h>
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-#include <asm/firmware.h>
-#include <asm/powernv.h>
-#include <asm/sections.h>
-#include <asm/trace.h>
-#include <asm/uaccess.h>
-
-#include <trace/events/thp.h>
-
-unsigned int mmu_pid_bits;
-unsigned int mmu_base_pid;
-
-static int native_register_process_table(unsigned long base, unsigned long pg_sz,
- unsigned long table_size)
-{
- unsigned long patb0, patb1;
-
- patb0 = be64_to_cpu(partition_tb[0].patb0);
- patb1 = base | table_size | PATB_GR;
-
- mmu_partition_table_set_entry(0, patb0, patb1);
-
- return 0;
-}
-
-static __ref void *early_alloc_pgtable(unsigned long size, int nid,
- unsigned long region_start, unsigned long region_end)
-{
- phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
- phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
- void *ptr;
-
- if (region_start)
- min_addr = region_start;
- if (region_end)
- max_addr = region_end;
-
- ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
-
- if (!ptr)
- panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
- __func__, size, size, nid, &min_addr, &max_addr);
-
- return ptr;
-}
-
-static int early_map_kernel_page(unsigned long ea, unsigned long pa,
- pgprot_t flags,
- unsigned int map_page_size,
- int nid,
- unsigned long region_start, unsigned long region_end)
-{
- unsigned long pfn = pa >> PAGE_SHIFT;
- pgd_t *pgdp;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- pgdp = pgd_offset_k(ea);
- if (pgd_none(*pgdp)) {
- pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
- region_start, region_end);
- pgd_populate(&init_mm, pgdp, pudp);
- }
- pudp = pud_offset(pgdp, ea);
- if (map_page_size == PUD_SIZE) {
- ptep = (pte_t *)pudp;
- goto set_the_pte;
- }
- if (pud_none(*pudp)) {
- pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
- region_start, region_end);
- pud_populate(&init_mm, pudp, pmdp);
- }
- pmdp = pmd_offset(pudp, ea);
- if (map_page_size == PMD_SIZE) {
- ptep = pmdp_ptep(pmdp);
- goto set_the_pte;
- }
- if (!pmd_present(*pmdp)) {
- ptep = early_alloc_pgtable(PAGE_SIZE, nid,
- region_start, region_end);
- pmd_populate_kernel(&init_mm, pmdp, ptep);
- }
- ptep = pte_offset_kernel(pmdp, ea);
-
-set_the_pte:
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
- smp_wmb();
- return 0;
-}
-
-/*
- * nid, region_start, and region_end are hints to try to place the page
- * table memory in the same node or region.
- */
-static int __map_kernel_page(unsigned long ea, unsigned long pa,
- pgprot_t flags,
- unsigned int map_page_size,
- int nid,
- unsigned long region_start, unsigned long region_end)
-{
- unsigned long pfn = pa >> PAGE_SHIFT;
- pgd_t *pgdp;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
- /*
- * Make sure task size is correct as per the max adddr
- */
- BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
-
-#ifdef CONFIG_PPC_64K_PAGES
- BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
-#endif
-
- if (unlikely(!slab_is_available()))
- return early_map_kernel_page(ea, pa, flags, map_page_size,
- nid, region_start, region_end);
-
- /*
- * Should make page table allocation functions be able to take a
- * node, so we can place kernel page tables on the right nodes after
- * boot.
- */
- pgdp = pgd_offset_k(ea);
- pudp = pud_alloc(&init_mm, pgdp, ea);
- if (!pudp)
- return -ENOMEM;
- if (map_page_size == PUD_SIZE) {
- ptep = (pte_t *)pudp;
- goto set_the_pte;
- }
- pmdp = pmd_alloc(&init_mm, pudp, ea);
- if (!pmdp)
- return -ENOMEM;
- if (map_page_size == PMD_SIZE) {
- ptep = pmdp_ptep(pmdp);
- goto set_the_pte;
- }
- ptep = pte_alloc_kernel(pmdp, ea);
- if (!ptep)
- return -ENOMEM;
-
-set_the_pte:
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
- smp_wmb();
- return 0;
-}
-
-int radix__map_kernel_page(unsigned long ea, unsigned long pa,
- pgprot_t flags,
- unsigned int map_page_size)
-{
- return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
-}
-
-#ifdef CONFIG_STRICT_KERNEL_RWX
-void radix__change_memory_range(unsigned long start, unsigned long end,
- unsigned long clear)
-{
- unsigned long idx;
- pgd_t *pgdp;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- start = ALIGN_DOWN(start, PAGE_SIZE);
- end = PAGE_ALIGN(end); // aligns up
-
- pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
- start, end, clear);
-
- for (idx = start; idx < end; idx += PAGE_SIZE) {
- pgdp = pgd_offset_k(idx);
- pudp = pud_alloc(&init_mm, pgdp, idx);
- if (!pudp)
- continue;
- if (pud_huge(*pudp)) {
- ptep = (pte_t *)pudp;
- goto update_the_pte;
- }
- pmdp = pmd_alloc(&init_mm, pudp, idx);
- if (!pmdp)
- continue;
- if (pmd_huge(*pmdp)) {
- ptep = pmdp_ptep(pmdp);
- goto update_the_pte;
- }
- ptep = pte_alloc_kernel(pmdp, idx);
- if (!ptep)
- continue;
-update_the_pte:
- radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
- }
-
- radix__flush_tlb_kernel_range(start, end);
-}
-
-void radix__mark_rodata_ro(void)
-{
- unsigned long start, end;
-
- start = (unsigned long)_stext;
- end = (unsigned long)__init_begin;
-
- radix__change_memory_range(start, end, _PAGE_WRITE);
-}
-
-void radix__mark_initmem_nx(void)
-{
- unsigned long start = (unsigned long)__init_begin;
- unsigned long end = (unsigned long)__init_end;
-
- radix__change_memory_range(start, end, _PAGE_EXEC);
-}
-#endif /* CONFIG_STRICT_KERNEL_RWX */
-
-static inline void __meminit
-print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
-{
- char buf[10];
-
- if (end <= start)
- return;
-
- string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
-
- pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
- exec ? " (exec)" : "");
-}
-
-static unsigned long next_boundary(unsigned long addr, unsigned long end)
-{
-#ifdef CONFIG_STRICT_KERNEL_RWX
- if (addr < __pa_symbol(__init_begin))
- return __pa_symbol(__init_begin);
-#endif
- return end;
-}
-
-static int __meminit create_physical_mapping(unsigned long start,
- unsigned long end,
- int nid)
-{
- unsigned long vaddr, addr, mapping_size = 0;
- bool prev_exec, exec = false;
- pgprot_t prot;
- int psize;
-
- start = _ALIGN_UP(start, PAGE_SIZE);
- for (addr = start; addr < end; addr += mapping_size) {
- unsigned long gap, previous_size;
- int rc;
-
- gap = next_boundary(addr, end) - addr;
- previous_size = mapping_size;
- prev_exec = exec;
-
- if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
- mmu_psize_defs[MMU_PAGE_1G].shift) {
- mapping_size = PUD_SIZE;
- psize = MMU_PAGE_1G;
- } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
- mmu_psize_defs[MMU_PAGE_2M].shift) {
- mapping_size = PMD_SIZE;
- psize = MMU_PAGE_2M;
- } else {
- mapping_size = PAGE_SIZE;
- psize = mmu_virtual_psize;
- }
-
- vaddr = (unsigned long)__va(addr);
-
- if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
- overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
- prot = PAGE_KERNEL_X;
- exec = true;
- } else {
- prot = PAGE_KERNEL;
- exec = false;
- }
-
- if (mapping_size != previous_size || exec != prev_exec) {
- print_mapping(start, addr, previous_size, prev_exec);
- start = addr;
- }
-
- rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
- if (rc)
- return rc;
-
- update_page_count(psize, 1);
- }
-
- print_mapping(start, addr, mapping_size, exec);
- return 0;
-}
-
-void __init radix_init_pgtable(void)
-{
- unsigned long rts_field;
- struct memblock_region *reg;
-
- /* We don't support slb for radix */
- mmu_slb_size = 0;
- /*
- * Create the linear mapping, using standard page size for now
- */
- for_each_memblock(memory, reg) {
- /*
- * The memblock allocator is up at this point, so the
- * page tables will be allocated within the range. No
- * need or a node (which we don't have yet).
- */
-
- if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
- pr_warn("Outside the supported range\n");
- continue;
- }
-
- WARN_ON(create_physical_mapping(reg->base,
- reg->base + reg->size,
- -1));
- }
-
- /* Find out how many PID bits are supported */
- if (cpu_has_feature(CPU_FTR_HVMODE)) {
- if (!mmu_pid_bits)
- mmu_pid_bits = 20;
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- /*
- * When KVM is possible, we only use the top half of the
- * PID space to avoid collisions between host and guest PIDs
- * which can cause problems due to prefetch when exiting the
- * guest with AIL=3
- */
- mmu_base_pid = 1 << (mmu_pid_bits - 1);
-#else
- mmu_base_pid = 1;
-#endif
- } else {
- /* The guest uses the bottom half of the PID space */
- if (!mmu_pid_bits)
- mmu_pid_bits = 19;
- mmu_base_pid = 1;
- }
-
- /*
- * Allocate Partition table and process table for the
- * host.
- */
- BUG_ON(PRTB_SIZE_SHIFT > 36);
- process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
- /*
- * Fill in the process table.
- */
- rts_field = radix__get_tree_size();
- process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
- /*
- * Fill in the partition table. We are suppose to use effective address
- * of process table here. But our linear mapping also enable us to use
- * physical address here.
- */
- register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
- pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
- asm volatile("ptesync" : : : "memory");
- asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
- trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
-
- /*
- * The init_mm context is given the first available (non-zero) PID,
- * which is the "guard PID" and contains no page table. PIDR should
- * never be set to zero because that duplicates the kernel address
- * space at the 0x0... offset (quadrant 0)!
- *
- * An arbitrary PID that may later be allocated by the PID allocator
- * for userspace processes must not be used either, because that
- * would cause stale user mappings for that PID on CPUs outside of
- * the TLB invalidation scheme (because it won't be in mm_cpumask).
- *
- * So permanently carve out one PID for the purpose of a guard PID.
- */
- init_mm.context.id = mmu_base_pid;
- mmu_base_pid++;
-}
-
-static void __init radix_init_partition_table(void)
-{
- unsigned long rts_field, dw0;
-
- mmu_partition_table_init();
- rts_field = radix__get_tree_size();
- dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
- mmu_partition_table_set_entry(0, dw0, 0);
-
- pr_info("Initializing Radix MMU\n");
- pr_info("Partition table %p\n", partition_tb);
-}
-
-void __init radix_init_native(void)
-{
- register_process_table = native_register_process_table;
-}
-
-static int __init get_idx_from_shift(unsigned int shift)
-{
- int idx = -1;
-
- switch (shift) {
- case 0xc:
- idx = MMU_PAGE_4K;
- break;
- case 0x10:
- idx = MMU_PAGE_64K;
- break;
- case 0x15:
- idx = MMU_PAGE_2M;
- break;
- case 0x1e:
- idx = MMU_PAGE_1G;
- break;
- }
- return idx;
-}
-
-static int __init radix_dt_scan_page_sizes(unsigned long node,
- const char *uname, int depth,
- void *data)
-{
- int size = 0;
- int shift, idx;
- unsigned int ap;
- const __be32 *prop;
- const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
-
- /* We are scanning "cpu" nodes only */
- if (type == NULL || strcmp(type, "cpu") != 0)
- return 0;
-
- /* Find MMU PID size */
- prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
- if (prop && size == 4)
- mmu_pid_bits = be32_to_cpup(prop);
-
- /* Grab page size encodings */
- prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
- if (!prop)
- return 0;
-
- pr_info("Page sizes from device-tree:\n");
- for (; size >= 4; size -= 4, ++prop) {
-
- struct mmu_psize_def *def;
-
- /* top 3 bit is AP encoding */
- shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
- ap = be32_to_cpu(prop[0]) >> 29;
- pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
-
- idx = get_idx_from_shift(shift);
- if (idx < 0)
- continue;
-
- def = &mmu_psize_defs[idx];
- def->shift = shift;
- def->ap = ap;
- }
-
- /* needed ? */
- cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
- return 1;
-}
-
-void __init radix__early_init_devtree(void)
-{
- int rc;
-
- /*
- * Try to find the available page sizes in the device-tree
- */
- rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
- if (rc != 0) /* Found */
- goto found;
- /*
- * let's assume we have page 4k and 64k support
- */
- mmu_psize_defs[MMU_PAGE_4K].shift = 12;
- mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
-
- mmu_psize_defs[MMU_PAGE_64K].shift = 16;
- mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
-found:
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- if (mmu_psize_defs[MMU_PAGE_2M].shift) {
- /*
- * map vmemmap using 2M if available
- */
- mmu_vmemmap_psize = MMU_PAGE_2M;
- }
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
- return;
-}
-
-static void radix_init_amor(void)
-{
- /*
- * In HV mode, we init AMOR (Authority Mask Override Register) so that
- * the hypervisor and guest can setup IAMR (Instruction Authority Mask
- * Register), enable key 0 and set it to 1.
- *
- * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
- */
- mtspr(SPRN_AMOR, (3ul << 62));
-}
-
-#ifdef CONFIG_PPC_KUEP
-void setup_kuep(bool disabled)
-{
- if (disabled || !early_radix_enabled())
- return;
-
- if (smp_processor_id() == boot_cpuid)
- pr_info("Activating Kernel Userspace Execution Prevention\n");
-
- /*
- * Radix always uses key0 of the IAMR to determine if an access is
- * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
- * fetch.
- */
- mtspr(SPRN_IAMR, (1ul << 62));
-}
-#endif
-
-#ifdef CONFIG_PPC_KUAP
-void setup_kuap(bool disabled)
-{
- if (disabled || !early_radix_enabled())
- return;
-
- if (smp_processor_id() == boot_cpuid) {
- pr_info("Activating Kernel Userspace Access Prevention\n");
- cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
- }
-
- /* Make sure userspace can't change the AMR */
- mtspr(SPRN_UAMOR, 0);
- mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
- isync();
-}
-#endif
-
-void __init radix__early_init_mmu(void)
-{
- unsigned long lpcr;
-
-#ifdef CONFIG_PPC_64K_PAGES
- /* PAGE_SIZE mappings */
- mmu_virtual_psize = MMU_PAGE_64K;
-#else
- mmu_virtual_psize = MMU_PAGE_4K;
-#endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- /* vmemmap mapping */
- mmu_vmemmap_psize = mmu_virtual_psize;
-#endif
- /*
- * initialize page table size
- */
- __pte_index_size = RADIX_PTE_INDEX_SIZE;
- __pmd_index_size = RADIX_PMD_INDEX_SIZE;
- __pud_index_size = RADIX_PUD_INDEX_SIZE;
- __pgd_index_size = RADIX_PGD_INDEX_SIZE;
- __pud_cache_index = RADIX_PUD_INDEX_SIZE;
- __pte_table_size = RADIX_PTE_TABLE_SIZE;
- __pmd_table_size = RADIX_PMD_TABLE_SIZE;
- __pud_table_size = RADIX_PUD_TABLE_SIZE;
- __pgd_table_size = RADIX_PGD_TABLE_SIZE;
-
- __pmd_val_bits = RADIX_PMD_VAL_BITS;
- __pud_val_bits = RADIX_PUD_VAL_BITS;
- __pgd_val_bits = RADIX_PGD_VAL_BITS;
-
- __kernel_virt_start = RADIX_KERN_VIRT_START;
- __vmalloc_start = RADIX_VMALLOC_START;
- __vmalloc_end = RADIX_VMALLOC_END;
- __kernel_io_start = RADIX_KERN_IO_START;
- __kernel_io_end = RADIX_KERN_IO_END;
- vmemmap = (struct page *)RADIX_VMEMMAP_START;
- ioremap_bot = IOREMAP_BASE;
-
-#ifdef CONFIG_PCI
- pci_io_base = ISA_IO_BASE;
-#endif
- __pte_frag_nr = RADIX_PTE_FRAG_NR;
- __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
- __pmd_frag_nr = RADIX_PMD_FRAG_NR;
- __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
-
- if (!firmware_has_feature(FW_FEATURE_LPAR)) {
- radix_init_native();
- lpcr = mfspr(SPRN_LPCR);
- mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
- radix_init_partition_table();
- radix_init_amor();
- } else {
- radix_init_pseries();
- }
-
- memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
-
- radix_init_pgtable();
- /* Switch to the guard PID before turning on MMU */
- radix__switch_mmu_context(NULL, &init_mm);
- if (cpu_has_feature(CPU_FTR_HVMODE))
- tlbiel_all();
-}
-
-void radix__early_init_mmu_secondary(void)
-{
- unsigned long lpcr;
- /*
- * update partition table control register and UPRT
- */
- if (!firmware_has_feature(FW_FEATURE_LPAR)) {
- lpcr = mfspr(SPRN_LPCR);
- mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
-
- mtspr(SPRN_PTCR,
- __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
- radix_init_amor();
- }
-
- radix__switch_mmu_context(NULL, &init_mm);
- if (cpu_has_feature(CPU_FTR_HVMODE))
- tlbiel_all();
-}
-
-void radix__mmu_cleanup_all(void)
-{
- unsigned long lpcr;
-
- if (!firmware_has_feature(FW_FEATURE_LPAR)) {
- lpcr = mfspr(SPRN_LPCR);
- mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
- mtspr(SPRN_PTCR, 0);
- powernv_set_nmmu_ptcr(0);
- radix__flush_tlb_all();
- }
-}
-
-void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* We don't currently support the first MEMBLOCK not mapping 0
- * physical on those processors
- */
- BUG_ON(first_memblock_base != 0);
-
- /*
- * Radix mode is not limited by RMA / VRMA addressing.
- */
- ppc64_rma_size = ULONG_MAX;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
-{
- pte_t *pte;
- int i;
-
- for (i = 0; i < PTRS_PER_PTE; i++) {
- pte = pte_start + i;
- if (!pte_none(*pte))
- return;
- }
-
- pte_free_kernel(&init_mm, pte_start);
- pmd_clear(pmd);
-}
-
-static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
-{
- pmd_t *pmd;
- int i;
-
- for (i = 0; i < PTRS_PER_PMD; i++) {
- pmd = pmd_start + i;
- if (!pmd_none(*pmd))
- return;
- }
-
- pmd_free(&init_mm, pmd_start);
- pud_clear(pud);
-}
-
-struct change_mapping_params {
- pte_t *pte;
- unsigned long start;
- unsigned long end;
- unsigned long aligned_start;
- unsigned long aligned_end;
-};
-
-static int __meminit stop_machine_change_mapping(void *data)
-{
- struct change_mapping_params *params =
- (struct change_mapping_params *)data;
-
- if (!data)
- return -1;
-
- spin_unlock(&init_mm.page_table_lock);
- pte_clear(&init_mm, params->aligned_start, params->pte);
- create_physical_mapping(params->aligned_start, params->start, -1);
- create_physical_mapping(params->end, params->aligned_end, -1);
- spin_lock(&init_mm.page_table_lock);
- return 0;
-}
-
-static void remove_pte_table(pte_t *pte_start, unsigned long addr,
- unsigned long end)
-{
- unsigned long next;
- pte_t *pte;
-
- pte = pte_start + pte_index(addr);
- for (; addr < end; addr = next, pte++) {
- next = (addr + PAGE_SIZE) & PAGE_MASK;
- if (next > end)
- next = end;
-
- if (!pte_present(*pte))
- continue;
-
- if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
- /*
- * The vmemmap_free() and remove_section_mapping()
- * codepaths call us with aligned addresses.
- */
- WARN_ONCE(1, "%s: unaligned range\n", __func__);
- continue;
- }
-
- pte_clear(&init_mm, addr, pte);
- }
-}
-
-/*
- * clear the pte and potentially split the mapping helper
- */
-static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
- unsigned long size, pte_t *pte)
-{
- unsigned long mask = ~(size - 1);
- unsigned long aligned_start = addr & mask;
- unsigned long aligned_end = addr + size;
- struct change_mapping_params params;
- bool split_region = false;
-
- if ((end - addr) < size) {
- /*
- * We're going to clear the PTE, but not flushed
- * the mapping, time to remap and flush. The
- * effects if visible outside the processor or
- * if we are running in code close to the
- * mapping we cleared, we are in trouble.
- */
- if (overlaps_kernel_text(aligned_start, addr) ||
- overlaps_kernel_text(end, aligned_end)) {
- /*
- * Hack, just return, don't pte_clear
- */
- WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
- "text, not splitting\n", addr, end);
- return;
- }
- split_region = true;
- }
-
- if (split_region) {
- params.pte = pte;
- params.start = addr;
- params.end = end;
- params.aligned_start = addr & ~(size - 1);
- params.aligned_end = min_t(unsigned long, aligned_end,
- (unsigned long)__va(memblock_end_of_DRAM()));
- stop_machine(stop_machine_change_mapping, ¶ms, NULL);
- return;
- }
-
- pte_clear(&init_mm, addr, pte);
-}
-
-static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
- unsigned long end)
-{
- unsigned long next;
- pte_t *pte_base;
- pmd_t *pmd;
-
- pmd = pmd_start + pmd_index(addr);
- for (; addr < end; addr = next, pmd++) {
- next = pmd_addr_end(addr, end);
-
- if (!pmd_present(*pmd))
- continue;
-
- if (pmd_huge(*pmd)) {
- split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
- continue;
- }
-
- pte_base = (pte_t *)pmd_page_vaddr(*pmd);
- remove_pte_table(pte_base, addr, next);
- free_pte_table(pte_base, pmd);
- }
-}
-
-static void remove_pud_table(pud_t *pud_start, unsigned long addr,
- unsigned long end)
-{
- unsigned long next;
- pmd_t *pmd_base;
- pud_t *pud;
-
- pud = pud_start + pud_index(addr);
- for (; addr < end; addr = next, pud++) {
- next = pud_addr_end(addr, end);
-
- if (!pud_present(*pud))
- continue;
-
- if (pud_huge(*pud)) {
- split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
- continue;
- }
-
- pmd_base = (pmd_t *)pud_page_vaddr(*pud);
- remove_pmd_table(pmd_base, addr, next);
- free_pmd_table(pmd_base, pud);
- }
-}
-
-static void __meminit remove_pagetable(unsigned long start, unsigned long end)
-{
- unsigned long addr, next;
- pud_t *pud_base;
- pgd_t *pgd;
-
- spin_lock(&init_mm.page_table_lock);
-
- for (addr = start; addr < end; addr = next) {
- next = pgd_addr_end(addr, end);
-
- pgd = pgd_offset_k(addr);
- if (!pgd_present(*pgd))
- continue;
-
- if (pgd_huge(*pgd)) {
- split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
- continue;
- }
-
- pud_base = (pud_t *)pgd_page_vaddr(*pgd);
- remove_pud_table(pud_base, addr, next);
- }
-
- spin_unlock(&init_mm.page_table_lock);
- radix__flush_tlb_kernel_range(start, end);
-}
-
-int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
-{
- if (end >= RADIX_VMALLOC_START) {
- pr_warn("Outside the supported range\n");
- return -1;
- }
-
- return create_physical_mapping(start, end, nid);
-}
-
-int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
-{
- remove_pagetable(start, end);
- return 0;
-}
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
- pgprot_t flags, unsigned int map_page_size,
- int nid)
-{
- return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
-}
-
-int __meminit radix__vmemmap_create_mapping(unsigned long start,
- unsigned long page_size,
- unsigned long phys)
-{
- /* Create a PTE encoding */
- unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
- int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
- int ret;
-
- if ((start + page_size) >= RADIX_VMEMMAP_END) {
- pr_warn("Outside the supported range\n");
- return -1;
- }
-
- ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
- BUG_ON(ret);
-
- return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
-{
- remove_pagetable(start, start + page_size);
-}
-#endif
-#endif
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, unsigned long clr,
- unsigned long set)
-{
- unsigned long old;
-
-#ifdef CONFIG_DEBUG_VM
- WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
- assert_spin_locked(pmd_lockptr(mm, pmdp));
-#endif
-
- old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
- trace_hugepage_update(addr, old, clr, set);
-
- return old;
-}
-
-pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-
-{
- pmd_t pmd;
-
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
- VM_BUG_ON(pmd_devmap(*pmdp));
- /*
- * khugepaged calls this for normal pmd
- */
- pmd = *pmdp;
- pmd_clear(pmdp);
-
- /*FIXME!! Verify whether we need this kick below */
- serialize_against_pte_lookup(vma->vm_mm);
-
- radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
-
- return pmd;
-}
-
-/*
- * For us pgtable_t is pte_t *. Inorder to save the deposisted
- * page table, we consider the allocated page table as a list
- * head. On withdraw we need to make sure we zero out the used
- * list_head memory area.
- */
-void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
- pgtable_t pgtable)
-{
- struct list_head *lh = (struct list_head *) pgtable;
-
- assert_spin_locked(pmd_lockptr(mm, pmdp));
-
- /* FIFO */
- if (!pmd_huge_pte(mm, pmdp))
- INIT_LIST_HEAD(lh);
- else
- list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
- pmd_huge_pte(mm, pmdp) = pgtable;
-}
-
-pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
-{
- pte_t *ptep;
- pgtable_t pgtable;
- struct list_head *lh;
-
- assert_spin_locked(pmd_lockptr(mm, pmdp));
-
- /* FIFO */
- pgtable = pmd_huge_pte(mm, pmdp);
- lh = (struct list_head *) pgtable;
- if (list_empty(lh))
- pmd_huge_pte(mm, pmdp) = NULL;
- else {
- pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
- list_del(lh);
- }
- ptep = (pte_t *) pgtable;
- *ptep = __pte(0);
- ptep++;
- *ptep = __pte(0);
- return pgtable;
-}
-
-
-pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp)
-{
- pmd_t old_pmd;
- unsigned long old;
-
- old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
- old_pmd = __pmd(old);
- /*
- * Serialize against find_current_mm_pte which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_current_mm_pte to finish.
- */
- serialize_against_pte_lookup(mm);
- return old_pmd;
-}
-
-int radix__has_transparent_hugepage(void)
-{
- /* For radix 2M at PMD level means thp */
- if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
- return 1;
- return 0;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
- pte_t entry, unsigned long address, int psize)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
- _PAGE_RW | _PAGE_EXEC);
-
- unsigned long change = pte_val(entry) ^ pte_val(*ptep);
- /*
- * To avoid NMMU hang while relaxing access, we need mark
- * the pte invalid in between.
- */
- if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
- unsigned long old_pte, new_pte;
-
- old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
- /*
- * new value of pte
- */
- new_pte = old_pte | set;
- radix__flush_tlb_page_psize(mm, address, psize);
- __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
- } else {
- __radix_pte_update(ptep, 0, set);
- /*
- * Book3S does not require a TLB flush when relaxing access
- * restrictions when the address space is not attached to a
- * NMMU, because the core MMU will reload the pte after taking
- * an access fault, which is defined by the architectue.
- */
- }
- /* See ptesync comment in radix__set_pte_at */
-}
-
-void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t old_pte, pte_t pte)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- /*
- * To avoid NMMU hang while relaxing access we need to flush the tlb before
- * we set the new value. We need to do this only for radix, because hash
- * translation does flush when updating the linux pte.
- */
- if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
- (atomic_read(&mm->context.copros) > 0))
- radix__flush_tlb_page(vma, addr);
-
- set_pte_at(mm, addr, ptep, pte);
-}
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * PowerPC Memory Protection Keys management
- *
- * Copyright 2017, Ram Pai, IBM Corporation.
- */
-
-#include <asm/mman.h>
-#include <asm/mmu_context.h>
-#include <asm/mmu.h>
-#include <asm/setup.h>
-#include <linux/pkeys.h>
-#include <linux/of_device.h>
-
-DEFINE_STATIC_KEY_TRUE(pkey_disabled);
-int pkeys_total; /* Total pkeys as per device tree */
-u32 initial_allocation_mask; /* Bits set for the initially allocated keys */
-u32 reserved_allocation_mask; /* Bits set for reserved keys */
-static bool pkey_execute_disable_supported;
-static bool pkeys_devtree_defined; /* property exported by device tree */
-static u64 pkey_amr_mask; /* Bits in AMR not to be touched */
-static u64 pkey_iamr_mask; /* Bits in AMR not to be touched */
-static u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */
-static int execute_only_key = 2;
-
-#define AMR_BITS_PER_PKEY 2
-#define AMR_RD_BIT 0x1UL
-#define AMR_WR_BIT 0x2UL
-#define IAMR_EX_BIT 0x1UL
-#define PKEY_REG_BITS (sizeof(u64)*8)
-#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
-
-static void scan_pkey_feature(void)
-{
- u32 vals[2];
- struct device_node *cpu;
-
- cpu = of_find_node_by_type(NULL, "cpu");
- if (!cpu)
- return;
-
- if (of_property_read_u32_array(cpu,
- "ibm,processor-storage-keys", vals, 2))
- return;
-
- /*
- * Since any pkey can be used for data or execute, we will just treat
- * all keys as equal and track them as one entity.
- */
- pkeys_total = vals[0];
- pkeys_devtree_defined = true;
-}
-
-static inline bool pkey_mmu_enabled(void)
-{
- if (firmware_has_feature(FW_FEATURE_LPAR))
- return pkeys_total;
- else
- return cpu_has_feature(CPU_FTR_PKEY);
-}
-
-static int pkey_initialize(void)
-{
- int os_reserved, i;
-
- /*
- * We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral
- * generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE.
- * Ensure that the bits a distinct.
- */
- BUILD_BUG_ON(PKEY_DISABLE_EXECUTE &
- (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
-
- /*
- * pkey_to_vmflag_bits() assumes that the pkey bits are contiguous
- * in the vmaflag. Make sure that is really the case.
- */
- BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) +
- __builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)
- != (sizeof(u64) * BITS_PER_BYTE));
-
- /* scan the device tree for pkey feature */
- scan_pkey_feature();
-
- /*
- * Let's assume 32 pkeys on P8 bare metal, if its not defined by device
- * tree. We make this exception since skiboot forgot to expose this
- * property on power8.
- */
- if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) &&
- cpu_has_feature(CPU_FTRS_POWER8))
- pkeys_total = 32;
-
- /*
- * Adjust the upper limit, based on the number of bits supported by
- * arch-neutral code.
- */
- pkeys_total = min_t(int, pkeys_total,
- ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1));
-
- if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total)
- static_branch_enable(&pkey_disabled);
- else
- static_branch_disable(&pkey_disabled);
-
- if (static_branch_likely(&pkey_disabled))
- return 0;
-
- /*
- * The device tree cannot be relied to indicate support for
- * execute_disable support. Instead we use a PVR check.
- */
- if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p))
- pkey_execute_disable_supported = false;
- else
- pkey_execute_disable_supported = true;
-
-#ifdef CONFIG_PPC_4K_PAGES
- /*
- * The OS can manage only 8 pkeys due to its inability to represent them
- * in the Linux 4K PTE.
- */
- os_reserved = pkeys_total - 8;
-#else
- os_reserved = 0;
-#endif
- /* Bits are in LE format. */
- reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);
-
- /* register mask is in BE format */
- pkey_amr_mask = ~0x0ul;
- pkey_amr_mask &= ~(0x3ul << pkeyshift(0));
-
- pkey_iamr_mask = ~0x0ul;
- pkey_iamr_mask &= ~(0x3ul << pkeyshift(0));
- pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key));
-
- pkey_uamor_mask = ~0x0ul;
- pkey_uamor_mask &= ~(0x3ul << pkeyshift(0));
- pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key));
-
- /* mark the rest of the keys as reserved and hence unavailable */
- for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) {
- reserved_allocation_mask |= (0x1 << i);
- pkey_uamor_mask &= ~(0x3ul << pkeyshift(i));
- }
- initial_allocation_mask = reserved_allocation_mask | (0x1 << 0);
-
- if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) {
- /*
- * Insufficient number of keys to support
- * execute only key. Mark it unavailable.
- * Any AMR, UAMOR, IAMR bit set for
- * this key is irrelevant since this key
- * can never be allocated.
- */
- execute_only_key = -1;
- }
-
- return 0;
-}
-
-arch_initcall(pkey_initialize);
-
-void pkey_mm_init(struct mm_struct *mm)
-{
- if (static_branch_likely(&pkey_disabled))
- return;
- mm_pkey_allocation_map(mm) = initial_allocation_mask;
- mm->context.execute_only_pkey = execute_only_key;
-}
-
-static inline u64 read_amr(void)
-{
- return mfspr(SPRN_AMR);
-}
-
-static inline void write_amr(u64 value)
-{
- mtspr(SPRN_AMR, value);
-}
-
-static inline u64 read_iamr(void)
-{
- if (!likely(pkey_execute_disable_supported))
- return 0x0UL;
-
- return mfspr(SPRN_IAMR);
-}
-
-static inline void write_iamr(u64 value)
-{
- if (!likely(pkey_execute_disable_supported))
- return;
-
- mtspr(SPRN_IAMR, value);
-}
-
-static inline u64 read_uamor(void)
-{
- return mfspr(SPRN_UAMOR);
-}
-
-static inline void write_uamor(u64 value)
-{
- mtspr(SPRN_UAMOR, value);
-}
-
-static bool is_pkey_enabled(int pkey)
-{
- u64 uamor = read_uamor();
- u64 pkey_bits = 0x3ul << pkeyshift(pkey);
- u64 uamor_pkey_bits = (uamor & pkey_bits);
-
- /*
- * Both the bits in UAMOR corresponding to the key should be set or
- * reset.
- */
- WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits));
- return !!(uamor_pkey_bits);
-}
-
-static inline void init_amr(int pkey, u8 init_bits)
-{
- u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
- u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
-
- write_amr(old_amr | new_amr_bits);
-}
-
-static inline void init_iamr(int pkey, u8 init_bits)
-{
- u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey));
- u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey));
-
- write_iamr(old_iamr | new_iamr_bits);
-}
-
-/*
- * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that
- * specified in @init_val.
- */
-int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
- unsigned long init_val)
-{
- u64 new_amr_bits = 0x0ul;
- u64 new_iamr_bits = 0x0ul;
-
- if (!is_pkey_enabled(pkey))
- return -EINVAL;
-
- if (init_val & PKEY_DISABLE_EXECUTE) {
- if (!pkey_execute_disable_supported)
- return -EINVAL;
- new_iamr_bits |= IAMR_EX_BIT;
- }
- init_iamr(pkey, new_iamr_bits);
-
- /* Set the bits we need in AMR: */
- if (init_val & PKEY_DISABLE_ACCESS)
- new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT;
- else if (init_val & PKEY_DISABLE_WRITE)
- new_amr_bits |= AMR_WR_BIT;
-
- init_amr(pkey, new_amr_bits);
- return 0;
-}
-
-void thread_pkey_regs_save(struct thread_struct *thread)
-{
- if (static_branch_likely(&pkey_disabled))
- return;
-
- /*
- * TODO: Skip saving registers if @thread hasn't used any keys yet.
- */
- thread->amr = read_amr();
- thread->iamr = read_iamr();
- thread->uamor = read_uamor();
-}
-
-void thread_pkey_regs_restore(struct thread_struct *new_thread,
- struct thread_struct *old_thread)
-{
- if (static_branch_likely(&pkey_disabled))
- return;
-
- if (old_thread->amr != new_thread->amr)
- write_amr(new_thread->amr);
- if (old_thread->iamr != new_thread->iamr)
- write_iamr(new_thread->iamr);
- if (old_thread->uamor != new_thread->uamor)
- write_uamor(new_thread->uamor);
-}
-
-void thread_pkey_regs_init(struct thread_struct *thread)
-{
- if (static_branch_likely(&pkey_disabled))
- return;
-
- thread->amr = pkey_amr_mask;
- thread->iamr = pkey_iamr_mask;
- thread->uamor = pkey_uamor_mask;
-
- write_uamor(pkey_uamor_mask);
- write_amr(pkey_amr_mask);
- write_iamr(pkey_iamr_mask);
-}
-
-static inline bool pkey_allows_readwrite(int pkey)
-{
- int pkey_shift = pkeyshift(pkey);
-
- if (!is_pkey_enabled(pkey))
- return true;
-
- return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift));
-}
-
-int __execute_only_pkey(struct mm_struct *mm)
-{
- return mm->context.execute_only_pkey;
-}
-
-static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
-{
- /* Do this check first since the vm_flags should be hot */
- if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
- return false;
-
- return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
-}
-
-/*
- * This should only be called for *plain* mprotect calls.
- */
-int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot,
- int pkey)
-{
- /*
- * If the currently associated pkey is execute-only, but the requested
- * protection is not execute-only, move it back to the default pkey.
- */
- if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC))
- return 0;
-
- /*
- * The requested protection is execute-only. Hence let's use an
- * execute-only pkey.
- */
- if (prot == PROT_EXEC) {
- pkey = execute_only_pkey(vma->vm_mm);
- if (pkey > 0)
- return pkey;
- }
-
- /* Nothing to override. */
- return vma_pkey(vma);
-}
-
-static bool pkey_access_permitted(int pkey, bool write, bool execute)
-{
- int pkey_shift;
- u64 amr;
-
- if (!is_pkey_enabled(pkey))
- return true;
-
- pkey_shift = pkeyshift(pkey);
- if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift)))
- return true;
-
- amr = read_amr(); /* Delay reading amr until absolutely needed */
- return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) ||
- (write && !(amr & (AMR_WR_BIT << pkey_shift))));
-}
-
-bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
-{
- if (static_branch_likely(&pkey_disabled))
- return true;
-
- return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
-}
-
-/*
- * We only want to enforce protection keys on the current thread because we
- * effectively have no access to AMR/IAMR for other threads or any way to tell
- * which AMR/IAMR in a threaded process we could use.
- *
- * So do not enforce things if the VMA is not from the current mm, or if we are
- * in a kernel thread.
- */
-static inline bool vma_is_foreign(struct vm_area_struct *vma)
-{
- if (!current->mm)
- return true;
-
- /* if it is not our ->mm, it has to be foreign */
- if (current->mm != vma->vm_mm)
- return true;
-
- return false;
-}
-
-bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
- bool execute, bool foreign)
-{
- if (static_branch_likely(&pkey_disabled))
- return true;
- /*
- * Do not enforce our key-permissions on a foreign vma.
- */
- if (foreign || vma_is_foreign(vma))
- return true;
-
- return pkey_access_permitted(vma_pkey(vma), write, execute);
-}
-
-void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
-{
- if (static_branch_likely(&pkey_disabled))
- return;
-
- /* Duplicate the oldmm pkey state in mm: */
- mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
- mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
-}
+++ /dev/null
-/*
- * PowerPC64 SLB support.
- *
- * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- * Based on earlier code written by:
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- * Copyright (c) 2001 Dave Engebretsen
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/asm-prototypes.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/paca.h>
-#include <asm/ppc-opcode.h>
-#include <asm/cputable.h>
-#include <asm/cacheflush.h>
-#include <asm/smp.h>
-#include <linux/compiler.h>
-#include <linux/context_tracking.h>
-#include <linux/mm_types.h>
-
-#include <asm/udbg.h>
-#include <asm/code-patching.h>
-
-enum slb_index {
- LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */
- KSTACK_INDEX = 1, /* Kernel stack map */
-};
-
-static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
-
-#define slb_esid_mask(ssize) \
- (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
-
-static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
- enum slb_index index)
-{
- return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
-}
-
-static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
- unsigned long flags)
-{
- return (vsid << slb_vsid_shift(ssize)) | flags |
- ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
-}
-
-static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
- unsigned long flags)
-{
- return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
-}
-
-static void assert_slb_presence(bool present, unsigned long ea)
-{
-#ifdef CONFIG_DEBUG_VM
- unsigned long tmp;
-
- WARN_ON_ONCE(mfmsr() & MSR_EE);
-
- if (!cpu_has_feature(CPU_FTR_ARCH_206))
- return;
-
- /*
- * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
- * ignores all other bits from 0-27, so just clear them all.
- */
- ea &= ~((1UL << 28) - 1);
- asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
-
- WARN_ON(present == (tmp == 0));
-#endif
-}
-
-static inline void slb_shadow_update(unsigned long ea, int ssize,
- unsigned long flags,
- enum slb_index index)
-{
- struct slb_shadow *p = get_slb_shadow();
-
- /*
- * Clear the ESID first so the entry is not valid while we are
- * updating it. No write barriers are needed here, provided
- * we only update the current CPU's SLB shadow buffer.
- */
- WRITE_ONCE(p->save_area[index].esid, 0);
- WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
- WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
-}
-
-static inline void slb_shadow_clear(enum slb_index index)
-{
- WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
-}
-
-static inline void create_shadowed_slbe(unsigned long ea, int ssize,
- unsigned long flags,
- enum slb_index index)
-{
- /*
- * Updating the shadow buffer before writing the SLB ensures
- * we don't get a stale entry here if we get preempted by PHYP
- * between these two statements.
- */
- slb_shadow_update(ea, ssize, flags, index);
-
- assert_slb_presence(false, ea);
- asm volatile("slbmte %0,%1" :
- : "r" (mk_vsid_data(ea, ssize, flags)),
- "r" (mk_esid_data(ea, ssize, index))
- : "memory" );
-}
-
-/*
- * Insert bolted entries into SLB (which may not be empty, so don't clear
- * slb_cache_ptr).
- */
-void __slb_restore_bolted_realmode(void)
-{
- struct slb_shadow *p = get_slb_shadow();
- enum slb_index index;
-
- /* No isync needed because realmode. */
- for (index = 0; index < SLB_NUM_BOLTED; index++) {
- asm volatile("slbmte %0,%1" :
- : "r" (be64_to_cpu(p->save_area[index].vsid)),
- "r" (be64_to_cpu(p->save_area[index].esid)));
- }
-
- assert_slb_presence(true, local_paca->kstack);
-}
-
-/*
- * Insert the bolted entries into an empty SLB.
- */
-void slb_restore_bolted_realmode(void)
-{
- __slb_restore_bolted_realmode();
- get_paca()->slb_cache_ptr = 0;
-
- get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
- get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-}
-
-/*
- * This flushes all SLB entries including 0, so it must be realmode.
- */
-void slb_flush_all_realmode(void)
-{
- asm volatile("slbmte %0,%0; slbia" : : "r" (0));
-}
-
-/*
- * This flushes non-bolted entries, it can be run in virtual mode. Must
- * be called with interrupts disabled.
- */
-void slb_flush_and_restore_bolted(void)
-{
- struct slb_shadow *p = get_slb_shadow();
-
- BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
-
- WARN_ON(!irqs_disabled());
-
- /*
- * We can't take a PMU exception in the following code, so hard
- * disable interrupts.
- */
- hard_irq_disable();
-
- asm volatile("isync\n"
- "slbia\n"
- "slbmte %0, %1\n"
- "isync\n"
- :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)),
- "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid))
- : "memory");
- assert_slb_presence(true, get_paca()->kstack);
-
- get_paca()->slb_cache_ptr = 0;
-
- get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
- get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-}
-
-void slb_save_contents(struct slb_entry *slb_ptr)
-{
- int i;
- unsigned long e, v;
-
- /* Save slb_cache_ptr value. */
- get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
-
- if (!slb_ptr)
- return;
-
- for (i = 0; i < mmu_slb_size; i++) {
- asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i));
- asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i));
- slb_ptr->esid = e;
- slb_ptr->vsid = v;
- slb_ptr++;
- }
-}
-
-void slb_dump_contents(struct slb_entry *slb_ptr)
-{
- int i, n;
- unsigned long e, v;
- unsigned long llp;
-
- if (!slb_ptr)
- return;
-
- pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
- pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr);
-
- for (i = 0; i < mmu_slb_size; i++) {
- e = slb_ptr->esid;
- v = slb_ptr->vsid;
- slb_ptr++;
-
- if (!e && !v)
- continue;
-
- pr_err("%02d %016lx %016lx\n", i, e, v);
-
- if (!(e & SLB_ESID_V)) {
- pr_err("\n");
- continue;
- }
- llp = v & SLB_VSID_LLP;
- if (v & SLB_VSID_B_1T) {
- pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n",
- GET_ESID_1T(e),
- (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp);
- } else {
- pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n",
- GET_ESID(e),
- (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp);
- }
- }
- pr_err("----------------------------------\n");
-
- /* Dump slb cache entires as well. */
- pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
- pr_err("Valid SLB cache entries:\n");
- n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
- for (i = 0; i < n; i++)
- pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
- pr_err("Rest of SLB cache entries:\n");
- for (i = n; i < SLB_CACHE_ENTRIES; i++)
- pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
-}
-
-void slb_vmalloc_update(void)
-{
- /*
- * vmalloc is not bolted, so just have to flush non-bolted.
- */
- slb_flush_and_restore_bolted();
-}
-
-static bool preload_hit(struct thread_info *ti, unsigned long esid)
-{
- unsigned char i;
-
- for (i = 0; i < ti->slb_preload_nr; i++) {
- unsigned char idx;
-
- idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
- if (esid == ti->slb_preload_esid[idx])
- return true;
- }
- return false;
-}
-
-static bool preload_add(struct thread_info *ti, unsigned long ea)
-{
- unsigned char idx;
- unsigned long esid;
-
- if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
- /* EAs are stored >> 28 so 256MB segments don't need clearing */
- if (ea & ESID_MASK_1T)
- ea &= ESID_MASK_1T;
- }
-
- esid = ea >> SID_SHIFT;
-
- if (preload_hit(ti, esid))
- return false;
-
- idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR;
- ti->slb_preload_esid[idx] = esid;
- if (ti->slb_preload_nr == SLB_PRELOAD_NR)
- ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
- else
- ti->slb_preload_nr++;
-
- return true;
-}
-
-static void preload_age(struct thread_info *ti)
-{
- if (!ti->slb_preload_nr)
- return;
- ti->slb_preload_nr--;
- ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
-}
-
-void slb_setup_new_exec(void)
-{
- struct thread_info *ti = current_thread_info();
- struct mm_struct *mm = current->mm;
- unsigned long exec = 0x10000000;
-
- WARN_ON(irqs_disabled());
-
- /*
- * preload cache can only be used to determine whether a SLB
- * entry exists if it does not start to overflow.
- */
- if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR)
- return;
-
- hard_irq_disable();
-
- /*
- * We have no good place to clear the slb preload cache on exec,
- * flush_thread is about the earliest arch hook but that happens
- * after we switch to the mm and have aleady preloaded the SLBEs.
- *
- * For the most part that's probably okay to use entries from the
- * previous exec, they will age out if unused. It may turn out to
- * be an advantage to clear the cache before switching to it,
- * however.
- */
-
- /*
- * preload some userspace segments into the SLB.
- * Almost all 32 and 64bit PowerPC executables are linked at
- * 0x10000000 so it makes sense to preload this segment.
- */
- if (!is_kernel_addr(exec)) {
- if (preload_add(ti, exec))
- slb_allocate_user(mm, exec);
- }
-
- /* Libraries and mmaps. */
- if (!is_kernel_addr(mm->mmap_base)) {
- if (preload_add(ti, mm->mmap_base))
- slb_allocate_user(mm, mm->mmap_base);
- }
-
- /* see switch_slb */
- asm volatile("isync" : : : "memory");
-
- local_irq_enable();
-}
-
-void preload_new_slb_context(unsigned long start, unsigned long sp)
-{
- struct thread_info *ti = current_thread_info();
- struct mm_struct *mm = current->mm;
- unsigned long heap = mm->start_brk;
-
- WARN_ON(irqs_disabled());
-
- /* see above */
- if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR)
- return;
-
- hard_irq_disable();
-
- /* Userspace entry address. */
- if (!is_kernel_addr(start)) {
- if (preload_add(ti, start))
- slb_allocate_user(mm, start);
- }
-
- /* Top of stack, grows down. */
- if (!is_kernel_addr(sp)) {
- if (preload_add(ti, sp))
- slb_allocate_user(mm, sp);
- }
-
- /* Bottom of heap, grows up. */
- if (heap && !is_kernel_addr(heap)) {
- if (preload_add(ti, heap))
- slb_allocate_user(mm, heap);
- }
-
- /* see switch_slb */
- asm volatile("isync" : : : "memory");
-
- local_irq_enable();
-}
-
-
-/* Flush all user entries from the segment table of the current processor. */
-void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
-{
- struct thread_info *ti = task_thread_info(tsk);
- unsigned char i;
-
- /*
- * We need interrupts hard-disabled here, not just soft-disabled,
- * so that a PMU interrupt can't occur, which might try to access
- * user memory (to get a stack trace) and possible cause an SLB miss
- * which would update the slb_cache/slb_cache_ptr fields in the PACA.
- */
- hard_irq_disable();
- asm volatile("isync" : : : "memory");
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- /*
- * SLBIA IH=3 invalidates all Class=1 SLBEs and their
- * associated lookaside structures, which matches what
- * switch_slb wants. So ARCH_300 does not use the slb
- * cache.
- */
- asm volatile(PPC_SLBIA(3));
- } else {
- unsigned long offset = get_paca()->slb_cache_ptr;
-
- if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
- offset <= SLB_CACHE_ENTRIES) {
- unsigned long slbie_data = 0;
-
- for (i = 0; i < offset; i++) {
- unsigned long ea;
-
- ea = (unsigned long)
- get_paca()->slb_cache[i] << SID_SHIFT;
- /*
- * Could assert_slb_presence(true) here, but
- * hypervisor or machine check could have come
- * in and removed the entry at this point.
- */
-
- slbie_data = ea;
- slbie_data |= user_segment_size(slbie_data)
- << SLBIE_SSIZE_SHIFT;
- slbie_data |= SLBIE_C; /* user slbs have C=1 */
- asm volatile("slbie %0" : : "r" (slbie_data));
- }
-
- /* Workaround POWER5 < DD2.1 issue */
- if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
- asm volatile("slbie %0" : : "r" (slbie_data));
-
- } else {
- struct slb_shadow *p = get_slb_shadow();
- unsigned long ksp_esid_data =
- be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
- unsigned long ksp_vsid_data =
- be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
-
- asm volatile(PPC_SLBIA(1) "\n"
- "slbmte %0,%1\n"
- "isync"
- :: "r"(ksp_vsid_data),
- "r"(ksp_esid_data));
-
- get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
- }
-
- get_paca()->slb_cache_ptr = 0;
- }
- get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-
- copy_mm_to_paca(mm);
-
- /*
- * We gradually age out SLBs after a number of context switches to
- * reduce reload overhead of unused entries (like we do with FP/VEC
- * reload). Each time we wrap 256 switches, take an entry out of the
- * SLB preload cache.
- */
- tsk->thread.load_slb++;
- if (!tsk->thread.load_slb) {
- unsigned long pc = KSTK_EIP(tsk);
-
- preload_age(ti);
- preload_add(ti, pc);
- }
-
- for (i = 0; i < ti->slb_preload_nr; i++) {
- unsigned char idx;
- unsigned long ea;
-
- idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
- ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
-
- slb_allocate_user(mm, ea);
- }
-
- /*
- * Synchronize slbmte preloads with possible subsequent user memory
- * address accesses by the kernel (user mode won't happen until
- * rfid, which is safe).
- */
- asm volatile("isync" : : : "memory");
-}
-
-void slb_set_size(u16 size)
-{
- mmu_slb_size = size;
-}
-
-void slb_initialize(void)
-{
- unsigned long linear_llp, vmalloc_llp, io_llp;
- unsigned long lflags;
- static int slb_encoding_inited;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- unsigned long vmemmap_llp;
-#endif
-
- /* Prepare our SLB miss handler based on our page size */
- linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
- io_llp = mmu_psize_defs[mmu_io_psize].sllp;
- vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
- get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
-#endif
- if (!slb_encoding_inited) {
- slb_encoding_inited = 1;
- pr_devel("SLB: linear LLP = %04lx\n", linear_llp);
- pr_devel("SLB: io LLP = %04lx\n", io_llp);
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
-#endif
- }
-
- get_paca()->stab_rr = SLB_NUM_BOLTED - 1;
- get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
- get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
-
- lflags = SLB_VSID_KERNEL | linear_llp;
-
- /* Invalidate the entire SLB (even entry 0) & all the ERATS */
- asm volatile("isync":::"memory");
- asm volatile("slbmte %0,%0"::"r" (0) : "memory");
- asm volatile("isync; slbia; isync":::"memory");
- create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX);
-
- /* For the boot cpu, we're running on the stack in init_thread_union,
- * which is in the first segment of the linear mapping, and also
- * get_paca()->kstack hasn't been initialized yet.
- * For secondary cpus, we need to bolt the kernel stack entry now.
- */
- slb_shadow_clear(KSTACK_INDEX);
- if (raw_smp_processor_id() != boot_cpuid &&
- (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
- create_shadowed_slbe(get_paca()->kstack,
- mmu_kernel_ssize, lflags, KSTACK_INDEX);
-
- asm volatile("isync":::"memory");
-}
-
-static void slb_cache_update(unsigned long esid_data)
-{
- int slb_cache_index;
-
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- return; /* ISAv3.0B and later does not use slb_cache */
-
- /*
- * Now update slb cache entries
- */
- slb_cache_index = local_paca->slb_cache_ptr;
- if (slb_cache_index < SLB_CACHE_ENTRIES) {
- /*
- * We have space in slb cache for optimized switch_slb().
- * Top 36 bits from esid_data as per ISA
- */
- local_paca->slb_cache[slb_cache_index++] = esid_data >> 28;
- local_paca->slb_cache_ptr++;
- } else {
- /*
- * Our cache is full and the current cache content strictly
- * doesn't indicate the active SLB conents. Bump the ptr
- * so that switch_slb() will ignore the cache.
- */
- local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
- }
-}
-
-static enum slb_index alloc_slb_index(bool kernel)
-{
- enum slb_index index;
-
- /*
- * The allocation bitmaps can become out of synch with the SLB
- * when the _switch code does slbie when bolting a new stack
- * segment and it must not be anywhere else in the SLB. This leaves
- * a kernel allocated entry that is unused in the SLB. With very
- * large systems or small segment sizes, the bitmaps could slowly
- * fill with these entries. They will eventually be cleared out
- * by the round robin allocator in that case, so it's probably not
- * worth accounting for.
- */
-
- /*
- * SLBs beyond 32 entries are allocated with stab_rr only
- * POWER7/8/9 have 32 SLB entries, this could be expanded if a
- * future CPU has more.
- */
- if (local_paca->slb_used_bitmap != U32_MAX) {
- index = ffz(local_paca->slb_used_bitmap);
- local_paca->slb_used_bitmap |= 1U << index;
- if (kernel)
- local_paca->slb_kern_bitmap |= 1U << index;
- } else {
- /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */
- index = local_paca->stab_rr;
- if (index < (mmu_slb_size - 1))
- index++;
- else
- index = SLB_NUM_BOLTED;
- local_paca->stab_rr = index;
- if (index < 32) {
- if (kernel)
- local_paca->slb_kern_bitmap |= 1U << index;
- else
- local_paca->slb_kern_bitmap &= ~(1U << index);
- }
- }
- BUG_ON(index < SLB_NUM_BOLTED);
-
- return index;
-}
-
-static long slb_insert_entry(unsigned long ea, unsigned long context,
- unsigned long flags, int ssize, bool kernel)
-{
- unsigned long vsid;
- unsigned long vsid_data, esid_data;
- enum slb_index index;
-
- vsid = get_vsid(context, ea, ssize);
- if (!vsid)
- return -EFAULT;
-
- /*
- * There must not be a kernel SLB fault in alloc_slb_index or before
- * slbmte here or the allocation bitmaps could get out of whack with
- * the SLB.
- *
- * User SLB faults or preloads take this path which might get inlined
- * into the caller, so add compiler barriers here to ensure unsafe
- * memory accesses do not come between.
- */
- barrier();
-
- index = alloc_slb_index(kernel);
-
- vsid_data = __mk_vsid_data(vsid, ssize, flags);
- esid_data = mk_esid_data(ea, ssize, index);
-
- /*
- * No need for an isync before or after this slbmte. The exception
- * we enter with and the rfid we exit with are context synchronizing.
- * User preloads should add isync afterwards in case the kernel
- * accesses user memory before it returns to userspace with rfid.
- */
- assert_slb_presence(false, ea);
- asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
-
- barrier();
-
- if (!kernel)
- slb_cache_update(esid_data);
-
- return 0;
-}
-
-static long slb_allocate_kernel(unsigned long ea, unsigned long id)
-{
- unsigned long context;
- unsigned long flags;
- int ssize;
-
- if (id == LINEAR_MAP_REGION_ID) {
-
- /* We only support upto MAX_PHYSMEM_BITS */
- if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS))
- return -EFAULT;
-
- flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- } else if (id == VMEMMAP_REGION_ID) {
-
- if (ea >= H_VMEMMAP_END)
- return -EFAULT;
-
- flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
-#endif
- } else if (id == VMALLOC_REGION_ID) {
-
- if (ea >= H_VMALLOC_END)
- return -EFAULT;
-
- flags = local_paca->vmalloc_sllp;
-
- } else if (id == IO_REGION_ID) {
-
- if (ea >= H_KERN_IO_END)
- return -EFAULT;
-
- flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp;
-
- } else {
- return -EFAULT;
- }
-
- ssize = MMU_SEGSIZE_1T;
- if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
- ssize = MMU_SEGSIZE_256M;
-
- context = get_kernel_context(ea);
-
- return slb_insert_entry(ea, context, flags, ssize, true);
-}
-
-static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
-{
- unsigned long context;
- unsigned long flags;
- int bpsize;
- int ssize;
-
- /*
- * consider this as bad access if we take a SLB miss
- * on an address above addr limit.
- */
- if (ea >= mm_ctx_slb_addr_limit(&mm->context))
- return -EFAULT;
-
- context = get_user_context(&mm->context, ea);
- if (!context)
- return -EFAULT;
-
- if (unlikely(ea >= H_PGTABLE_RANGE)) {
- WARN_ON(1);
- return -EFAULT;
- }
-
- ssize = user_segment_size(ea);
-
- bpsize = get_slice_psize(mm, ea);
- flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp;
-
- return slb_insert_entry(ea, context, flags, ssize, false);
-}
-
-long do_slb_fault(struct pt_regs *regs, unsigned long ea)
-{
- unsigned long id = get_region_id(ea);
-
- /* IRQs are not reconciled here, so can't check irqs_disabled */
- VM_WARN_ON(mfmsr() & MSR_EE);
-
- if (unlikely(!(regs->msr & MSR_RI)))
- return -EINVAL;
-
- /*
- * SLB kernel faults must be very careful not to touch anything
- * that is not bolted. E.g., PACA and global variables are okay,
- * mm->context stuff is not.
- *
- * SLB user faults can access all of kernel memory, but must be
- * careful not to touch things like IRQ state because it is not
- * "reconciled" here. The difficulty is that we must use
- * fast_exception_return to return from kernel SLB faults without
- * looking at possible non-bolted memory. We could test user vs
- * kernel faults in the interrupt handler asm and do a full fault,
- * reconcile, ret_from_except for user faults which would make them
- * first class kernel code. But for performance it's probably nicer
- * if they go via fast_exception_return too.
- */
- if (id >= LINEAR_MAP_REGION_ID) {
- long err;
-#ifdef CONFIG_DEBUG_VM
- /* Catch recursive kernel SLB faults. */
- BUG_ON(local_paca->in_kernel_slb_handler);
- local_paca->in_kernel_slb_handler = 1;
-#endif
- err = slb_allocate_kernel(ea, id);
-#ifdef CONFIG_DEBUG_VM
- local_paca->in_kernel_slb_handler = 0;
-#endif
- return err;
- } else {
- struct mm_struct *mm = current->mm;
- long err;
-
- if (unlikely(!mm))
- return -EFAULT;
-
- err = slb_allocate_user(mm, ea);
- if (!err)
- preload_add(current_thread_info(), ea);
-
- return err;
- }
-}
-
-void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err)
-{
- if (err == -EFAULT) {
- if (user_mode(regs))
- _exception(SIGSEGV, regs, SEGV_BNDERR, ea);
- else
- bad_page_fault(regs, ea, SIGSEGV);
- } else if (err == -EINVAL) {
- unrecoverable_exception(regs);
- } else {
- BUG();
- }
-}
+++ /dev/null
-/*
- * Copyright 2007-2008 Paul Mackerras, IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/syscalls.h>
-
-#include <asm/pgtable.h>
-#include <linux/uaccess.h>
-
-/*
- * Free all pages allocated for subpage protection maps and pointers.
- * Also makes sure that the subpage_prot_table structure is
- * reinitialized for the next user.
- */
-void subpage_prot_free(struct mm_struct *mm)
-{
- struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
- unsigned long i, j, addr;
- u32 **p;
-
- if (!spt)
- return;
-
- for (i = 0; i < 4; ++i) {
- if (spt->low_prot[i]) {
- free_page((unsigned long)spt->low_prot[i]);
- spt->low_prot[i] = NULL;
- }
- }
- addr = 0;
- for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) {
- p = spt->protptrs[i];
- if (!p)
- continue;
- spt->protptrs[i] = NULL;
- for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr;
- ++j, addr += PAGE_SIZE)
- if (p[j])
- free_page((unsigned long)p[j]);
- free_page((unsigned long)p);
- }
- spt->maxaddr = 0;
- kfree(spt);
-}
-
-static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
- int npages)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- spinlock_t *ptl;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_none(*pgd))
- return;
- pud = pud_offset(pgd, addr);
- if (pud_none(*pud))
- return;
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
- return;
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
- arch_enter_lazy_mmu_mode();
- for (; npages > 0; --npages) {
- pte_update(mm, addr, pte, 0, 0, 0);
- addr += PAGE_SIZE;
- ++pte;
- }
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(pte - 1, ptl);
-}
-
-/*
- * Clear the subpage protection map for an address range, allowing
- * all accesses that are allowed by the pte permissions.
- */
-static void subpage_prot_clear(unsigned long addr, unsigned long len)
-{
- struct mm_struct *mm = current->mm;
- struct subpage_prot_table *spt;
- u32 **spm, *spp;
- unsigned long i;
- size_t nw;
- unsigned long next, limit;
-
- down_write(&mm->mmap_sem);
-
- spt = mm_ctx_subpage_prot(&mm->context);
- if (!spt)
- goto err_out;
-
- limit = addr + len;
- if (limit > spt->maxaddr)
- limit = spt->maxaddr;
- for (; addr < limit; addr = next) {
- next = pmd_addr_end(addr, limit);
- if (addr < 0x100000000UL) {
- spm = spt->low_prot;
- } else {
- spm = spt->protptrs[addr >> SBP_L3_SHIFT];
- if (!spm)
- continue;
- }
- spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
- if (!spp)
- continue;
- spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
-
- i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
- nw = PTRS_PER_PTE - i;
- if (addr + (nw << PAGE_SHIFT) > next)
- nw = (next - addr) >> PAGE_SHIFT;
-
- memset(spp, 0, nw * sizeof(u32));
-
- /* now flush any existing HPTEs for the range */
- hpte_flush_range(mm, addr, nw);
- }
-
-err_out:
- up_write(&mm->mmap_sem);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
- unsigned long end, struct mm_walk *walk)
-{
- struct vm_area_struct *vma = walk->vma;
- split_huge_pmd(vma, pmd, addr);
- return 0;
-}
-
-static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
- unsigned long len)
-{
- struct vm_area_struct *vma;
- struct mm_walk subpage_proto_walk = {
- .mm = mm,
- .pmd_entry = subpage_walk_pmd_entry,
- };
-
- /*
- * We don't try too hard, we just mark all the vma in that range
- * VM_NOHUGEPAGE and split them.
- */
- vma = find_vma(mm, addr);
- /*
- * If the range is in unmapped range, just return
- */
- if (vma && ((addr + len) <= vma->vm_start))
- return;
-
- while (vma) {
- if (vma->vm_start >= (addr + len))
- break;
- vma->vm_flags |= VM_NOHUGEPAGE;
- walk_page_vma(vma, &subpage_proto_walk);
- vma = vma->vm_next;
- }
-}
-#else
-static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
- unsigned long len)
-{
- return;
-}
-#endif
-
-/*
- * Copy in a subpage protection map for an address range.
- * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
- * Each 2-bit field is 0 to allow any access, 1 to prevent writes,
- * 2 or 3 to prevent all accesses.
- * Note that the normal page protections also apply; the subpage
- * protection mechanism is an additional constraint, so putting 0
- * in a 2-bit field won't allow writes to a page that is otherwise
- * write-protected.
- */
-SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
- unsigned long, len, u32 __user *, map)
-{
- struct mm_struct *mm = current->mm;
- struct subpage_prot_table *spt;
- u32 **spm, *spp;
- unsigned long i;
- size_t nw;
- unsigned long next, limit;
- int err;
-
- if (radix_enabled())
- return -ENOENT;
-
- /* Check parameters */
- if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
- addr >= mm->task_size || len >= mm->task_size ||
- addr + len > mm->task_size)
- return -EINVAL;
-
- if (is_hugepage_only_range(mm, addr, len))
- return -EINVAL;
-
- if (!map) {
- /* Clear out the protection map for the address range */
- subpage_prot_clear(addr, len);
- return 0;
- }
-
- if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
- return -EFAULT;
-
- down_write(&mm->mmap_sem);
-
- spt = mm_ctx_subpage_prot(&mm->context);
- if (!spt) {
- /*
- * Allocate subpage prot table if not already done.
- * Do this with mmap_sem held
- */
- spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
- if (!spt) {
- err = -ENOMEM;
- goto out;
- }
- mm->context.hash_context->spt = spt;
- }
-
- subpage_mark_vma_nohuge(mm, addr, len);
- for (limit = addr + len; addr < limit; addr = next) {
- next = pmd_addr_end(addr, limit);
- err = -ENOMEM;
- if (addr < 0x100000000UL) {
- spm = spt->low_prot;
- } else {
- spm = spt->protptrs[addr >> SBP_L3_SHIFT];
- if (!spm) {
- spm = (u32 **)get_zeroed_page(GFP_KERNEL);
- if (!spm)
- goto out;
- spt->protptrs[addr >> SBP_L3_SHIFT] = spm;
- }
- }
- spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1);
- spp = *spm;
- if (!spp) {
- spp = (u32 *)get_zeroed_page(GFP_KERNEL);
- if (!spp)
- goto out;
- *spm = spp;
- }
- spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
-
- local_irq_disable();
- demote_segment_4k(mm, addr);
- local_irq_enable();
-
- i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
- nw = PTRS_PER_PTE - i;
- if (addr + (nw << PAGE_SHIFT) > next)
- nw = (next - addr) >> PAGE_SHIFT;
-
- up_write(&mm->mmap_sem);
- if (__copy_from_user(spp, map, nw * sizeof(u32)))
- return -EFAULT;
- map += nw;
- down_write(&mm->mmap_sem);
-
- /* now flush any existing HPTEs for the range */
- hpte_flush_range(mm, addr, nw);
- }
- if (limit > spt->maxaddr)
- spt->maxaddr = limit;
- err = 0;
- out:
- up_write(&mm->mmap_sem);
- return err;
-}
+++ /dev/null
-/*
- * TLB flush routines for radix kernels.
- *
- * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/memblock.h>
-#include <linux/mmu_context.h>
-#include <linux/sched/mm.h>
-
-#include <asm/ppc-opcode.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-#include <asm/trace.h>
-#include <asm/cputhreads.h>
-
-#define RIC_FLUSH_TLB 0
-#define RIC_FLUSH_PWC 1
-#define RIC_FLUSH_ALL 2
-
-/*
- * tlbiel instruction for radix, set invalidation
- * i.e., r=1 and is=01 or is=10 or is=11
- */
-static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
- unsigned int pid,
- unsigned int ric, unsigned int prs)
-{
- unsigned long rb;
- unsigned long rs;
-
- rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
- rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
-
- asm volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
- : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
- : "memory");
-}
-
-static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
-{
- unsigned int set;
-
- asm volatile("ptesync": : :"memory");
-
- /*
- * Flush the first set of the TLB, and the entire Page Walk Cache
- * and partition table entries. Then flush the remaining sets of the
- * TLB.
- */
- tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
- for (set = 1; set < num_sets; set++)
- tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
-
- /* Do the same for process scoped entries. */
- tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
- for (set = 1; set < num_sets; set++)
- tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
-
- asm volatile("ptesync": : :"memory");
-}
-
-void radix__tlbiel_all(unsigned int action)
-{
- unsigned int is;
-
- switch (action) {
- case TLB_INVAL_SCOPE_GLOBAL:
- is = 3;
- break;
- case TLB_INVAL_SCOPE_LPID:
- is = 2;
- break;
- default:
- BUG();
- }
-
- if (early_cpu_has_feature(CPU_FTR_ARCH_300))
- tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is);
- else
- WARN(1, "%s called on pre-POWER9 CPU\n", __func__);
-
- asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-static inline void __tlbiel_pid(unsigned long pid, int set,
- unsigned long ric)
-{
- unsigned long rb,rs,prs,r;
-
- rb = PPC_BIT(53); /* IS = 1 */
- rb |= set << PPC_BITLSHIFT(51);
- rs = ((unsigned long)pid) << PPC_BITLSHIFT(31);
- prs = 1; /* process scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(0, 1, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
-{
- unsigned long rb,rs,prs,r;
-
- rb = PPC_BIT(53); /* IS = 1 */
- rs = pid << PPC_BITLSHIFT(31);
- prs = 1; /* process scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbiel_lpid(unsigned long lpid, int set,
- unsigned long ric)
-{
- unsigned long rb,rs,prs,r;
-
- rb = PPC_BIT(52); /* IS = 2 */
- rb |= set << PPC_BITLSHIFT(51);
- rs = 0; /* LPID comes from LPIDR */
- prs = 0; /* partition scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
-{
- unsigned long rb,rs,prs,r;
-
- rb = PPC_BIT(52); /* IS = 2 */
- rs = lpid;
- prs = 0; /* partition scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
- unsigned long ric)
-{
- unsigned long rb,rs,prs,r;
-
- rb = PPC_BIT(52); /* IS = 2 */
- rb |= set << PPC_BITLSHIFT(51);
- rs = 0; /* LPID comes from LPIDR */
- prs = 1; /* process scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
-}
-
-
-static inline void __tlbiel_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
-{
- unsigned long rb,rs,prs,r;
-
- rb = va & ~(PPC_BITMASK(52, 63));
- rb |= ap << PPC_BITLSHIFT(58);
- rs = pid << PPC_BITLSHIFT(31);
- prs = 1; /* process scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(0, 1, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
-{
- unsigned long rb,rs,prs,r;
-
- rb = va & ~(PPC_BITMASK(52, 63));
- rb |= ap << PPC_BITLSHIFT(58);
- rs = pid << PPC_BITLSHIFT(31);
- prs = 1; /* process scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
- unsigned long ap, unsigned long ric)
-{
- unsigned long rb,rs,prs,r;
-
- rb = va & ~(PPC_BITMASK(52, 63));
- rb |= ap << PPC_BITLSHIFT(58);
- rs = lpid;
- prs = 0; /* partition scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
-}
-
-static inline void fixup_tlbie(void)
-{
- unsigned long pid = 0;
- unsigned long va = ((1UL << 52) - 1);
-
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
- asm volatile("ptesync": : :"memory");
- __tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
- }
-}
-
-static inline void fixup_tlbie_lpid(unsigned long lpid)
-{
- unsigned long va = ((1UL << 52) - 1);
-
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
- asm volatile("ptesync": : :"memory");
- __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
- }
-}
-
-/*
- * We use 128 set in radix mode and 256 set in hpt mode.
- */
-static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
-{
- int set;
-
- asm volatile("ptesync": : :"memory");
-
- /*
- * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
- * also flush the entire Page Walk Cache.
- */
- __tlbiel_pid(pid, 0, ric);
-
- /* For PWC, only one flush is needed */
- if (ric == RIC_FLUSH_PWC) {
- asm volatile("ptesync": : :"memory");
- return;
- }
-
- /* For the remaining sets, just flush the TLB */
- for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
- __tlbiel_pid(pid, set, RIC_FLUSH_TLB);
-
- asm volatile("ptesync": : :"memory");
- asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
-{
- asm volatile("ptesync": : :"memory");
-
- /*
- * Workaround the fact that the "ric" argument to __tlbie_pid
- * must be a compile-time contraint to match the "i" constraint
- * in the asm statement.
- */
- switch (ric) {
- case RIC_FLUSH_TLB:
- __tlbie_pid(pid, RIC_FLUSH_TLB);
- break;
- case RIC_FLUSH_PWC:
- __tlbie_pid(pid, RIC_FLUSH_PWC);
- break;
- case RIC_FLUSH_ALL:
- default:
- __tlbie_pid(pid, RIC_FLUSH_ALL);
- }
- fixup_tlbie();
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
-{
- int set;
-
- VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
-
- asm volatile("ptesync": : :"memory");
-
- /*
- * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
- * also flush the entire Page Walk Cache.
- */
- __tlbiel_lpid(lpid, 0, ric);
-
- /* For PWC, only one flush is needed */
- if (ric == RIC_FLUSH_PWC) {
- asm volatile("ptesync": : :"memory");
- return;
- }
-
- /* For the remaining sets, just flush the TLB */
- for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
- __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
-
- asm volatile("ptesync": : :"memory");
- asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
-{
- asm volatile("ptesync": : :"memory");
-
- /*
- * Workaround the fact that the "ric" argument to __tlbie_pid
- * must be a compile-time contraint to match the "i" constraint
- * in the asm statement.
- */
- switch (ric) {
- case RIC_FLUSH_TLB:
- __tlbie_lpid(lpid, RIC_FLUSH_TLB);
- break;
- case RIC_FLUSH_PWC:
- __tlbie_lpid(lpid, RIC_FLUSH_PWC);
- break;
- case RIC_FLUSH_ALL:
- default:
- __tlbie_lpid(lpid, RIC_FLUSH_ALL);
- }
- fixup_tlbie_lpid(lpid);
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
-{
- int set;
-
- VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
-
- asm volatile("ptesync": : :"memory");
-
- /*
- * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
- * also flush the entire Page Walk Cache.
- */
- __tlbiel_lpid_guest(lpid, 0, ric);
-
- /* For PWC, only one flush is needed */
- if (ric == RIC_FLUSH_PWC) {
- asm volatile("ptesync": : :"memory");
- return;
- }
-
- /* For the remaining sets, just flush the TLB */
- for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
- __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
-
- asm volatile("ptesync": : :"memory");
- asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
-}
-
-
-static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
- unsigned long pid, unsigned long page_size,
- unsigned long psize)
-{
- unsigned long addr;
- unsigned long ap = mmu_get_ap(psize);
-
- for (addr = start; addr < end; addr += page_size)
- __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-}
-
-static inline void _tlbiel_va(unsigned long va, unsigned long pid,
- unsigned long psize, unsigned long ric)
-{
- unsigned long ap = mmu_get_ap(psize);
-
- asm volatile("ptesync": : :"memory");
- __tlbiel_va(va, pid, ap, ric);
- asm volatile("ptesync": : :"memory");
-}
-
-static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
- unsigned long pid, unsigned long page_size,
- unsigned long psize, bool also_pwc)
-{
- asm volatile("ptesync": : :"memory");
- if (also_pwc)
- __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
- __tlbiel_va_range(start, end, pid, page_size, psize);
- asm volatile("ptesync": : :"memory");
-}
-
-static inline void __tlbie_va_range(unsigned long start, unsigned long end,
- unsigned long pid, unsigned long page_size,
- unsigned long psize)
-{
- unsigned long addr;
- unsigned long ap = mmu_get_ap(psize);
-
- for (addr = start; addr < end; addr += page_size)
- __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
-}
-
-static inline void _tlbie_va(unsigned long va, unsigned long pid,
- unsigned long psize, unsigned long ric)
-{
- unsigned long ap = mmu_get_ap(psize);
-
- asm volatile("ptesync": : :"memory");
- __tlbie_va(va, pid, ap, ric);
- fixup_tlbie();
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
- unsigned long psize, unsigned long ric)
-{
- unsigned long ap = mmu_get_ap(psize);
-
- asm volatile("ptesync": : :"memory");
- __tlbie_lpid_va(va, lpid, ap, ric);
- fixup_tlbie_lpid(lpid);
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-static inline void _tlbie_va_range(unsigned long start, unsigned long end,
- unsigned long pid, unsigned long page_size,
- unsigned long psize, bool also_pwc)
-{
- asm volatile("ptesync": : :"memory");
- if (also_pwc)
- __tlbie_pid(pid, RIC_FLUSH_PWC);
- __tlbie_va_range(start, end, pid, page_size, psize);
- fixup_tlbie();
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-/*
- * Base TLB flushing operations:
- *
- * - flush_tlb_mm(mm) flushes the specified mm context TLB's
- * - flush_tlb_page(vma, vmaddr) flushes one page
- * - flush_tlb_range(vma, start, end) flushes a range of pages
- * - flush_tlb_kernel_range(start, end) flushes kernel pages
- *
- * - local_* variants of page and mm only apply to the current
- * processor
- */
-void radix__local_flush_tlb_mm(struct mm_struct *mm)
-{
- unsigned long pid;
-
- preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_TLB);
- preempt_enable();
-}
-EXPORT_SYMBOL(radix__local_flush_tlb_mm);
-
-#ifndef CONFIG_SMP
-void radix__local_flush_all_mm(struct mm_struct *mm)
-{
- unsigned long pid;
-
- preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
- preempt_enable();
-}
-EXPORT_SYMBOL(radix__local_flush_all_mm);
-#endif /* CONFIG_SMP */
-
-void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
- int psize)
-{
- unsigned long pid;
-
- preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
- preempt_enable();
-}
-
-void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-#ifdef CONFIG_HUGETLB_PAGE
- /* need the return fix for nohash.c */
- if (is_vm_hugetlb_page(vma))
- return radix__local_flush_hugetlb_page(vma, vmaddr);
-#endif
- radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
-}
-EXPORT_SYMBOL(radix__local_flush_tlb_page);
-
-static bool mm_is_singlethreaded(struct mm_struct *mm)
-{
- if (atomic_read(&mm->context.copros) > 0)
- return false;
- if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
- return true;
- return false;
-}
-
-static bool mm_needs_flush_escalation(struct mm_struct *mm)
-{
- /*
- * P9 nest MMU has issues with the page walk cache
- * caching PTEs and not flushing them properly when
- * RIC = 0 for a PID/LPID invalidate
- */
- if (atomic_read(&mm->context.copros) > 0)
- return true;
- return false;
-}
-
-#ifdef CONFIG_SMP
-static void do_exit_flush_lazy_tlb(void *arg)
-{
- struct mm_struct *mm = arg;
- unsigned long pid = mm->context.id;
-
- if (current->mm == mm)
- return; /* Local CPU */
-
- if (current->active_mm == mm) {
- /*
- * Must be a kernel thread because sender is single-threaded.
- */
- BUG_ON(current->mm);
- mmgrab(&init_mm);
- switch_mm(mm, &init_mm, current);
- current->active_mm = &init_mm;
- mmdrop(mm);
- }
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
-}
-
-static void exit_flush_lazy_tlbs(struct mm_struct *mm)
-{
- /*
- * Would be nice if this was async so it could be run in
- * parallel with our local flush, but generic code does not
- * give a good API for it. Could extend the generic code or
- * make a special powerpc IPI for flushing TLBs.
- * For now it's not too performance critical.
- */
- smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
- (void *)mm, 1);
- mm_reset_thread_local(mm);
-}
-
-void radix__flush_tlb_mm(struct mm_struct *mm)
-{
- unsigned long pid;
-
- pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
- return;
-
- preempt_disable();
- /*
- * Order loads of mm_cpumask vs previous stores to clear ptes before
- * the invalidate. See barrier in switch_mm_irqs_off
- */
- smp_mb();
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- exit_flush_lazy_tlbs(mm);
- goto local;
- }
-
- if (mm_needs_flush_escalation(mm))
- _tlbie_pid(pid, RIC_FLUSH_ALL);
- else
- _tlbie_pid(pid, RIC_FLUSH_TLB);
- } else {
-local:
- _tlbiel_pid(pid, RIC_FLUSH_TLB);
- }
- preempt_enable();
-}
-EXPORT_SYMBOL(radix__flush_tlb_mm);
-
-static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
-{
- unsigned long pid;
-
- pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
- return;
-
- preempt_disable();
- smp_mb(); /* see radix__flush_tlb_mm */
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- if (!fullmm) {
- exit_flush_lazy_tlbs(mm);
- goto local;
- }
- }
- _tlbie_pid(pid, RIC_FLUSH_ALL);
- } else {
-local:
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
- }
- preempt_enable();
-}
-void radix__flush_all_mm(struct mm_struct *mm)
-{
- __flush_all_mm(mm, false);
-}
-EXPORT_SYMBOL(radix__flush_all_mm);
-
-void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
-{
- tlb->need_flush_all = 1;
-}
-EXPORT_SYMBOL(radix__flush_tlb_pwc);
-
-void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
- int psize)
-{
- unsigned long pid;
-
- pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
- return;
-
- preempt_disable();
- smp_mb(); /* see radix__flush_tlb_mm */
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- exit_flush_lazy_tlbs(mm);
- goto local;
- }
- _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
- } else {
-local:
- _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
- }
- preempt_enable();
-}
-
-void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
-{
-#ifdef CONFIG_HUGETLB_PAGE
- if (is_vm_hugetlb_page(vma))
- return radix__flush_hugetlb_page(vma, vmaddr);
-#endif
- radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
-}
-EXPORT_SYMBOL(radix__flush_tlb_page);
-
-#else /* CONFIG_SMP */
-#define radix__flush_all_mm radix__local_flush_all_mm
-#endif /* CONFIG_SMP */
-
-void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
- _tlbie_pid(0, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
-
-#define TLB_FLUSH_ALL -1UL
-
-/*
- * Number of pages above which we invalidate the entire PID rather than
- * flush individual pages, for local and global flushes respectively.
- *
- * tlbie goes out to the interconnect and individual ops are more costly.
- * It also does not iterate over sets like the local tlbiel variant when
- * invalidating a full PID, so it has a far lower threshold to change from
- * individual page flushes to full-pid flushes.
- */
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
-static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
-
-static inline void __radix__flush_tlb_range(struct mm_struct *mm,
- unsigned long start, unsigned long end,
- bool flush_all_sizes)
-
-{
- unsigned long pid;
- unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
- unsigned long page_size = 1UL << page_shift;
- unsigned long nr_pages = (end - start) >> page_shift;
- bool local, full;
-
- pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
- return;
-
- preempt_disable();
- smp_mb(); /* see radix__flush_tlb_mm */
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- if (end != TLB_FLUSH_ALL) {
- exit_flush_lazy_tlbs(mm);
- goto is_local;
- }
- }
- local = false;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_single_page_flush_ceiling);
- } else {
-is_local:
- local = true;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_local_single_page_flush_ceiling);
- }
-
- if (full) {
- if (local) {
- _tlbiel_pid(pid, RIC_FLUSH_TLB);
- } else {
- if (mm_needs_flush_escalation(mm))
- _tlbie_pid(pid, RIC_FLUSH_ALL);
- else
- _tlbie_pid(pid, RIC_FLUSH_TLB);
- }
- } else {
- bool hflush = flush_all_sizes;
- bool gflush = flush_all_sizes;
- unsigned long hstart, hend;
- unsigned long gstart, gend;
-
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
- hflush = true;
-
- if (hflush) {
- hstart = (start + PMD_SIZE - 1) & PMD_MASK;
- hend = end & PMD_MASK;
- if (hstart == hend)
- hflush = false;
- }
-
- if (gflush) {
- gstart = (start + PUD_SIZE - 1) & PUD_MASK;
- gend = end & PUD_MASK;
- if (gstart == gend)
- gflush = false;
- }
-
- asm volatile("ptesync": : :"memory");
- if (local) {
- __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
- if (hflush)
- __tlbiel_va_range(hstart, hend, pid,
- PMD_SIZE, MMU_PAGE_2M);
- if (gflush)
- __tlbiel_va_range(gstart, gend, pid,
- PUD_SIZE, MMU_PAGE_1G);
- asm volatile("ptesync": : :"memory");
- } else {
- __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
- if (hflush)
- __tlbie_va_range(hstart, hend, pid,
- PMD_SIZE, MMU_PAGE_2M);
- if (gflush)
- __tlbie_va_range(gstart, gend, pid,
- PUD_SIZE, MMU_PAGE_1G);
- fixup_tlbie();
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
- }
- }
- preempt_enable();
-}
-
-void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
-
-{
-#ifdef CONFIG_HUGETLB_PAGE
- if (is_vm_hugetlb_page(vma))
- return radix__flush_hugetlb_tlb_range(vma, start, end);
-#endif
-
- __radix__flush_tlb_range(vma->vm_mm, start, end, false);
-}
-EXPORT_SYMBOL(radix__flush_tlb_range);
-
-static int radix_get_mmu_psize(int page_size)
-{
- int psize;
-
- if (page_size == (1UL << mmu_psize_defs[mmu_virtual_psize].shift))
- psize = mmu_virtual_psize;
- else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_2M].shift))
- psize = MMU_PAGE_2M;
- else if (page_size == (1UL << mmu_psize_defs[MMU_PAGE_1G].shift))
- psize = MMU_PAGE_1G;
- else
- return -1;
- return psize;
-}
-
-/*
- * Flush partition scoped LPID address translation for all CPUs.
- */
-void radix__flush_tlb_lpid_page(unsigned int lpid,
- unsigned long addr,
- unsigned long page_size)
-{
- int psize = radix_get_mmu_psize(page_size);
-
- _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB);
-}
-EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page);
-
-/*
- * Flush partition scoped PWC from LPID for all CPUs.
- */
-void radix__flush_pwc_lpid(unsigned int lpid)
-{
- _tlbie_lpid(lpid, RIC_FLUSH_PWC);
-}
-EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
-
-/*
- * Flush partition scoped translations from LPID (=LPIDR)
- */
-void radix__flush_tlb_lpid(unsigned int lpid)
-{
- _tlbie_lpid(lpid, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
-
-/*
- * Flush partition scoped translations from LPID (=LPIDR)
- */
-void radix__local_flush_tlb_lpid(unsigned int lpid)
-{
- _tlbiel_lpid(lpid, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
-
-/*
- * Flush process scoped translations from LPID (=LPIDR).
- * Important difference, the guest normally manages its own translations,
- * but some cases e.g., vCPU CPU migration require KVM to flush.
- */
-void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
-{
- _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
-}
-EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
-
-
-static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
- unsigned long end, int psize);
-
-void radix__tlb_flush(struct mmu_gather *tlb)
-{
- int psize = 0;
- struct mm_struct *mm = tlb->mm;
- int page_size = tlb->page_size;
- unsigned long start = tlb->start;
- unsigned long end = tlb->end;
-
- /*
- * if page size is not something we understand, do a full mm flush
- *
- * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
- * that flushes the process table entry cache upon process teardown.
- * See the comment for radix in arch_exit_mmap().
- */
- if (tlb->fullmm) {
- __flush_all_mm(mm, true);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
- } else if (mm_tlb_flush_nested(mm)) {
- /*
- * If there is a concurrent invalidation that is clearing ptes,
- * then it's possible this invalidation will miss one of those
- * cleared ptes and miss flushing the TLB. If this invalidate
- * returns before the other one flushes TLBs, that can result
- * in it returning while there are still valid TLBs inside the
- * range to be invalidated.
- *
- * See mm/memory.c:tlb_finish_mmu() for more details.
- *
- * The solution to this is ensure the entire range is always
- * flushed here. The problem for powerpc is that the flushes
- * are page size specific, so this "forced flush" would not
- * do the right thing if there are a mix of page sizes in
- * the range to be invalidated. So use __flush_tlb_range
- * which invalidates all possible page sizes in the range.
- *
- * PWC flush probably is not be required because the core code
- * shouldn't free page tables in this path, but accounting
- * for the possibility makes us a bit more robust.
- *
- * need_flush_all is an uncommon case because page table
- * teardown should be done with exclusive locks held (but
- * after locks are dropped another invalidate could come
- * in), it could be optimized further if necessary.
- */
- if (!tlb->need_flush_all)
- __radix__flush_tlb_range(mm, start, end, true);
- else
- radix__flush_all_mm(mm);
-#endif
- } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
- if (!tlb->need_flush_all)
- radix__flush_tlb_mm(mm);
- else
- radix__flush_all_mm(mm);
- } else {
- if (!tlb->need_flush_all)
- radix__flush_tlb_range_psize(mm, start, end, psize);
- else
- radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
- }
- tlb->need_flush_all = 0;
-}
-
-static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
- unsigned long start, unsigned long end,
- int psize, bool also_pwc)
-{
- unsigned long pid;
- unsigned int page_shift = mmu_psize_defs[psize].shift;
- unsigned long page_size = 1UL << page_shift;
- unsigned long nr_pages = (end - start) >> page_shift;
- bool local, full;
-
- pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
- return;
-
- preempt_disable();
- smp_mb(); /* see radix__flush_tlb_mm */
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- if (end != TLB_FLUSH_ALL) {
- exit_flush_lazy_tlbs(mm);
- goto is_local;
- }
- }
- local = false;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_single_page_flush_ceiling);
- } else {
-is_local:
- local = true;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_local_single_page_flush_ceiling);
- }
-
- if (full) {
- if (local) {
- _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
- } else {
- if (mm_needs_flush_escalation(mm))
- also_pwc = true;
-
- _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
- }
- } else {
- if (local)
- _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
- else
- _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
- }
- preempt_enable();
-}
-
-void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
- unsigned long end, int psize)
-{
- return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
-}
-
-static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
- unsigned long end, int psize)
-{
- __radix__flush_tlb_range_psize(mm, start, end, psize, true);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
-{
- unsigned long pid, end;
-
- pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
- return;
-
- /* 4k page size, just blow the world */
- if (PAGE_SIZE == 0x1000) {
- radix__flush_all_mm(mm);
- return;
- }
-
- end = addr + HPAGE_PMD_SIZE;
-
- /* Otherwise first do the PWC, then iterate the pages. */
- preempt_disable();
- smp_mb(); /* see radix__flush_tlb_mm */
- if (!mm_is_thread_local(mm)) {
- if (unlikely(mm_is_singlethreaded(mm))) {
- exit_flush_lazy_tlbs(mm);
- goto local;
- }
- _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
- } else {
-local:
- _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
- }
-
- preempt_enable();
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M);
-}
-EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
-
-void radix__flush_tlb_all(void)
-{
- unsigned long rb,prs,r,rs;
- unsigned long ric = RIC_FLUSH_ALL;
-
- rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */
- prs = 0; /* partition scoped */
- r = 1; /* radix format */
- rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */
-
- asm volatile("ptesync": : :"memory");
- /*
- * now flush guest entries by passing PRS = 1 and LPID != 0
- */
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory");
- /*
- * now flush host entires by passing PRS = 0 and LPID == 0
- */
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
-}
-
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
-{
- unsigned long pid = mm->context.id;
-
- if (unlikely(pid == MMU_NO_CONTEXT))
- return;
-
- /*
- * If this context hasn't run on that CPU before and KVM is
- * around, there's a slim chance that the guest on another
- * CPU just brought in obsolete translation into the TLB of
- * this CPU due to a bad prefetch using the guest PID on
- * the way into the hypervisor.
- *
- * We work around this here. If KVM is possible, we check if
- * any sibling thread is in KVM. If it is, the window may exist
- * and thus we flush that PID from the core.
- *
- * A potential future improvement would be to mark which PIDs
- * have never been used on the system and avoid it if the PID
- * is new and the process has no other cpumask bit set.
- */
- if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
- int cpu = smp_processor_id();
- int sib = cpu_first_thread_sibling(cpu);
- bool flush = false;
-
- for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
- if (sib == cpu)
- continue;
- if (!cpu_possible(sib))
- continue;
- if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
- flush = true;
- }
- if (flush)
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
- }
-}
-EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
-#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+++ /dev/null
-/*
- * This file contains the routines for flushing entries from the
- * TLB and MMU hash table.
- *
- * Derived from arch/ppc64/mm/init.c:
- * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- * and Cort Dougan (PReP) (cort@cs.nmt.edu)
- * Copyright (C) 1996 Paul Mackerras
- *
- * Derived from "arch/i386/mm/init.c"
- * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * Dave Engebretsen <engebret@us.ibm.com>
- * Rework for PPC64 port.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-#include <asm/bug.h>
-#include <asm/pte-walk.h>
-
-
-#include <trace/events/thp.h>
-
-DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
-
-/*
- * A linux PTE was changed and the corresponding hash table entry
- * neesd to be flushed. This function will either perform the flush
- * immediately or will batch it up if the current CPU has an active
- * batch on it.
- */
-void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, unsigned long pte, int huge)
-{
- unsigned long vpn;
- struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch);
- unsigned long vsid;
- unsigned int psize;
- int ssize;
- real_pte_t rpte;
- int i, offset;
-
- i = batch->index;
-
- /* Get page size (maybe move back to caller).
- *
- * NOTE: when using special 64K mappings in 4K environment like
- * for SPEs, we obtain the page size from the slice, which thus
- * must still exist (and thus the VMA not reused) at the time
- * of this call
- */
- if (huge) {
-#ifdef CONFIG_HUGETLB_PAGE
- psize = get_slice_psize(mm, addr);
- /* Mask the address for the correct page size */
- addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
- if (unlikely(psize == MMU_PAGE_16G))
- offset = PTRS_PER_PUD;
- else
- offset = PTRS_PER_PMD;
-#else
- BUG();
- psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
-#endif
- } else {
- psize = pte_pagesize_index(mm, addr, pte);
- /* Mask the address for the standard page size. If we
- * have a 64k page kernel, but the hardware does not
- * support 64k pages, this might be different from the
- * hardware page size encoded in the slice table. */
- addr &= PAGE_MASK;
- offset = PTRS_PER_PTE;
- }
-
-
- /* Build full vaddr */
- if (!is_kernel_addr(addr)) {
- ssize = user_segment_size(addr);
- vsid = get_user_vsid(&mm->context, addr, ssize);
- } else {
- vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
- ssize = mmu_kernel_ssize;
- }
- WARN_ON(vsid == 0);
- vpn = hpt_vpn(addr, vsid, ssize);
- rpte = __real_pte(__pte(pte), ptep, offset);
-
- /*
- * Check if we have an active batch on this CPU. If not, just
- * flush now and return.
- */
- if (!batch->active) {
- flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm));
- put_cpu_var(ppc64_tlb_batch);
- return;
- }
-
- /*
- * This can happen when we are in the middle of a TLB batch and
- * we encounter memory pressure (eg copy_page_range when it tries
- * to allocate a new pte). If we have to reclaim memory and end
- * up scanning and resetting referenced bits then our batch context
- * will change mid stream.
- *
- * We also need to ensure only one page size is present in a given
- * batch
- */
- if (i != 0 && (mm != batch->mm || batch->psize != psize ||
- batch->ssize != ssize)) {
- __flush_tlb_pending(batch);
- i = 0;
- }
- if (i == 0) {
- batch->mm = mm;
- batch->psize = psize;
- batch->ssize = ssize;
- }
- batch->pte[i] = rpte;
- batch->vpn[i] = vpn;
- batch->index = ++i;
- if (i >= PPC64_TLB_BATCH_NR)
- __flush_tlb_pending(batch);
- put_cpu_var(ppc64_tlb_batch);
-}
-
-/*
- * This function is called when terminating an mmu batch or when a batch
- * is full. It will perform the flush of all the entries currently stored
- * in a batch.
- *
- * Must be called from within some kind of spinlock/non-preempt region...
- */
-void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
-{
- int i, local;
-
- i = batch->index;
- local = mm_is_thread_local(batch->mm);
- if (i == 1)
- flush_hash_page(batch->vpn[0], batch->pte[0],
- batch->psize, batch->ssize, local);
- else
- flush_hash_range(i, local);
- batch->index = 0;
-}
-
-void hash__tlb_flush(struct mmu_gather *tlb)
-{
- struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
-
- /* If there's a TLB batch pending, then we must flush it because the
- * pages are going to be freed and we really don't want to have a CPU
- * access a freed page because it has a stale TLB
- */
- if (tlbbatch->index)
- __flush_tlb_pending(tlbbatch);
-
- put_cpu_var(ppc64_tlb_batch);
-}
-
-/**
- * __flush_hash_table_range - Flush all HPTEs for a given address range
- * from the hash table (and the TLB). But keeps
- * the linux PTEs intact.
- *
- * @mm : mm_struct of the target address space (generally init_mm)
- * @start : starting address
- * @end : ending address (not included in the flush)
- *
- * This function is mostly to be used by some IO hotplug code in order
- * to remove all hash entries from a given address range used to map IO
- * space on a removed PCI-PCI bidge without tearing down the full mapping
- * since 64K pages may overlap with other bridges when using 64K pages
- * with 4K HW pages on IO space.
- *
- * Because of that usage pattern, it is implemented for small size rather
- * than speed.
- */
-void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
- unsigned long end)
-{
- bool is_thp;
- int hugepage_shift;
- unsigned long flags;
-
- start = _ALIGN_DOWN(start, PAGE_SIZE);
- end = _ALIGN_UP(end, PAGE_SIZE);
-
- BUG_ON(!mm->pgd);
-
- /* Note: Normally, we should only ever use a batch within a
- * PTE locked section. This violates the rule, but will work
- * since we don't actually modify the PTEs, we just flush the
- * hash while leaving the PTEs intact (including their reference
- * to being hashed). This is not the most performance oriented
- * way to do things but is fine for our needs here.
- */
- local_irq_save(flags);
- arch_enter_lazy_mmu_mode();
- for (; start < end; start += PAGE_SIZE) {
- pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
- &hugepage_shift);
- unsigned long pte;
-
- if (ptep == NULL)
- continue;
- pte = pte_val(*ptep);
- if (is_thp)
- trace_hugepage_invalidate(start, pte);
- if (!(pte & H_PAGE_HASHPTE))
- continue;
- if (unlikely(is_thp))
- hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
- else
- hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
- }
- arch_leave_lazy_mmu_mode();
- local_irq_restore(flags);
-}
-
-void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
-{
- pte_t *pte;
- pte_t *start_pte;
- unsigned long flags;
-
- addr = _ALIGN_DOWN(addr, PMD_SIZE);
- /* Note: Normally, we should only ever use a batch within a
- * PTE locked section. This violates the rule, but will work
- * since we don't actually modify the PTEs, we just flush the
- * hash while leaving the PTEs intact (including their reference
- * to being hashed). This is not the most performance oriented
- * way to do things but is fine for our needs here.
- */
- local_irq_save(flags);
- arch_enter_lazy_mmu_mode();
- start_pte = pte_offset_map(pmd, addr);
- for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
- unsigned long pteval = pte_val(*pte);
- if (pteval & H_PAGE_HASHPTE)
- hpte_need_flush(mm, addr, pte, pteval, 0);
- addr += PAGE_SIZE;
- }
- arch_leave_lazy_mmu_mode();
- local_irq_restore(flags);
-}
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0
-#include <asm/byteorder.h>
-#include "vphn.h"
-
-/*
- * The associativity domain numbers are returned from the hypervisor as a
- * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the
- * special value of "all ones" (aka. 0xffff) and its size may not exceed 48
- * bytes.
- *
- * --- 16-bit fields -->
- * _________________________
- * | 0 | 1 | 2 | 3 | be_packed[0]
- * ------+-----+-----+------
- * _________________________
- * | 4 | 5 | 6 | 7 | be_packed[1]
- * -------------------------
- * ...
- * _________________________
- * | 20 | 21 | 22 | 23 | be_packed[5]
- * -------------------------
- *
- * Convert to the sequence they would appear in the ibm,associativity property.
- */
-int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
-{
- __be64 be_packed[VPHN_REGISTER_COUNT];
- int i, nr_assoc_doms = 0;
- const __be16 *field = (const __be16 *) be_packed;
- u16 last = 0;
- bool is_32bit = false;
-
-#define VPHN_FIELD_UNUSED (0xffff)
-#define VPHN_FIELD_MSB (0x8000)
-#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB)
-
- /* Let's fix the values returned by plpar_hcall9() */
- for (i = 0; i < VPHN_REGISTER_COUNT; i++)
- be_packed[i] = cpu_to_be64(packed[i]);
-
- for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
- u16 new = be16_to_cpup(field++);
-
- if (is_32bit) {
- /* Let's concatenate the 16 bits of this field to the
- * 15 lower bits of the previous field
- */
- unpacked[++nr_assoc_doms] =
- cpu_to_be32(last << 16 | new);
- is_32bit = false;
- } else if (new == VPHN_FIELD_UNUSED)
- /* This is the list terminator */
- break;
- else if (new & VPHN_FIELD_MSB) {
- /* Data is in the lower 15 bits of this field */
- unpacked[++nr_assoc_doms] =
- cpu_to_be32(new & VPHN_FIELD_MASK);
- } else {
- /* Data is in the lower 15 bits of this field
- * concatenated with the next 16 bit field
- */
- last = new;
- is_32bit = true;
- }
- }
-
- /* The first cell contains the length of the property */
- unpacked[0] = cpu_to_be32(nr_assoc_doms);
-
- return nr_assoc_doms;
-}
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ARCH_POWERPC_MM_VPHN_H_
-#define _ARCH_POWERPC_MM_VPHN_H_
-
-/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers.
- */
-#define VPHN_REGISTER_COUNT 6
-
-/*
- * 6 64-bit registers unpacked into up to 24 be32 associativity values. To
- * form the complete property we have to add the length in the first cell.
- */
-#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1)
-
-extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked);
-
-#endif
-../../../../../arch/powerpc/mm/vphn.c
\ No newline at end of file
+../../../../../arch/powerpc/mm/book3s64/vphn.c
\ No newline at end of file
-../../../../../arch/powerpc/mm/vphn.h
\ No newline at end of file
+../../../../../arch/powerpc/mm/book3s64/vphn.h
\ No newline at end of file