Patch series "1G transparent hugepage support for device dax", v2.
The following series implements support for 1G trasparent hugepage on
x86 for device dax. The bulk of the code was written by Mathew Wilcox a
while back supporting transparent 1G hugepage for fs DAX. I have
forward ported the relevant bits to 4.10-rc. The current submission has
only the necessary code to support device DAX.
Comments from Dan Williams: So the motivation and intended user of this
functionality mirrors the motivation and users of 1GB page support in
hugetlbfs. Given expected capacities of persistent memory devices an
in-memory database may want to reduce tlb pressure beyond what they can
already achieve with 2MB mappings of a device-dax file. We have
customer feedback to that effect as Willy mentioned in his previous
version of these patches [1].
[1]: https://lkml.org/lkml/2016/1/31/52
Comments from Nilesh @ Oracle:
There are applications which have a process model; and if you assume
10,000 processes attempting to mmap all the 6TB memory available on a
server; we are looking at the following:
processes : 10,000
memory : 6TB
pte @ 4k page size: 8 bytes / 4K of memory * #processes = 6TB / 4k * 8 * 10000 = 1.5GB * 80000 = 120,000GB
pmd @ 2M page size: 120,000 / 512 = ~240GB
pud @ 1G page size: 240GB / 512 = ~480MB
As you can see with 2M pages, this system will use up an exorbitant
amount of DRAM to hold the page tables; but the 1G pages finally brings
it down to a reasonable level. Memory sizes will keep increasing; so
this number will keep increasing.
An argument can be made to convert the applications from process model
to thread model, but in the real world that may not be always practical.
Hopefully this helps explain the use case where this is valuable.
This patch (of 3):
In preparation for adding the ability to handle PUD pages, convert
vm_operations_struct.pmd_fault to vm_operations_struct.huge_fault. The
vm_fault structure is extended to include a union of the different page
table pointers that may be needed, and three flag bits are reserved to
indicate which type of pointer is in the union.
[ross.zwisler@linux.intel.com: remove unused function ext4_dax_huge_fault()]
Link: http://lkml.kernel.org/r/1485813172-7284-1-git-send-email-ross.zwisler@linux.intel.com
[dave.jiang@intel.com: clear PMD or PUD size flags when in fall through path]
Link: http://lkml.kernel.org/r/148589842696.5820.16078080610311444794.stgit@djiang5-desk3.ch.intel.com
Link: http://lkml.kernel.org/r/148545058784.17912.6353162518188733642.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
return -1;
}
-static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
+static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
{
struct device *dev = &dax_dev->dev;
struct dax_region *dax_region;
return VM_FAULT_NOPAGE;
}
-static int dax_dev_fault(struct vm_fault *vmf)
-{
- struct vm_area_struct *vma = vmf->vma;
- int rc;
- struct file *filp = vma->vm_file;
- struct dax_dev *dax_dev = filp->private_data;
-
- dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
- current->comm, (vmf->flags & FAULT_FLAG_WRITE)
- ? "write" : "read", vma->vm_start, vma->vm_end);
- rcu_read_lock();
- rc = __dax_dev_fault(dax_dev, vmf);
- rcu_read_unlock();
-
- return rc;
-}
-
static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
{
unsigned long pmd_addr = vmf->address & PMD_MASK;
vmf->flags & FAULT_FLAG_WRITE);
}
-static int dax_dev_pmd_fault(struct vm_fault *vmf)
+static int dax_dev_fault(struct vm_fault *vmf)
{
int rc;
struct file *filp = vmf->vma->vm_file;
vmf->vma->vm_start, vmf->vma->vm_end);
rcu_read_lock();
- rc = __dax_dev_pmd_fault(dax_dev, vmf);
+ switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
+ case FAULT_FLAG_SIZE_PTE:
+ rc = __dax_dev_pte_fault(dax_dev, vmf);
+ break;
+ case FAULT_FLAG_SIZE_PMD:
+ rc = __dax_dev_pmd_fault(dax_dev, vmf);
+ break;
+ default:
+ return VM_FAULT_FALLBACK;
+ }
rcu_read_unlock();
return rc;
static const struct vm_operations_struct dax_dev_vm_ops = {
.fault = dax_dev_fault,
- .pmd_fault = dax_dev_pmd_fault,
+ .huge_fault = dax_dev_fault,
};
static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
return VM_FAULT_SIGBUS;
}
-/**
- * dax_iomap_fault - handle a page fault on a DAX file
- * @vmf: The description of the fault
- * @ops: iomap ops passed from the file system
- *
- * When a page fault occurs, filesystems may call this helper in their fault
- * or mkwrite handler for DAX files. Assumes the caller has done all the
- * necessary locking for the page fault to proceed successfully.
- */
-int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
+static int dax_iomap_pte_fault(struct vm_fault *vmf,
+ const struct iomap_ops *ops)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
}
return vmf_ret;
}
-EXPORT_SYMBOL_GPL(dax_iomap_fault);
#ifdef CONFIG_FS_DAX_PMD
/*
return VM_FAULT_FALLBACK;
}
-int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
+static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+ const struct iomap_ops *ops)
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
return result;
}
-EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
+#else
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
+{
+ return VM_FAULT_FALLBACK;
+}
#endif /* CONFIG_FS_DAX_PMD */
+
+/**
+ * dax_iomap_fault - handle a page fault on a DAX file
+ * @vmf: The description of the fault
+ * @ops: iomap ops passed from the file system
+ *
+ * When a page fault occurs, filesystems may call this helper in
+ * their fault handler for DAX files. dax_iomap_fault() assumes the caller
+ * has done all the necessary locking for page fault to proceed
+ * successfully.
+ */
+int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
+{
+ switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
+ case FAULT_FLAG_SIZE_PTE:
+ return dax_iomap_pte_fault(vmf, ops);
+ case FAULT_FLAG_SIZE_PMD:
+ return dax_iomap_pmd_fault(vmf, ops);
+ default:
+ return VM_FAULT_FALLBACK;
+ }
+}
+EXPORT_SYMBOL_GPL(dax_iomap_fault);
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
/*
- * .pmd_fault is not supported for DAX because allocation in ext2
+ * .huge_fault is not supported for DAX because allocation in ext2
* cannot be reliably aligned to huge page sizes and so pmd faults
* will always fail and fail back to regular faults.
*/
return result;
}
-static int
-ext4_dax_pmd_fault(struct vm_fault *vmf)
-{
- int result;
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct super_block *sb = inode->i_sb;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
-
- if (write) {
- sb_start_pagefault(sb);
- file_update_time(vmf->vma->vm_file);
- }
- down_read(&EXT4_I(inode)->i_mmap_sem);
- result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
- up_read(&EXT4_I(inode)->i_mmap_sem);
- if (write)
- sb_end_pagefault(sb);
-
- return result;
-}
-
/*
* Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
* handler we check for races agaist truncate. Note that since we cycle through
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
- .pmd_fault = ext4_dax_pmd_fault,
+ .huge_fault = ext4_dax_fault,
.page_mkwrite = ext4_dax_fault,
.pfn_mkwrite = ext4_dax_pfn_mkwrite,
};
/*
* Similar to xfs_filemap_fault(), the DAX fault path can call into here on
* both read and write faults. Hence we need to handle both cases. There is no
- * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * ->huge_mkwrite callout for huge pages, so we have a single function here to
* handle both cases here. @flags carries the information on the type of fault
* occuring.
*/
STATIC int
-xfs_filemap_pmd_fault(
+xfs_filemap_huge_fault(
struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
if (!IS_DAX(inode))
return VM_FAULT_FALLBACK;
- trace_xfs_filemap_pmd_fault(ip);
+ trace_xfs_filemap_huge_fault(ip);
if (vmf->flags & FAULT_FLAG_WRITE) {
sb_start_pagefault(inode->i_sb);
}
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (vmf->flags & FAULT_FLAG_WRITE)
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
- .pmd_fault = xfs_filemap_pmd_fault,
+ .huge_fault = xfs_filemap_huge_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
-DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
+DEFINE_INODE_EVENT(xfs_filemap_huge_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
return PMD_SHIFT - PAGE_SHIFT;
return 0;
}
-int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops);
#else
static inline unsigned int dax_radix_order(void *entry)
{
return 0;
}
-static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
- const struct iomap_ops *ops)
-{
- return VM_FAULT_FALLBACK;
-}
#endif
int dax_pfn_mkwrite(struct vm_fault *vmf);
#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */
#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */
+#define FAULT_FLAG_SIZE_MASK 0x7000 /* Support up to 8-level page tables */
+#define FAULT_FLAG_SIZE_PTE 0x0000 /* First level (eg 4k) */
+#define FAULT_FLAG_SIZE_PMD 0x1000 /* Second level (eg 2MB) */
+#define FAULT_FLAG_SIZE_PUD 0x2000 /* Third level (eg 1GB) */
+
#define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
{ FAULT_FLAG_MKWRITE, "MKWRITE" }, \
unsigned long address; /* Faulting virtual address */
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address' */
+ pud_t *pud; /* Pointer to pud entry matching
+ * the 'address'
+ */
pte_t orig_pte; /* Value of PTE at the time of fault */
struct page *cow_page; /* Page handler may use for COW fault */
void (*close)(struct vm_area_struct * area);
int (*mremap)(struct vm_area_struct * area);
int (*fault)(struct vm_fault *vmf);
- int (*pmd_fault)(struct vm_fault *vmf);
+ int (*huge_fault)(struct vm_fault *vmf);
void (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_anonymous_page(vmf);
- if (vmf->vma->vm_ops->pmd_fault)
- return vmf->vma->vm_ops->pmd_fault(vmf);
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf);
return VM_FAULT_FALLBACK;
}
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_wp_page(vmf, orig_pmd);
- if (vmf->vma->vm_ops->pmd_fault)
- return vmf->vma->vm_ops->pmd_fault(vmf);
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf);
/* COW handled on pte level: split pmd */
VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
pud_t *pud;
+ int ret;
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!vmf.pmd)
return VM_FAULT_OOM;
if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
- int ret = create_huge_pmd(&vmf);
+ vmf.flags |= FAULT_FLAG_SIZE_PMD;
+ ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
+ /* fall through path, remove PMD flag */
+ vmf.flags &= ~FAULT_FLAG_SIZE_PMD;
} else {
pmd_t orig_pmd = *vmf.pmd;
- int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
+ vmf.flags |= FAULT_FLAG_SIZE_PMD;
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf, orig_pmd);
ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
+ /* fall through path, remove PUD flag */
+ vmf.flags &= ~FAULT_FLAG_SIZE_PUD;
} else {
huge_pmd_set_accessed(&vmf, orig_pmd);
return 0;