Skip to content

Commit 992de9a

Browse files
Jérôme Glissetorvalds
authored andcommitted
mm/hmm: allow to mirror vma of a file on a DAX backed filesystem
HMM mirror is a device driver helpers to mirror range of virtual address. It means that the process jobs running on the device can access the same virtual address as the CPU threads of that process. This patch adds support for mirroring mapping of file that are on a DAX block device (ie range of virtual address that is an mmap of a file in a filesystem on a DAX block device). There is no reason to not support such case when mirroring virtual address on a device. Note that unlike GUP code we do not take page reference hence when we back-off we have nothing to undo. [[email protected]: move THP and hugetlbfs code path behind #if KCONFIG] Link: http://lkml.kernel.org/r/[email protected] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Jérôme Glisse <[email protected]> Reviewed-by: Ralph Campbell <[email protected]> Cc: Dan Williams <[email protected]> Cc: John Hubbard <[email protected]> Cc: Arnd Bergmann <[email protected]> Cc: Balbir Singh <[email protected]> Cc: Dan Carpenter <[email protected]> Cc: Ira Weiny <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Souptick Joarder <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 63d5066 commit 992de9a

1 file changed

Lines changed: 126 additions & 21 deletions

File tree

mm/hmm.c

Lines changed: 126 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister);
329329

330330
struct hmm_vma_walk {
331331
struct hmm_range *range;
332+
struct dev_pagemap *pgmap;
332333
unsigned long last;
333334
bool fault;
334335
bool block;
@@ -503,12 +504,22 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
503504
range->flags[HMM_PFN_VALID];
504505
}
505506

507+
static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
508+
{
509+
if (!pud_present(pud))
510+
return 0;
511+
return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
512+
range->flags[HMM_PFN_WRITE] :
513+
range->flags[HMM_PFN_VALID];
514+
}
515+
506516
static int hmm_vma_handle_pmd(struct mm_walk *walk,
507517
unsigned long addr,
508518
unsigned long end,
509519
uint64_t *pfns,
510520
pmd_t pmd)
511521
{
522+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
512523
struct hmm_vma_walk *hmm_vma_walk = walk->private;
513524
struct hmm_range *range = hmm_vma_walk->range;
514525
unsigned long pfn, npages, i;
@@ -524,10 +535,25 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
524535
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
525536

526537
pfn = pmd_pfn(pmd) + pte_index(addr);
527-
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
538+
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
539+
if (pmd_devmap(pmd)) {
540+
hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
541+
hmm_vma_walk->pgmap);
542+
if (unlikely(!hmm_vma_walk->pgmap))
543+
return -EBUSY;
544+
}
528545
pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
546+
}
547+
if (hmm_vma_walk->pgmap) {
548+
put_dev_pagemap(hmm_vma_walk->pgmap);
549+
hmm_vma_walk->pgmap = NULL;
550+
}
529551
hmm_vma_walk->last = end;
530552
return 0;
553+
#else
554+
/* If THP is not enabled then we should never reach that code ! */
555+
return -EINVAL;
556+
#endif
531557
}
532558

533559
static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
@@ -612,10 +638,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
612638
if (fault || write_fault)
613639
goto fault;
614640

641+
if (pte_devmap(pte)) {
642+
hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
643+
hmm_vma_walk->pgmap);
644+
if (unlikely(!hmm_vma_walk->pgmap))
645+
return -EBUSY;
646+
} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
647+
*pfn = range->values[HMM_PFN_SPECIAL];
648+
return -EFAULT;
649+
}
650+
615651
*pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
616652
return 0;
617653

618654
fault:
655+
if (hmm_vma_walk->pgmap) {
656+
put_dev_pagemap(hmm_vma_walk->pgmap);
657+
hmm_vma_walk->pgmap = NULL;
658+
}
619659
pte_unmap(ptep);
620660
/* Fault any virtual address we were asked to fault */
621661
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -703,12 +743,93 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
703743
return r;
704744
}
705745
}
746+
if (hmm_vma_walk->pgmap) {
747+
/*
748+
* We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
749+
* so that we can leverage get_dev_pagemap() optimization which
750+
* will not re-take a reference on a pgmap if we already have
751+
* one.
752+
*/
753+
put_dev_pagemap(hmm_vma_walk->pgmap);
754+
hmm_vma_walk->pgmap = NULL;
755+
}
706756
pte_unmap(ptep - 1);
707757

708758
hmm_vma_walk->last = addr;
709759
return 0;
710760
}
711761

762+
static int hmm_vma_walk_pud(pud_t *pudp,
763+
unsigned long start,
764+
unsigned long end,
765+
struct mm_walk *walk)
766+
{
767+
struct hmm_vma_walk *hmm_vma_walk = walk->private;
768+
struct hmm_range *range = hmm_vma_walk->range;
769+
unsigned long addr = start, next;
770+
pmd_t *pmdp;
771+
pud_t pud;
772+
int ret;
773+
774+
again:
775+
pud = READ_ONCE(*pudp);
776+
if (pud_none(pud))
777+
return hmm_vma_walk_hole(start, end, walk);
778+
779+
if (pud_huge(pud) && pud_devmap(pud)) {
780+
unsigned long i, npages, pfn;
781+
uint64_t *pfns, cpu_flags;
782+
bool fault, write_fault;
783+
784+
if (!pud_present(pud))
785+
return hmm_vma_walk_hole(start, end, walk);
786+
787+
i = (addr - range->start) >> PAGE_SHIFT;
788+
npages = (end - addr) >> PAGE_SHIFT;
789+
pfns = &range->pfns[i];
790+
791+
cpu_flags = pud_to_hmm_pfn_flags(range, pud);
792+
hmm_range_need_fault(hmm_vma_walk, pfns, npages,
793+
cpu_flags, &fault, &write_fault);
794+
if (fault || write_fault)
795+
return hmm_vma_walk_hole_(addr, end, fault,
796+
write_fault, walk);
797+
798+
#ifdef CONFIG_HUGETLB_PAGE
799+
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
800+
for (i = 0; i < npages; ++i, ++pfn) {
801+
hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
802+
hmm_vma_walk->pgmap);
803+
if (unlikely(!hmm_vma_walk->pgmap))
804+
return -EBUSY;
805+
pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
806+
}
807+
if (hmm_vma_walk->pgmap) {
808+
put_dev_pagemap(hmm_vma_walk->pgmap);
809+
hmm_vma_walk->pgmap = NULL;
810+
}
811+
hmm_vma_walk->last = end;
812+
return 0;
813+
#else
814+
return -EINVAL;
815+
#endif
816+
}
817+
818+
split_huge_pud(walk->vma, pudp, addr);
819+
if (pud_none(*pudp))
820+
goto again;
821+
822+
pmdp = pmd_offset(pudp, addr);
823+
do {
824+
next = pmd_addr_end(addr, end);
825+
ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
826+
if (ret)
827+
return ret;
828+
} while (pmdp++, addr = next, addr != end);
829+
830+
return 0;
831+
}
832+
712833
static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
713834
unsigned long start, unsigned long end,
714835
struct mm_walk *walk)
@@ -781,14 +902,6 @@ static void hmm_pfns_clear(struct hmm_range *range,
781902
*pfns = range->values[HMM_PFN_NONE];
782903
}
783904

784-
static void hmm_pfns_special(struct hmm_range *range)
785-
{
786-
unsigned long addr = range->start, i = 0;
787-
788-
for (; addr < range->end; addr += PAGE_SIZE, i++)
789-
range->pfns[i] = range->values[HMM_PFN_SPECIAL];
790-
}
791-
792905
/*
793906
* hmm_range_register() - start tracking change to CPU page table over a range
794907
* @range: range
@@ -906,12 +1019,6 @@ long hmm_range_snapshot(struct hmm_range *range)
9061019
if (vma == NULL || (vma->vm_flags & device_vma))
9071020
return -EFAULT;
9081021

909-
/* FIXME support dax */
910-
if (vma_is_dax(vma)) {
911-
hmm_pfns_special(range);
912-
return -EINVAL;
913-
}
914-
9151022
if (is_vm_hugetlb_page(vma)) {
9161023
struct hstate *h = hstate_vma(vma);
9171024

@@ -935,6 +1042,7 @@ long hmm_range_snapshot(struct hmm_range *range)
9351042
}
9361043

9371044
range->vma = vma;
1045+
hmm_vma_walk.pgmap = NULL;
9381046
hmm_vma_walk.last = start;
9391047
hmm_vma_walk.fault = false;
9401048
hmm_vma_walk.range = range;
@@ -946,6 +1054,7 @@ long hmm_range_snapshot(struct hmm_range *range)
9461054
mm_walk.pte_entry = NULL;
9471055
mm_walk.test_walk = NULL;
9481056
mm_walk.hugetlb_entry = NULL;
1057+
mm_walk.pud_entry = hmm_vma_walk_pud;
9491058
mm_walk.pmd_entry = hmm_vma_walk_pmd;
9501059
mm_walk.pte_hole = hmm_vma_walk_hole;
9511060
mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
@@ -1011,12 +1120,6 @@ long hmm_range_fault(struct hmm_range *range, bool block)
10111120
if (vma == NULL || (vma->vm_flags & device_vma))
10121121
return -EFAULT;
10131122

1014-
/* FIXME support dax */
1015-
if (vma_is_dax(vma)) {
1016-
hmm_pfns_special(range);
1017-
return -EINVAL;
1018-
}
1019-
10201123
if (is_vm_hugetlb_page(vma)) {
10211124
if (huge_page_shift(hstate_vma(vma)) !=
10221125
range->page_shift &&
@@ -1039,6 +1142,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
10391142
}
10401143

10411144
range->vma = vma;
1145+
hmm_vma_walk.pgmap = NULL;
10421146
hmm_vma_walk.last = start;
10431147
hmm_vma_walk.fault = true;
10441148
hmm_vma_walk.block = block;
@@ -1051,6 +1155,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
10511155
mm_walk.pte_entry = NULL;
10521156
mm_walk.test_walk = NULL;
10531157
mm_walk.hugetlb_entry = NULL;
1158+
mm_walk.pud_entry = hmm_vma_walk_pud;
10541159
mm_walk.pmd_entry = hmm_vma_walk_pmd;
10551160
mm_walk.pte_hole = hmm_vma_walk_hole;
10561161
mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;

0 commit comments

Comments
 (0)