Created
June 6, 2016 08:29
-
-
Save xen0n/ade47167ad7b4d95f61cd831e04c36e0 to your computer and use it in GitHub Desktop.
[EXPERIMENTAL] UKSM ported to Linux 4.5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 3e9afc3a63c3126ee55df341547319ee4a8ac498 Mon Sep 17 00:00:00 2001 | |
From: Wang Xuerui <idontknw.wang@gmail.com> | |
Date: Mon, 6 Jun 2016 16:20:23 +0800 | |
Subject: [PATCH] add uksm patch ported for v4.5 | |
--- | |
Documentation/vm/00-INDEX | 2 + | |
Documentation/vm/uksm.txt | 59 + | |
fs/exec.c | 4 +- | |
fs/proc/meminfo.c | 6 + | |
include/asm-generic/pgtable.h | 17 +- | |
include/linux/ksm.h | 44 +- | |
include/linux/mm_types.h | 3 + | |
include/linux/mmzone.h | 5 +- | |
include/linux/sradix-tree.h | 77 + | |
include/linux/uksm.h | 146 ++ | |
kernel/fork.c | 4 +- | |
lib/Makefile | 2 +- | |
lib/sradix-tree.c | 476 ++++ | |
mm/Kconfig | 26 + | |
mm/Makefile | 3 +- | |
mm/memory.c | 39 +- | |
mm/mmap.c | 44 +- | |
mm/rmap.c | 4 +- | |
mm/uksm.c | 5525 +++++++++++++++++++++++++++++++++++++++++ | |
mm/vmstat.c | 3 + | |
20 files changed, 6459 insertions(+), 30 deletions(-) | |
create mode 100644 Documentation/vm/uksm.txt | |
create mode 100644 include/linux/sradix-tree.h | |
create mode 100644 include/linux/uksm.h | |
create mode 100644 lib/sradix-tree.c | |
create mode 100644 mm/uksm.c | |
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX | |
index 6a5e2a1..09eaa9a1 100644 | |
--- a/Documentation/vm/00-INDEX | |
+++ b/Documentation/vm/00-INDEX | |
@@ -18,6 +18,8 @@ idle_page_tracking.txt | |
- description of the idle page tracking feature. | |
ksm.txt | |
- how to use the Kernel Samepage Merging feature. | |
+uksm.txt | |
+ - Introduction to Ultra KSM | |
numa | |
- information about NUMA specific code in the Linux vm. | |
numa_memory_policy.txt | |
diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt | |
new file mode 100644 | |
index 0000000..387d96a | |
--- /dev/null | |
+++ b/Documentation/vm/uksm.txt | |
@@ -0,0 +1,59 @@ | |
+The Ultra Kernel Samepage Merging feature | |
+---------------------------------------------- | |
+/* | |
+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia | |
+ * | |
+ * This is an improvement upon KSM. Some basic data structures and routines | |
+ * are borrowed from ksm.c . | |
+ * | |
+ * Its new features: | |
+ * 1. Full system scan: | |
+ * It automatically scans all user processes' anonymous VMAs. Kernel-user | |
+ * interaction to submit a memory area to KSM is no longer needed. | |
+ * | |
+ * 2. Rich area detection: | |
+ * It automatically detects rich areas containing abundant duplicated | |
+ * pages based. Rich areas are given a full scan speed. Poor areas are | |
+ * sampled at a reasonable speed with very low CPU consumption. | |
+ * | |
+ * 3. Ultra Per-page scan speed improvement: | |
+ * A new hash algorithm is proposed. As a result, on a machine with | |
+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it | |
+ * can scan memory areas that does not contain duplicated pages at speed of | |
+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of | |
+ * 477MB/sec ~ 923MB/sec. | |
+ * | |
+ * 4. Thrashing area avoidance: | |
+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be | |
+ * filtered out. My benchmark shows it's more efficient than KSM's per-page | |
+ * hash value based volatile page detection. | |
+ * | |
+ * | |
+ * 5. Misc changes upon KSM: | |
+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page | |
+ * comparison. It's much faster than default C version on x86. | |
+ * * rmap_item now has an struct *page member to loosely cache a | |
+ * address-->page mapping, which reduces too much time-costly | |
+ * follow_page(). | |
+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. | |
+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ | |
+ * ksm is needed for this case. | |
+ * | |
+ * 6. Full Zero Page consideration(contributed by Figo Zhang) | |
+ * Now uksmd consider full zero pages as special pages and merge them to an | |
+ * special unswappable uksm zero page. | |
+ */ | |
+ | |
+ChangeLog: | |
+ | |
+2012-05-05 The creation of this Doc | |
+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. | |
+2012-05-28 UKSM 0.1.1.2 bug fix release | |
+2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 | |
+2012-07-2 UKSM 0.1.2-beta2 | |
+2012-07-10 UKSM 0.1.2-beta3 | |
+2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. | |
+2012-10-13 UKSM 0.1.2.1 Bug fixes. | |
+2012-12-31 UKSM 0.1.2.2 Minor bug fixes. | |
+2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug". | |
+2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings. | |
diff --git a/fs/exec.c b/fs/exec.c | |
index dcd4ac7..80032f8 100644 | |
--- a/fs/exec.c | |
+++ b/fs/exec.c | |
@@ -19,7 +19,7 @@ | |
* current->executable is only used by the procfs. This allows a dispatch | |
* table to check for several different types of binary formats. We keep | |
* trying until we recognize the file or we run out of supported binary | |
- * formats. | |
+ * formats. | |
*/ | |
#include <linux/slab.h> | |
@@ -56,6 +56,7 @@ | |
#include <linux/pipe_fs_i.h> | |
#include <linux/oom.h> | |
#include <linux/compat.h> | |
+#include <linux/ksm.h> | |
#include <asm/uaccess.h> | |
#include <asm/mmu_context.h> | |
@@ -1162,6 +1163,7 @@ void setup_new_exec(struct linux_binprm * bprm) | |
/* An exec changes our domain. We are no longer part of the thread | |
group */ | |
current->self_exec_id++; | |
+ | |
flush_signal_handlers(current, 0); | |
do_close_on_exec(current->files); | |
} | |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c | |
index df4661a..70b9f95 100644 | |
--- a/fs/proc/meminfo.c | |
+++ b/fs/proc/meminfo.c | |
@@ -118,6 +118,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |
"SUnreclaim: %8lu kB\n" | |
"KernelStack: %8lu kB\n" | |
"PageTables: %8lu kB\n" | |
+#ifdef CONFIG_UKSM | |
+ "KsmZeroPages: %8lu kB\n" | |
+#endif | |
#ifdef CONFIG_QUICKLIST | |
"Quicklists: %8lu kB\n" | |
#endif | |
@@ -176,6 +179,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |
K(global_page_state(NR_SLAB_UNRECLAIMABLE)), | |
global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, | |
K(global_page_state(NR_PAGETABLE)), | |
+#ifdef CONFIG_UKSM | |
+ K(global_page_state(NR_UKSM_ZERO_PAGES)), | |
+#endif | |
#ifdef CONFIG_QUICKLIST | |
K(quicklist_total_size()), | |
#endif | |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h | |
index c370b26..f387ed1 100644 | |
--- a/include/asm-generic/pgtable.h | |
+++ b/include/asm-generic/pgtable.h | |
@@ -601,12 +601,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, | |
extern void untrack_pfn_moved(struct vm_area_struct *vma); | |
#endif | |
+#ifdef CONFIG_UKSM | |
+static inline int is_uksm_zero_pfn(unsigned long pfn) | |
+{ | |
+ extern unsigned long uksm_zero_pfn; | |
+ return pfn == uksm_zero_pfn; | |
+} | |
+#else | |
+static inline int is_uksm_zero_pfn(unsigned long pfn) | |
+{ | |
+ return 0; | |
+} | |
+#endif | |
+ | |
#ifdef __HAVE_COLOR_ZERO_PAGE | |
static inline int is_zero_pfn(unsigned long pfn) | |
{ | |
extern unsigned long zero_pfn; | |
unsigned long offset_from_zero_pfn = pfn - zero_pfn; | |
- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); | |
+ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); | |
} | |
#define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) | |
@@ -615,7 +628,7 @@ static inline int is_zero_pfn(unsigned long pfn) | |
static inline int is_zero_pfn(unsigned long pfn) | |
{ | |
extern unsigned long zero_pfn; | |
- return pfn == zero_pfn; | |
+ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); | |
} | |
static inline unsigned long my_zero_pfn(unsigned long addr) | |
diff --git a/include/linux/ksm.h b/include/linux/ksm.h | |
index 7ae216a..06861d8 100644 | |
--- a/include/linux/ksm.h | |
+++ b/include/linux/ksm.h | |
@@ -19,21 +19,6 @@ struct mem_cgroup; | |
#ifdef CONFIG_KSM | |
int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |
unsigned long end, int advice, unsigned long *vm_flags); | |
-int __ksm_enter(struct mm_struct *mm); | |
-void __ksm_exit(struct mm_struct *mm); | |
- | |
-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) | |
-{ | |
- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) | |
- return __ksm_enter(mm); | |
- return 0; | |
-} | |
- | |
-static inline void ksm_exit(struct mm_struct *mm) | |
-{ | |
- if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) | |
- __ksm_exit(mm); | |
-} | |
static inline struct stable_node *page_stable_node(struct page *page) | |
{ | |
@@ -64,6 +49,33 @@ struct page *ksm_might_need_to_copy(struct page *page, | |
int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); | |
void ksm_migrate_page(struct page *newpage, struct page *oldpage); | |
+#ifdef CONFIG_KSM_LEGACY | |
+int __ksm_enter(struct mm_struct *mm); | |
+void __ksm_exit(struct mm_struct *mm); | |
+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) | |
+{ | |
+ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) | |
+ return __ksm_enter(mm); | |
+ return 0; | |
+} | |
+ | |
+static inline void ksm_exit(struct mm_struct *mm) | |
+{ | |
+ if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) | |
+ __ksm_exit(mm); | |
+} | |
+ | |
+#elif defined(CONFIG_UKSM) | |
+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) | |
+{ | |
+ return 0; | |
+} | |
+ | |
+static inline void ksm_exit(struct mm_struct *mm) | |
+{ | |
+} | |
+#endif /* !CONFIG_UKSM */ | |
+ | |
#else /* !CONFIG_KSM */ | |
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) | |
@@ -106,4 +118,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) | |
#endif /* CONFIG_MMU */ | |
#endif /* !CONFIG_KSM */ | |
+#include <linux/uksm.h> | |
+ | |
#endif /* __LINUX_KSM_H */ | |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h | |
index 624b78b..f48da4a 100644 | |
--- a/include/linux/mm_types.h | |
+++ b/include/linux/mm_types.h | |
@@ -351,6 +351,9 @@ struct vm_area_struct { | |
#ifdef CONFIG_NUMA | |
struct mempolicy *vm_policy; /* NUMA policy for the VMA */ | |
#endif | |
+#ifdef CONFIG_UKSM | |
+ struct vma_slot *uksm_vma_slot; | |
+#endif | |
struct vm_userfaultfd_ctx vm_userfaultfd_ctx; | |
}; | |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h | |
index 7b6c2cf..0386513 100644 | |
--- a/include/linux/mmzone.h | |
+++ b/include/linux/mmzone.h | |
@@ -157,6 +157,9 @@ enum zone_stat_item { | |
WORKINGSET_NODERECLAIM, | |
NR_ANON_TRANSPARENT_HUGEPAGES, | |
NR_FREE_CMA_PAGES, | |
+#ifdef CONFIG_UKSM | |
+ NR_UKSM_ZERO_PAGES, | |
+#endif | |
NR_VM_ZONE_STAT_ITEMS }; | |
/* | |
@@ -808,7 +811,7 @@ static inline int is_highmem_idx(enum zone_type idx) | |
} | |
/** | |
- * is_highmem - helper function to quickly check if a struct zone is a | |
+ * is_highmem - helper function to quickly check if a struct zone is a | |
* highmem zone or not. This is an attempt to keep references | |
* to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. | |
* @zone - pointer to struct zone variable | |
diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h | |
new file mode 100644 | |
index 0000000..6780fdb | |
--- /dev/null | |
+++ b/include/linux/sradix-tree.h | |
@@ -0,0 +1,77 @@ | |
+#ifndef _LINUX_SRADIX_TREE_H | |
+#define _LINUX_SRADIX_TREE_H | |
+ | |
+ | |
+#define INIT_SRADIX_TREE(root, mask) \ | |
+do { \ | |
+ (root)->height = 0; \ | |
+ (root)->gfp_mask = (mask); \ | |
+ (root)->rnode = NULL; \ | |
+} while (0) | |
+ | |
+#define ULONG_BITS (sizeof(unsigned long) * 8) | |
+#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) | |
+//#define SRADIX_TREE_MAP_SHIFT 6 | |
+//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) | |
+//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) | |
+ | |
+struct sradix_tree_node { | |
+ unsigned int height; /* Height from the bottom */ | |
+ unsigned int count; | |
+ unsigned int fulls; /* Number of full sublevel trees */ | |
+ struct sradix_tree_node *parent; | |
+ void *stores[0]; | |
+}; | |
+ | |
+/* A simple radix tree implementation */ | |
+struct sradix_tree_root { | |
+ unsigned int height; | |
+ struct sradix_tree_node *rnode; | |
+ | |
+ /* Where found to have available empty stores in its sublevels */ | |
+ struct sradix_tree_node *enter_node; | |
+ unsigned int shift; | |
+ unsigned int stores_size; | |
+ unsigned int mask; | |
+ unsigned long min; /* The first hole index */ | |
+ unsigned long num; | |
+ //unsigned long *height_to_maxindex; | |
+ | |
+ /* How the node is allocated and freed. */ | |
+ struct sradix_tree_node *(*alloc)(void); | |
+ void (*free)(struct sradix_tree_node *node); | |
+ | |
+ /* When a new node is added and removed */ | |
+ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); | |
+ void (*assign)(struct sradix_tree_node *node, unsigned index, void *item); | |
+ void (*rm)(struct sradix_tree_node *node, unsigned offset); | |
+}; | |
+ | |
+struct sradix_tree_path { | |
+ struct sradix_tree_node *node; | |
+ int offset; | |
+}; | |
+ | |
+static inline | |
+void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) | |
+{ | |
+ root->height = 0; | |
+ root->rnode = NULL; | |
+ root->shift = shift; | |
+ root->stores_size = 1UL << shift; | |
+ root->mask = root->stores_size - 1; | |
+} | |
+ | |
+ | |
+extern void *sradix_tree_next(struct sradix_tree_root *root, | |
+ struct sradix_tree_node *node, unsigned long index, | |
+ int (*iter)(void *, unsigned long)); | |
+ | |
+extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); | |
+ | |
+extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, | |
+ struct sradix_tree_node *node, unsigned long index); | |
+ | |
+extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); | |
+ | |
+#endif /* _LINUX_SRADIX_TREE_H */ | |
diff --git a/include/linux/uksm.h b/include/linux/uksm.h | |
new file mode 100644 | |
index 0000000..206f109 | |
--- /dev/null | |
+++ b/include/linux/uksm.h | |
@@ -0,0 +1,146 @@ | |
+#ifndef __LINUX_UKSM_H | |
+#define __LINUX_UKSM_H | |
+/* | |
+ * Memory merging support. | |
+ * | |
+ * This code enables dynamic sharing of identical pages found in different | |
+ * memory areas, even if they are not shared by fork(). | |
+ */ | |
+ | |
+/* if !CONFIG_UKSM this file should not be compiled at all. */ | |
+#ifdef CONFIG_UKSM | |
+ | |
+#include <linux/bitops.h> | |
+#include <linux/mm.h> | |
+#include <linux/pagemap.h> | |
+#include <linux/rmap.h> | |
+#include <linux/sched.h> | |
+ | |
+extern unsigned long zero_pfn __read_mostly; | |
+extern unsigned long uksm_zero_pfn __read_mostly; | |
+extern struct page *empty_uksm_zero_page; | |
+ | |
+/* must be done before linked to mm */ | |
+extern void uksm_vma_add_new(struct vm_area_struct *vma); | |
+extern void uksm_remove_vma(struct vm_area_struct *vma); | |
+ | |
+#define UKSM_SLOT_NEED_SORT (1 << 0) | |
+#define UKSM_SLOT_NEED_RERAND (1 << 1) | |
+#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ | |
+#define UKSM_SLOT_FUL_SCANNED (1 << 3) | |
+#define UKSM_SLOT_IN_UKSM (1 << 4) | |
+ | |
+struct vma_slot { | |
+ struct sradix_tree_node *snode; | |
+ unsigned long sindex; | |
+ | |
+ struct list_head slot_list; | |
+ unsigned long fully_scanned_round; | |
+ unsigned long dedup_num; | |
+ unsigned long pages_scanned; | |
+ unsigned long last_scanned; | |
+ unsigned long pages_to_scan; | |
+ struct scan_rung *rung; | |
+ struct page **rmap_list_pool; | |
+ unsigned int *pool_counts; | |
+ unsigned long pool_size; | |
+ struct vm_area_struct *vma; | |
+ struct mm_struct *mm; | |
+ unsigned long ctime_j; | |
+ unsigned long pages; | |
+ unsigned long flags; | |
+ unsigned long pages_cowed; /* pages cowed this round */ | |
+ unsigned long pages_merged; /* pages merged this round */ | |
+ unsigned long pages_bemerged; | |
+ | |
+ /* when it has page merged in this eval round */ | |
+ struct list_head dedup_list; | |
+}; | |
+ | |
+static inline void uksm_unmap_zero_page(pte_t pte) | |
+{ | |
+ if (pte_pfn(pte) == uksm_zero_pfn) | |
+ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); | |
+} | |
+ | |
+static inline void uksm_map_zero_page(pte_t pte) | |
+{ | |
+ if (pte_pfn(pte) == uksm_zero_pfn) | |
+ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); | |
+} | |
+ | |
+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) | |
+{ | |
+ if (vma->uksm_vma_slot && PageKsm(page)) | |
+ vma->uksm_vma_slot->pages_cowed++; | |
+} | |
+ | |
+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) | |
+{ | |
+ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) | |
+ vma->uksm_vma_slot->pages_cowed++; | |
+} | |
+ | |
+static inline int uksm_flags_can_scan(unsigned long vm_flags) | |
+{ | |
+#ifndef VM_SAO | |
+#define VM_SAO 0 | |
+#endif | |
+ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | | |
+ VM_HUGETLB | VM_MIXEDMAP | VM_SHARED | |
+ | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN | VM_SAO)); | |
+} | |
+ | |
+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) | |
+{ | |
+ if (uksm_flags_can_scan(*vm_flags_p)) | |
+ *vm_flags_p |= VM_MERGEABLE; | |
+} | |
+ | |
+/* | |
+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will | |
+ * be removed when uksm zero page patch is stable enough. | |
+ */ | |
+static inline void uksm_bugon_zeropage(pte_t pte) | |
+{ | |
+ BUG_ON(pte_pfn(pte) == uksm_zero_pfn); | |
+} | |
+#else | |
+static inline void uksm_vma_add_new(struct vm_area_struct *vma) | |
+{ | |
+} | |
+ | |
+static inline void uksm_remove_vma(struct vm_area_struct *vma) | |
+{ | |
+} | |
+ | |
+static inline void uksm_unmap_zero_page(pte_t pte) | |
+{ | |
+} | |
+ | |
+static inline void uksm_map_zero_page(pte_t pte) | |
+{ | |
+} | |
+ | |
+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) | |
+{ | |
+} | |
+ | |
+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) | |
+{ | |
+} | |
+ | |
+static inline int uksm_flags_can_scan(unsigned long vm_flags) | |
+{ | |
+ return 0; | |
+} | |
+ | |
+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) | |
+{ | |
+} | |
+ | |
+static inline void uksm_bugon_zeropage(pte_t pte) | |
+{ | |
+} | |
+#endif /* !CONFIG_UKSM */ | |
+#endif /* __LINUX_UKSM_H */ | |
diff --git a/kernel/fork.c b/kernel/fork.c | |
index 2e391c7..f917401 100644 | |
--- a/kernel/fork.c | |
+++ b/kernel/fork.c | |
@@ -444,7 +444,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |
goto fail_nomem; | |
charge = len; | |
} | |
- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | |
+ tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | |
if (!tmp) | |
goto fail_nomem; | |
*tmp = *mpnt; | |
@@ -497,7 +497,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |
__vma_link_rb(mm, tmp, rb_link, rb_parent); | |
rb_link = &tmp->vm_rb.rb_right; | |
rb_parent = &tmp->vm_rb; | |
- | |
+ uksm_vma_add_new(tmp); | |
mm->map_count++; | |
retval = copy_page_range(mm, oldmm, mpnt); | |
diff --git a/lib/Makefile b/lib/Makefile | |
index a7c26a4..1bf34ea 100644 | |
--- a/lib/Makefile | |
+++ b/lib/Makefile | |
@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) | |
endif | |
lib-y := ctype.o string.o vsprintf.o cmdline.o \ | |
- rbtree.o radix-tree.o dump_stack.o timerqueue.o\ | |
+ rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ | |
idr.o int_sqrt.o extable.o \ | |
sha1.o md5.o irq_regs.o argv_split.o \ | |
proportions.o flex_proportions.o ratelimit.o show_mem.o \ | |
diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c | |
new file mode 100644 | |
index 0000000..8d06329 | |
--- /dev/null | |
+++ b/lib/sradix-tree.c | |
@@ -0,0 +1,476 @@ | |
+#include <linux/errno.h> | |
+#include <linux/mm.h> | |
+#include <linux/mman.h> | |
+#include <linux/spinlock.h> | |
+#include <linux/slab.h> | |
+#include <linux/gcd.h> | |
+#include <linux/sradix-tree.h> | |
+ | |
+static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) | |
+{ | |
+ return node->fulls == root->stores_size || | |
+ (node->height == 1 && node->count == root->stores_size); | |
+} | |
+ | |
+/* | |
+ * Extend a sradix tree so it can store key @index. | |
+ */ | |
+static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) | |
+{ | |
+ struct sradix_tree_node *node; | |
+ unsigned int height; | |
+ | |
+ if (unlikely(root->rnode == NULL)) { | |
+ if (!(node = root->alloc())) | |
+ return -ENOMEM; | |
+ | |
+ node->height = 1; | |
+ root->rnode = node; | |
+ root->height = 1; | |
+ } | |
+ | |
+ /* Figure out what the height should be. */ | |
+ height = root->height; | |
+ index >>= root->shift * height; | |
+ | |
+ while (index) { | |
+ index >>= root->shift; | |
+ height++; | |
+ } | |
+ | |
+ while (height > root->height) { | |
+ unsigned int newheight; | |
+ if (!(node = root->alloc())) | |
+ return -ENOMEM; | |
+ | |
+ /* Increase the height. */ | |
+ node->stores[0] = root->rnode; | |
+ root->rnode->parent = node; | |
+ if (root->extend) | |
+ root->extend(node, root->rnode); | |
+ | |
+ newheight = root->height + 1; | |
+ node->height = newheight; | |
+ node->count = 1; | |
+ if (sradix_node_full(root, root->rnode)) | |
+ node->fulls = 1; | |
+ | |
+ root->rnode = node; | |
+ root->height = newheight; | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
+/* | |
+ * Search the next item from the current node, that is not NULL | |
+ * and can satify root->iter(). | |
+ */ | |
+void *sradix_tree_next(struct sradix_tree_root *root, | |
+ struct sradix_tree_node *node, unsigned long index, | |
+ int (*iter)(void *item, unsigned long height)) | |
+{ | |
+ unsigned long offset; | |
+ void *item; | |
+ | |
+ if (unlikely(node == NULL)) { | |
+ node = root->rnode; | |
+ for (offset = 0; offset < root->stores_size; offset++) { | |
+ item = node->stores[offset]; | |
+ if (item && (!iter || iter(item, node->height))) | |
+ break; | |
+ } | |
+ | |
+ if (unlikely(offset >= root->stores_size)) | |
+ return NULL; | |
+ | |
+ if (node->height == 1) | |
+ return item; | |
+ else | |
+ goto go_down; | |
+ } | |
+ | |
+ while (node) { | |
+ offset = (index & root->mask) + 1; | |
+ for (;offset < root->stores_size; offset++) { | |
+ item = node->stores[offset]; | |
+ if (item && (!iter || iter(item, node->height))) | |
+ break; | |
+ } | |
+ | |
+ if (offset < root->stores_size) | |
+ break; | |
+ | |
+ node = node->parent; | |
+ index >>= root->shift; | |
+ } | |
+ | |
+ if (!node) | |
+ return NULL; | |
+ | |
+ while (node->height > 1) { | |
+go_down: | |
+ node = item; | |
+ for (offset = 0; offset < root->stores_size; offset++) { | |
+ item = node->stores[offset]; | |
+ if (item && (!iter || iter(item, node->height))) | |
+ break; | |
+ } | |
+ | |
+ if (unlikely(offset >= root->stores_size)) | |
+ return NULL; | |
+ } | |
+ | |
+ BUG_ON(offset > root->stores_size); | |
+ | |
+ return item; | |
+} | |
+ | |
+/* | |
+ * Blindly insert the item to the tree. Typically, we reuse the | |
+ * first empty store item. | |
+ */ | |
+int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) | |
+{ | |
+ unsigned long index; | |
+ unsigned int height; | |
+ struct sradix_tree_node *node, *tmp = NULL; | |
+ int offset, offset_saved; | |
+ void **store = NULL; | |
+ int error, i, j, shift; | |
+ | |
+go_on: | |
+ index = root->min; | |
+ | |
+ if (root->enter_node && !sradix_node_full(root, root->enter_node)) { | |
+ node = root->enter_node; | |
+ BUG_ON((index >> (root->shift * root->height))); | |
+ } else { | |
+ node = root->rnode; | |
+ if (node == NULL || (index >> (root->shift * root->height)) | |
+ || sradix_node_full(root, node)) { | |
+ error = sradix_tree_extend(root, index); | |
+ if (error) | |
+ return error; | |
+ | |
+ node = root->rnode; | |
+ } | |
+ } | |
+ | |
+ | |
+ height = node->height; | |
+ shift = (height - 1) * root->shift; | |
+ offset = (index >> shift) & root->mask; | |
+ while (shift > 0) { | |
+ offset_saved = offset; | |
+ for (; offset < root->stores_size; offset++) { | |
+ store = &node->stores[offset]; | |
+ tmp = *store; | |
+ | |
+ if (!tmp || !sradix_node_full(root, tmp)) | |
+ break; | |
+ } | |
+ BUG_ON(offset >= root->stores_size); | |
+ | |
+ if (offset != offset_saved) { | |
+ index += (offset - offset_saved) << shift; | |
+ index &= ~((1UL << shift) - 1); | |
+ } | |
+ | |
+ if (!tmp) { | |
+ if (!(tmp = root->alloc())) | |
+ return -ENOMEM; | |
+ | |
+ tmp->height = shift / root->shift; | |
+ *store = tmp; | |
+ tmp->parent = node; | |
+ node->count++; | |
+// if (root->extend) | |
+// root->extend(node, tmp); | |
+ } | |
+ | |
+ node = tmp; | |
+ shift -= root->shift; | |
+ offset = (index >> shift) & root->mask; | |
+ } | |
+ | |
+ BUG_ON(node->height != 1); | |
+ | |
+ | |
+ store = &node->stores[offset]; | |
+ for (i = 0, j = 0; | |
+ j < root->stores_size - node->count && | |
+ i < root->stores_size - offset && j < num; i++) { | |
+ if (!store[i]) { | |
+ store[i] = item[j]; | |
+ if (root->assign) | |
+ root->assign(node, index + i, item[j]); | |
+ j++; | |
+ } | |
+ } | |
+ | |
+ node->count += j; | |
+ root->num += j; | |
+ num -= j; | |
+ | |
+ while (sradix_node_full(root, node)) { | |
+ node = node->parent; | |
+ if (!node) | |
+ break; | |
+ | |
+ node->fulls++; | |
+ } | |
+ | |
+ if (unlikely(!node)) { | |
+ /* All nodes are full */ | |
+ root->min = 1 << (root->height * root->shift); | |
+ root->enter_node = NULL; | |
+ } else { | |
+ root->min = index + i - 1; | |
+ root->min |= (1UL << (node->height - 1)) - 1; | |
+ root->min++; | |
+ root->enter_node = node; | |
+ } | |
+ | |
+ if (num) { | |
+ item += j; | |
+ goto go_on; | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
+ | |
+/** | |
+ * sradix_tree_shrink - shrink height of a sradix tree to minimal | |
+ * @root sradix tree root | |
+ * | |
+ */ | |
+static inline void sradix_tree_shrink(struct sradix_tree_root *root) | |
+{ | |
+ /* try to shrink tree height */ | |
+ while (root->height > 1) { | |
+ struct sradix_tree_node *to_free = root->rnode; | |
+ | |
+ /* | |
+ * The candidate node has more than one child, or its child | |
+ * is not at the leftmost store, we cannot shrink. | |
+ */ | |
+ if (to_free->count != 1 || !to_free->stores[0]) | |
+ break; | |
+ | |
+ root->rnode = to_free->stores[0]; | |
+ root->rnode->parent = NULL; | |
+ root->height--; | |
+ if (unlikely(root->enter_node == to_free)) { | |
+ root->enter_node = NULL; | |
+ } | |
+ root->free(to_free); | |
+ } | |
+} | |
+ | |
+/* | |
+ * Del the item on the known leaf node and index | |
+ */ | |
+void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, | |
+ struct sradix_tree_node *node, unsigned long index) | |
+{ | |
+ unsigned int offset; | |
+ struct sradix_tree_node *start, *end; | |
+ | |
+ BUG_ON(node->height != 1); | |
+ | |
+ start = node; | |
+ while (node && !(--node->count)) | |
+ node = node->parent; | |
+ | |
+ end = node; | |
+ if (!node) { | |
+ root->rnode = NULL; | |
+ root->height = 0; | |
+ root->min = 0; | |
+ root->num = 0; | |
+ root->enter_node = NULL; | |
+ } else { | |
+ offset = (index >> (root->shift * (node->height - 1))) & root->mask; | |
+ if (root->rm) | |
+ root->rm(node, offset); | |
+ node->stores[offset] = NULL; | |
+ root->num--; | |
+ if (root->min > index) { | |
+ root->min = index; | |
+ root->enter_node = node; | |
+ } | |
+ } | |
+ | |
+ if (start != end) { | |
+ do { | |
+ node = start; | |
+ start = start->parent; | |
+ if (unlikely(root->enter_node == node)) | |
+ root->enter_node = end; | |
+ root->free(node); | |
+ } while (start != end); | |
+ | |
+ /* | |
+ * Note that shrink may free "end", so enter_node still need to | |
+ * be checked inside. | |
+ */ | |
+ sradix_tree_shrink(root); | |
+ } else if (node->count == root->stores_size - 1) { | |
+ /* It WAS a full leaf node. Update the ancestors */ | |
+ node = node->parent; | |
+ while (node) { | |
+ node->fulls--; | |
+ if (node->fulls != root->stores_size - 1) | |
+ break; | |
+ | |
+ node = node->parent; | |
+ } | |
+ } | |
+} | |
+ | |
+void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) | |
+{ | |
+ unsigned int height, offset; | |
+ struct sradix_tree_node *node; | |
+ int shift; | |
+ | |
+ node = root->rnode; | |
+ if (node == NULL || (index >> (root->shift * root->height))) | |
+ return NULL; | |
+ | |
+ height = root->height; | |
+ shift = (height - 1) * root->shift; | |
+ | |
+ do { | |
+ offset = (index >> shift) & root->mask; | |
+ node = node->stores[offset]; | |
+ if (!node) | |
+ return NULL; | |
+ | |
+ shift -= root->shift; | |
+ } while (shift >= 0); | |
+ | |
+ return node; | |
+} | |
+ | |
+/* | |
+ * Return the item if it exists, otherwise create it in place | |
+ * and return the created item. | |
+ */ | |
+void *sradix_tree_lookup_create(struct sradix_tree_root *root, | |
+ unsigned long index, void *(*item_alloc)(void)) | |
+{ | |
+ unsigned int height, offset; | |
+ struct sradix_tree_node *node, *tmp; | |
+ void *item; | |
+ int shift, error; | |
+ | |
+ if (root->rnode == NULL || (index >> (root->shift * root->height))) { | |
+ if (item_alloc) { | |
+ error = sradix_tree_extend(root, index); | |
+ if (error) | |
+ return NULL; | |
+ } else { | |
+ return NULL; | |
+ } | |
+ } | |
+ | |
+ node = root->rnode; | |
+ height = root->height; | |
+ shift = (height - 1) * root->shift; | |
+ | |
+ do { | |
+ offset = (index >> shift) & root->mask; | |
+ if (!node->stores[offset]) { | |
+ if (!(tmp = root->alloc())) | |
+ return NULL; | |
+ | |
+ tmp->height = shift / root->shift; | |
+ node->stores[offset] = tmp; | |
+ tmp->parent = node; | |
+ node->count++; | |
+ node = tmp; | |
+ } else { | |
+ node = node->stores[offset]; | |
+ } | |
+ | |
+ shift -= root->shift; | |
+ } while (shift > 0); | |
+ | |
+ BUG_ON(node->height != 1); | |
+ offset = index & root->mask; | |
+ if (node->stores[offset]) { | |
+ return node->stores[offset]; | |
+ } else if (item_alloc) { | |
+ if (!(item = item_alloc())) | |
+ return NULL; | |
+ | |
+ node->stores[offset] = item; | |
+ | |
+ /* | |
+ * NOTE: we do NOT call root->assign here, since this item is | |
+ * newly created by us having no meaning. Caller can call this | |
+ * if it's necessary to do so. | |
+ */ | |
+ | |
+ node->count++; | |
+ root->num++; | |
+ | |
+ while (sradix_node_full(root, node)) { | |
+ node = node->parent; | |
+ if (!node) | |
+ break; | |
+ | |
+ node->fulls++; | |
+ } | |
+ | |
+ if (unlikely(!node)) { | |
+ /* All nodes are full */ | |
+ root->min = 1 << (root->height * root->shift); | |
+ } else { | |
+ if (root->min == index) { | |
+ root->min |= (1UL << (node->height - 1)) - 1; | |
+ root->min++; | |
+ root->enter_node = node; | |
+ } | |
+ } | |
+ | |
+ return item; | |
+ } else { | |
+ return NULL; | |
+ } | |
+ | |
+} | |
+ | |
+int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) | |
+{ | |
+ unsigned int height, offset; | |
+ struct sradix_tree_node *node; | |
+ int shift; | |
+ | |
+ node = root->rnode; | |
+ if (node == NULL || (index >> (root->shift * root->height))) | |
+ return -ENOENT; | |
+ | |
+ height = root->height; | |
+ shift = (height - 1) * root->shift; | |
+ | |
+ do { | |
+ offset = (index >> shift) & root->mask; | |
+ node = node->stores[offset]; | |
+ if (!node) | |
+ return -ENOENT; | |
+ | |
+ shift -= root->shift; | |
+ } while (shift > 0); | |
+ | |
+ offset = index & root->mask; | |
+ if (!node->stores[offset]) | |
+ return -ENOENT; | |
+ | |
+ sradix_tree_delete_from_leaf(root, node, index); | |
+ | |
+ return 0; | |
+} | |
diff --git a/mm/Kconfig b/mm/Kconfig | |
index 03cbfa0..1b1b681 100644 | |
--- a/mm/Kconfig | |
+++ b/mm/Kconfig | |
@@ -322,6 +322,32 @@ config KSM | |
See Documentation/vm/ksm.txt for more information: KSM is inactive | |
until a program has madvised that an area is MADV_MERGEABLE, and | |
root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). | |
+choice | |
+ prompt "Choose UKSM/KSM strategy" | |
+ default UKSM | |
+ depends on KSM | |
+ help | |
+ This option allows to select a UKSM/KSM stragety. | |
+ | |
+config UKSM | |
+ bool "Ultra-KSM for page merging" | |
+ depends on KSM | |
+ help | |
+ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same | |
+ page Merging), but with a fundamentally rewritten core algorithm. With | |
+ an advanced algorithm, UKSM now can transparently scans all anonymously | |
+ mapped user space applications with an significantly improved scan speed | |
+ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from | |
+ UKSM. Now UKSM has its first stable release and first real world enterprise user. | |
+ For more information, please goto its project page. | |
+ (www.kerneldedup.org) | |
+ | |
+config KSM_LEGACY | |
+ bool "Legacy KSM implementation" | |
+ depends on KSM | |
+ help | |
+ The legacy KSM implementation from Redhat. | |
+endchoice | |
config DEFAULT_MMAP_MIN_ADDR | |
int "Low address space to protect from user allocation" | |
diff --git a/mm/Makefile b/mm/Makefile | |
index 2ed4319..90fd58a 100644 | |
--- a/mm/Makefile | |
+++ b/mm/Makefile | |
@@ -47,7 +47,8 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o | |
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | |
obj-$(CONFIG_SLOB) += slob.o | |
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | |
-obj-$(CONFIG_KSM) += ksm.o | |
+obj-$(CONFIG_KSM_LEGACY) += ksm.o | |
+obj-$(CONFIG_UKSM) += uksm.o | |
obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | |
obj-$(CONFIG_SLAB) += slab.o | |
obj-$(CONFIG_SLUB) += slub.o | |
diff --git a/mm/memory.c b/mm/memory.c | |
index 8132787..b6175a5 100644 | |
--- a/mm/memory.c | |
+++ b/mm/memory.c | |
@@ -122,6 +122,28 @@ unsigned long highest_memmap_pfn __read_mostly; | |
EXPORT_SYMBOL(zero_pfn); | |
+#ifdef CONFIG_UKSM | |
+unsigned long uksm_zero_pfn __read_mostly; | |
+EXPORT_SYMBOL_GPL(uksm_zero_pfn); | |
+struct page *empty_uksm_zero_page; | |
+ | |
+static int __init setup_uksm_zero_page(void) | |
+{ | |
+ unsigned long addr; | |
+ addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0); | |
+ if (!addr) | |
+ panic("Oh boy, that early out of memory?"); | |
+ | |
+ empty_uksm_zero_page = virt_to_page((void *) addr); | |
+ SetPageReserved(empty_uksm_zero_page); | |
+ | |
+ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); | |
+ | |
+ return 0; | |
+} | |
+core_initcall(setup_uksm_zero_page); | |
+#endif | |
+ | |
/* | |
* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() | |
*/ | |
@@ -133,6 +155,7 @@ static int __init init_zero_pfn(void) | |
core_initcall(init_zero_pfn); | |
+ | |
#if defined(SPLIT_RSS_COUNTING) | |
void sync_mm_rss(struct mm_struct *mm) | |
@@ -867,6 +890,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |
get_page(page); | |
page_dup_rmap(page, false); | |
rss[mm_counter(page)]++; | |
+ | |
+ /* Should return NULL in vm_normal_page() */ | |
+ uksm_bugon_zeropage(pte); | |
+ } else { | |
+ uksm_map_zero_page(pte); | |
} | |
out_set_pte: | |
@@ -1100,8 +1128,10 @@ again: | |
ptent = ptep_get_and_clear_full(mm, addr, pte, | |
tlb->fullmm); | |
tlb_remove_tlb_entry(tlb, pte, addr); | |
- if (unlikely(!page)) | |
+ if (unlikely(!page)) { | |
+ uksm_unmap_zero_page(ptent); | |
continue; | |
+ } | |
if (!PageAnon(page)) { | |
if (pte_dirty(ptent)) { | |
@@ -1937,8 +1967,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo | |
clear_page(kaddr); | |
kunmap_atomic(kaddr); | |
flush_dcache_page(dst); | |
- } else | |
+ } else { | |
copy_user_highpage(dst, src, va, vma); | |
+ uksm_cow_page(vma, src); | |
+ } | |
} | |
static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) | |
@@ -2083,6 +2115,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |
new_page = alloc_zeroed_user_highpage_movable(vma, address); | |
if (!new_page) | |
goto oom; | |
+ uksm_cow_pte(vma, orig_pte); | |
} else { | |
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | |
if (!new_page) | |
@@ -2108,7 +2141,9 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |
mm_counter_file(old_page)); | |
inc_mm_counter_fast(mm, MM_ANONPAGES); | |
} | |
+ uksm_bugon_zeropage(orig_pte); | |
} else { | |
+ uksm_unmap_zero_page(orig_pte); | |
inc_mm_counter_fast(mm, MM_ANONPAGES); | |
} | |
flush_cache_page(vma, address, pte_pfn(orig_pte)); | |
diff --git a/mm/mmap.c b/mm/mmap.c | |
index 76d1ec2..60b6347 100644 | |
--- a/mm/mmap.c | |
+++ b/mm/mmap.c | |
@@ -41,6 +41,7 @@ | |
#include <linux/notifier.h> | |
#include <linux/memory.h> | |
#include <linux/printk.h> | |
+#include <linux/ksm.h> | |
#include <linux/userfaultfd_k.h> | |
#include <linux/moduleparam.h> | |
@@ -292,6 +293,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | |
if (vma->vm_file) | |
fput(vma->vm_file); | |
mpol_put(vma_policy(vma)); | |
+ uksm_remove_vma(vma); | |
kmem_cache_free(vm_area_cachep, vma); | |
return next; | |
} | |
@@ -756,9 +758,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |
long adjust_next = 0; | |
int remove_next = 0; | |
+/* | |
+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is | |
+ * acquired | |
+ */ | |
+ uksm_remove_vma(vma); | |
+ | |
if (next && !insert) { | |
struct vm_area_struct *exporter = NULL; | |
+ uksm_remove_vma(next); | |
if (end >= next->vm_end) { | |
/* | |
* vma expands, overlapping all the next, and | |
@@ -852,6 +861,7 @@ again: remove_next = 1 + (end > next->vm_end); | |
end_changed = true; | |
} | |
vma->vm_pgoff = pgoff; | |
+ | |
if (adjust_next) { | |
next->vm_start += adjust_next << PAGE_SHIFT; | |
next->vm_pgoff += adjust_next; | |
@@ -922,16 +932,22 @@ again: remove_next = 1 + (end > next->vm_end); | |
* up the code too much to do both in one go. | |
*/ | |
next = vma->vm_next; | |
- if (remove_next == 2) | |
+ if (remove_next == 2) { | |
+ uksm_remove_vma(next); | |
goto again; | |
- else if (next) | |
+ } else if (next) { | |
vma_gap_update(next); | |
- else | |
+ } else { | |
mm->highest_vm_end = end; | |
+ } | |
+ } else { | |
+ if (next && !insert) | |
+ uksm_vma_add_new(next); | |
} | |
if (insert && file) | |
uprobe_mmap(insert); | |
+ uksm_vma_add_new(vma); | |
validate_mm(mm); | |
return 0; | |
@@ -1316,6 +1332,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, | |
vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | | |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | |
+ /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */ | |
+ uksm_vm_flags_mod(&vm_flags); | |
+ | |
if (flags & MAP_LOCKED) | |
if (!can_do_mlock()) | |
return -EPERM; | |
@@ -1654,6 +1673,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |
allow_write_access(file); | |
} | |
file = vma->vm_file; | |
+ uksm_vma_add_new(vma); | |
out: | |
perf_event_mmap(vma); | |
@@ -1695,6 +1715,7 @@ allow_write_and_free_vma: | |
if (vm_flags & VM_DENYWRITE) | |
allow_write_access(file); | |
free_vma: | |
+ uksm_remove_vma(vma); | |
kmem_cache_free(vm_area_cachep, vma); | |
unacct_error: | |
if (charged) | |
@@ -2490,6 +2511,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |
else | |
err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); | |
+ uksm_vma_add_new(new); | |
+ | |
/* Success. */ | |
if (!err) | |
return 0; | |
@@ -2750,6 +2773,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |
return addr; | |
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | |
+ uksm_vm_flags_mod(&flags); | |
error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); | |
if (offset_in_page(error)) | |
@@ -2807,6 +2831,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |
vma->vm_flags = flags; | |
vma->vm_page_prot = vm_get_page_prot(flags); | |
vma_link(mm, vma, prev, rb_link, rb_parent); | |
+ uksm_vma_add_new(vma); | |
out: | |
perf_event_mmap(vma); | |
mm->total_vm += len >> PAGE_SHIFT; | |
@@ -2843,6 +2868,12 @@ void exit_mmap(struct mm_struct *mm) | |
/* mm's last user has gone, and its about to be pulled down */ | |
mmu_notifier_release(mm); | |
+ /* | |
+ * Taking write lock on mmap_sem does not harm others, | |
+ * but it's crucial for uksm to avoid races. | |
+ */ | |
+ down_write(&mm->mmap_sem); | |
+ | |
if (mm->locked_vm) { | |
vma = mm->mmap; | |
while (vma) { | |
@@ -2878,6 +2909,11 @@ void exit_mmap(struct mm_struct *mm) | |
vma = remove_vma(vma); | |
} | |
vm_unacct_memory(nr_accounted); | |
+ | |
+ mm->mmap = NULL; | |
+ mm->mm_rb = RB_ROOT; | |
+ vmacache_invalidate(mm); | |
+ up_write(&mm->mmap_sem); | |
} | |
/* Insert vm structure into process list sorted by address | |
@@ -2987,6 +3023,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |
new_vma->vm_ops->open(new_vma); | |
vma_link(mm, new_vma, prev, rb_link, rb_parent); | |
*need_rmap_locks = false; | |
+ uksm_vma_add_new(new_vma); | |
} | |
return new_vma; | |
@@ -3116,6 +3153,7 @@ static struct vm_area_struct *__install_special_mapping( | |
vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); | |
perf_event_mmap(vma); | |
+ uksm_vma_add_new(vma); | |
return vma; | |
diff --git a/mm/rmap.c b/mm/rmap.c | |
index 79f3bf0..ea35ca3 100644 | |
--- a/mm/rmap.c | |
+++ b/mm/rmap.c | |
@@ -1125,9 +1125,9 @@ void page_move_anon_rmap(struct page *page, | |
/** | |
* __page_set_anon_rmap - set up new anonymous rmap | |
- * @page: Page to add to rmap | |
+ * @page: Page to add to rmap | |
* @vma: VM area to add page to. | |
- * @address: User virtual address of the mapping | |
+ * @address: User virtual address of the mapping | |
* @exclusive: the page is exclusively owned by the current process | |
*/ | |
static void __page_set_anon_rmap(struct page *page, | |
diff --git a/mm/uksm.c b/mm/uksm.c | |
new file mode 100644 | |
index 0000000..bfa97c9 | |
--- /dev/null | |
+++ b/mm/uksm.c | |
@@ -0,0 +1,5525 @@ | |
+/* | |
+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia | |
+ * | |
+ * This is an improvement upon KSM. Some basic data structures and routines | |
+ * are borrowed from ksm.c . | |
+ * | |
+ * Its new features: | |
+ * 1. Full system scan: | |
+ * It automatically scans all user processes' anonymous VMAs. Kernel-user | |
+ * interaction to submit a memory area to KSM is no longer needed. | |
+ * | |
+ * 2. Rich area detection: | |
+ * It automatically detects rich areas containing abundant duplicated | |
+ * pages based. Rich areas are given a full scan speed. Poor areas are | |
+ * sampled at a reasonable speed with very low CPU consumption. | |
+ * | |
+ * 3. Ultra Per-page scan speed improvement: | |
+ * A new hash algorithm is proposed. As a result, on a machine with | |
+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it | |
+ * can scan memory areas that does not contain duplicated pages at speed of | |
+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of | |
+ * 477MB/sec ~ 923MB/sec. | |
+ * | |
+ * 4. Thrashing area avoidance: | |
+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be | |
+ * filtered out. My benchmark shows it's more efficient than KSM's per-page | |
+ * hash value based volatile page detection. | |
+ * | |
+ * | |
+ * 5. Misc changes upon KSM: | |
+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page | |
+ * comparison. It's much faster than default C version on x86. | |
+ * * rmap_item now has an struct *page member to loosely cache a | |
+ * address-->page mapping, which reduces too much time-costly | |
+ * follow_page(). | |
+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. | |
+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ | |
+ * ksm is needed for this case. | |
+ * | |
+ * 6. Full Zero Page consideration(contributed by Figo Zhang) | |
+ * Now uksmd consider full zero pages as special pages and merge them to an | |
+ * special unswappable uksm zero page. | |
+ */ | |
+ | |
+#include <linux/errno.h> | |
+#include <linux/mm.h> | |
+#include <linux/fs.h> | |
+#include <linux/mman.h> | |
+#include <linux/sched.h> | |
+#include <linux/rwsem.h> | |
+#include <linux/pagemap.h> | |
+#include <linux/rmap.h> | |
+#include <linux/spinlock.h> | |
+#include <linux/jhash.h> | |
+#include <linux/delay.h> | |
+#include <linux/kthread.h> | |
+#include <linux/wait.h> | |
+#include <linux/slab.h> | |
+#include <linux/rbtree.h> | |
+#include <linux/memory.h> | |
+#include <linux/mmu_notifier.h> | |
+#include <linux/swap.h> | |
+#include <linux/ksm.h> | |
+#include <linux/crypto.h> | |
+#include <linux/scatterlist.h> | |
+#include <crypto/hash.h> | |
+#include <linux/random.h> | |
+#include <linux/math64.h> | |
+#include <linux/gcd.h> | |
+#include <linux/freezer.h> | |
+#include <linux/sradix-tree.h> | |
+ | |
+#include <asm/tlbflush.h> | |
+#include "internal.h" | |
+ | |
+#ifdef CONFIG_X86 | |
+#undef memcmp | |
+ | |
+#ifdef CONFIG_X86_32 | |
+#define memcmp memcmpx86_32 | |
+/* | |
+ * Compare 4-byte-aligned address s1 and s2, with length n | |
+ */ | |
+int memcmpx86_32(void *s1, void *s2, size_t n) | |
+{ | |
+ size_t num = n / 4; | |
+ register int res; | |
+ | |
+ __asm__ __volatile__ | |
+ ( | |
+ "testl %3,%3\n\t" | |
+ "repe; cmpsd\n\t" | |
+ "je 1f\n\t" | |
+ "sbbl %0,%0\n\t" | |
+ "orl $1,%0\n" | |
+ "1:" | |
+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) | |
+ : "0" (0) | |
+ : "cc"); | |
+ | |
+ return res; | |
+} | |
+ | |
+/* | |
+ * Check the page is all zero ? | |
+ */ | |
+static int is_full_zero(const void *s1, size_t len) | |
+{ | |
+ unsigned char same; | |
+ | |
+ len /= 4; | |
+ | |
+ __asm__ __volatile__ | |
+ ("repe; scasl;" | |
+ "sete %0" | |
+ : "=qm" (same), "+D" (s1), "+c" (len) | |
+ : "a" (0) | |
+ : "cc"); | |
+ | |
+ return same; | |
+} | |
+ | |
+ | |
+#elif defined(CONFIG_X86_64) | |
+#define memcmp memcmpx86_64 | |
+/* | |
+ * Compare 8-byte-aligned address s1 and s2, with length n | |
+ */ | |
+int memcmpx86_64(void *s1, void *s2, size_t n) | |
+{ | |
+ size_t num = n / 8; | |
+ register int res; | |
+ | |
+ __asm__ __volatile__ | |
+ ( | |
+ "testq %q3,%q3\n\t" | |
+ "repe; cmpsq\n\t" | |
+ "je 1f\n\t" | |
+ "sbbq %q0,%q0\n\t" | |
+ "orq $1,%q0\n" | |
+ "1:" | |
+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) | |
+ : "0" (0) | |
+ : "cc"); | |
+ | |
+ return res; | |
+} | |
+ | |
+static int is_full_zero(const void *s1, size_t len) | |
+{ | |
+ unsigned char same; | |
+ | |
+ len /= 8; | |
+ | |
+ __asm__ __volatile__ | |
+ ("repe; scasq;" | |
+ "sete %0" | |
+ : "=qm" (same), "+D" (s1), "+c" (len) | |
+ : "a" (0) | |
+ : "cc"); | |
+ | |
+ return same; | |
+} | |
+ | |
+#endif | |
+#else | |
+static int is_full_zero(const void *s1, size_t len) | |
+{ | |
+ unsigned long *src = s1; | |
+ int i; | |
+ | |
+ len /= sizeof(*src); | |
+ | |
+ for (i = 0; i < len; i++) { | |
+ if (src[i]) | |
+ return 0; | |
+ } | |
+ | |
+ return 1; | |
+} | |
+#endif | |
+ | |
+#define UKSM_RUNG_ROUND_FINISHED (1 << 0) | |
+#define TIME_RATIO_SCALE 10000 | |
+ | |
+#define SLOT_TREE_NODE_SHIFT 8 | |
+#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) | |
+struct slot_tree_node { | |
+ unsigned long size; | |
+ struct sradix_tree_node snode; | |
+ void *stores[SLOT_TREE_NODE_STORE_SIZE]; | |
+}; | |
+ | |
+static struct kmem_cache *slot_tree_node_cachep; | |
+ | |
+static struct sradix_tree_node *slot_tree_node_alloc(void) | |
+{ | |
+ struct slot_tree_node *p; | |
+ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL); | |
+ if (!p) | |
+ return NULL; | |
+ | |
+ return &p->snode; | |
+} | |
+ | |
+static void slot_tree_node_free(struct sradix_tree_node *node) | |
+{ | |
+ struct slot_tree_node *p; | |
+ | |
+ p = container_of(node, struct slot_tree_node, snode); | |
+ kmem_cache_free(slot_tree_node_cachep, p); | |
+} | |
+ | |
+static void slot_tree_node_extend(struct sradix_tree_node *parent, | |
+ struct sradix_tree_node *child) | |
+{ | |
+ struct slot_tree_node *p, *c; | |
+ | |
+ p = container_of(parent, struct slot_tree_node, snode); | |
+ c = container_of(child, struct slot_tree_node, snode); | |
+ | |
+ p->size += c->size; | |
+} | |
+ | |
+void slot_tree_node_assign(struct sradix_tree_node *node, | |
+ unsigned index, void *item) | |
+{ | |
+ struct vma_slot *slot = item; | |
+ struct slot_tree_node *cur; | |
+ | |
+ slot->snode = node; | |
+ slot->sindex = index; | |
+ | |
+ while (node) { | |
+ cur = container_of(node, struct slot_tree_node, snode); | |
+ cur->size += slot->pages; | |
+ node = node->parent; | |
+ } | |
+} | |
+ | |
+void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset) | |
+{ | |
+ struct vma_slot *slot; | |
+ struct slot_tree_node *cur; | |
+ unsigned long pages; | |
+ | |
+ if (node->height == 1) { | |
+ slot = node->stores[offset]; | |
+ pages = slot->pages; | |
+ } else { | |
+ cur = container_of(node->stores[offset], | |
+ struct slot_tree_node, snode); | |
+ pages = cur->size; | |
+ } | |
+ | |
+ while (node) { | |
+ cur = container_of(node, struct slot_tree_node, snode); | |
+ cur->size -= pages; | |
+ node = node->parent; | |
+ } | |
+} | |
+ | |
+unsigned long slot_iter_index; | |
+int slot_iter(void *item, unsigned long height) | |
+{ | |
+ struct slot_tree_node *node; | |
+ struct vma_slot *slot; | |
+ | |
+ if (height == 1) { | |
+ slot = item; | |
+ if (slot_iter_index < slot->pages) { | |
+ /*in this one*/ | |
+ return 1; | |
+ } else { | |
+ slot_iter_index -= slot->pages; | |
+ return 0; | |
+ } | |
+ | |
+ } else { | |
+ node = container_of(item, struct slot_tree_node, snode); | |
+ if (slot_iter_index < node->size) { | |
+ /*in this one*/ | |
+ return 1; | |
+ } else { | |
+ slot_iter_index -= node->size; | |
+ return 0; | |
+ } | |
+ } | |
+} | |
+ | |
+ | |
+static inline void slot_tree_init_root(struct sradix_tree_root *root) | |
+{ | |
+ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); | |
+ root->alloc = slot_tree_node_alloc; | |
+ root->free = slot_tree_node_free; | |
+ root->extend = slot_tree_node_extend; | |
+ root->assign = slot_tree_node_assign; | |
+ root->rm = slot_tree_node_rm; | |
+} | |
+ | |
+void slot_tree_init(void) | |
+{ | |
+ slot_tree_node_cachep = kmem_cache_create("slot_tree_node", | |
+ sizeof(struct slot_tree_node), 0, | |
+ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, | |
+ NULL); | |
+} | |
+ | |
+ | |
+/* Each rung of this ladder is a list of VMAs having a same scan ratio */ | |
+struct scan_rung { | |
+ //struct list_head scanned_list; | |
+ struct sradix_tree_root vma_root; | |
+ struct sradix_tree_root vma_root2; | |
+ | |
+ struct vma_slot *current_scan; | |
+ unsigned long current_offset; | |
+ | |
+ /* | |
+ * The initial value for current_offset, it should loop over | |
+ * [0~ step - 1] to let all slot have its chance to be scanned. | |
+ */ | |
+ unsigned long offset_init; | |
+ unsigned long step; /* dynamic step for current_offset */ | |
+ unsigned int flags; | |
+ unsigned long pages_to_scan; | |
+ //unsigned long fully_scanned_slots; | |
+ /* | |
+ * a little bit tricky - if cpu_time_ratio > 0, then the value is the | |
+ * the cpu time ratio it can spend in rung_i for every scan | |
+ * period. if < 0, then it is the cpu time ratio relative to the | |
+ * max cpu percentage user specified. Both in unit of | |
+ * 1/TIME_RATIO_SCALE | |
+ */ | |
+ int cpu_ratio; | |
+ | |
+ /* | |
+ * How long it will take for all slots in this rung to be fully | |
+ * scanned? If it's zero, we don't care about the cover time: | |
+ * it's fully scanned. | |
+ */ | |
+ unsigned int cover_msecs; | |
+ //unsigned long vma_num; | |
+ //unsigned long pages; /* Sum of all slot's pages in rung */ | |
+}; | |
+ | |
+/** | |
+ * node of either the stable or unstale rbtree | |
+ * | |
+ */ | |
+struct tree_node { | |
+ struct rb_node node; /* link in the main (un)stable rbtree */ | |
+ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ | |
+ u32 hash; | |
+ unsigned long count; /* TODO: merged with sub_root */ | |
+ struct list_head all_list; /* all tree nodes in stable/unstable tree */ | |
+}; | |
+ | |
+/** | |
+ * struct stable_node - node of the stable rbtree | |
+ * @node: rb node of this ksm page in the stable tree | |
+ * @hlist: hlist head of rmap_items using this ksm page | |
+ * @kpfn: page frame number of this ksm page | |
+ */ | |
+struct stable_node { | |
+ struct rb_node node; /* link in sub-rbtree */ | |
+ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ | |
+ struct hlist_head hlist; | |
+ unsigned long kpfn; | |
+ u32 hash_max; /* if ==0 then it's not been calculated yet */ | |
+ struct list_head all_list; /* in a list for all stable nodes */ | |
+}; | |
+ | |
+/** | |
+ * struct node_vma - group rmap_items linked in a same stable | |
+ * node together. | |
+ */ | |
+struct node_vma { | |
+ union { | |
+ struct vma_slot *slot; | |
+ unsigned long key; /* slot is used as key sorted on hlist */ | |
+ }; | |
+ struct hlist_node hlist; | |
+ struct hlist_head rmap_hlist; | |
+ struct stable_node *head; | |
+}; | |
+ | |
+/** | |
+ * struct rmap_item - reverse mapping item for virtual addresses | |
+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list | |
+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree | |
+ * @mm: the memory structure this rmap_item is pointing into | |
+ * @address: the virtual address this rmap_item tracks (+ flags in low bits) | |
+ * @node: rb node of this rmap_item in the unstable tree | |
+ * @head: pointer to stable_node heading this list in the stable tree | |
+ * @hlist: link into hlist of rmap_items hanging off that stable_node | |
+ */ | |
+struct rmap_item { | |
+ struct vma_slot *slot; | |
+ struct page *page; | |
+ unsigned long address; /* + low bits used for flags below */ | |
+ unsigned long hash_round; | |
+ unsigned long entry_index; | |
+ union { | |
+ struct {/* when in unstable tree */ | |
+ struct rb_node node; | |
+ struct tree_node *tree_node; | |
+ u32 hash_max; | |
+ }; | |
+ struct { /* when in stable tree */ | |
+ struct node_vma *head; | |
+ struct hlist_node hlist; | |
+ struct anon_vma *anon_vma; | |
+ }; | |
+ }; | |
+} __attribute__((aligned(4))); | |
+ | |
+struct rmap_list_entry { | |
+ union { | |
+ struct rmap_item *item; | |
+ unsigned long addr; | |
+ }; | |
+ /* lowest bit is used for is_addr tag */ | |
+} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/ | |
+ | |
+ | |
+/* Basic data structure definition ends */ | |
+ | |
+ | |
+/* | |
+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree. | |
+ * The flags use the low bits of rmap_item.address | |
+ */ | |
+#define UNSTABLE_FLAG 0x1 | |
+#define STABLE_FLAG 0x2 | |
+#define get_rmap_addr(x) ((x)->address & PAGE_MASK) | |
+ | |
+/* | |
+ * rmap_list_entry helpers | |
+ */ | |
+#define IS_ADDR_FLAG 1 | |
+#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) | |
+#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) | |
+#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) | |
+ | |
+ | |
+/* | |
+ * High speed caches for frequently allocated and freed structs | |
+ */ | |
+static struct kmem_cache *rmap_item_cache; | |
+static struct kmem_cache *stable_node_cache; | |
+static struct kmem_cache *node_vma_cache; | |
+static struct kmem_cache *vma_slot_cache; | |
+static struct kmem_cache *tree_node_cache; | |
+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ | |
+ sizeof(struct __struct), __alignof__(struct __struct),\ | |
+ (__flags), NULL) | |
+ | |
+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ | |
+#define SCAN_LADDER_SIZE 4 | |
+static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; | |
+ | |
+/* The evaluation rounds uksmd has finished */ | |
+static unsigned long long uksm_eval_round = 1; | |
+ | |
+/* | |
+ * we add 1 to this var when we consider we should rebuild the whole | |
+ * unstable tree. | |
+ */ | |
+static unsigned long uksm_hash_round = 1; | |
+ | |
+/* | |
+ * How many times the whole memory is scanned. | |
+ */ | |
+static unsigned long long fully_scanned_round = 1; | |
+ | |
+/* The total number of virtual pages of all vma slots */ | |
+static u64 uksm_pages_total; | |
+ | |
+/* The number of pages has been scanned since the start up */ | |
+static u64 uksm_pages_scanned; | |
+ | |
+static u64 scanned_virtual_pages; | |
+ | |
+/* The number of pages has been scanned since last encode_benefit call */ | |
+static u64 uksm_pages_scanned_last; | |
+ | |
+/* If the scanned number is tooo large, we encode it here */ | |
+static u64 pages_scanned_stored; | |
+ | |
+static unsigned long pages_scanned_base; | |
+ | |
+/* The number of nodes in the stable tree */ | |
+static unsigned long uksm_pages_shared; | |
+ | |
+/* The number of page slots additionally sharing those nodes */ | |
+static unsigned long uksm_pages_sharing; | |
+ | |
+/* The number of nodes in the unstable tree */ | |
+static unsigned long uksm_pages_unshared; | |
+ | |
+/* | |
+ * Milliseconds ksmd should sleep between scans, | |
+ * >= 100ms to be consistent with | |
+ * scan_time_to_sleep_msec() | |
+ */ | |
+static unsigned int uksm_sleep_jiffies; | |
+ | |
+/* The real value for the uksmd next sleep */ | |
+static unsigned int uksm_sleep_real; | |
+ | |
+/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ | |
+static unsigned int uksm_sleep_saved; | |
+ | |
+/* Max percentage of cpu utilization ksmd can take to scan in one batch */ | |
+static unsigned int uksm_max_cpu_percentage; | |
+ | |
+static int uksm_cpu_governor; | |
+ | |
+static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; | |
+ | |
+struct uksm_cpu_preset_s { | |
+ int cpu_ratio[SCAN_LADDER_SIZE]; | |
+ unsigned int cover_msecs[SCAN_LADDER_SIZE]; | |
+ unsigned int max_cpu; /* percentage */ | |
+}; | |
+ | |
+struct uksm_cpu_preset_s uksm_cpu_preset[4] = { | |
+ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, | |
+ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, | |
+ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, | |
+ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, | |
+}; | |
+ | |
+/* The default value for uksm_ema_page_time if it's not initialized */ | |
+#define UKSM_PAGE_TIME_DEFAULT 500 | |
+ | |
+/*cost to scan one page by expotional moving average in nsecs */ | |
+static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; | |
+ | |
+/* The expotional moving average alpha weight, in percentage. */ | |
+#define EMA_ALPHA 20 | |
+ | |
+/* | |
+ * The threshold used to filter out thrashing areas, | |
+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound | |
+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio | |
+ * will be considered as having a zero duplication ratio. | |
+ */ | |
+static unsigned int uksm_thrash_threshold = 50; | |
+ | |
+/* How much dedup ratio is considered to be abundant*/ | |
+static unsigned int uksm_abundant_threshold = 10; | |
+ | |
+/* All slots having merged pages in this eval round. */ | |
+struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); | |
+ | |
+/* How many times the ksmd has slept since startup */ | |
+static unsigned long long uksm_sleep_times; | |
+ | |
+#define UKSM_RUN_STOP 0 | |
+#define UKSM_RUN_MERGE 1 | |
+static unsigned int uksm_run = 1; | |
+ | |
+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); | |
+static DEFINE_MUTEX(uksm_thread_mutex); | |
+ | |
+/* | |
+ * List vma_slot_new is for newly created vma_slot waiting to be added by | |
+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to | |
+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding | |
+ * VMA has been removed/freed. | |
+ */ | |
+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); | |
+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); | |
+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); | |
+static DEFINE_SPINLOCK(vma_slot_list_lock); | |
+ | |
+/* The unstable tree heads */ | |
+static struct rb_root root_unstable_tree = RB_ROOT; | |
+ | |
+/* | |
+ * All tree_nodes are in a list to be freed at once when unstable tree is | |
+ * freed after each scan round. | |
+ */ | |
+static struct list_head unstable_tree_node_list = | |
+ LIST_HEAD_INIT(unstable_tree_node_list); | |
+ | |
+/* List contains all stable nodes */ | |
+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); | |
+ | |
+/* | |
+ * When the hash strength is changed, the stable tree must be delta_hashed and | |
+ * re-structured. We use two set of below structs to speed up the | |
+ * re-structuring of stable tree. | |
+ */ | |
+static struct list_head | |
+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), | |
+ LIST_HEAD_INIT(stable_tree_node_list[1])}; | |
+ | |
+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; | |
+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; | |
+static struct rb_root *root_stable_treep = &root_stable_tree[0]; | |
+static unsigned long stable_tree_index; | |
+ | |
+/* The hash strength needed to hash a full page */ | |
+#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) | |
+ | |
+/* The hash strength needed for loop-back hashing */ | |
+#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) | |
+ | |
+/* The random offsets in a page */ | |
+static u32 *random_nums; | |
+ | |
+/* The hash strength */ | |
+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; | |
+ | |
+/* The delta value each time the hash strength increases or decreases */ | |
+static unsigned long hash_strength_delta; | |
+#define HASH_STRENGTH_DELTA_MAX 5 | |
+ | |
+/* The time we have saved due to random_sample_hash */ | |
+static u64 rshash_pos; | |
+ | |
+/* The time we have wasted due to hash collision */ | |
+static u64 rshash_neg; | |
+ | |
+struct uksm_benefit { | |
+ u64 pos; | |
+ u64 neg; | |
+ u64 scanned; | |
+ unsigned long base; | |
+} benefit; | |
+ | |
+/* | |
+ * The relative cost of memcmp, compared to 1 time unit of random sample | |
+ * hash, this value is tested when ksm module is initialized | |
+ */ | |
+static unsigned long memcmp_cost; | |
+ | |
+static unsigned long rshash_neg_cont_zero; | |
+static unsigned long rshash_cont_obscure; | |
+ | |
+/* The possible states of hash strength adjustment heuristic */ | |
+enum rshash_states { | |
+ RSHASH_STILL, | |
+ RSHASH_TRYUP, | |
+ RSHASH_TRYDOWN, | |
+ RSHASH_NEW, | |
+ RSHASH_PRE_STILL, | |
+}; | |
+ | |
+/* The possible direction we are about to adjust hash strength */ | |
+enum rshash_direct { | |
+ GO_UP, | |
+ GO_DOWN, | |
+ OBSCURE, | |
+ STILL, | |
+}; | |
+ | |
+/* random sampling hash state machine */ | |
+static struct { | |
+ enum rshash_states state; | |
+ enum rshash_direct pre_direct; | |
+ u8 below_count; | |
+ /* Keep a lookup window of size 5, iff above_count/below_count > 3 | |
+ * in this window we stop trying. | |
+ */ | |
+ u8 lookup_window_index; | |
+ u64 stable_benefit; | |
+ unsigned long turn_point_down; | |
+ unsigned long turn_benefit_down; | |
+ unsigned long turn_point_up; | |
+ unsigned long turn_benefit_up; | |
+ unsigned long stable_point; | |
+} rshash_state; | |
+ | |
+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ | |
+static u32 *zero_hash_table; | |
+ | |
+static inline struct node_vma *alloc_node_vma(void) | |
+{ | |
+ struct node_vma *node_vma; | |
+ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL); | |
+ if (node_vma) { | |
+ INIT_HLIST_HEAD(&node_vma->rmap_hlist); | |
+ INIT_HLIST_NODE(&node_vma->hlist); | |
+ } | |
+ return node_vma; | |
+} | |
+ | |
+static inline void free_node_vma(struct node_vma *node_vma) | |
+{ | |
+ kmem_cache_free(node_vma_cache, node_vma); | |
+} | |
+ | |
+ | |
+static inline struct vma_slot *alloc_vma_slot(void) | |
+{ | |
+ struct vma_slot *slot; | |
+ | |
+ /* | |
+ * In case ksm is not initialized by now. | |
+ * Oops, we need to consider the call site of uksm_init() in the future. | |
+ */ | |
+ if (!vma_slot_cache) | |
+ return NULL; | |
+ | |
+ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL); | |
+ if (slot) { | |
+ INIT_LIST_HEAD(&slot->slot_list); | |
+ INIT_LIST_HEAD(&slot->dedup_list); | |
+ slot->flags |= UKSM_SLOT_NEED_RERAND; | |
+ } | |
+ return slot; | |
+} | |
+ | |
+static inline void free_vma_slot(struct vma_slot *vma_slot) | |
+{ | |
+ kmem_cache_free(vma_slot_cache, vma_slot); | |
+} | |
+ | |
+ | |
+ | |
+static inline struct rmap_item *alloc_rmap_item(void) | |
+{ | |
+ struct rmap_item *rmap_item; | |
+ | |
+ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); | |
+ if (rmap_item) { | |
+ /* bug on lowest bit is not clear for flag use */ | |
+ BUG_ON(is_addr(rmap_item)); | |
+ } | |
+ return rmap_item; | |
+} | |
+ | |
+static inline void free_rmap_item(struct rmap_item *rmap_item) | |
+{ | |
+ rmap_item->slot = NULL; /* debug safety */ | |
+ kmem_cache_free(rmap_item_cache, rmap_item); | |
+} | |
+ | |
+static inline struct stable_node *alloc_stable_node(void) | |
+{ | |
+ struct stable_node *node; | |
+ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC); | |
+ if (!node) | |
+ return NULL; | |
+ | |
+ INIT_HLIST_HEAD(&node->hlist); | |
+ list_add(&node->all_list, &stable_node_list); | |
+ return node; | |
+} | |
+ | |
+static inline void free_stable_node(struct stable_node *stable_node) | |
+{ | |
+ list_del(&stable_node->all_list); | |
+ kmem_cache_free(stable_node_cache, stable_node); | |
+} | |
+ | |
+static inline struct tree_node *alloc_tree_node(struct list_head *list) | |
+{ | |
+ struct tree_node *node; | |
+ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC); | |
+ if (!node) | |
+ return NULL; | |
+ | |
+ list_add(&node->all_list, list); | |
+ return node; | |
+} | |
+ | |
+static inline void free_tree_node(struct tree_node *node) | |
+{ | |
+ list_del(&node->all_list); | |
+ kmem_cache_free(tree_node_cache, node); | |
+} | |
+ | |
+static void uksm_drop_anon_vma(struct rmap_item *rmap_item) | |
+{ | |
+ struct anon_vma *anon_vma = rmap_item->anon_vma; | |
+ | |
+ put_anon_vma(anon_vma); | |
+} | |
+ | |
+ | |
+/** | |
+ * Remove a stable node from stable_tree, may unlink from its tree_node and | |
+ * may remove its parent tree_node if no other stable node is pending. | |
+ * | |
+ * @stable_node The node need to be removed | |
+ * @unlink_rb Will this node be unlinked from the rbtree? | |
+ * @remove_tree_ node Will its tree_node be removed if empty? | |
+ */ | |
+static void remove_node_from_stable_tree(struct stable_node *stable_node, | |
+ int unlink_rb, int remove_tree_node) | |
+{ | |
+ struct node_vma *node_vma; | |
+ struct rmap_item *rmap_item; | |
+ struct hlist_node *n; | |
+ | |
+ if (!hlist_empty(&stable_node->hlist)) { | |
+ hlist_for_each_entry_safe(node_vma, n, | |
+ &stable_node->hlist, hlist) { | |
+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { | |
+ uksm_pages_sharing--; | |
+ | |
+ uksm_drop_anon_vma(rmap_item); | |
+ rmap_item->address &= PAGE_MASK; | |
+ } | |
+ free_node_vma(node_vma); | |
+ cond_resched(); | |
+ } | |
+ | |
+ /* the last one is counted as shared */ | |
+ uksm_pages_shared--; | |
+ uksm_pages_sharing++; | |
+ } | |
+ | |
+ if (stable_node->tree_node && unlink_rb) { | |
+ rb_erase(&stable_node->node, | |
+ &stable_node->tree_node->sub_root); | |
+ | |
+ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && | |
+ remove_tree_node) { | |
+ rb_erase(&stable_node->tree_node->node, | |
+ root_stable_treep); | |
+ free_tree_node(stable_node->tree_node); | |
+ } else { | |
+ stable_node->tree_node->count--; | |
+ } | |
+ } | |
+ | |
+ free_stable_node(stable_node); | |
+} | |
+ | |
+ | |
+/* | |
+ * get_uksm_page: checks if the page indicated by the stable node | |
+ * is still its ksm page, despite having held no reference to it. | |
+ * In which case we can trust the content of the page, and it | |
+ * returns the gotten page; but if the page has now been zapped, | |
+ * remove the stale node from the stable tree and return NULL. | |
+ * | |
+ * You would expect the stable_node to hold a reference to the ksm page. | |
+ * But if it increments the page's count, swapping out has to wait for | |
+ * ksmd to come around again before it can free the page, which may take | |
+ * seconds or even minutes: much too unresponsive. So instead we use a | |
+ * "keyhole reference": access to the ksm page from the stable node peeps | |
+ * out through its keyhole to see if that page still holds the right key, | |
+ * pointing back to this stable node. This relies on freeing a PageAnon | |
+ * page to reset its page->mapping to NULL, and relies on no other use of | |
+ * a page to put something that might look like our key in page->mapping. | |
+ * | |
+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | |
+ * but this is different - made simpler by uksm_thread_mutex being held, but | |
+ * interesting for assuming that no other use of the struct page could ever | |
+ * put our expected_mapping into page->mapping (or a field of the union which | |
+ * coincides with page->mapping). The RCU calls are not for KSM at all, but | |
+ * to keep the page_count protocol described with page_cache_get_speculative. | |
+ * | |
+ * Note: it is possible that get_uksm_page() will return NULL one moment, | |
+ * then page the next, if the page is in between page_freeze_refs() and | |
+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | |
+ * is on its way to being freed; but it is an anomaly to bear in mind. | |
+ * | |
+ * @unlink_rb: if the removal of this node will firstly unlink from | |
+ * its rbtree. stable_node_reinsert will prevent this when restructuring the | |
+ * node from its old tree. | |
+ * | |
+ * @remove_tree_node: if this is the last one of its tree_node, will the | |
+ * tree_node be freed ? If we are inserting stable node, this tree_node may | |
+ * be reused, so don't free it. | |
+ */ | |
+static struct page *get_uksm_page(struct stable_node *stable_node, | |
+ int unlink_rb, int remove_tree_node) | |
+{ | |
+ struct page *page; | |
+ void *expected_mapping; | |
+ | |
+ page = pfn_to_page(stable_node->kpfn); | |
+ expected_mapping = (void *)stable_node + | |
+ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | |
+ rcu_read_lock(); | |
+ if (page->mapping != expected_mapping) | |
+ goto stale; | |
+ if (!get_page_unless_zero(page)) | |
+ goto stale; | |
+ if (page->mapping != expected_mapping) { | |
+ put_page(page); | |
+ goto stale; | |
+ } | |
+ rcu_read_unlock(); | |
+ return page; | |
+stale: | |
+ rcu_read_unlock(); | |
+ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); | |
+ | |
+ return NULL; | |
+} | |
+ | |
+/* | |
+ * Removing rmap_item from stable or unstable tree. | |
+ * This function will clean the information from the stable/unstable tree. | |
+ */ | |
+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |
+{ | |
+ if (rmap_item->address & STABLE_FLAG) { | |
+ struct stable_node *stable_node; | |
+ struct node_vma *node_vma; | |
+ struct page *page; | |
+ | |
+ node_vma = rmap_item->head; | |
+ stable_node = node_vma->head; | |
+ page = get_uksm_page(stable_node, 1, 1); | |
+ if (!page) | |
+ goto out; | |
+ | |
+ /* | |
+ * page lock is needed because it's racing with | |
+ * try_to_unmap_ksm(), etc. | |
+ */ | |
+ lock_page(page); | |
+ hlist_del(&rmap_item->hlist); | |
+ | |
+ if (hlist_empty(&node_vma->rmap_hlist)) { | |
+ hlist_del(&node_vma->hlist); | |
+ free_node_vma(node_vma); | |
+ } | |
+ unlock_page(page); | |
+ | |
+ put_page(page); | |
+ if (hlist_empty(&stable_node->hlist)) { | |
+ /* do NOT call remove_node_from_stable_tree() here, | |
+ * it's possible for a forked rmap_item not in | |
+ * stable tree while the in-tree rmap_items were | |
+ * deleted. | |
+ */ | |
+ uksm_pages_shared--; | |
+ } else | |
+ uksm_pages_sharing--; | |
+ | |
+ | |
+ uksm_drop_anon_vma(rmap_item); | |
+ } else if (rmap_item->address & UNSTABLE_FLAG) { | |
+ if (rmap_item->hash_round == uksm_hash_round) { | |
+ | |
+ rb_erase(&rmap_item->node, | |
+ &rmap_item->tree_node->sub_root); | |
+ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { | |
+ rb_erase(&rmap_item->tree_node->node, | |
+ &root_unstable_tree); | |
+ | |
+ free_tree_node(rmap_item->tree_node); | |
+ } else | |
+ rmap_item->tree_node->count--; | |
+ } | |
+ uksm_pages_unshared--; | |
+ } | |
+ | |
+ rmap_item->address &= PAGE_MASK; | |
+ rmap_item->hash_max = 0; | |
+ | |
+out: | |
+ cond_resched(); /* we're called from many long loops */ | |
+} | |
+ | |
+static inline int slot_in_uksm(struct vma_slot *slot) | |
+{ | |
+ return list_empty(&slot->slot_list); | |
+} | |
+ | |
+/* | |
+ * Test if the mm is exiting | |
+ */ | |
+static inline bool uksm_test_exit(struct mm_struct *mm) | |
+{ | |
+ return atomic_read(&mm->mm_users) == 0; | |
+} | |
+ | |
+static inline unsigned long vma_pool_size(struct vma_slot *slot) | |
+{ | |
+ return round_up(sizeof(struct rmap_list_entry) * slot->pages, | |
+ PAGE_SIZE) >> PAGE_SHIFT; | |
+} | |
+ | |
+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) | |
+ | |
+/* must be done with sem locked */ | |
+static int slot_pool_alloc(struct vma_slot *slot) | |
+{ | |
+ unsigned long pool_size; | |
+ | |
+ if (slot->rmap_list_pool) | |
+ return 0; | |
+ | |
+ pool_size = vma_pool_size(slot); | |
+ slot->rmap_list_pool = kzalloc(sizeof(struct page *) * | |
+ pool_size, GFP_KERNEL); | |
+ if (!slot->rmap_list_pool) | |
+ return -ENOMEM; | |
+ | |
+ slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size, | |
+ GFP_KERNEL); | |
+ if (!slot->pool_counts) { | |
+ kfree(slot->rmap_list_pool); | |
+ return -ENOMEM; | |
+ } | |
+ | |
+ slot->pool_size = pool_size; | |
+ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); | |
+ slot->flags |= UKSM_SLOT_IN_UKSM; | |
+ uksm_pages_total += slot->pages; | |
+ | |
+ return 0; | |
+} | |
+ | |
+/* | |
+ * Called after vma is unlinked from its mm | |
+ */ | |
+void uksm_remove_vma(struct vm_area_struct *vma) | |
+{ | |
+ struct vma_slot *slot; | |
+ | |
+ if (!vma->uksm_vma_slot) | |
+ return; | |
+ | |
+ spin_lock(&vma_slot_list_lock); | |
+ slot = vma->uksm_vma_slot; | |
+ if (!slot) | |
+ goto out; | |
+ | |
+ if (slot_in_uksm(slot)) { | |
+ /** | |
+ * This slot has been added by ksmd, so move to the del list | |
+ * waiting ksmd to free it. | |
+ */ | |
+ list_add_tail(&slot->slot_list, &vma_slot_del); | |
+ } else { | |
+ /** | |
+ * It's still on new list. It's ok to free slot directly. | |
+ */ | |
+ list_del(&slot->slot_list); | |
+ free_vma_slot(slot); | |
+ } | |
+out: | |
+ vma->uksm_vma_slot = NULL; | |
+ spin_unlock(&vma_slot_list_lock); | |
+} | |
+ | |
+/** | |
+ * Need to do two things: | |
+ * 1. check if slot was moved to del list | |
+ * 2. make sure the mmap_sem is manipulated under valid vma. | |
+ * | |
+ * My concern here is that in some cases, this may make | |
+ * vma_slot_list_lock() waiters to serialized further by some | |
+ * sem->wait_lock, can this really be expensive? | |
+ * | |
+ * | |
+ * @return | |
+ * 0: if successfully locked mmap_sem | |
+ * -ENOENT: this slot was moved to del list | |
+ * -EBUSY: vma lock failed | |
+ */ | |
+static int try_down_read_slot_mmap_sem(struct vma_slot *slot) | |
+{ | |
+ struct vm_area_struct *vma; | |
+ struct mm_struct *mm; | |
+ struct rw_semaphore *sem; | |
+ | |
+ spin_lock(&vma_slot_list_lock); | |
+ | |
+ /* the slot_list was removed and inited from new list, when it enters | |
+ * uksm_list. If now it's not empty, then it must be moved to del list | |
+ */ | |
+ if (!slot_in_uksm(slot)) { | |
+ spin_unlock(&vma_slot_list_lock); | |
+ return -ENOENT; | |
+ } | |
+ | |
+ BUG_ON(slot->pages != vma_pages(slot->vma)); | |
+ /* Ok, vma still valid */ | |
+ vma = slot->vma; | |
+ mm = vma->vm_mm; | |
+ sem = &mm->mmap_sem; | |
+ | |
+ if (uksm_test_exit(mm)) { | |
+ spin_unlock(&vma_slot_list_lock); | |
+ return -ENOENT; | |
+ } | |
+ | |
+ if (down_read_trylock(sem)) { | |
+ spin_unlock(&vma_slot_list_lock); | |
+ if (slot_pool_alloc(slot)) { | |
+ uksm_remove_vma(vma); | |
+ up_read(sem); | |
+ return -ENOENT; | |
+ } | |
+ return 0; | |
+ } | |
+ | |
+ spin_unlock(&vma_slot_list_lock); | |
+ return -EBUSY; | |
+} | |
+ | |
+static inline unsigned long | |
+vma_page_address(struct page *page, struct vm_area_struct *vma) | |
+{ | |
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | |
+ unsigned long address; | |
+ | |
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | |
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { | |
+ /* page should be within @vma mapping range */ | |
+ return -EFAULT; | |
+ } | |
+ return address; | |
+} | |
+ | |
+ | |
+/* return 0 on success with the item's mmap_sem locked */ | |
+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) | |
+{ | |
+ struct mm_struct *mm; | |
+ struct vma_slot *slot = item->slot; | |
+ int err = -EINVAL; | |
+ | |
+ struct page *page; | |
+ | |
+ /* | |
+ * try_down_read_slot_mmap_sem() returns non-zero if the slot | |
+ * has been removed by uksm_remove_vma(). | |
+ */ | |
+ if (try_down_read_slot_mmap_sem(slot)) | |
+ return -EBUSY; | |
+ | |
+ mm = slot->vma->vm_mm; | |
+ | |
+ if (uksm_test_exit(mm)) | |
+ goto failout_up; | |
+ | |
+ page = item->page; | |
+ rcu_read_lock(); | |
+ if (!get_page_unless_zero(page)) { | |
+ rcu_read_unlock(); | |
+ goto failout_up; | |
+ } | |
+ | |
+ /* No need to consider huge page here. */ | |
+ if (item->slot->vma->anon_vma != page_anon_vma(page) || | |
+ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { | |
+ /* | |
+ * TODO: | |
+ * should we release this item becase of its stale page | |
+ * mapping? | |
+ */ | |
+ put_page(page); | |
+ rcu_read_unlock(); | |
+ goto failout_up; | |
+ } | |
+ rcu_read_unlock(); | |
+ return 0; | |
+ | |
+failout_up: | |
+ up_read(&mm->mmap_sem); | |
+ return err; | |
+} | |
+ | |
+/* | |
+ * What kind of VMA is considered ? | |
+ */ | |
+static inline int vma_can_enter(struct vm_area_struct *vma) | |
+{ | |
+ return uksm_flags_can_scan(vma->vm_flags); | |
+} | |
+ | |
+/* | |
+ * Called whenever a fresh new vma is created A new vma_slot. | |
+ * is created and inserted into a global list Must be called. | |
+ * after vma is inserted to its mm . | |
+ */ | |
+void uksm_vma_add_new(struct vm_area_struct *vma) | |
+{ | |
+ struct vma_slot *slot; | |
+ | |
+ if (!vma_can_enter(vma)) { | |
+ vma->uksm_vma_slot = NULL; | |
+ return; | |
+ } | |
+ | |
+ slot = alloc_vma_slot(); | |
+ if (!slot) { | |
+ vma->uksm_vma_slot = NULL; | |
+ return; | |
+ } | |
+ | |
+ vma->uksm_vma_slot = slot; | |
+ vma->vm_flags |= VM_MERGEABLE; | |
+ slot->vma = vma; | |
+ slot->mm = vma->vm_mm; | |
+ slot->ctime_j = jiffies; | |
+ slot->pages = vma_pages(vma); | |
+ spin_lock(&vma_slot_list_lock); | |
+ list_add_tail(&slot->slot_list, &vma_slot_new); | |
+ spin_unlock(&vma_slot_list_lock); | |
+} | |
+ | |
+/* 32/3 < they < 32/2 */ | |
+#define shiftl 8 | |
+#define shiftr 12 | |
+ | |
+#define HASH_FROM_TO(from, to) \ | |
+for (index = from; index < to; index++) { \ | |
+ pos = random_nums[index]; \ | |
+ hash += key[pos]; \ | |
+ hash += (hash << shiftl); \ | |
+ hash ^= (hash >> shiftr); \ | |
+} | |
+ | |
+ | |
+#define HASH_FROM_DOWN_TO(from, to) \ | |
+for (index = from - 1; index >= to; index--) { \ | |
+ hash ^= (hash >> shiftr); \ | |
+ hash ^= (hash >> (shiftr*2)); \ | |
+ hash -= (hash << shiftl); \ | |
+ hash += (hash << (shiftl*2)); \ | |
+ pos = random_nums[index]; \ | |
+ hash -= key[pos]; \ | |
+} | |
+ | |
+/* | |
+ * The main random sample hash function. | |
+ */ | |
+static u32 random_sample_hash(void *addr, u32 hash_strength) | |
+{ | |
+ u32 hash = 0xdeadbeef; | |
+ int index, pos, loop = hash_strength; | |
+ u32 *key = (u32 *)addr; | |
+ | |
+ if (loop > HASH_STRENGTH_FULL) | |
+ loop = HASH_STRENGTH_FULL; | |
+ | |
+ HASH_FROM_TO(0, loop); | |
+ | |
+ if (hash_strength > HASH_STRENGTH_FULL) { | |
+ loop = hash_strength - HASH_STRENGTH_FULL; | |
+ HASH_FROM_TO(0, loop); | |
+ } | |
+ | |
+ return hash; | |
+} | |
+ | |
+ | |
+/** | |
+ * It's used when hash strength is adjusted | |
+ * | |
+ * @addr The page's virtual address | |
+ * @from The original hash strength | |
+ * @to The hash strength changed to | |
+ * @hash The hash value generated with "from" hash value | |
+ * | |
+ * return the hash value | |
+ */ | |
+static u32 delta_hash(void *addr, int from, int to, u32 hash) | |
+{ | |
+ u32 *key = (u32 *)addr; | |
+ int index, pos; /* make sure they are int type */ | |
+ | |
+ if (to > from) { | |
+ if (from >= HASH_STRENGTH_FULL) { | |
+ from -= HASH_STRENGTH_FULL; | |
+ to -= HASH_STRENGTH_FULL; | |
+ HASH_FROM_TO(from, to); | |
+ } else if (to <= HASH_STRENGTH_FULL) { | |
+ HASH_FROM_TO(from, to); | |
+ } else { | |
+ HASH_FROM_TO(from, HASH_STRENGTH_FULL); | |
+ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); | |
+ } | |
+ } else { | |
+ if (from <= HASH_STRENGTH_FULL) { | |
+ HASH_FROM_DOWN_TO(from, to); | |
+ } else if (to >= HASH_STRENGTH_FULL) { | |
+ from -= HASH_STRENGTH_FULL; | |
+ to -= HASH_STRENGTH_FULL; | |
+ HASH_FROM_DOWN_TO(from, to); | |
+ } else { | |
+ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); | |
+ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); | |
+ } | |
+ } | |
+ | |
+ return hash; | |
+} | |
+ | |
+/** | |
+ * | |
+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round | |
+ * has finished. | |
+ * | |
+ * return 0 if no page has been scanned since last call, 1 otherwise. | |
+ */ | |
+static inline int encode_benefit(void) | |
+{ | |
+ u64 scanned_delta, pos_delta, neg_delta; | |
+ unsigned long base = benefit.base; | |
+ | |
+ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; | |
+ | |
+ if (!scanned_delta) | |
+ return 0; | |
+ | |
+ scanned_delta >>= base; | |
+ pos_delta = rshash_pos >> base; | |
+ neg_delta = rshash_neg >> base; | |
+ | |
+ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || | |
+ CAN_OVERFLOW_U64(benefit.neg, neg_delta) || | |
+ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { | |
+ benefit.scanned >>= 1; | |
+ benefit.neg >>= 1; | |
+ benefit.pos >>= 1; | |
+ benefit.base++; | |
+ scanned_delta >>= 1; | |
+ pos_delta >>= 1; | |
+ neg_delta >>= 1; | |
+ } | |
+ | |
+ benefit.pos += pos_delta; | |
+ benefit.neg += neg_delta; | |
+ benefit.scanned += scanned_delta; | |
+ | |
+ BUG_ON(!benefit.scanned); | |
+ | |
+ rshash_pos = rshash_neg = 0; | |
+ uksm_pages_scanned_last = uksm_pages_scanned; | |
+ | |
+ return 1; | |
+} | |
+ | |
+static inline void reset_benefit(void) | |
+{ | |
+ benefit.pos = 0; | |
+ benefit.neg = 0; | |
+ benefit.base = 0; | |
+ benefit.scanned = 0; | |
+} | |
+ | |
+static inline void inc_rshash_pos(unsigned long delta) | |
+{ | |
+ if (CAN_OVERFLOW_U64(rshash_pos, delta)) | |
+ encode_benefit(); | |
+ | |
+ rshash_pos += delta; | |
+} | |
+ | |
+static inline void inc_rshash_neg(unsigned long delta) | |
+{ | |
+ if (CAN_OVERFLOW_U64(rshash_neg, delta)) | |
+ encode_benefit(); | |
+ | |
+ rshash_neg += delta; | |
+} | |
+ | |
+ | |
+static inline u32 page_hash(struct page *page, unsigned long hash_strength, | |
+ int cost_accounting) | |
+{ | |
+ u32 val; | |
+ unsigned long delta; | |
+ | |
+ void *addr = kmap_atomic(page); | |
+ | |
+ val = random_sample_hash(addr, hash_strength); | |
+ kunmap_atomic(addr); | |
+ | |
+ if (cost_accounting) { | |
+ if (HASH_STRENGTH_FULL > hash_strength) | |
+ delta = HASH_STRENGTH_FULL - hash_strength; | |
+ else | |
+ delta = 0; | |
+ | |
+ inc_rshash_pos(delta); | |
+ } | |
+ | |
+ return val; | |
+} | |
+ | |
+static int memcmp_pages(struct page *page1, struct page *page2, | |
+ int cost_accounting) | |
+{ | |
+ char *addr1, *addr2; | |
+ int ret; | |
+ | |
+ addr1 = kmap_atomic(page1); | |
+ addr2 = kmap_atomic(page2); | |
+ ret = memcmp(addr1, addr2, PAGE_SIZE); | |
+ kunmap_atomic(addr2); | |
+ kunmap_atomic(addr1); | |
+ | |
+ if (cost_accounting) | |
+ inc_rshash_neg(memcmp_cost); | |
+ | |
+ return ret; | |
+} | |
+ | |
+static inline int pages_identical(struct page *page1, struct page *page2) | |
+{ | |
+ return !memcmp_pages(page1, page2, 0); | |
+} | |
+ | |
+static inline int is_page_full_zero(struct page *page) | |
+{ | |
+ char *addr; | |
+ int ret; | |
+ | |
+ addr = kmap_atomic(page); | |
+ ret = is_full_zero(addr, PAGE_SIZE); | |
+ kunmap_atomic(addr); | |
+ | |
+ return ret; | |
+} | |
+ | |
+static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |
+ pte_t *orig_pte, pte_t *old_pte) | |
+{ | |
+ struct mm_struct *mm = vma->vm_mm; | |
+ unsigned long addr; | |
+ pte_t *ptep; | |
+ spinlock_t *ptl; | |
+ int swapped; | |
+ int err = -EFAULT; | |
+ unsigned long mmun_start; /* For mmu_notifiers */ | |
+ unsigned long mmun_end; /* For mmu_notifiers */ | |
+ | |
+ addr = page_address_in_vma(page, vma); | |
+ if (addr == -EFAULT) | |
+ goto out; | |
+ | |
+ BUG_ON(PageTransCompound(page)); | |
+ | |
+ mmun_start = addr; | |
+ mmun_end = addr + PAGE_SIZE; | |
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | |
+ | |
+ ptep = page_check_address(page, mm, addr, &ptl, 0); | |
+ if (!ptep) | |
+ goto out_mn; | |
+ | |
+ if (old_pte) | |
+ *old_pte = *ptep; | |
+ | |
+ if (pte_write(*ptep) || pte_dirty(*ptep)) { | |
+ pte_t entry; | |
+ | |
+ swapped = PageSwapCache(page); | |
+ flush_cache_page(vma, addr, page_to_pfn(page)); | |
+ /* | |
+ * Ok this is tricky, when get_user_pages_fast() run it doesnt | |
+ * take any lock, therefore the check that we are going to make | |
+ * with the pagecount against the mapcount is racey and | |
+ * O_DIRECT can happen right after the check. | |
+ * So we clear the pte and flush the tlb before the check | |
+ * this assure us that no O_DIRECT can happen after the check | |
+ * or in the middle of the check. | |
+ */ | |
+ entry = ptep_clear_flush_notify(vma, addr, ptep); | |
+ /* | |
+ * Check that no O_DIRECT or similar I/O is in progress on the | |
+ * page | |
+ */ | |
+ if (page_mapcount(page) + 1 + swapped != page_count(page)) { | |
+ set_pte_at(mm, addr, ptep, entry); | |
+ goto out_unlock; | |
+ } | |
+ if (pte_dirty(entry)) | |
+ set_page_dirty(page); | |
+ entry = pte_mkclean(pte_wrprotect(entry)); | |
+ set_pte_at_notify(mm, addr, ptep, entry); | |
+ } | |
+ *orig_pte = *ptep; | |
+ err = 0; | |
+ | |
+out_unlock: | |
+ pte_unmap_unlock(ptep, ptl); | |
+out_mn: | |
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | |
+out: | |
+ return err; | |
+} | |
+ | |
+#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ | |
+#define MERGE_ERR_COLLI 2 /* there is a collision */ | |
+#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ | |
+#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ | |
+ | |
+ | |
+/** | |
+ * replace_page - replace page in vma by new ksm page | |
+ * @vma: vma that holds the pte pointing to page | |
+ * @page: the page we are replacing by kpage | |
+ * @kpage: the ksm page we replace page by | |
+ * @orig_pte: the original value of the pte | |
+ * | |
+ * Returns 0 on success, MERGE_ERR_PGERR on failure. | |
+ */ | |
+static int replace_page(struct vm_area_struct *vma, struct page *page, | |
+ struct page *kpage, pte_t orig_pte) | |
+{ | |
+ struct mm_struct *mm = vma->vm_mm; | |
+ pgd_t *pgd; | |
+ pud_t *pud; | |
+ pmd_t *pmd; | |
+ pte_t *ptep; | |
+ spinlock_t *ptl; | |
+ pte_t entry; | |
+ | |
+ unsigned long addr; | |
+ int err = MERGE_ERR_PGERR; | |
+ unsigned long mmun_start; /* For mmu_notifiers */ | |
+ unsigned long mmun_end; /* For mmu_notifiers */ | |
+ | |
+ addr = page_address_in_vma(page, vma); | |
+ if (addr == -EFAULT) | |
+ goto out; | |
+ | |
+ pgd = pgd_offset(mm, addr); | |
+ if (!pgd_present(*pgd)) | |
+ goto out; | |
+ | |
+ pud = pud_offset(pgd, addr); | |
+ if (!pud_present(*pud)) | |
+ goto out; | |
+ | |
+ pmd = pmd_offset(pud, addr); | |
+ BUG_ON(pmd_trans_huge(*pmd)); | |
+ if (!pmd_present(*pmd)) | |
+ goto out; | |
+ | |
+ mmun_start = addr; | |
+ mmun_end = addr + PAGE_SIZE; | |
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | |
+ | |
+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | |
+ if (!pte_same(*ptep, orig_pte)) { | |
+ pte_unmap_unlock(ptep, ptl); | |
+ goto out_mn; | |
+ } | |
+ | |
+ flush_cache_page(vma, addr, pte_pfn(*ptep)); | |
+ ptep_clear_flush_notify(vma, addr, ptep); | |
+ entry = mk_pte(kpage, vma->vm_page_prot); | |
+ | |
+ /* special treatment is needed for zero_page */ | |
+ if ((page_to_pfn(kpage) == uksm_zero_pfn) || | |
+ (page_to_pfn(kpage) == zero_pfn)) | |
+ entry = pte_mkspecial(entry); | |
+ else { | |
+ get_page(kpage); | |
+ page_add_anon_rmap(kpage, vma, addr, false); | |
+ } | |
+ | |
+ set_pte_at_notify(mm, addr, ptep, entry); | |
+ | |
+ page_remove_rmap(page, false); | |
+ if (!page_mapped(page)) | |
+ try_to_free_swap(page); | |
+ put_page(page); | |
+ | |
+ pte_unmap_unlock(ptep, ptl); | |
+ err = 0; | |
+out_mn: | |
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | |
+out: | |
+ return err; | |
+} | |
+ | |
+ | |
+/** | |
+ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The | |
+ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its | |
+ * hash_max member has not been calculated. | |
+ * | |
+ * @page The page needs to be hashed | |
+ * @hash_old The hash value calculated with current hash strength | |
+ * | |
+ * return the new hash value calculated at HASH_STRENGTH_MAX | |
+ */ | |
+static inline u32 page_hash_max(struct page *page, u32 hash_old) | |
+{ | |
+ u32 hash_max = 0; | |
+ void *addr; | |
+ | |
+ addr = kmap_atomic(page); | |
+ hash_max = delta_hash(addr, hash_strength, | |
+ HASH_STRENGTH_MAX, hash_old); | |
+ | |
+ kunmap_atomic(addr); | |
+ | |
+ if (!hash_max) | |
+ hash_max = 1; | |
+ | |
+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); | |
+ return hash_max; | |
+} | |
+ | |
+/* | |
+ * We compare the hash again, to ensure that it is really a hash collision | |
+ * instead of being caused by page write. | |
+ */ | |
+static inline int check_collision(struct rmap_item *rmap_item, | |
+ u32 hash) | |
+{ | |
+ int err; | |
+ struct page *page = rmap_item->page; | |
+ | |
+ /* if this rmap_item has already been hash_maxed, then the collision | |
+ * must appears in the second-level rbtree search. In this case we check | |
+ * if its hash_max value has been changed. Otherwise, the collision | |
+ * happens in the first-level rbtree search, so we check against it's | |
+ * current hash value. | |
+ */ | |
+ if (rmap_item->hash_max) { | |
+ inc_rshash_neg(memcmp_cost); | |
+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); | |
+ | |
+ if (rmap_item->hash_max == page_hash_max(page, hash)) | |
+ err = MERGE_ERR_COLLI; | |
+ else | |
+ err = MERGE_ERR_CHANGED; | |
+ } else { | |
+ inc_rshash_neg(memcmp_cost + hash_strength); | |
+ | |
+ if (page_hash(page, hash_strength, 0) == hash) | |
+ err = MERGE_ERR_COLLI; | |
+ else | |
+ err = MERGE_ERR_CHANGED; | |
+ } | |
+ | |
+ return err; | |
+} | |
+ | |
+static struct page *page_trans_compound_anon(struct page *page) | |
+{ | |
+ if (PageTransCompound(page)) { | |
+ struct page *head = compound_head(page); | |
+ /* | |
+ * head may actually be splitted and freed from under | |
+ * us but it's ok here. | |
+ */ | |
+ if (PageAnon(head)) | |
+ return head; | |
+ } | |
+ return NULL; | |
+} | |
+ | |
+static int page_trans_compound_anon_split(struct page *page) | |
+{ | |
+ int ret = 0; | |
+ struct page *transhuge_head = page_trans_compound_anon(page); | |
+ if (transhuge_head) { | |
+ /* Get the reference on the head to split it. */ | |
+ if (get_page_unless_zero(transhuge_head)) { | |
+ /* | |
+ * Recheck we got the reference while the head | |
+ * was still anonymous. | |
+ */ | |
+ if (PageAnon(transhuge_head)) | |
+ ret = split_huge_page(transhuge_head); | |
+ else | |
+ /* | |
+ * Retry later if split_huge_page run | |
+ * from under us. | |
+ */ | |
+ ret = 1; | |
+ put_page(transhuge_head); | |
+ } else | |
+ /* Retry later if split_huge_page run from under us. */ | |
+ ret = 1; | |
+ } | |
+ return ret; | |
+} | |
+ | |
+/** | |
+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must | |
+ * already be a ksm page. | |
+ * | |
+ * @return 0 if the pages were merged, -EFAULT otherwise. | |
+ */ | |
+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, | |
+ struct page *kpage, u32 hash) | |
+{ | |
+ struct vm_area_struct *vma = rmap_item->slot->vma; | |
+ struct mm_struct *mm = vma->vm_mm; | |
+ pte_t orig_pte = __pte(0); | |
+ int err = MERGE_ERR_PGERR; | |
+ struct page *page; | |
+ | |
+ if (uksm_test_exit(mm)) | |
+ goto out; | |
+ | |
+ page = rmap_item->page; | |
+ | |
+ if (page == kpage) { /* ksm page forked */ | |
+ err = 0; | |
+ goto out; | |
+ } | |
+ | |
+ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | |
+ goto out; | |
+ BUG_ON(PageTransCompound(page)); | |
+ | |
+ if (!PageAnon(page) || !PageKsm(kpage)) | |
+ goto out; | |
+ | |
+ /* | |
+ * We need the page lock to read a stable PageSwapCache in | |
+ * write_protect_page(). We use trylock_page() instead of | |
+ * lock_page() because we don't want to wait here - we | |
+ * prefer to continue scanning and merging different pages, | |
+ * then come back to this page when it is unlocked. | |
+ */ | |
+ if (!trylock_page(page)) | |
+ goto out; | |
+ /* | |
+ * If this anonymous page is mapped only here, its pte may need | |
+ * to be write-protected. If it's mapped elsewhere, all of its | |
+ * ptes are necessarily already write-protected. But in either | |
+ * case, we need to lock and check page_count is not raised. | |
+ */ | |
+ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { | |
+ if (pages_identical(page, kpage)) | |
+ err = replace_page(vma, page, kpage, orig_pte); | |
+ else | |
+ err = check_collision(rmap_item, hash); | |
+ } | |
+ | |
+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { | |
+ munlock_vma_page(page); | |
+ if (!PageMlocked(kpage)) { | |
+ unlock_page(page); | |
+ lock_page(kpage); | |
+ mlock_vma_page(kpage); | |
+ page = kpage; /* for final unlock */ | |
+ } | |
+ } | |
+ | |
+ unlock_page(page); | |
+out: | |
+ return err; | |
+} | |
+ | |
+ | |
+ | |
+/** | |
+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance | |
+ * to restore a page mapping that has been changed in try_to_merge_two_pages. | |
+ * | |
+ * @return 0 on success. | |
+ */ | |
+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, | |
+ pte_t orig_pte, pte_t wprt_pte) | |
+{ | |
+ struct mm_struct *mm = vma->vm_mm; | |
+ pgd_t *pgd; | |
+ pud_t *pud; | |
+ pmd_t *pmd; | |
+ pte_t *ptep; | |
+ spinlock_t *ptl; | |
+ | |
+ int err = -EFAULT; | |
+ | |
+ pgd = pgd_offset(mm, addr); | |
+ if (!pgd_present(*pgd)) | |
+ goto out; | |
+ | |
+ pud = pud_offset(pgd, addr); | |
+ if (!pud_present(*pud)) | |
+ goto out; | |
+ | |
+ pmd = pmd_offset(pud, addr); | |
+ if (!pmd_present(*pmd)) | |
+ goto out; | |
+ | |
+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | |
+ if (!pte_same(*ptep, wprt_pte)) { | |
+ /* already copied, let it be */ | |
+ pte_unmap_unlock(ptep, ptl); | |
+ goto out; | |
+ } | |
+ | |
+ /* | |
+ * Good boy, still here. When we still get the ksm page, it does not | |
+ * return to the free page pool, there is no way that a pte was changed | |
+ * to other page and gets back to this page. And remind that ksm page | |
+ * do not reuse in do_wp_page(). So it's safe to restore the original | |
+ * pte. | |
+ */ | |
+ flush_cache_page(vma, addr, pte_pfn(*ptep)); | |
+ ptep_clear_flush_notify(vma, addr, ptep); | |
+ set_pte_at_notify(mm, addr, ptep, orig_pte); | |
+ | |
+ pte_unmap_unlock(ptep, ptl); | |
+ err = 0; | |
+out: | |
+ return err; | |
+} | |
+ | |
+/** | |
+ * try_to_merge_two_pages() - take two identical pages and prepare | |
+ * them to be merged into one page(rmap_item->page) | |
+ * | |
+ * @return 0 if we successfully merged two identical pages into | |
+ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision | |
+ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been | |
+ * changed since it's hashed. MERGE_ERR_PGERR otherwise. | |
+ * | |
+ */ | |
+static int try_to_merge_two_pages(struct rmap_item *rmap_item, | |
+ struct rmap_item *tree_rmap_item, | |
+ u32 hash) | |
+{ | |
+ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); | |
+ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); | |
+ struct vm_area_struct *vma1 = rmap_item->slot->vma; | |
+ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; | |
+ struct page *page = rmap_item->page; | |
+ struct page *tree_page = tree_rmap_item->page; | |
+ int err = MERGE_ERR_PGERR; | |
+ struct address_space *saved_mapping; | |
+ | |
+ | |
+ if (rmap_item->page == tree_rmap_item->page) | |
+ goto out; | |
+ | |
+ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | |
+ goto out; | |
+ BUG_ON(PageTransCompound(page)); | |
+ | |
+ if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page)) | |
+ goto out; | |
+ BUG_ON(PageTransCompound(tree_page)); | |
+ | |
+ if (!PageAnon(page) || !PageAnon(tree_page)) | |
+ goto out; | |
+ | |
+ if (!trylock_page(page)) | |
+ goto out; | |
+ | |
+ | |
+ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { | |
+ unlock_page(page); | |
+ goto out; | |
+ } | |
+ | |
+ /* | |
+ * While we hold page lock, upgrade page from | |
+ * PageAnon+anon_vma to PageKsm+NULL stable_node: | |
+ * stable_tree_insert() will update stable_node. | |
+ */ | |
+ saved_mapping = page->mapping; | |
+ set_page_stable_node(page, NULL); | |
+ mark_page_accessed(page); | |
+ unlock_page(page); | |
+ | |
+ if (!trylock_page(tree_page)) | |
+ goto restore_out; | |
+ | |
+ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { | |
+ unlock_page(tree_page); | |
+ goto restore_out; | |
+ } | |
+ | |
+ if (pages_identical(page, tree_page)) { | |
+ err = replace_page(vma2, tree_page, page, wprt_pte2); | |
+ if (err) { | |
+ unlock_page(tree_page); | |
+ goto restore_out; | |
+ } | |
+ | |
+ if ((vma2->vm_flags & VM_LOCKED)) { | |
+ munlock_vma_page(tree_page); | |
+ if (!PageMlocked(page)) { | |
+ unlock_page(tree_page); | |
+ lock_page(page); | |
+ mlock_vma_page(page); | |
+ tree_page = page; /* for final unlock */ | |
+ } | |
+ } | |
+ | |
+ unlock_page(tree_page); | |
+ | |
+ goto out; /* success */ | |
+ | |
+ } else { | |
+ if (tree_rmap_item->hash_max && | |
+ tree_rmap_item->hash_max == rmap_item->hash_max) { | |
+ err = MERGE_ERR_COLLI_MAX; | |
+ } else if (page_hash(page, hash_strength, 0) == | |
+ page_hash(tree_page, hash_strength, 0)) { | |
+ inc_rshash_neg(memcmp_cost + hash_strength * 2); | |
+ err = MERGE_ERR_COLLI; | |
+ } else { | |
+ err = MERGE_ERR_CHANGED; | |
+ } | |
+ | |
+ unlock_page(tree_page); | |
+ } | |
+ | |
+restore_out: | |
+ lock_page(page); | |
+ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), | |
+ orig_pte1, wprt_pte1)) | |
+ page->mapping = saved_mapping; | |
+ | |
+ unlock_page(page); | |
+out: | |
+ return err; | |
+} | |
+ | |
+static inline int hash_cmp(u32 new_val, u32 node_val) | |
+{ | |
+ if (new_val > node_val) | |
+ return 1; | |
+ else if (new_val < node_val) | |
+ return -1; | |
+ else | |
+ return 0; | |
+} | |
+ | |
+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) | |
+{ | |
+ u32 hash_max = item->hash_max; | |
+ | |
+ if (!hash_max) { | |
+ hash_max = page_hash_max(item->page, hash); | |
+ | |
+ item->hash_max = hash_max; | |
+ } | |
+ | |
+ return hash_max; | |
+} | |
+ | |
+ | |
+ | |
+/** | |
+ * stable_tree_search() - search the stable tree for a page | |
+ * | |
+ * @item: the rmap_item we are comparing with | |
+ * @hash: the hash value of this item->page already calculated | |
+ * | |
+ * @return the page we have found, NULL otherwise. The page returned has | |
+ * been gotten. | |
+ */ | |
+static struct page *stable_tree_search(struct rmap_item *item, u32 hash) | |
+{ | |
+ struct rb_node *node = root_stable_treep->rb_node; | |
+ struct tree_node *tree_node; | |
+ unsigned long hash_max; | |
+ struct page *page = item->page; | |
+ struct stable_node *stable_node; | |
+ | |
+ stable_node = page_stable_node(page); | |
+ if (stable_node) { | |
+ /* ksm page forked, that is | |
+ * if (PageKsm(page) && !in_stable_tree(rmap_item)) | |
+ * it's actually gotten once outside. | |
+ */ | |
+ get_page(page); | |
+ return page; | |
+ } | |
+ | |
+ while (node) { | |
+ int cmp; | |
+ | |
+ tree_node = rb_entry(node, struct tree_node, node); | |
+ | |
+ cmp = hash_cmp(hash, tree_node->hash); | |
+ | |
+ if (cmp < 0) | |
+ node = node->rb_left; | |
+ else if (cmp > 0) | |
+ node = node->rb_right; | |
+ else | |
+ break; | |
+ } | |
+ | |
+ if (!node) | |
+ return NULL; | |
+ | |
+ if (tree_node->count == 1) { | |
+ stable_node = rb_entry(tree_node->sub_root.rb_node, | |
+ struct stable_node, node); | |
+ BUG_ON(!stable_node); | |
+ | |
+ goto get_page_out; | |
+ } | |
+ | |
+ /* | |
+ * ok, we have to search the second | |
+ * level subtree, hash the page to a | |
+ * full strength. | |
+ */ | |
+ node = tree_node->sub_root.rb_node; | |
+ BUG_ON(!node); | |
+ hash_max = rmap_item_hash_max(item, hash); | |
+ | |
+ while (node) { | |
+ int cmp; | |
+ | |
+ stable_node = rb_entry(node, struct stable_node, node); | |
+ | |
+ cmp = hash_cmp(hash_max, stable_node->hash_max); | |
+ | |
+ if (cmp < 0) | |
+ node = node->rb_left; | |
+ else if (cmp > 0) | |
+ node = node->rb_right; | |
+ else | |
+ goto get_page_out; | |
+ } | |
+ | |
+ return NULL; | |
+ | |
+get_page_out: | |
+ page = get_uksm_page(stable_node, 1, 1); | |
+ return page; | |
+} | |
+ | |
+static int try_merge_rmap_item(struct rmap_item *item, | |
+ struct page *kpage, | |
+ struct page *tree_page) | |
+{ | |
+ spinlock_t *ptl; | |
+ pte_t *ptep; | |
+ unsigned long addr; | |
+ struct vm_area_struct *vma = item->slot->vma; | |
+ | |
+ addr = get_rmap_addr(item); | |
+ ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0); | |
+ if (!ptep) | |
+ return 0; | |
+ | |
+ if (pte_write(*ptep)) { | |
+ /* has changed, abort! */ | |
+ pte_unmap_unlock(ptep, ptl); | |
+ return 0; | |
+ } | |
+ | |
+ get_page(tree_page); | |
+ page_add_anon_rmap(tree_page, vma, addr, false); | |
+ | |
+ flush_cache_page(vma, addr, pte_pfn(*ptep)); | |
+ ptep_clear_flush_notify(vma, addr, ptep); | |
+ set_pte_at_notify(vma->vm_mm, addr, ptep, | |
+ mk_pte(tree_page, vma->vm_page_prot)); | |
+ | |
+ page_remove_rmap(kpage, false); | |
+ put_page(kpage); | |
+ | |
+ pte_unmap_unlock(ptep, ptl); | |
+ | |
+ return 1; | |
+} | |
+ | |
+/** | |
+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted | |
+ * into stable tree, the page was found to be identical to a stable ksm page, | |
+ * this is the last chance we can merge them into one. | |
+ * | |
+ * @item1: the rmap_item holding the page which we wanted to insert | |
+ * into stable tree. | |
+ * @item2: the other rmap_item we found when unstable tree search | |
+ * @oldpage: the page currently mapped by the two rmap_items | |
+ * @tree_page: the page we found identical in stable tree node | |
+ * @success1: return if item1 is successfully merged | |
+ * @success2: return if item2 is successfully merged | |
+ */ | |
+static void try_merge_with_stable(struct rmap_item *item1, | |
+ struct rmap_item *item2, | |
+ struct page **kpage, | |
+ struct page *tree_page, | |
+ int *success1, int *success2) | |
+{ | |
+ struct vm_area_struct *vma1 = item1->slot->vma; | |
+ struct vm_area_struct *vma2 = item2->slot->vma; | |
+ *success1 = 0; | |
+ *success2 = 0; | |
+ | |
+ if (unlikely(*kpage == tree_page)) { | |
+ /* I don't think this can really happen */ | |
+ printk(KERN_WARNING "UKSM: unexpected condition detected in " | |
+ "try_merge_with_stable() -- *kpage == tree_page !\n"); | |
+ *success1 = 1; | |
+ *success2 = 1; | |
+ return; | |
+ } | |
+ | |
+ if (!PageAnon(*kpage) || !PageKsm(*kpage)) | |
+ goto failed; | |
+ | |
+ if (!trylock_page(tree_page)) | |
+ goto failed; | |
+ | |
+ /* If the oldpage is still ksm and still pointed | |
+ * to in the right place, and still write protected, | |
+ * we are confident it's not changed, no need to | |
+ * memcmp anymore. | |
+ * be ware, we cannot take nested pte locks, | |
+ * deadlock risk. | |
+ */ | |
+ if (!try_merge_rmap_item(item1, *kpage, tree_page)) | |
+ goto unlock_failed; | |
+ | |
+ /* ok, then vma2, remind that pte1 already set */ | |
+ if (!try_merge_rmap_item(item2, *kpage, tree_page)) | |
+ goto success_1; | |
+ | |
+ *success2 = 1; | |
+success_1: | |
+ *success1 = 1; | |
+ | |
+ | |
+ if ((*success1 && vma1->vm_flags & VM_LOCKED) || | |
+ (*success2 && vma2->vm_flags & VM_LOCKED)) { | |
+ munlock_vma_page(*kpage); | |
+ if (!PageMlocked(tree_page)) | |
+ mlock_vma_page(tree_page); | |
+ } | |
+ | |
+ /* | |
+ * We do not need oldpage any more in the caller, so can break the lock | |
+ * now. | |
+ */ | |
+ unlock_page(*kpage); | |
+ *kpage = tree_page; /* Get unlocked outside. */ | |
+ return; | |
+ | |
+unlock_failed: | |
+ unlock_page(tree_page); | |
+failed: | |
+ return; | |
+} | |
+ | |
+static inline void stable_node_hash_max(struct stable_node *node, | |
+ struct page *page, u32 hash) | |
+{ | |
+ u32 hash_max = node->hash_max; | |
+ | |
+ if (!hash_max) { | |
+ hash_max = page_hash_max(page, hash); | |
+ node->hash_max = hash_max; | |
+ } | |
+} | |
+ | |
+static inline | |
+struct stable_node *new_stable_node(struct tree_node *tree_node, | |
+ struct page *kpage, u32 hash_max) | |
+{ | |
+ struct stable_node *new_stable_node; | |
+ | |
+ new_stable_node = alloc_stable_node(); | |
+ if (!new_stable_node) | |
+ return NULL; | |
+ | |
+ new_stable_node->kpfn = page_to_pfn(kpage); | |
+ new_stable_node->hash_max = hash_max; | |
+ new_stable_node->tree_node = tree_node; | |
+ set_page_stable_node(kpage, new_stable_node); | |
+ | |
+ return new_stable_node; | |
+} | |
+ | |
+static inline | |
+struct stable_node *first_level_insert(struct tree_node *tree_node, | |
+ struct rmap_item *rmap_item, | |
+ struct rmap_item *tree_rmap_item, | |
+ struct page **kpage, u32 hash, | |
+ int *success1, int *success2) | |
+{ | |
+ int cmp; | |
+ struct page *tree_page; | |
+ u32 hash_max = 0; | |
+ struct stable_node *stable_node, *new_snode; | |
+ struct rb_node *parent = NULL, **new; | |
+ | |
+ /* this tree node contains no sub-tree yet */ | |
+ stable_node = rb_entry(tree_node->sub_root.rb_node, | |
+ struct stable_node, node); | |
+ | |
+ tree_page = get_uksm_page(stable_node, 1, 0); | |
+ if (tree_page) { | |
+ cmp = memcmp_pages(*kpage, tree_page, 1); | |
+ if (!cmp) { | |
+ try_merge_with_stable(rmap_item, tree_rmap_item, kpage, | |
+ tree_page, success1, success2); | |
+ put_page(tree_page); | |
+ if (!*success1 && !*success2) | |
+ goto failed; | |
+ | |
+ return stable_node; | |
+ | |
+ } else { | |
+ /* | |
+ * collision in first level try to create a subtree. | |
+ * A new node need to be created. | |
+ */ | |
+ put_page(tree_page); | |
+ | |
+ stable_node_hash_max(stable_node, tree_page, | |
+ tree_node->hash); | |
+ hash_max = rmap_item_hash_max(rmap_item, hash); | |
+ cmp = hash_cmp(hash_max, stable_node->hash_max); | |
+ | |
+ parent = &stable_node->node; | |
+ if (cmp < 0) { | |
+ new = &parent->rb_left; | |
+ } else if (cmp > 0) { | |
+ new = &parent->rb_right; | |
+ } else { | |
+ goto failed; | |
+ } | |
+ } | |
+ | |
+ } else { | |
+ /* the only stable_node deleted, we reuse its tree_node. | |
+ */ | |
+ parent = NULL; | |
+ new = &tree_node->sub_root.rb_node; | |
+ } | |
+ | |
+ new_snode = new_stable_node(tree_node, *kpage, hash_max); | |
+ if (!new_snode) | |
+ goto failed; | |
+ | |
+ rb_link_node(&new_snode->node, parent, new); | |
+ rb_insert_color(&new_snode->node, &tree_node->sub_root); | |
+ tree_node->count++; | |
+ *success1 = *success2 = 1; | |
+ | |
+ return new_snode; | |
+ | |
+failed: | |
+ return NULL; | |
+} | |
+ | |
+static inline | |
+struct stable_node *stable_subtree_insert(struct tree_node *tree_node, | |
+ struct rmap_item *rmap_item, | |
+ struct rmap_item *tree_rmap_item, | |
+ struct page **kpage, u32 hash, | |
+ int *success1, int *success2) | |
+{ | |
+ struct page *tree_page; | |
+ u32 hash_max; | |
+ struct stable_node *stable_node, *new_snode; | |
+ struct rb_node *parent, **new; | |
+ | |
+research: | |
+ parent = NULL; | |
+ new = &tree_node->sub_root.rb_node; | |
+ BUG_ON(!*new); | |
+ hash_max = rmap_item_hash_max(rmap_item, hash); | |
+ while (*new) { | |
+ int cmp; | |
+ | |
+ stable_node = rb_entry(*new, struct stable_node, node); | |
+ | |
+ cmp = hash_cmp(hash_max, stable_node->hash_max); | |
+ | |
+ if (cmp < 0) { | |
+ parent = *new; | |
+ new = &parent->rb_left; | |
+ } else if (cmp > 0) { | |
+ parent = *new; | |
+ new = &parent->rb_right; | |
+ } else { | |
+ tree_page = get_uksm_page(stable_node, 1, 0); | |
+ if (tree_page) { | |
+ cmp = memcmp_pages(*kpage, tree_page, 1); | |
+ if (!cmp) { | |
+ try_merge_with_stable(rmap_item, | |
+ tree_rmap_item, kpage, | |
+ tree_page, success1, success2); | |
+ | |
+ put_page(tree_page); | |
+ if (!*success1 && !*success2) | |
+ goto failed; | |
+ /* | |
+ * successfully merged with a stable | |
+ * node | |
+ */ | |
+ return stable_node; | |
+ } else { | |
+ put_page(tree_page); | |
+ goto failed; | |
+ } | |
+ } else { | |
+ /* | |
+ * stable node may be deleted, | |
+ * and subtree maybe | |
+ * restructed, cannot | |
+ * continue, research it. | |
+ */ | |
+ if (tree_node->count) { | |
+ goto research; | |
+ } else { | |
+ /* reuse the tree node*/ | |
+ parent = NULL; | |
+ new = &tree_node->sub_root.rb_node; | |
+ } | |
+ } | |
+ } | |
+ } | |
+ | |
+ new_snode = new_stable_node(tree_node, *kpage, hash_max); | |
+ if (!new_snode) | |
+ goto failed; | |
+ | |
+ rb_link_node(&new_snode->node, parent, new); | |
+ rb_insert_color(&new_snode->node, &tree_node->sub_root); | |
+ tree_node->count++; | |
+ *success1 = *success2 = 1; | |
+ | |
+ return new_snode; | |
+ | |
+failed: | |
+ return NULL; | |
+} | |
+ | |
+ | |
+/** | |
+ * stable_tree_insert() - try to insert a merged page in unstable tree to | |
+ * the stable tree | |
+ * | |
+ * @kpage: the page need to be inserted | |
+ * @hash: the current hash of this page | |
+ * @rmap_item: the rmap_item being scanned | |
+ * @tree_rmap_item: the rmap_item found on unstable tree | |
+ * @success1: return if rmap_item is merged | |
+ * @success2: return if tree_rmap_item is merged | |
+ * | |
+ * @return the stable_node on stable tree if at least one | |
+ * rmap_item is inserted into stable tree, NULL | |
+ * otherwise. | |
+ */ | |
+static struct stable_node * | |
+stable_tree_insert(struct page **kpage, u32 hash, | |
+ struct rmap_item *rmap_item, | |
+ struct rmap_item *tree_rmap_item, | |
+ int *success1, int *success2) | |
+{ | |
+ struct rb_node **new = &root_stable_treep->rb_node; | |
+ struct rb_node *parent = NULL; | |
+ struct stable_node *stable_node; | |
+ struct tree_node *tree_node; | |
+ u32 hash_max = 0; | |
+ | |
+ *success1 = *success2 = 0; | |
+ | |
+ while (*new) { | |
+ int cmp; | |
+ | |
+ tree_node = rb_entry(*new, struct tree_node, node); | |
+ | |
+ cmp = hash_cmp(hash, tree_node->hash); | |
+ | |
+ if (cmp < 0) { | |
+ parent = *new; | |
+ new = &parent->rb_left; | |
+ } else if (cmp > 0) { | |
+ parent = *new; | |
+ new = &parent->rb_right; | |
+ } else | |
+ break; | |
+ } | |
+ | |
+ if (*new) { | |
+ if (tree_node->count == 1) { | |
+ stable_node = first_level_insert(tree_node, rmap_item, | |
+ tree_rmap_item, kpage, | |
+ hash, success1, success2); | |
+ } else { | |
+ stable_node = stable_subtree_insert(tree_node, | |
+ rmap_item, tree_rmap_item, kpage, | |
+ hash, success1, success2); | |
+ } | |
+ } else { | |
+ | |
+ /* no tree node found */ | |
+ tree_node = alloc_tree_node(stable_tree_node_listp); | |
+ if (!tree_node) { | |
+ stable_node = NULL; | |
+ goto out; | |
+ } | |
+ | |
+ stable_node = new_stable_node(tree_node, *kpage, hash_max); | |
+ if (!stable_node) { | |
+ free_tree_node(tree_node); | |
+ goto out; | |
+ } | |
+ | |
+ tree_node->hash = hash; | |
+ rb_link_node(&tree_node->node, parent, new); | |
+ rb_insert_color(&tree_node->node, root_stable_treep); | |
+ parent = NULL; | |
+ new = &tree_node->sub_root.rb_node; | |
+ | |
+ rb_link_node(&stable_node->node, parent, new); | |
+ rb_insert_color(&stable_node->node, &tree_node->sub_root); | |
+ tree_node->count++; | |
+ *success1 = *success2 = 1; | |
+ } | |
+ | |
+out: | |
+ return stable_node; | |
+} | |
+ | |
+ | |
+/** | |
+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem | |
+ * | |
+ * @return 0 on success, -EBUSY if unable to lock the mmap_sem, | |
+ * -EINVAL if the page mapping has been changed. | |
+ */ | |
+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) | |
+{ | |
+ int err; | |
+ | |
+ err = get_mergeable_page_lock_mmap(tree_rmap_item); | |
+ | |
+ if (err == -EINVAL) { | |
+ /* its page map has been changed, remove it */ | |
+ remove_rmap_item_from_tree(tree_rmap_item); | |
+ } | |
+ | |
+ /* The page is gotten and mmap_sem is locked now. */ | |
+ return err; | |
+} | |
+ | |
+ | |
+/** | |
+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the | |
+ * same hash value. Get its page and trylock the mmap_sem | |
+ */ | |
+static inline | |
+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |
+ u32 hash) | |
+ | |
+{ | |
+ struct rb_node **new = &root_unstable_tree.rb_node; | |
+ struct rb_node *parent = NULL; | |
+ struct tree_node *tree_node; | |
+ u32 hash_max; | |
+ struct rmap_item *tree_rmap_item; | |
+ | |
+ while (*new) { | |
+ int cmp; | |
+ | |
+ tree_node = rb_entry(*new, struct tree_node, node); | |
+ | |
+ cmp = hash_cmp(hash, tree_node->hash); | |
+ | |
+ if (cmp < 0) { | |
+ parent = *new; | |
+ new = &parent->rb_left; | |
+ } else if (cmp > 0) { | |
+ parent = *new; | |
+ new = &parent->rb_right; | |
+ } else | |
+ break; | |
+ } | |
+ | |
+ if (*new) { | |
+ /* got the tree_node */ | |
+ if (tree_node->count == 1) { | |
+ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, | |
+ struct rmap_item, node); | |
+ BUG_ON(!tree_rmap_item); | |
+ | |
+ goto get_page_out; | |
+ } | |
+ | |
+ /* well, search the collision subtree */ | |
+ new = &tree_node->sub_root.rb_node; | |
+ BUG_ON(!*new); | |
+ hash_max = rmap_item_hash_max(rmap_item, hash); | |
+ | |
+ while (*new) { | |
+ int cmp; | |
+ | |
+ tree_rmap_item = rb_entry(*new, struct rmap_item, | |
+ node); | |
+ | |
+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); | |
+ parent = *new; | |
+ if (cmp < 0) | |
+ new = &parent->rb_left; | |
+ else if (cmp > 0) | |
+ new = &parent->rb_right; | |
+ else | |
+ goto get_page_out; | |
+ } | |
+ } else { | |
+ /* alloc a new tree_node */ | |
+ tree_node = alloc_tree_node(&unstable_tree_node_list); | |
+ if (!tree_node) | |
+ return NULL; | |
+ | |
+ tree_node->hash = hash; | |
+ rb_link_node(&tree_node->node, parent, new); | |
+ rb_insert_color(&tree_node->node, &root_unstable_tree); | |
+ parent = NULL; | |
+ new = &tree_node->sub_root.rb_node; | |
+ } | |
+ | |
+ /* did not found even in sub-tree */ | |
+ rmap_item->tree_node = tree_node; | |
+ rmap_item->address |= UNSTABLE_FLAG; | |
+ rmap_item->hash_round = uksm_hash_round; | |
+ rb_link_node(&rmap_item->node, parent, new); | |
+ rb_insert_color(&rmap_item->node, &tree_node->sub_root); | |
+ | |
+ uksm_pages_unshared++; | |
+ return NULL; | |
+ | |
+get_page_out: | |
+ if (tree_rmap_item->page == rmap_item->page) | |
+ return NULL; | |
+ | |
+ if (get_tree_rmap_item_page(tree_rmap_item)) | |
+ return NULL; | |
+ | |
+ return tree_rmap_item; | |
+} | |
+ | |
+static void hold_anon_vma(struct rmap_item *rmap_item, | |
+ struct anon_vma *anon_vma) | |
+{ | |
+ rmap_item->anon_vma = anon_vma; | |
+ get_anon_vma(anon_vma); | |
+} | |
+ | |
+ | |
+/** | |
+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication | |
+ * ratio statistics is done in this function. | |
+ * | |
+ */ | |
+static void stable_tree_append(struct rmap_item *rmap_item, | |
+ struct stable_node *stable_node, int logdedup) | |
+{ | |
+ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; | |
+ unsigned long key = (unsigned long)rmap_item->slot; | |
+ unsigned long factor = rmap_item->slot->rung->step; | |
+ | |
+ BUG_ON(!stable_node); | |
+ rmap_item->address |= STABLE_FLAG; | |
+ | |
+ if (hlist_empty(&stable_node->hlist)) { | |
+ uksm_pages_shared++; | |
+ goto node_vma_new; | |
+ } else { | |
+ uksm_pages_sharing++; | |
+ } | |
+ | |
+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { | |
+ if (node_vma->key >= key) | |
+ break; | |
+ | |
+ if (logdedup) { | |
+ node_vma->slot->pages_bemerged += factor; | |
+ if (list_empty(&node_vma->slot->dedup_list)) | |
+ list_add(&node_vma->slot->dedup_list, | |
+ &vma_slot_dedup); | |
+ } | |
+ } | |
+ | |
+ if (node_vma) { | |
+ if (node_vma->key == key) { | |
+ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); | |
+ goto node_vma_ok; | |
+ } else if (node_vma->key > key) { | |
+ node_vma_cont = node_vma; | |
+ } | |
+ } | |
+ | |
+node_vma_new: | |
+ /* no same vma already in node, alloc a new node_vma */ | |
+ new_node_vma = alloc_node_vma(); | |
+ BUG_ON(!new_node_vma); | |
+ new_node_vma->head = stable_node; | |
+ new_node_vma->slot = rmap_item->slot; | |
+ | |
+ if (!node_vma) { | |
+ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); | |
+ } else if (node_vma->key != key) { | |
+ if (node_vma->key < key) | |
+ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); | |
+ else { | |
+ hlist_add_before(&new_node_vma->hlist, | |
+ &node_vma->hlist); | |
+ } | |
+ | |
+ } | |
+ node_vma = new_node_vma; | |
+ | |
+node_vma_ok: /* ok, ready to add to the list */ | |
+ rmap_item->head = node_vma; | |
+ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); | |
+ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); | |
+ if (logdedup) { | |
+ rmap_item->slot->pages_merged++; | |
+ if (node_vma_cont) { | |
+ node_vma = node_vma_cont; | |
+ hlist_for_each_entry_continue(node_vma, hlist) { | |
+ node_vma->slot->pages_bemerged += factor; | |
+ if (list_empty(&node_vma->slot->dedup_list)) | |
+ list_add(&node_vma->slot->dedup_list, | |
+ &vma_slot_dedup); | |
+ } | |
+ } | |
+ } | |
+} | |
+ | |
+/* | |
+ * We use break_ksm to break COW on a ksm page: it's a stripped down | |
+ * | |
+ * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) | |
+ * put_page(page); | |
+ * | |
+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, | |
+ * in case the application has unmapped and remapped mm,addr meanwhile. | |
+ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP | |
+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. | |
+ */ | |
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |
+{ | |
+ struct page *page; | |
+ int ret = 0; | |
+ | |
+ do { | |
+ cond_resched(); | |
+ page = follow_page(vma, addr, FOLL_GET); | |
+ if (IS_ERR_OR_NULL(page)) | |
+ break; | |
+ if (PageKsm(page)) { | |
+ ret = handle_mm_fault(vma->vm_mm, vma, addr, | |
+ FAULT_FLAG_WRITE); | |
+ } else | |
+ ret = VM_FAULT_WRITE; | |
+ put_page(page); | |
+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); | |
+ /* | |
+ * We must loop because handle_mm_fault() may back out if there's | |
+ * any difficulty e.g. if pte accessed bit gets updated concurrently. | |
+ * | |
+ * VM_FAULT_WRITE is what we have been hoping for: it indicates that | |
+ * COW has been broken, even if the vma does not permit VM_WRITE; | |
+ * but note that a concurrent fault might break PageKsm for us. | |
+ * | |
+ * VM_FAULT_SIGBUS could occur if we race with truncation of the | |
+ * backing file, which also invalidates anonymous pages: that's | |
+ * okay, that truncation will have unmapped the PageKsm for us. | |
+ * | |
+ * VM_FAULT_OOM: at the time of writing (late July 2009), setting | |
+ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the | |
+ * current task has TIF_MEMDIE set, and will be OOM killed on return | |
+ * to user; and ksmd, having no mm, would never be chosen for that. | |
+ * | |
+ * But if the mm is in a limited mem_cgroup, then the fault may fail | |
+ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and | |
+ * even ksmd can fail in this way - though it's usually breaking ksm | |
+ * just to undo a merge it made a moment before, so unlikely to oom. | |
+ * | |
+ * That's a pity: we might therefore have more kernel pages allocated | |
+ * than we're counting as nodes in the stable tree; but uksm_do_scan | |
+ * will retry to break_cow on each pass, so should recover the page | |
+ * in due course. The important thing is to not let VM_MERGEABLE | |
+ * be cleared while any such pages might remain in the area. | |
+ */ | |
+ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | |
+} | |
+ | |
+static void break_cow(struct rmap_item *rmap_item) | |
+{ | |
+ struct vm_area_struct *vma = rmap_item->slot->vma; | |
+ struct mm_struct *mm = vma->vm_mm; | |
+ unsigned long addr = get_rmap_addr(rmap_item); | |
+ | |
+ if (uksm_test_exit(mm)) | |
+ goto out; | |
+ | |
+ break_ksm(vma, addr); | |
+out: | |
+ return; | |
+} | |
+ | |
+/* | |
+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather | |
+ * than check every pte of a given vma, the locking doesn't quite work for | |
+ * that - an rmap_item is assigned to the stable tree after inserting ksm | |
+ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing | |
+ * rmap_items from parent to child at fork time (so as not to waste time | |
+ * if exit comes before the next scan reaches it). | |
+ * | |
+ * Similarly, although we'd like to remove rmap_items (so updating counts | |
+ * and freeing memory) when unmerging an area, it's easier to leave that | |
+ * to the next pass of ksmd - consider, for example, how ksmd might be | |
+ * in cmp_and_merge_page on one of the rmap_items we would be removing. | |
+ */ | |
+inline int unmerge_uksm_pages(struct vm_area_struct *vma, | |
+ unsigned long start, unsigned long end) | |
+{ | |
+ unsigned long addr; | |
+ int err = 0; | |
+ | |
+ for (addr = start; addr < end && !err; addr += PAGE_SIZE) { | |
+ if (uksm_test_exit(vma->vm_mm)) | |
+ break; | |
+ if (signal_pending(current)) | |
+ err = -ERESTARTSYS; | |
+ else | |
+ err = break_ksm(vma, addr); | |
+ } | |
+ return err; | |
+} | |
+ | |
+static inline void inc_uksm_pages_scanned(void) | |
+{ | |
+ u64 delta; | |
+ | |
+ | |
+ if (uksm_pages_scanned == U64_MAX) { | |
+ encode_benefit(); | |
+ | |
+ delta = uksm_pages_scanned >> pages_scanned_base; | |
+ | |
+ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { | |
+ pages_scanned_stored >>= 1; | |
+ delta >>= 1; | |
+ pages_scanned_base++; | |
+ } | |
+ | |
+ pages_scanned_stored += delta; | |
+ | |
+ uksm_pages_scanned = uksm_pages_scanned_last = 0; | |
+ } | |
+ | |
+ uksm_pages_scanned++; | |
+} | |
+ | |
+static inline int find_zero_page_hash(int strength, u32 hash) | |
+{ | |
+ return (zero_hash_table[strength] == hash); | |
+} | |
+ | |
+static | |
+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) | |
+{ | |
+ struct page *zero_page = empty_uksm_zero_page; | |
+ struct mm_struct *mm = vma->vm_mm; | |
+ pte_t orig_pte = __pte(0); | |
+ int err = -EFAULT; | |
+ | |
+ if (uksm_test_exit(mm)) | |
+ goto out; | |
+ | |
+ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | |
+ goto out; | |
+ BUG_ON(PageTransCompound(page)); | |
+ | |
+ if (!PageAnon(page)) | |
+ goto out; | |
+ | |
+ if (!trylock_page(page)) | |
+ goto out; | |
+ | |
+ if (write_protect_page(vma, page, &orig_pte, 0) == 0) { | |
+ if (is_page_full_zero(page)) | |
+ err = replace_page(vma, page, zero_page, orig_pte); | |
+ } | |
+ | |
+ unlock_page(page); | |
+out: | |
+ return err; | |
+} | |
+ | |
+/* | |
+ * cmp_and_merge_page() - first see if page can be merged into the stable | |
+ * tree; if not, compare hash to previous and if it's the same, see if page | |
+ * can be inserted into the unstable tree, or merged with a page already there | |
+ * and both transferred to the stable tree. | |
+ * | |
+ * @page: the page that we are searching identical page to. | |
+ * @rmap_item: the reverse mapping into the virtual address of this page | |
+ */ | |
+static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) | |
+{ | |
+ struct rmap_item *tree_rmap_item; | |
+ struct page *page; | |
+ struct page *kpage = NULL; | |
+ u32 hash_max; | |
+ int err; | |
+ unsigned int success1, success2; | |
+ struct stable_node *snode; | |
+ int cmp; | |
+ struct rb_node *parent = NULL, **new; | |
+ | |
+ remove_rmap_item_from_tree(rmap_item); | |
+ page = rmap_item->page; | |
+ | |
+ /* We first start with searching the page inside the stable tree */ | |
+ kpage = stable_tree_search(rmap_item, hash); | |
+ if (kpage) { | |
+ err = try_to_merge_with_uksm_page(rmap_item, kpage, | |
+ hash); | |
+ if (!err) { | |
+ /* | |
+ * The page was successfully merged, add | |
+ * its rmap_item to the stable tree. | |
+ * page lock is needed because it's | |
+ * racing with try_to_unmap_ksm(), etc. | |
+ */ | |
+ lock_page(kpage); | |
+ snode = page_stable_node(kpage); | |
+ stable_tree_append(rmap_item, snode, 1); | |
+ unlock_page(kpage); | |
+ put_page(kpage); | |
+ return; /* success */ | |
+ } | |
+ put_page(kpage); | |
+ | |
+ /* | |
+ * if it's a collision and it has been search in sub-rbtree | |
+ * (hash_max != 0), we want to abort, because if it is | |
+ * successfully merged in unstable tree, the collision trends to | |
+ * happen again. | |
+ */ | |
+ if (err == MERGE_ERR_COLLI && rmap_item->hash_max) | |
+ return; | |
+ } | |
+ | |
+ tree_rmap_item = | |
+ unstable_tree_search_insert(rmap_item, hash); | |
+ if (tree_rmap_item) { | |
+ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); | |
+ /* | |
+ * As soon as we merge this page, we want to remove the | |
+ * rmap_item of the page we have merged with from the unstable | |
+ * tree, and insert it instead as new node in the stable tree. | |
+ */ | |
+ if (!err) { | |
+ kpage = page; | |
+ remove_rmap_item_from_tree(tree_rmap_item); | |
+ lock_page(kpage); | |
+ snode = stable_tree_insert(&kpage, hash, | |
+ rmap_item, tree_rmap_item, | |
+ &success1, &success2); | |
+ | |
+ /* | |
+ * Do not log dedup for tree item, it's not counted as | |
+ * scanned in this round. | |
+ */ | |
+ if (success2) | |
+ stable_tree_append(tree_rmap_item, snode, 0); | |
+ | |
+ /* | |
+ * The order of these two stable append is important: | |
+ * we are scanning rmap_item. | |
+ */ | |
+ if (success1) | |
+ stable_tree_append(rmap_item, snode, 1); | |
+ | |
+ /* | |
+ * The original kpage may be unlocked inside | |
+ * stable_tree_insert() already. This page | |
+ * should be unlocked before doing | |
+ * break_cow(). | |
+ */ | |
+ unlock_page(kpage); | |
+ | |
+ if (!success1) | |
+ break_cow(rmap_item); | |
+ | |
+ if (!success2) | |
+ break_cow(tree_rmap_item); | |
+ | |
+ } else if (err == MERGE_ERR_COLLI) { | |
+ BUG_ON(tree_rmap_item->tree_node->count > 1); | |
+ | |
+ rmap_item_hash_max(tree_rmap_item, | |
+ tree_rmap_item->tree_node->hash); | |
+ | |
+ hash_max = rmap_item_hash_max(rmap_item, hash); | |
+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); | |
+ parent = &tree_rmap_item->node; | |
+ if (cmp < 0) | |
+ new = &parent->rb_left; | |
+ else if (cmp > 0) | |
+ new = &parent->rb_right; | |
+ else | |
+ goto put_up_out; | |
+ | |
+ rmap_item->tree_node = tree_rmap_item->tree_node; | |
+ rmap_item->address |= UNSTABLE_FLAG; | |
+ rmap_item->hash_round = uksm_hash_round; | |
+ rb_link_node(&rmap_item->node, parent, new); | |
+ rb_insert_color(&rmap_item->node, | |
+ &tree_rmap_item->tree_node->sub_root); | |
+ rmap_item->tree_node->count++; | |
+ } else { | |
+ /* | |
+ * either one of the page has changed or they collide | |
+ * at the max hash, we consider them as ill items. | |
+ */ | |
+ remove_rmap_item_from_tree(tree_rmap_item); | |
+ } | |
+put_up_out: | |
+ put_page(tree_rmap_item->page); | |
+ up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); | |
+ } | |
+} | |
+ | |
+ | |
+ | |
+ | |
+static inline unsigned long get_pool_index(struct vma_slot *slot, | |
+ unsigned long index) | |
+{ | |
+ unsigned long pool_index; | |
+ | |
+ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; | |
+ if (pool_index >= slot->pool_size) | |
+ BUG(); | |
+ return pool_index; | |
+} | |
+ | |
+static inline unsigned long index_page_offset(unsigned long index) | |
+{ | |
+ return offset_in_page(sizeof(struct rmap_list_entry *) * index); | |
+} | |
+ | |
+static inline | |
+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, | |
+ unsigned long index, int need_alloc) | |
+{ | |
+ unsigned long pool_index; | |
+ struct page *page; | |
+ void *addr; | |
+ | |
+ | |
+ pool_index = get_pool_index(slot, index); | |
+ if (!slot->rmap_list_pool[pool_index]) { | |
+ if (!need_alloc) | |
+ return NULL; | |
+ | |
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); | |
+ if (!page) | |
+ return NULL; | |
+ | |
+ slot->rmap_list_pool[pool_index] = page; | |
+ } | |
+ | |
+ addr = kmap(slot->rmap_list_pool[pool_index]); | |
+ addr += index_page_offset(index); | |
+ | |
+ return addr; | |
+} | |
+ | |
+static inline void put_rmap_list_entry(struct vma_slot *slot, | |
+ unsigned long index) | |
+{ | |
+ unsigned long pool_index; | |
+ | |
+ pool_index = get_pool_index(slot, index); | |
+ BUG_ON(!slot->rmap_list_pool[pool_index]); | |
+ kunmap(slot->rmap_list_pool[pool_index]); | |
+} | |
+ | |
+static inline int entry_is_new(struct rmap_list_entry *entry) | |
+{ | |
+ return !entry->item; | |
+} | |
+ | |
+static inline unsigned long get_index_orig_addr(struct vma_slot *slot, | |
+ unsigned long index) | |
+{ | |
+ return slot->vma->vm_start + (index << PAGE_SHIFT); | |
+} | |
+ | |
+static inline unsigned long get_entry_address(struct rmap_list_entry *entry) | |
+{ | |
+ unsigned long addr; | |
+ | |
+ if (is_addr(entry->addr)) | |
+ addr = get_clean_addr(entry->addr); | |
+ else if (entry->item) | |
+ addr = get_rmap_addr(entry->item); | |
+ else | |
+ BUG(); | |
+ | |
+ return addr; | |
+} | |
+ | |
+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) | |
+{ | |
+ if (is_addr(entry->addr)) | |
+ return NULL; | |
+ | |
+ return entry->item; | |
+} | |
+ | |
+static inline void inc_rmap_list_pool_count(struct vma_slot *slot, | |
+ unsigned long index) | |
+{ | |
+ unsigned long pool_index; | |
+ | |
+ pool_index = get_pool_index(slot, index); | |
+ BUG_ON(!slot->rmap_list_pool[pool_index]); | |
+ slot->pool_counts[pool_index]++; | |
+} | |
+ | |
+static inline void dec_rmap_list_pool_count(struct vma_slot *slot, | |
+ unsigned long index) | |
+{ | |
+ unsigned long pool_index; | |
+ | |
+ pool_index = get_pool_index(slot, index); | |
+ BUG_ON(!slot->rmap_list_pool[pool_index]); | |
+ BUG_ON(!slot->pool_counts[pool_index]); | |
+ slot->pool_counts[pool_index]--; | |
+} | |
+ | |
+static inline int entry_has_rmap(struct rmap_list_entry *entry) | |
+{ | |
+ return !is_addr(entry->addr) && entry->item; | |
+} | |
+ | |
+static inline void swap_entries(struct rmap_list_entry *entry1, | |
+ unsigned long index1, | |
+ struct rmap_list_entry *entry2, | |
+ unsigned long index2) | |
+{ | |
+ struct rmap_list_entry tmp; | |
+ | |
+ /* swapping two new entries is meaningless */ | |
+ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); | |
+ | |
+ tmp = *entry1; | |
+ *entry1 = *entry2; | |
+ *entry2 = tmp; | |
+ | |
+ if (entry_has_rmap(entry1)) | |
+ entry1->item->entry_index = index1; | |
+ | |
+ if (entry_has_rmap(entry2)) | |
+ entry2->item->entry_index = index2; | |
+ | |
+ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { | |
+ inc_rmap_list_pool_count(entry1->item->slot, index1); | |
+ dec_rmap_list_pool_count(entry1->item->slot, index2); | |
+ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { | |
+ inc_rmap_list_pool_count(entry2->item->slot, index2); | |
+ dec_rmap_list_pool_count(entry2->item->slot, index1); | |
+ } | |
+} | |
+ | |
+static inline void free_entry_item(struct rmap_list_entry *entry) | |
+{ | |
+ unsigned long index; | |
+ struct rmap_item *item; | |
+ | |
+ if (!is_addr(entry->addr)) { | |
+ BUG_ON(!entry->item); | |
+ item = entry->item; | |
+ entry->addr = get_rmap_addr(item); | |
+ set_is_addr(entry->addr); | |
+ index = item->entry_index; | |
+ remove_rmap_item_from_tree(item); | |
+ dec_rmap_list_pool_count(item->slot, index); | |
+ free_rmap_item(item); | |
+ } | |
+} | |
+ | |
+static inline int pool_entry_boundary(unsigned long index) | |
+{ | |
+ unsigned long linear_addr; | |
+ | |
+ linear_addr = sizeof(struct rmap_list_entry *) * index; | |
+ return index && !offset_in_page(linear_addr); | |
+} | |
+ | |
+static inline void try_free_last_pool(struct vma_slot *slot, | |
+ unsigned long index) | |
+{ | |
+ unsigned long pool_index; | |
+ | |
+ pool_index = get_pool_index(slot, index); | |
+ if (slot->rmap_list_pool[pool_index] && | |
+ !slot->pool_counts[pool_index]) { | |
+ __free_page(slot->rmap_list_pool[pool_index]); | |
+ slot->rmap_list_pool[pool_index] = NULL; | |
+ slot->flags |= UKSM_SLOT_NEED_SORT; | |
+ } | |
+ | |
+} | |
+ | |
+static inline unsigned long vma_item_index(struct vm_area_struct *vma, | |
+ struct rmap_item *item) | |
+{ | |
+ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; | |
+} | |
+ | |
+static int within_same_pool(struct vma_slot *slot, | |
+ unsigned long i, unsigned long j) | |
+{ | |
+ unsigned long pool_i, pool_j; | |
+ | |
+ pool_i = get_pool_index(slot, i); | |
+ pool_j = get_pool_index(slot, j); | |
+ | |
+ return (pool_i == pool_j); | |
+} | |
+ | |
+static void sort_rmap_entry_list(struct vma_slot *slot) | |
+{ | |
+ unsigned long i, j; | |
+ struct rmap_list_entry *entry, *swap_entry; | |
+ | |
+ entry = get_rmap_list_entry(slot, 0, 0); | |
+ for (i = 0; i < slot->pages; ) { | |
+ | |
+ if (!entry) | |
+ goto skip_whole_pool; | |
+ | |
+ if (entry_is_new(entry)) | |
+ goto next_entry; | |
+ | |
+ if (is_addr(entry->addr)) { | |
+ entry->addr = 0; | |
+ goto next_entry; | |
+ } | |
+ | |
+ j = vma_item_index(slot->vma, entry->item); | |
+ if (j == i) | |
+ goto next_entry; | |
+ | |
+ if (within_same_pool(slot, i, j)) | |
+ swap_entry = entry + j - i; | |
+ else | |
+ swap_entry = get_rmap_list_entry(slot, j, 1); | |
+ | |
+ swap_entries(entry, i, swap_entry, j); | |
+ if (!within_same_pool(slot, i, j)) | |
+ put_rmap_list_entry(slot, j); | |
+ continue; | |
+ | |
+skip_whole_pool: | |
+ i += PAGE_SIZE / sizeof(*entry); | |
+ if (i < slot->pages) | |
+ entry = get_rmap_list_entry(slot, i, 0); | |
+ continue; | |
+ | |
+next_entry: | |
+ if (i >= slot->pages - 1 || | |
+ !within_same_pool(slot, i, i + 1)) { | |
+ put_rmap_list_entry(slot, i); | |
+ if (i + 1 < slot->pages) | |
+ entry = get_rmap_list_entry(slot, i + 1, 0); | |
+ } else | |
+ entry++; | |
+ i++; | |
+ continue; | |
+ } | |
+ | |
+ /* free empty pool entries which contain no rmap_item */ | |
+ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ | |
+ for (i = 0; i < slot->pool_size; i++) { | |
+ unsigned char has_rmap; | |
+ void *addr; | |
+ | |
+ if (!slot->rmap_list_pool[i]) | |
+ continue; | |
+ | |
+ has_rmap = 0; | |
+ addr = kmap(slot->rmap_list_pool[i]); | |
+ BUG_ON(!addr); | |
+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { | |
+ entry = (struct rmap_list_entry *)addr + j; | |
+ if (is_addr(entry->addr)) | |
+ continue; | |
+ if (!entry->item) | |
+ continue; | |
+ has_rmap = 1; | |
+ } | |
+ kunmap(slot->rmap_list_pool[i]); | |
+ if (!has_rmap) { | |
+ BUG_ON(slot->pool_counts[i]); | |
+ __free_page(slot->rmap_list_pool[i]); | |
+ slot->rmap_list_pool[i] = NULL; | |
+ } | |
+ } | |
+ | |
+ slot->flags &= ~UKSM_SLOT_NEED_SORT; | |
+} | |
+ | |
+/* | |
+ * vma_fully_scanned() - if all the pages in this slot have been scanned. | |
+ */ | |
+static inline int vma_fully_scanned(struct vma_slot *slot) | |
+{ | |
+ return slot->pages_scanned == slot->pages; | |
+} | |
+ | |
+/** | |
+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to | |
+ * its random permutation. This function is embedded with the random | |
+ * permutation index management code. | |
+ */ | |
+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) | |
+{ | |
+ unsigned long rand_range, addr, swap_index, scan_index; | |
+ struct rmap_item *item = NULL; | |
+ struct rmap_list_entry *scan_entry, *swap_entry = NULL; | |
+ struct page *page; | |
+ | |
+ scan_index = swap_index = slot->pages_scanned % slot->pages; | |
+ | |
+ if (pool_entry_boundary(scan_index)) | |
+ try_free_last_pool(slot, scan_index - 1); | |
+ | |
+ if (vma_fully_scanned(slot)) { | |
+ if (slot->flags & UKSM_SLOT_NEED_SORT) | |
+ slot->flags |= UKSM_SLOT_NEED_RERAND; | |
+ else | |
+ slot->flags &= ~UKSM_SLOT_NEED_RERAND; | |
+ if (slot->flags & UKSM_SLOT_NEED_SORT) | |
+ sort_rmap_entry_list(slot); | |
+ } | |
+ | |
+ scan_entry = get_rmap_list_entry(slot, scan_index, 1); | |
+ if (!scan_entry) | |
+ return NULL; | |
+ | |
+ if (entry_is_new(scan_entry)) { | |
+ scan_entry->addr = get_index_orig_addr(slot, scan_index); | |
+ set_is_addr(scan_entry->addr); | |
+ } | |
+ | |
+ if (slot->flags & UKSM_SLOT_NEED_RERAND) { | |
+ rand_range = slot->pages - scan_index; | |
+ BUG_ON(!rand_range); | |
+ swap_index = scan_index + (prandom_u32() % rand_range); | |
+ } | |
+ | |
+ if (swap_index != scan_index) { | |
+ swap_entry = get_rmap_list_entry(slot, swap_index, 1); | |
+ if (entry_is_new(swap_entry)) { | |
+ swap_entry->addr = get_index_orig_addr(slot, | |
+ swap_index); | |
+ set_is_addr(swap_entry->addr); | |
+ } | |
+ swap_entries(scan_entry, scan_index, swap_entry, swap_index); | |
+ } | |
+ | |
+ addr = get_entry_address(scan_entry); | |
+ item = get_entry_item(scan_entry); | |
+ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); | |
+ | |
+ page = follow_page(slot->vma, addr, FOLL_GET); | |
+ if (IS_ERR_OR_NULL(page)) | |
+ goto nopage; | |
+ | |
+ if (!PageAnon(page) && !page_trans_compound_anon(page)) | |
+ goto putpage; | |
+ | |
+ /*check is zero_page pfn or uksm_zero_page*/ | |
+ if ((page_to_pfn(page) == zero_pfn) | |
+ || (page_to_pfn(page) == uksm_zero_pfn)) | |
+ goto putpage; | |
+ | |
+ flush_anon_page(slot->vma, page, addr); | |
+ flush_dcache_page(page); | |
+ | |
+ | |
+ *hash = page_hash(page, hash_strength, 1); | |
+ inc_uksm_pages_scanned(); | |
+ /*if the page content all zero, re-map to zero-page*/ | |
+ if (find_zero_page_hash(hash_strength, *hash)) { | |
+ if (!cmp_and_merge_zero_page(slot->vma, page)) { | |
+ slot->pages_merged++; | |
+ inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); | |
+ dec_mm_counter(slot->mm, MM_ANONPAGES); | |
+ | |
+ /* For full-zero pages, no need to create rmap item */ | |
+ goto putpage; | |
+ } else { | |
+ inc_rshash_neg(memcmp_cost / 2); | |
+ } | |
+ } | |
+ | |
+ if (!item) { | |
+ item = alloc_rmap_item(); | |
+ if (item) { | |
+ /* It has already been zeroed */ | |
+ item->slot = slot; | |
+ item->address = addr; | |
+ item->entry_index = scan_index; | |
+ scan_entry->item = item; | |
+ inc_rmap_list_pool_count(slot, scan_index); | |
+ } else | |
+ goto putpage; | |
+ } | |
+ | |
+ BUG_ON(item->slot != slot); | |
+ /* the page may have changed */ | |
+ item->page = page; | |
+ put_rmap_list_entry(slot, scan_index); | |
+ if (swap_entry) | |
+ put_rmap_list_entry(slot, swap_index); | |
+ return item; | |
+ | |
+putpage: | |
+ put_page(page); | |
+ page = NULL; | |
+nopage: | |
+ /* no page, store addr back and free rmap_item if possible */ | |
+ free_entry_item(scan_entry); | |
+ put_rmap_list_entry(slot, scan_index); | |
+ if (swap_entry) | |
+ put_rmap_list_entry(slot, swap_index); | |
+ return NULL; | |
+} | |
+ | |
+static inline int in_stable_tree(struct rmap_item *rmap_item) | |
+{ | |
+ return rmap_item->address & STABLE_FLAG; | |
+} | |
+ | |
+/** | |
+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with | |
+ * mmap_sem locked. | |
+ */ | |
+static noinline void scan_vma_one_page(struct vma_slot *slot) | |
+{ | |
+ u32 hash; | |
+ struct mm_struct *mm; | |
+ struct rmap_item *rmap_item = NULL; | |
+ struct vm_area_struct *vma = slot->vma; | |
+ | |
+ mm = vma->vm_mm; | |
+ BUG_ON(!mm); | |
+ BUG_ON(!slot); | |
+ | |
+ rmap_item = get_next_rmap_item(slot, &hash); | |
+ if (!rmap_item) | |
+ goto out1; | |
+ | |
+ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) | |
+ goto out2; | |
+ | |
+ cmp_and_merge_page(rmap_item, hash); | |
+out2: | |
+ put_page(rmap_item->page); | |
+out1: | |
+ slot->pages_scanned++; | |
+ if (slot->fully_scanned_round != fully_scanned_round) | |
+ scanned_virtual_pages++; | |
+ | |
+ if (vma_fully_scanned(slot)) | |
+ slot->fully_scanned_round = fully_scanned_round; | |
+} | |
+ | |
+static inline unsigned long rung_get_pages(struct scan_rung *rung) | |
+{ | |
+ struct slot_tree_node *node; | |
+ | |
+ if (!rung->vma_root.rnode) | |
+ return 0; | |
+ | |
+ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); | |
+ | |
+ return node->size; | |
+} | |
+ | |
+#define RUNG_SAMPLED_MIN 3 | |
+ | |
+static inline | |
+void uksm_calc_rung_step(struct scan_rung *rung, | |
+ unsigned long page_time, unsigned long ratio) | |
+{ | |
+ unsigned long sampled, pages; | |
+ | |
+ /* will be fully scanned ? */ | |
+ if (!rung->cover_msecs) { | |
+ rung->step = 1; | |
+ return; | |
+ } | |
+ | |
+ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) | |
+ * ratio / page_time; | |
+ | |
+ /* | |
+ * Before we finsish a scan round and expensive per-round jobs, | |
+ * we need to have a chance to estimate the per page time. So | |
+ * the sampled number can not be too small. | |
+ */ | |
+ if (sampled < RUNG_SAMPLED_MIN) | |
+ sampled = RUNG_SAMPLED_MIN; | |
+ | |
+ pages = rung_get_pages(rung); | |
+ if (likely(pages > sampled)) | |
+ rung->step = pages / sampled; | |
+ else | |
+ rung->step = 1; | |
+} | |
+ | |
+static inline int step_need_recalc(struct scan_rung *rung) | |
+{ | |
+ unsigned long pages, stepmax; | |
+ | |
+ pages = rung_get_pages(rung); | |
+ stepmax = pages / RUNG_SAMPLED_MIN; | |
+ | |
+ return pages && (rung->step > pages || | |
+ (stepmax && rung->step > stepmax)); | |
+} | |
+ | |
+static inline | |
+void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) | |
+{ | |
+ struct vma_slot *slot; | |
+ | |
+ if (finished) | |
+ rung->flags |= UKSM_RUNG_ROUND_FINISHED; | |
+ | |
+ if (step_recalc || step_need_recalc(rung)) { | |
+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); | |
+ BUG_ON(step_need_recalc(rung)); | |
+ } | |
+ | |
+ slot_iter_index = prandom_u32() % rung->step; | |
+ BUG_ON(!rung->vma_root.rnode); | |
+ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); | |
+ BUG_ON(!slot); | |
+ | |
+ rung->current_scan = slot; | |
+ rung->current_offset = slot_iter_index; | |
+} | |
+ | |
+static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) | |
+{ | |
+ return &slot->rung->vma_root; | |
+} | |
+ | |
+/* | |
+ * return if resetted. | |
+ */ | |
+static int advance_current_scan(struct scan_rung *rung) | |
+{ | |
+ unsigned short n; | |
+ struct vma_slot *slot, *next = NULL; | |
+ | |
+ BUG_ON(!rung->vma_root.num); | |
+ | |
+ slot = rung->current_scan; | |
+ n = (slot->pages - rung->current_offset) % rung->step; | |
+ slot_iter_index = rung->step - n; | |
+ next = sradix_tree_next(&rung->vma_root, slot->snode, | |
+ slot->sindex, slot_iter); | |
+ | |
+ if (next) { | |
+ rung->current_offset = slot_iter_index; | |
+ rung->current_scan = next; | |
+ return 0; | |
+ } else { | |
+ reset_current_scan(rung, 1, 0); | |
+ return 1; | |
+ } | |
+} | |
+ | |
+static inline void rung_rm_slot(struct vma_slot *slot) | |
+{ | |
+ struct scan_rung *rung = slot->rung; | |
+ struct sradix_tree_root *root; | |
+ | |
+ if (rung->current_scan == slot) | |
+ advance_current_scan(rung); | |
+ | |
+ root = slot_get_root(slot); | |
+ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); | |
+ slot->snode = NULL; | |
+ if (step_need_recalc(rung)) { | |
+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); | |
+ BUG_ON(step_need_recalc(rung)); | |
+ } | |
+ | |
+ /* In case advance_current_scan loop back to this slot again */ | |
+ if (rung->vma_root.num && rung->current_scan == slot) | |
+ reset_current_scan(slot->rung, 1, 0); | |
+} | |
+ | |
+static inline void rung_add_new_slots(struct scan_rung *rung, | |
+ struct vma_slot **slots, unsigned long num) | |
+{ | |
+ int err; | |
+ struct vma_slot *slot; | |
+ unsigned long i; | |
+ struct sradix_tree_root *root = &rung->vma_root; | |
+ | |
+ err = sradix_tree_enter(root, (void **)slots, num); | |
+ BUG_ON(err); | |
+ | |
+ for (i = 0; i < num; i++) { | |
+ slot = slots[i]; | |
+ slot->rung = rung; | |
+ BUG_ON(vma_fully_scanned(slot)); | |
+ } | |
+ | |
+ if (rung->vma_root.num == num) | |
+ reset_current_scan(rung, 0, 1); | |
+} | |
+ | |
+static inline int rung_add_one_slot(struct scan_rung *rung, | |
+ struct vma_slot *slot) | |
+{ | |
+ int err; | |
+ | |
+ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); | |
+ if (err) | |
+ return err; | |
+ | |
+ slot->rung = rung; | |
+ if (rung->vma_root.num == 1) | |
+ reset_current_scan(rung, 0, 1); | |
+ | |
+ return 0; | |
+} | |
+ | |
+/* | |
+ * Return true if the slot is deleted from its rung. | |
+ */ | |
+static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) | |
+{ | |
+ struct scan_rung *old_rung = slot->rung; | |
+ int err; | |
+ | |
+ if (old_rung == rung) | |
+ return 0; | |
+ | |
+ rung_rm_slot(slot); | |
+ err = rung_add_one_slot(rung, slot); | |
+ if (err) { | |
+ err = rung_add_one_slot(old_rung, slot); | |
+ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ | |
+ } | |
+ | |
+ return 1; | |
+} | |
+ | |
+static inline int vma_rung_up(struct vma_slot *slot) | |
+{ | |
+ struct scan_rung *rung; | |
+ | |
+ rung = slot->rung; | |
+ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) | |
+ rung++; | |
+ | |
+ return vma_rung_enter(slot, rung); | |
+} | |
+ | |
+static inline int vma_rung_down(struct vma_slot *slot) | |
+{ | |
+ struct scan_rung *rung; | |
+ | |
+ rung = slot->rung; | |
+ if (slot->rung != &uksm_scan_ladder[0]) | |
+ rung--; | |
+ | |
+ return vma_rung_enter(slot, rung); | |
+} | |
+ | |
+/** | |
+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. | |
+ */ | |
+static unsigned long cal_dedup_ratio(struct vma_slot *slot) | |
+{ | |
+ unsigned long ret; | |
+ | |
+ BUG_ON(slot->pages_scanned == slot->last_scanned); | |
+ | |
+ ret = slot->pages_merged; | |
+ | |
+ /* Thrashing area filtering */ | |
+ if (ret && uksm_thrash_threshold) { | |
+ if (slot->pages_cowed * 100 / slot->pages_merged | |
+ > uksm_thrash_threshold) { | |
+ ret = 0; | |
+ } else { | |
+ ret = slot->pages_merged - slot->pages_cowed; | |
+ } | |
+ } | |
+ | |
+ return ret; | |
+} | |
+ | |
+/** | |
+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. | |
+ */ | |
+static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) | |
+{ | |
+ unsigned long ret; | |
+ unsigned long pages_scanned; | |
+ | |
+ pages_scanned = slot->pages_scanned; | |
+ if (!pages_scanned) { | |
+ if (uksm_thrash_threshold) | |
+ return 0; | |
+ else | |
+ pages_scanned = slot->pages_scanned; | |
+ } | |
+ | |
+ ret = slot->pages_bemerged * 100 / pages_scanned; | |
+ | |
+ /* Thrashing area filtering */ | |
+ if (ret && uksm_thrash_threshold) { | |
+ if (slot->pages_cowed * 100 / slot->pages_bemerged | |
+ > uksm_thrash_threshold) { | |
+ ret = 0; | |
+ } else { | |
+ ret = slot->pages_bemerged - slot->pages_cowed; | |
+ } | |
+ } | |
+ | |
+ return ret; | |
+} | |
+ | |
+/** | |
+ * stable_node_reinsert() - When the hash_strength has been adjusted, the | |
+ * stable tree need to be restructured, this is the function re-inserting the | |
+ * stable node. | |
+ */ | |
+static inline void stable_node_reinsert(struct stable_node *new_node, | |
+ struct page *page, | |
+ struct rb_root *root_treep, | |
+ struct list_head *tree_node_listp, | |
+ u32 hash) | |
+{ | |
+ struct rb_node **new = &root_treep->rb_node; | |
+ struct rb_node *parent = NULL; | |
+ struct stable_node *stable_node; | |
+ struct tree_node *tree_node; | |
+ struct page *tree_page; | |
+ int cmp; | |
+ | |
+ while (*new) { | |
+ int cmp; | |
+ | |
+ tree_node = rb_entry(*new, struct tree_node, node); | |
+ | |
+ cmp = hash_cmp(hash, tree_node->hash); | |
+ | |
+ if (cmp < 0) { | |
+ parent = *new; | |
+ new = &parent->rb_left; | |
+ } else if (cmp > 0) { | |
+ parent = *new; | |
+ new = &parent->rb_right; | |
+ } else | |
+ break; | |
+ } | |
+ | |
+ if (*new) { | |
+ /* find a stable tree node with same first level hash value */ | |
+ stable_node_hash_max(new_node, page, hash); | |
+ if (tree_node->count == 1) { | |
+ stable_node = rb_entry(tree_node->sub_root.rb_node, | |
+ struct stable_node, node); | |
+ tree_page = get_uksm_page(stable_node, 1, 0); | |
+ if (tree_page) { | |
+ stable_node_hash_max(stable_node, | |
+ tree_page, hash); | |
+ put_page(tree_page); | |
+ | |
+ /* prepare for stable node insertion */ | |
+ | |
+ cmp = hash_cmp(new_node->hash_max, | |
+ stable_node->hash_max); | |
+ parent = &stable_node->node; | |
+ if (cmp < 0) | |
+ new = &parent->rb_left; | |
+ else if (cmp > 0) | |
+ new = &parent->rb_right; | |
+ else | |
+ goto failed; | |
+ | |
+ goto add_node; | |
+ } else { | |
+ /* the only stable_node deleted, the tree node | |
+ * was not deleted. | |
+ */ | |
+ goto tree_node_reuse; | |
+ } | |
+ } | |
+ | |
+ /* well, search the collision subtree */ | |
+ new = &tree_node->sub_root.rb_node; | |
+ parent = NULL; | |
+ BUG_ON(!*new); | |
+ while (*new) { | |
+ int cmp; | |
+ | |
+ stable_node = rb_entry(*new, struct stable_node, node); | |
+ | |
+ cmp = hash_cmp(new_node->hash_max, | |
+ stable_node->hash_max); | |
+ | |
+ if (cmp < 0) { | |
+ parent = *new; | |
+ new = &parent->rb_left; | |
+ } else if (cmp > 0) { | |
+ parent = *new; | |
+ new = &parent->rb_right; | |
+ } else { | |
+ /* oh, no, still a collision */ | |
+ goto failed; | |
+ } | |
+ } | |
+ | |
+ goto add_node; | |
+ } | |
+ | |
+ /* no tree node found */ | |
+ tree_node = alloc_tree_node(tree_node_listp); | |
+ if (!tree_node) { | |
+ printk(KERN_ERR "UKSM: memory allocation error!\n"); | |
+ goto failed; | |
+ } else { | |
+ tree_node->hash = hash; | |
+ rb_link_node(&tree_node->node, parent, new); | |
+ rb_insert_color(&tree_node->node, root_treep); | |
+ | |
+tree_node_reuse: | |
+ /* prepare for stable node insertion */ | |
+ parent = NULL; | |
+ new = &tree_node->sub_root.rb_node; | |
+ } | |
+ | |
+add_node: | |
+ rb_link_node(&new_node->node, parent, new); | |
+ rb_insert_color(&new_node->node, &tree_node->sub_root); | |
+ new_node->tree_node = tree_node; | |
+ tree_node->count++; | |
+ return; | |
+ | |
+failed: | |
+ /* This can only happen when two nodes have collided | |
+ * in two levels. | |
+ */ | |
+ new_node->tree_node = NULL; | |
+ return; | |
+} | |
+ | |
+static inline void free_all_tree_nodes(struct list_head *list) | |
+{ | |
+ struct tree_node *node, *tmp; | |
+ | |
+ list_for_each_entry_safe(node, tmp, list, all_list) { | |
+ free_tree_node(node); | |
+ } | |
+} | |
+ | |
+/** | |
+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash | |
+ * strength to the current hash_strength. It re-structures the hole tree. | |
+ */ | |
+static inline void stable_tree_delta_hash(u32 prev_hash_strength) | |
+{ | |
+ struct stable_node *node, *tmp; | |
+ struct rb_root *root_new_treep; | |
+ struct list_head *new_tree_node_listp; | |
+ | |
+ stable_tree_index = (stable_tree_index + 1) % 2; | |
+ root_new_treep = &root_stable_tree[stable_tree_index]; | |
+ new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; | |
+ *root_new_treep = RB_ROOT; | |
+ BUG_ON(!list_empty(new_tree_node_listp)); | |
+ | |
+ /* | |
+ * we need to be safe, the node could be removed by get_uksm_page() | |
+ */ | |
+ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { | |
+ void *addr; | |
+ struct page *node_page; | |
+ u32 hash; | |
+ | |
+ /* | |
+ * We are completely re-structuring the stable nodes to a new | |
+ * stable tree. We don't want to touch the old tree unlinks and | |
+ * old tree_nodes. The old tree_nodes will be freed at once. | |
+ */ | |
+ node_page = get_uksm_page(node, 0, 0); | |
+ if (!node_page) | |
+ continue; | |
+ | |
+ if (node->tree_node) { | |
+ hash = node->tree_node->hash; | |
+ | |
+ addr = kmap_atomic(node_page); | |
+ | |
+ hash = delta_hash(addr, prev_hash_strength, | |
+ hash_strength, hash); | |
+ kunmap_atomic(addr); | |
+ } else { | |
+ /* | |
+ *it was not inserted to rbtree due to collision in last | |
+ *round scan. | |
+ */ | |
+ hash = page_hash(node_page, hash_strength, 0); | |
+ } | |
+ | |
+ stable_node_reinsert(node, node_page, root_new_treep, | |
+ new_tree_node_listp, hash); | |
+ put_page(node_page); | |
+ } | |
+ | |
+ root_stable_treep = root_new_treep; | |
+ free_all_tree_nodes(stable_tree_node_listp); | |
+ BUG_ON(!list_empty(stable_tree_node_listp)); | |
+ stable_tree_node_listp = new_tree_node_listp; | |
+} | |
+ | |
+static inline void inc_hash_strength(unsigned long delta) | |
+{ | |
+ hash_strength += 1 << delta; | |
+ if (hash_strength > HASH_STRENGTH_MAX) | |
+ hash_strength = HASH_STRENGTH_MAX; | |
+} | |
+ | |
+static inline void dec_hash_strength(unsigned long delta) | |
+{ | |
+ unsigned long change = 1 << delta; | |
+ | |
+ if (hash_strength <= change + 1) | |
+ hash_strength = 1; | |
+ else | |
+ hash_strength -= change; | |
+} | |
+ | |
+static inline void inc_hash_strength_delta(void) | |
+{ | |
+ hash_strength_delta++; | |
+ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) | |
+ hash_strength_delta = HASH_STRENGTH_DELTA_MAX; | |
+} | |
+ | |
+/* | |
+static inline unsigned long get_current_neg_ratio(void) | |
+{ | |
+ if (!rshash_pos || rshash_neg > rshash_pos) | |
+ return 100; | |
+ | |
+ return div64_u64(100 * rshash_neg , rshash_pos); | |
+} | |
+*/ | |
+ | |
+static inline unsigned long get_current_neg_ratio(void) | |
+{ | |
+ u64 pos = benefit.pos; | |
+ u64 neg = benefit.neg; | |
+ | |
+ if (!neg) | |
+ return 0; | |
+ | |
+ if (!pos || neg > pos) | |
+ return 100; | |
+ | |
+ if (neg > div64_u64(U64_MAX, 100)) | |
+ pos = div64_u64(pos, 100); | |
+ else | |
+ neg *= 100; | |
+ | |
+ return div64_u64(neg, pos); | |
+} | |
+ | |
+static inline unsigned long get_current_benefit(void) | |
+{ | |
+ u64 pos = benefit.pos; | |
+ u64 neg = benefit.neg; | |
+ u64 scanned = benefit.scanned; | |
+ | |
+ if (neg > pos) | |
+ return 0; | |
+ | |
+ return div64_u64((pos - neg), scanned); | |
+} | |
+ | |
+static inline int judge_rshash_direction(void) | |
+{ | |
+ u64 current_neg_ratio, stable_benefit; | |
+ u64 current_benefit, delta = 0; | |
+ int ret = STILL; | |
+ | |
+ /* Try to probe a value after the boot, and in case the system | |
+ are still for a long time. */ | |
+ if ((fully_scanned_round & 0xFFULL) == 10) { | |
+ ret = OBSCURE; | |
+ goto out; | |
+ } | |
+ | |
+ current_neg_ratio = get_current_neg_ratio(); | |
+ | |
+ if (current_neg_ratio == 0) { | |
+ rshash_neg_cont_zero++; | |
+ if (rshash_neg_cont_zero > 2) | |
+ return GO_DOWN; | |
+ else | |
+ return STILL; | |
+ } | |
+ rshash_neg_cont_zero = 0; | |
+ | |
+ if (current_neg_ratio > 90) { | |
+ ret = GO_UP; | |
+ goto out; | |
+ } | |
+ | |
+ current_benefit = get_current_benefit(); | |
+ stable_benefit = rshash_state.stable_benefit; | |
+ | |
+ if (!stable_benefit) { | |
+ ret = OBSCURE; | |
+ goto out; | |
+ } | |
+ | |
+ if (current_benefit > stable_benefit) | |
+ delta = current_benefit - stable_benefit; | |
+ else if (current_benefit < stable_benefit) | |
+ delta = stable_benefit - current_benefit; | |
+ | |
+ delta = div64_u64(100 * delta , stable_benefit); | |
+ | |
+ if (delta > 50) { | |
+ rshash_cont_obscure++; | |
+ if (rshash_cont_obscure > 2) | |
+ return OBSCURE; | |
+ else | |
+ return STILL; | |
+ } | |
+ | |
+out: | |
+ rshash_cont_obscure = 0; | |
+ return ret; | |
+} | |
+ | |
+/** | |
+ * rshash_adjust() - The main function to control the random sampling state | |
+ * machine for hash strength adapting. | |
+ * | |
+ * return true if hash_strength has changed. | |
+ */ | |
+static inline int rshash_adjust(void) | |
+{ | |
+ unsigned long prev_hash_strength = hash_strength; | |
+ | |
+ if (!encode_benefit()) | |
+ return 0; | |
+ | |
+ switch (rshash_state.state) { | |
+ case RSHASH_STILL: | |
+ switch (judge_rshash_direction()) { | |
+ case GO_UP: | |
+ if (rshash_state.pre_direct == GO_DOWN) | |
+ hash_strength_delta = 0; | |
+ | |
+ inc_hash_strength(hash_strength_delta); | |
+ inc_hash_strength_delta(); | |
+ rshash_state.stable_benefit = get_current_benefit(); | |
+ rshash_state.pre_direct = GO_UP; | |
+ break; | |
+ | |
+ case GO_DOWN: | |
+ if (rshash_state.pre_direct == GO_UP) | |
+ hash_strength_delta = 0; | |
+ | |
+ dec_hash_strength(hash_strength_delta); | |
+ inc_hash_strength_delta(); | |
+ rshash_state.stable_benefit = get_current_benefit(); | |
+ rshash_state.pre_direct = GO_DOWN; | |
+ break; | |
+ | |
+ case OBSCURE: | |
+ rshash_state.stable_point = hash_strength; | |
+ rshash_state.turn_point_down = hash_strength; | |
+ rshash_state.turn_point_up = hash_strength; | |
+ rshash_state.turn_benefit_down = get_current_benefit(); | |
+ rshash_state.turn_benefit_up = get_current_benefit(); | |
+ rshash_state.lookup_window_index = 0; | |
+ rshash_state.state = RSHASH_TRYDOWN; | |
+ dec_hash_strength(hash_strength_delta); | |
+ inc_hash_strength_delta(); | |
+ break; | |
+ | |
+ case STILL: | |
+ break; | |
+ default: | |
+ BUG(); | |
+ } | |
+ break; | |
+ | |
+ case RSHASH_TRYDOWN: | |
+ if (rshash_state.lookup_window_index++ % 5 == 0) | |
+ rshash_state.below_count = 0; | |
+ | |
+ if (get_current_benefit() < rshash_state.stable_benefit) | |
+ rshash_state.below_count++; | |
+ else if (get_current_benefit() > | |
+ rshash_state.turn_benefit_down) { | |
+ rshash_state.turn_point_down = hash_strength; | |
+ rshash_state.turn_benefit_down = get_current_benefit(); | |
+ } | |
+ | |
+ if (rshash_state.below_count >= 3 || | |
+ judge_rshash_direction() == GO_UP || | |
+ hash_strength == 1) { | |
+ hash_strength = rshash_state.stable_point; | |
+ hash_strength_delta = 0; | |
+ inc_hash_strength(hash_strength_delta); | |
+ inc_hash_strength_delta(); | |
+ rshash_state.lookup_window_index = 0; | |
+ rshash_state.state = RSHASH_TRYUP; | |
+ hash_strength_delta = 0; | |
+ } else { | |
+ dec_hash_strength(hash_strength_delta); | |
+ inc_hash_strength_delta(); | |
+ } | |
+ break; | |
+ | |
+ case RSHASH_TRYUP: | |
+ if (rshash_state.lookup_window_index++ % 5 == 0) | |
+ rshash_state.below_count = 0; | |
+ | |
+ if (get_current_benefit() < rshash_state.turn_benefit_down) | |
+ rshash_state.below_count++; | |
+ else if (get_current_benefit() > rshash_state.turn_benefit_up) { | |
+ rshash_state.turn_point_up = hash_strength; | |
+ rshash_state.turn_benefit_up = get_current_benefit(); | |
+ } | |
+ | |
+ if (rshash_state.below_count >= 3 || | |
+ judge_rshash_direction() == GO_DOWN || | |
+ hash_strength == HASH_STRENGTH_MAX) { | |
+ hash_strength = rshash_state.turn_benefit_up > | |
+ rshash_state.turn_benefit_down ? | |
+ rshash_state.turn_point_up : | |
+ rshash_state.turn_point_down; | |
+ | |
+ rshash_state.state = RSHASH_PRE_STILL; | |
+ } else { | |
+ inc_hash_strength(hash_strength_delta); | |
+ inc_hash_strength_delta(); | |
+ } | |
+ | |
+ break; | |
+ | |
+ case RSHASH_NEW: | |
+ case RSHASH_PRE_STILL: | |
+ rshash_state.stable_benefit = get_current_benefit(); | |
+ rshash_state.state = RSHASH_STILL; | |
+ hash_strength_delta = 0; | |
+ break; | |
+ default: | |
+ BUG(); | |
+ } | |
+ | |
+ /* rshash_neg = rshash_pos = 0; */ | |
+ reset_benefit(); | |
+ | |
+ if (prev_hash_strength != hash_strength) | |
+ stable_tree_delta_hash(prev_hash_strength); | |
+ | |
+ return prev_hash_strength != hash_strength; | |
+} | |
+ | |
+/** | |
+ * round_update_ladder() - The main function to do update of all the | |
+ * adjustments whenever a scan round is finished. | |
+ */ | |
+static noinline void round_update_ladder(void) | |
+{ | |
+ int i; | |
+ unsigned long dedup; | |
+ struct vma_slot *slot, *tmp_slot; | |
+ | |
+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
+ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; | |
+ } | |
+ | |
+ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { | |
+ | |
+ /* slot may be rung_rm_slot() when mm exits */ | |
+ if (slot->snode) { | |
+ dedup = cal_dedup_ratio_old(slot); | |
+ if (dedup && dedup >= uksm_abundant_threshold) | |
+ vma_rung_up(slot); | |
+ } | |
+ | |
+ slot->pages_bemerged = 0; | |
+ slot->pages_cowed = 0; | |
+ | |
+ list_del_init(&slot->dedup_list); | |
+ } | |
+} | |
+ | |
+static void uksm_del_vma_slot(struct vma_slot *slot) | |
+{ | |
+ int i, j; | |
+ struct rmap_list_entry *entry; | |
+ | |
+ if (slot->snode) { | |
+ /* | |
+ * In case it just failed when entering the rung, it's not | |
+ * necessary. | |
+ */ | |
+ rung_rm_slot(slot); | |
+ } | |
+ | |
+ if (!list_empty(&slot->dedup_list)) | |
+ list_del(&slot->dedup_list); | |
+ | |
+ if (!slot->rmap_list_pool || !slot->pool_counts) { | |
+ /* In case it OOMed in uksm_vma_enter() */ | |
+ goto out; | |
+ } | |
+ | |
+ for (i = 0; i < slot->pool_size; i++) { | |
+ void *addr; | |
+ | |
+ if (!slot->rmap_list_pool[i]) | |
+ continue; | |
+ | |
+ addr = kmap(slot->rmap_list_pool[i]); | |
+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { | |
+ entry = (struct rmap_list_entry *)addr + j; | |
+ if (is_addr(entry->addr)) | |
+ continue; | |
+ if (!entry->item) | |
+ continue; | |
+ | |
+ remove_rmap_item_from_tree(entry->item); | |
+ free_rmap_item(entry->item); | |
+ slot->pool_counts[i]--; | |
+ } | |
+ BUG_ON(slot->pool_counts[i]); | |
+ kunmap(slot->rmap_list_pool[i]); | |
+ __free_page(slot->rmap_list_pool[i]); | |
+ } | |
+ kfree(slot->rmap_list_pool); | |
+ kfree(slot->pool_counts); | |
+ | |
+out: | |
+ slot->rung = NULL; | |
+ if (slot->flags & UKSM_SLOT_IN_UKSM) { | |
+ BUG_ON(uksm_pages_total < slot->pages); | |
+ uksm_pages_total -= slot->pages; | |
+ } | |
+ | |
+ if (slot->fully_scanned_round == fully_scanned_round) | |
+ scanned_virtual_pages -= slot->pages; | |
+ else | |
+ scanned_virtual_pages -= slot->pages_scanned; | |
+ free_vma_slot(slot); | |
+} | |
+ | |
+ | |
+#define SPIN_LOCK_PERIOD 32 | |
+static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; | |
+static inline void cleanup_vma_slots(void) | |
+{ | |
+ struct vma_slot *slot; | |
+ int i; | |
+ | |
+ i = 0; | |
+ spin_lock(&vma_slot_list_lock); | |
+ while (!list_empty(&vma_slot_del)) { | |
+ slot = list_entry(vma_slot_del.next, | |
+ struct vma_slot, slot_list); | |
+ list_del(&slot->slot_list); | |
+ cleanup_slots[i++] = slot; | |
+ if (i == SPIN_LOCK_PERIOD) { | |
+ spin_unlock(&vma_slot_list_lock); | |
+ while (--i >= 0) | |
+ uksm_del_vma_slot(cleanup_slots[i]); | |
+ i = 0; | |
+ spin_lock(&vma_slot_list_lock); | |
+ } | |
+ } | |
+ spin_unlock(&vma_slot_list_lock); | |
+ | |
+ while (--i >= 0) | |
+ uksm_del_vma_slot(cleanup_slots[i]); | |
+} | |
+ | |
+/* | |
+*expotional moving average formula | |
+*/ | |
+static inline unsigned long ema(unsigned long curr, unsigned long last_ema) | |
+{ | |
+ /* | |
+ * For a very high burst, even the ema cannot work well, a false very | |
+ * high per-page time estimation can result in feedback in very high | |
+ * overhead of context swith and rung update -- this will then lead | |
+ * to higher per-paper time, this may not converge. | |
+ * | |
+ * Instead, we try to approach this value in a binary manner. | |
+ */ | |
+ if (curr > last_ema * 10) | |
+ return last_ema * 2; | |
+ | |
+ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; | |
+} | |
+ | |
+/* | |
+ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to | |
+ * nanoseconds based on current uksm_sleep_jiffies. | |
+ */ | |
+static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) | |
+{ | |
+ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / | |
+ (TIME_RATIO_SCALE - ratio) * ratio; | |
+} | |
+ | |
+ | |
+static inline unsigned long rung_real_ratio(int cpu_time_ratio) | |
+{ | |
+ unsigned long ret; | |
+ | |
+ BUG_ON(!cpu_time_ratio); | |
+ | |
+ if (cpu_time_ratio > 0) | |
+ ret = cpu_time_ratio; | |
+ else | |
+ ret = (unsigned long)(-cpu_time_ratio) * | |
+ uksm_max_cpu_percentage / 100UL; | |
+ | |
+ return ret ? ret : 1; | |
+} | |
+ | |
+static noinline void uksm_calc_scan_pages(void) | |
+{ | |
+ struct scan_rung *ladder = uksm_scan_ladder; | |
+ unsigned long sleep_usecs, nsecs; | |
+ unsigned long ratio; | |
+ int i; | |
+ unsigned long per_page; | |
+ | |
+ if (uksm_ema_page_time > 100000 || | |
+ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) | |
+ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; | |
+ | |
+ per_page = uksm_ema_page_time; | |
+ BUG_ON(!per_page); | |
+ | |
+ /* | |
+ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value | |
+ * based on saved user input. | |
+ */ | |
+ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) | |
+ uksm_sleep_jiffies = uksm_sleep_saved; | |
+ | |
+ /* We require a rung scan at least 1 page in a period. */ | |
+ nsecs = per_page; | |
+ ratio = rung_real_ratio(ladder[0].cpu_ratio); | |
+ if (cpu_ratio_to_nsec(ratio) < nsecs) { | |
+ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio | |
+ / NSEC_PER_USEC; | |
+ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; | |
+ } | |
+ | |
+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
+ ratio = rung_real_ratio(ladder[i].cpu_ratio); | |
+ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / | |
+ per_page; | |
+ BUG_ON(!ladder[i].pages_to_scan); | |
+ uksm_calc_rung_step(&ladder[i], per_page, ratio); | |
+ } | |
+} | |
+ | |
+/* | |
+ * From the scan time of this round (ns) to next expected min sleep time | |
+ * (ms), be careful of the possible overflows. ratio is taken from | |
+ * rung_real_ratio() | |
+ */ | |
+static inline | |
+unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) | |
+{ | |
+ scan_time >>= 20; /* to msec level now */ | |
+ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); | |
+ | |
+ return (unsigned int) ((unsigned long) scan_time * | |
+ (TIME_RATIO_SCALE - ratio) / ratio); | |
+} | |
+ | |
+#define __round_mask(x, y) ((__typeof__(x))((y)-1)) | |
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) | |
+ | |
+static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) | |
+{ | |
+ struct scan_rung *rung; | |
+ | |
+ rung = &uksm_scan_ladder[0]; | |
+ rung_add_new_slots(rung, slots, num); | |
+} | |
+ | |
+static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; | |
+ | |
+static void uksm_enter_all_slots(void) | |
+{ | |
+ struct vma_slot *slot; | |
+ unsigned long index; | |
+ struct list_head empty_vma_list; | |
+ int i; | |
+ | |
+ i = 0; | |
+ index = 0; | |
+ INIT_LIST_HEAD(&empty_vma_list); | |
+ | |
+ spin_lock(&vma_slot_list_lock); | |
+ while (!list_empty(&vma_slot_new)) { | |
+ slot = list_entry(vma_slot_new.next, | |
+ struct vma_slot, slot_list); | |
+ | |
+ if (!slot->vma->anon_vma) { | |
+ list_move(&slot->slot_list, &empty_vma_list); | |
+ } else if (vma_can_enter(slot->vma)) { | |
+ batch_slots[index++] = slot; | |
+ list_del_init(&slot->slot_list); | |
+ } else { | |
+ list_move(&slot->slot_list, &vma_slot_noadd); | |
+ } | |
+ | |
+ if (++i == SPIN_LOCK_PERIOD || | |
+ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { | |
+ spin_unlock(&vma_slot_list_lock); | |
+ | |
+ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { | |
+ uksm_vma_enter(batch_slots, index); | |
+ index = 0; | |
+ } | |
+ i = 0; | |
+ cond_resched(); | |
+ spin_lock(&vma_slot_list_lock); | |
+ } | |
+ } | |
+ | |
+ list_splice(&empty_vma_list, &vma_slot_new); | |
+ | |
+ spin_unlock(&vma_slot_list_lock); | |
+ | |
+ if (index) | |
+ uksm_vma_enter(batch_slots, index); | |
+ | |
+} | |
+ | |
+static inline int rung_round_finished(struct scan_rung *rung) | |
+{ | |
+ return rung->flags & UKSM_RUNG_ROUND_FINISHED; | |
+} | |
+ | |
+static inline void judge_slot(struct vma_slot *slot) | |
+{ | |
+ struct scan_rung *rung = slot->rung; | |
+ unsigned long dedup; | |
+ int deleted; | |
+ | |
+ dedup = cal_dedup_ratio(slot); | |
+ if (vma_fully_scanned(slot) && uksm_thrash_threshold) | |
+ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); | |
+ else if (dedup && dedup >= uksm_abundant_threshold) | |
+ deleted = vma_rung_up(slot); | |
+ else | |
+ deleted = vma_rung_down(slot); | |
+ | |
+ slot->pages_merged = 0; | |
+ slot->pages_cowed = 0; | |
+ | |
+ if (vma_fully_scanned(slot)) | |
+ slot->pages_scanned = 0; | |
+ | |
+ slot->last_scanned = slot->pages_scanned; | |
+ | |
+ /* If its deleted in above, then rung was already advanced. */ | |
+ if (!deleted) | |
+ advance_current_scan(rung); | |
+} | |
+ | |
+ | |
+static inline int hash_round_finished(void) | |
+{ | |
+ if (scanned_virtual_pages > (uksm_pages_total >> 2)) { | |
+ scanned_virtual_pages = 0; | |
+ if (uksm_pages_scanned) | |
+ fully_scanned_round++; | |
+ | |
+ return 1; | |
+ } else { | |
+ return 0; | |
+ } | |
+} | |
+ | |
+#define UKSM_MMSEM_BATCH 5 | |
+#define BUSY_RETRY 100 | |
+ | |
+/** | |
+ * uksm_do_scan() - the main worker function. | |
+ */ | |
+static noinline void uksm_do_scan(void) | |
+{ | |
+ struct vma_slot *slot, *iter; | |
+ struct mm_struct *busy_mm; | |
+ unsigned char round_finished, all_rungs_emtpy; | |
+ int i, err, mmsem_batch; | |
+ unsigned long pcost; | |
+ long long delta_exec; | |
+ unsigned long vpages, max_cpu_ratio; | |
+ unsigned long long start_time, end_time, scan_time; | |
+ unsigned int expected_jiffies; | |
+ | |
+ might_sleep(); | |
+ | |
+ vpages = 0; | |
+ | |
+ start_time = task_sched_runtime(current); | |
+ max_cpu_ratio = 0; | |
+ mmsem_batch = 0; | |
+ | |
+ for (i = 0; i < SCAN_LADDER_SIZE;) { | |
+ struct scan_rung *rung = &uksm_scan_ladder[i]; | |
+ unsigned long ratio; | |
+ int busy_retry; | |
+ | |
+ if (!rung->pages_to_scan) { | |
+ i++; | |
+ continue; | |
+ } | |
+ | |
+ if (!rung->vma_root.num) { | |
+ rung->pages_to_scan = 0; | |
+ i++; | |
+ continue; | |
+ } | |
+ | |
+ ratio = rung_real_ratio(rung->cpu_ratio); | |
+ if (ratio > max_cpu_ratio) | |
+ max_cpu_ratio = ratio; | |
+ | |
+ busy_retry = BUSY_RETRY; | |
+ /* | |
+ * Do not consider rung_round_finished() here, just used up the | |
+ * rung->pages_to_scan quota. | |
+ */ | |
+ while (rung->pages_to_scan && rung->vma_root.num && | |
+ likely(!freezing(current))) { | |
+ int reset = 0; | |
+ | |
+ slot = rung->current_scan; | |
+ | |
+ BUG_ON(vma_fully_scanned(slot)); | |
+ | |
+ if (mmsem_batch) { | |
+ err = 0; | |
+ } else { | |
+ err = try_down_read_slot_mmap_sem(slot); | |
+ } | |
+ | |
+ if (err == -ENOENT) { | |
+rm_slot: | |
+ rung_rm_slot(slot); | |
+ continue; | |
+ } | |
+ | |
+ busy_mm = slot->mm; | |
+ | |
+ if (err == -EBUSY) { | |
+ /* skip other vmas on the same mm */ | |
+ do { | |
+ reset = advance_current_scan(rung); | |
+ iter = rung->current_scan; | |
+ busy_retry--; | |
+ if (iter->vma->vm_mm != busy_mm || | |
+ !busy_retry || reset) | |
+ break; | |
+ } while (1); | |
+ | |
+ if (iter->vma->vm_mm != busy_mm) { | |
+ continue; | |
+ } else { | |
+ /* scan round finsished */ | |
+ break; | |
+ } | |
+ } | |
+ | |
+ BUG_ON(!vma_can_enter(slot->vma)); | |
+ if (uksm_test_exit(slot->vma->vm_mm)) { | |
+ mmsem_batch = 0; | |
+ up_read(&slot->vma->vm_mm->mmap_sem); | |
+ goto rm_slot; | |
+ } | |
+ | |
+ if (mmsem_batch) | |
+ mmsem_batch--; | |
+ else | |
+ mmsem_batch = UKSM_MMSEM_BATCH; | |
+ | |
+ /* Ok, we have take the mmap_sem, ready to scan */ | |
+ scan_vma_one_page(slot); | |
+ rung->pages_to_scan--; | |
+ vpages++; | |
+ | |
+ if (rung->current_offset + rung->step > slot->pages - 1 | |
+ || vma_fully_scanned(slot)) { | |
+ up_read(&slot->vma->vm_mm->mmap_sem); | |
+ judge_slot(slot); | |
+ mmsem_batch = 0; | |
+ } else { | |
+ rung->current_offset += rung->step; | |
+ if (!mmsem_batch) | |
+ up_read(&slot->vma->vm_mm->mmap_sem); | |
+ } | |
+ | |
+ busy_retry = BUSY_RETRY; | |
+ cond_resched(); | |
+ } | |
+ | |
+ if (mmsem_batch) { | |
+ up_read(&slot->vma->vm_mm->mmap_sem); | |
+ mmsem_batch = 0; | |
+ } | |
+ | |
+ if (freezing(current)) | |
+ break; | |
+ | |
+ cond_resched(); | |
+ } | |
+ end_time = task_sched_runtime(current); | |
+ delta_exec = end_time - start_time; | |
+ | |
+ if (freezing(current)) | |
+ return; | |
+ | |
+ cleanup_vma_slots(); | |
+ uksm_enter_all_slots(); | |
+ | |
+ round_finished = 1; | |
+ all_rungs_emtpy = 1; | |
+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
+ struct scan_rung *rung = &uksm_scan_ladder[i]; | |
+ | |
+ if (rung->vma_root.num) { | |
+ all_rungs_emtpy = 0; | |
+ if (!rung_round_finished(rung)) | |
+ round_finished = 0; | |
+ } | |
+ } | |
+ | |
+ if (all_rungs_emtpy) | |
+ round_finished = 0; | |
+ | |
+ if (round_finished) { | |
+ round_update_ladder(); | |
+ uksm_eval_round++; | |
+ | |
+ if (hash_round_finished() && rshash_adjust()) { | |
+ /* Reset the unstable root iff hash strength changed */ | |
+ uksm_hash_round++; | |
+ root_unstable_tree = RB_ROOT; | |
+ free_all_tree_nodes(&unstable_tree_node_list); | |
+ } | |
+ | |
+ /* | |
+ * A number of pages can hang around indefinitely on per-cpu | |
+ * pagevecs, raised page count preventing write_protect_page | |
+ * from merging them. Though it doesn't really matter much, | |
+ * it is puzzling to see some stuck in pages_volatile until | |
+ * other activity jostles them out, and they also prevented | |
+ * LTP's KSM test from succeeding deterministically; so drain | |
+ * them here (here rather than on entry to uksm_do_scan(), | |
+ * so we don't IPI too often when pages_to_scan is set low). | |
+ */ | |
+ lru_add_drain_all(); | |
+ } | |
+ | |
+ | |
+ if (vpages && delta_exec > 0) { | |
+ pcost = (unsigned long) delta_exec / vpages; | |
+ if (likely(uksm_ema_page_time)) | |
+ uksm_ema_page_time = ema(pcost, uksm_ema_page_time); | |
+ else | |
+ uksm_ema_page_time = pcost; | |
+ } | |
+ | |
+ uksm_calc_scan_pages(); | |
+ uksm_sleep_real = uksm_sleep_jiffies; | |
+ /* in case of radical cpu bursts, apply the upper bound */ | |
+ end_time = task_sched_runtime(current); | |
+ if (max_cpu_ratio && end_time > start_time) { | |
+ scan_time = end_time - start_time; | |
+ expected_jiffies = msecs_to_jiffies( | |
+ scan_time_to_sleep(scan_time, max_cpu_ratio)); | |
+ | |
+ if (expected_jiffies > uksm_sleep_real) | |
+ uksm_sleep_real = expected_jiffies; | |
+ | |
+ /* We have a 1 second up bound for responsiveness. */ | |
+ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) | |
+ uksm_sleep_real = msecs_to_jiffies(1000); | |
+ } | |
+ | |
+ return; | |
+} | |
+ | |
+static int ksmd_should_run(void) | |
+{ | |
+ return uksm_run & UKSM_RUN_MERGE; | |
+} | |
+ | |
+static int uksm_scan_thread(void *nothing) | |
+{ | |
+ set_freezable(); | |
+ set_user_nice(current, 5); | |
+ | |
+ while (!kthread_should_stop()) { | |
+ mutex_lock(&uksm_thread_mutex); | |
+ if (ksmd_should_run()) { | |
+ uksm_do_scan(); | |
+ } | |
+ mutex_unlock(&uksm_thread_mutex); | |
+ | |
+ try_to_freeze(); | |
+ | |
+ if (ksmd_should_run()) { | |
+ schedule_timeout_interruptible(uksm_sleep_real); | |
+ uksm_sleep_times++; | |
+ } else { | |
+ wait_event_freezable(uksm_thread_wait, | |
+ ksmd_should_run() || kthread_should_stop()); | |
+ } | |
+ } | |
+ return 0; | |
+} | |
+ | |
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) | |
+{ | |
+ struct stable_node *stable_node; | |
+ struct node_vma *node_vma; | |
+ struct rmap_item *rmap_item; | |
+ int ret = SWAP_AGAIN; | |
+ int search_new_forks = 0; | |
+ unsigned long address; | |
+ | |
+ VM_BUG_ON_PAGE(!PageKsm(page), page); | |
+ VM_BUG_ON_PAGE(!PageLocked(page), page); | |
+ | |
+ stable_node = page_stable_node(page); | |
+ if (!stable_node) | |
+ return ret; | |
+again: | |
+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { | |
+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { | |
+ struct anon_vma *anon_vma = rmap_item->anon_vma; | |
+ struct anon_vma_chain *vmac; | |
+ struct vm_area_struct *vma; | |
+ | |
+ anon_vma_lock_read(anon_vma); | |
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | |
+ 0, ULONG_MAX) { | |
+ vma = vmac->vma; | |
+ address = get_rmap_addr(rmap_item); | |
+ | |
+ if (address < vma->vm_start || | |
+ address >= vma->vm_end) | |
+ continue; | |
+ | |
+ if ((rmap_item->slot->vma == vma) == | |
+ search_new_forks) | |
+ continue; | |
+ | |
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | |
+ continue; | |
+ | |
+ ret = rwc->rmap_one(page, vma, address, rwc->arg); | |
+ if (ret != SWAP_AGAIN) { | |
+ anon_vma_unlock_read(anon_vma); | |
+ goto out; | |
+ } | |
+ | |
+ if (rwc->done && rwc->done(page)) { | |
+ anon_vma_unlock_read(anon_vma); | |
+ goto out; | |
+ } | |
+ } | |
+ anon_vma_unlock_read(anon_vma); | |
+ } | |
+ } | |
+ if (!search_new_forks++) | |
+ goto again; | |
+out: | |
+ return ret; | |
+} | |
+ | |
+#ifdef CONFIG_MIGRATION | |
+/* Common ksm interface but may be specific to uksm */ | |
+void ksm_migrate_page(struct page *newpage, struct page *oldpage) | |
+{ | |
+ struct stable_node *stable_node; | |
+ | |
+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); | |
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); | |
+ VM_BUG_ON(newpage->mapping != oldpage->mapping); | |
+ | |
+ stable_node = page_stable_node(newpage); | |
+ if (stable_node) { | |
+ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | |
+ stable_node->kpfn = page_to_pfn(newpage); | |
+ } | |
+} | |
+#endif /* CONFIG_MIGRATION */ | |
+ | |
+#ifdef CONFIG_MEMORY_HOTREMOVE | |
+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, | |
+ unsigned long end_pfn) | |
+{ | |
+ struct rb_node *node; | |
+ | |
+ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { | |
+ struct stable_node *stable_node; | |
+ | |
+ stable_node = rb_entry(node, struct stable_node, node); | |
+ if (stable_node->kpfn >= start_pfn && | |
+ stable_node->kpfn < end_pfn) | |
+ return stable_node; | |
+ } | |
+ return NULL; | |
+} | |
+ | |
+static int uksm_memory_callback(struct notifier_block *self, | |
+ unsigned long action, void *arg) | |
+{ | |
+ struct memory_notify *mn = arg; | |
+ struct stable_node *stable_node; | |
+ | |
+ switch (action) { | |
+ case MEM_GOING_OFFLINE: | |
+ /* | |
+ * Keep it very simple for now: just lock out ksmd and | |
+ * MADV_UNMERGEABLE while any memory is going offline. | |
+ * mutex_lock_nested() is necessary because lockdep was alarmed | |
+ * that here we take uksm_thread_mutex inside notifier chain | |
+ * mutex, and later take notifier chain mutex inside | |
+ * uksm_thread_mutex to unlock it. But that's safe because both | |
+ * are inside mem_hotplug_mutex. | |
+ */ | |
+ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); | |
+ break; | |
+ | |
+ case MEM_OFFLINE: | |
+ /* | |
+ * Most of the work is done by page migration; but there might | |
+ * be a few stable_nodes left over, still pointing to struct | |
+ * pages which have been offlined: prune those from the tree. | |
+ */ | |
+ while ((stable_node = uksm_check_stable_tree(mn->start_pfn, | |
+ mn->start_pfn + mn->nr_pages)) != NULL) | |
+ remove_node_from_stable_tree(stable_node, 1, 1); | |
+ /* fallthrough */ | |
+ | |
+ case MEM_CANCEL_OFFLINE: | |
+ mutex_unlock(&uksm_thread_mutex); | |
+ break; | |
+ } | |
+ return NOTIFY_OK; | |
+} | |
+#endif /* CONFIG_MEMORY_HOTREMOVE */ | |
+ | |
+#ifdef CONFIG_SYSFS | |
+/* | |
+ * This all compiles without CONFIG_SYSFS, but is a waste of space. | |
+ */ | |
+ | |
+#define UKSM_ATTR_RO(_name) \ | |
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | |
+#define UKSM_ATTR(_name) \ | |
+ static struct kobj_attribute _name##_attr = \ | |
+ __ATTR(_name, 0644, _name##_show, _name##_store) | |
+ | |
+static ssize_t max_cpu_percentage_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ return sprintf(buf, "%u\n", uksm_max_cpu_percentage); | |
+} | |
+ | |
+static ssize_t max_cpu_percentage_store(struct kobject *kobj, | |
+ struct kobj_attribute *attr, | |
+ const char *buf, size_t count) | |
+{ | |
+ unsigned long max_cpu_percentage; | |
+ int err; | |
+ | |
+ err = kstrtoul(buf, 10, &max_cpu_percentage); | |
+ if (err || max_cpu_percentage > 100) | |
+ return -EINVAL; | |
+ | |
+ if (max_cpu_percentage == 100) | |
+ max_cpu_percentage = 99; | |
+ else if (max_cpu_percentage < 10) | |
+ max_cpu_percentage = 10; | |
+ | |
+ uksm_max_cpu_percentage = max_cpu_percentage; | |
+ | |
+ return count; | |
+} | |
+UKSM_ATTR(max_cpu_percentage); | |
+ | |
+static ssize_t sleep_millisecs_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); | |
+} | |
+ | |
+static ssize_t sleep_millisecs_store(struct kobject *kobj, | |
+ struct kobj_attribute *attr, | |
+ const char *buf, size_t count) | |
+{ | |
+ unsigned long msecs; | |
+ int err; | |
+ | |
+ err = kstrtoul(buf, 10, &msecs); | |
+ if (err || msecs > MSEC_PER_SEC) | |
+ return -EINVAL; | |
+ | |
+ uksm_sleep_jiffies = msecs_to_jiffies(msecs); | |
+ uksm_sleep_saved = uksm_sleep_jiffies; | |
+ | |
+ return count; | |
+} | |
+UKSM_ATTR(sleep_millisecs); | |
+ | |
+ | |
+static ssize_t cpu_governor_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); | |
+ int i; | |
+ | |
+ buf[0] = '\0'; | |
+ for (i = 0; i < n ; i++) { | |
+ if (uksm_cpu_governor == i) | |
+ strcat(buf, "["); | |
+ | |
+ strcat(buf, uksm_cpu_governor_str[i]); | |
+ | |
+ if (uksm_cpu_governor == i) | |
+ strcat(buf, "]"); | |
+ | |
+ strcat(buf, " "); | |
+ } | |
+ strcat(buf, "\n"); | |
+ | |
+ return strlen(buf); | |
+} | |
+ | |
+static inline void init_performance_values(void) | |
+{ | |
+ int i; | |
+ struct scan_rung *rung; | |
+ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; | |
+ | |
+ | |
+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
+ rung = uksm_scan_ladder + i; | |
+ rung->cpu_ratio = preset->cpu_ratio[i]; | |
+ rung->cover_msecs = preset->cover_msecs[i]; | |
+ } | |
+ | |
+ uksm_max_cpu_percentage = preset->max_cpu; | |
+} | |
+ | |
+static ssize_t cpu_governor_store(struct kobject *kobj, | |
+ struct kobj_attribute *attr, | |
+ const char *buf, size_t count) | |
+{ | |
+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); | |
+ | |
+ for (n--; n >=0 ; n--) { | |
+ if (!strncmp(buf, uksm_cpu_governor_str[n], | |
+ strlen(uksm_cpu_governor_str[n]))) | |
+ break; | |
+ } | |
+ | |
+ if (n < 0) | |
+ return -EINVAL; | |
+ else | |
+ uksm_cpu_governor = n; | |
+ | |
+ init_performance_values(); | |
+ | |
+ return count; | |
+} | |
+UKSM_ATTR(cpu_governor); | |
+ | |
+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, | |
+ char *buf) | |
+{ | |
+ return sprintf(buf, "%u\n", uksm_run); | |
+} | |
+ | |
+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |
+ const char *buf, size_t count) | |
+{ | |
+ int err; | |
+ unsigned long flags; | |
+ | |
+ err = kstrtoul(buf, 10, &flags); | |
+ if (err || flags > UINT_MAX) | |
+ return -EINVAL; | |
+ if (flags > UKSM_RUN_MERGE) | |
+ return -EINVAL; | |
+ | |
+ mutex_lock(&uksm_thread_mutex); | |
+ if (uksm_run != flags) { | |
+ uksm_run = flags; | |
+ } | |
+ mutex_unlock(&uksm_thread_mutex); | |
+ | |
+ if (flags & UKSM_RUN_MERGE) | |
+ wake_up_interruptible(&uksm_thread_wait); | |
+ | |
+ return count; | |
+} | |
+UKSM_ATTR(run); | |
+ | |
+static ssize_t abundant_threshold_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ return sprintf(buf, "%u\n", uksm_abundant_threshold); | |
+} | |
+ | |
+static ssize_t abundant_threshold_store(struct kobject *kobj, | |
+ struct kobj_attribute *attr, | |
+ const char *buf, size_t count) | |
+{ | |
+ int err; | |
+ unsigned long flags; | |
+ | |
+ err = kstrtoul(buf, 10, &flags); | |
+ if (err || flags > 99) | |
+ return -EINVAL; | |
+ | |
+ uksm_abundant_threshold = flags; | |
+ | |
+ return count; | |
+} | |
+UKSM_ATTR(abundant_threshold); | |
+ | |
+static ssize_t thrash_threshold_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ return sprintf(buf, "%u\n", uksm_thrash_threshold); | |
+} | |
+ | |
+static ssize_t thrash_threshold_store(struct kobject *kobj, | |
+ struct kobj_attribute *attr, | |
+ const char *buf, size_t count) | |
+{ | |
+ int err; | |
+ unsigned long flags; | |
+ | |
+ err = kstrtoul(buf, 10, &flags); | |
+ if (err || flags > 99) | |
+ return -EINVAL; | |
+ | |
+ uksm_thrash_threshold = flags; | |
+ | |
+ return count; | |
+} | |
+UKSM_ATTR(thrash_threshold); | |
+ | |
+static ssize_t cpu_ratios_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ int i, size; | |
+ struct scan_rung *rung; | |
+ char *p = buf; | |
+ | |
+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
+ rung = &uksm_scan_ladder[i]; | |
+ | |
+ if (rung->cpu_ratio > 0) | |
+ size = sprintf(p, "%d ", rung->cpu_ratio); | |
+ else | |
+ size = sprintf(p, "MAX/%d ", | |
+ TIME_RATIO_SCALE / -rung->cpu_ratio); | |
+ | |
+ p += size; | |
+ } | |
+ | |
+ *p++ = '\n'; | |
+ *p = '\0'; | |
+ | |
+ return p - buf; | |
+} | |
+ | |
+static ssize_t cpu_ratios_store(struct kobject *kobj, | |
+ struct kobj_attribute *attr, | |
+ const char *buf, size_t count) | |
+{ | |
+ int i, cpuratios[SCAN_LADDER_SIZE], err; | |
+ unsigned long value; | |
+ struct scan_rung *rung; | |
+ char *p, *end = NULL; | |
+ | |
+ p = kzalloc(count, GFP_KERNEL); | |
+ if (!p) | |
+ return -ENOMEM; | |
+ | |
+ memcpy(p, buf, count); | |
+ | |
+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
+ if (i != SCAN_LADDER_SIZE -1) { | |
+ end = strchr(p, ' '); | |
+ if (!end) | |
+ return -EINVAL; | |
+ | |
+ *end = '\0'; | |
+ } | |
+ | |
+ if (strstr(p, "MAX/")) { | |
+ p = strchr(p, '/') + 1; | |
+ err = kstrtoul(p, 10, &value); | |
+ if (err || value > TIME_RATIO_SCALE || !value) | |
+ return -EINVAL; | |
+ | |
+ cpuratios[i] = - (int) (TIME_RATIO_SCALE / value); | |
+ } else { | |
+ err = kstrtoul(p, 10, &value); | |
+ if (err || value > TIME_RATIO_SCALE || !value) | |
+ return -EINVAL; | |
+ | |
+ cpuratios[i] = value; | |
+ } | |
+ | |
+ p = end + 1; | |
+ } | |
+ | |
+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
+ rung = &uksm_scan_ladder[i]; | |
+ | |
+ rung->cpu_ratio = cpuratios[i]; | |
+ } | |
+ | |
+ return count; | |
+} | |
+UKSM_ATTR(cpu_ratios); | |
+ | |
+static ssize_t eval_intervals_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ int i, size; | |
+ struct scan_rung *rung; | |
+ char *p = buf; | |
+ | |
+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
+ rung = &uksm_scan_ladder[i]; | |
+ size = sprintf(p, "%u ", rung->cover_msecs); | |
+ p += size; | |
+ } | |
+ | |
+ *p++ = '\n'; | |
+ *p = '\0'; | |
+ | |
+ return p - buf; | |
+} | |
+ | |
+static ssize_t eval_intervals_store(struct kobject *kobj, | |
+ struct kobj_attribute *attr, | |
+ const char *buf, size_t count) | |
+{ | |
+ int i, err; | |
+ unsigned long values[SCAN_LADDER_SIZE]; | |
+ struct scan_rung *rung; | |
+ char *p, *end = NULL; | |
+ | |
+ p = kzalloc(count, GFP_KERNEL); | |
+ if (!p) | |
+ return -ENOMEM; | |
+ | |
+ memcpy(p, buf, count); | |
+ | |
+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
+ if (i != SCAN_LADDER_SIZE -1) { | |
+ end = strchr(p, ' '); | |
+ if (!end) | |
+ return -EINVAL; | |
+ | |
+ *end = '\0'; | |
+ } | |
+ | |
+ err = kstrtoul(p, 10, &values[i]); | |
+ if (err) | |
+ return -EINVAL; | |
+ | |
+ p = end + 1; | |
+ } | |
+ | |
+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
+ rung = &uksm_scan_ladder[i]; | |
+ | |
+ rung->cover_msecs = values[i]; | |
+ } | |
+ | |
+ return count; | |
+} | |
+UKSM_ATTR(eval_intervals); | |
+ | |
+static ssize_t ema_per_page_time_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ return sprintf(buf, "%lu\n", uksm_ema_page_time); | |
+} | |
+UKSM_ATTR_RO(ema_per_page_time); | |
+ | |
+static ssize_t pages_shared_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ return sprintf(buf, "%lu\n", uksm_pages_shared); | |
+} | |
+UKSM_ATTR_RO(pages_shared); | |
+ | |
+static ssize_t pages_sharing_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ return sprintf(buf, "%lu\n", uksm_pages_sharing); | |
+} | |
+UKSM_ATTR_RO(pages_sharing); | |
+ | |
+static ssize_t pages_unshared_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ return sprintf(buf, "%lu\n", uksm_pages_unshared); | |
+} | |
+UKSM_ATTR_RO(pages_unshared); | |
+ | |
+static ssize_t full_scans_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ return sprintf(buf, "%llu\n", fully_scanned_round); | |
+} | |
+UKSM_ATTR_RO(full_scans); | |
+ | |
+static ssize_t pages_scanned_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ unsigned long base = 0; | |
+ u64 delta, ret; | |
+ | |
+ if (pages_scanned_stored) { | |
+ base = pages_scanned_base; | |
+ ret = pages_scanned_stored; | |
+ delta = uksm_pages_scanned >> base; | |
+ if (CAN_OVERFLOW_U64(ret, delta)) { | |
+ ret >>= 1; | |
+ delta >>= 1; | |
+ base++; | |
+ ret += delta; | |
+ } | |
+ } else { | |
+ ret = uksm_pages_scanned; | |
+ } | |
+ | |
+ while (ret > ULONG_MAX) { | |
+ ret >>= 1; | |
+ base++; | |
+ } | |
+ | |
+ if (base) | |
+ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); | |
+ else | |
+ return sprintf(buf, "%lu\n", (unsigned long)ret); | |
+} | |
+UKSM_ATTR_RO(pages_scanned); | |
+ | |
+static ssize_t hash_strength_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ return sprintf(buf, "%lu\n", hash_strength); | |
+} | |
+UKSM_ATTR_RO(hash_strength); | |
+ | |
+static ssize_t sleep_times_show(struct kobject *kobj, | |
+ struct kobj_attribute *attr, char *buf) | |
+{ | |
+ return sprintf(buf, "%llu\n", uksm_sleep_times); | |
+} | |
+UKSM_ATTR_RO(sleep_times); | |
+ | |
+ | |
+static struct attribute *uksm_attrs[] = { | |
+ &max_cpu_percentage_attr.attr, | |
+ &sleep_millisecs_attr.attr, | |
+ &cpu_governor_attr.attr, | |
+ &run_attr.attr, | |
+ &ema_per_page_time_attr.attr, | |
+ &pages_shared_attr.attr, | |
+ &pages_sharing_attr.attr, | |
+ &pages_unshared_attr.attr, | |
+ &full_scans_attr.attr, | |
+ &pages_scanned_attr.attr, | |
+ &hash_strength_attr.attr, | |
+ &sleep_times_attr.attr, | |
+ &thrash_threshold_attr.attr, | |
+ &abundant_threshold_attr.attr, | |
+ &cpu_ratios_attr.attr, | |
+ &eval_intervals_attr.attr, | |
+ NULL, | |
+}; | |
+ | |
+static struct attribute_group uksm_attr_group = { | |
+ .attrs = uksm_attrs, | |
+ .name = "uksm", | |
+}; | |
+#endif /* CONFIG_SYSFS */ | |
+ | |
+static inline void init_scan_ladder(void) | |
+{ | |
+ int i; | |
+ struct scan_rung *rung; | |
+ | |
+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { | |
+ rung = uksm_scan_ladder + i; | |
+ slot_tree_init_root(&rung->vma_root); | |
+ } | |
+ | |
+ init_performance_values(); | |
+ uksm_calc_scan_pages(); | |
+} | |
+ | |
+static inline int cal_positive_negative_costs(void) | |
+{ | |
+ struct page *p1, *p2; | |
+ unsigned char *addr1, *addr2; | |
+ unsigned long i, time_start, hash_cost; | |
+ unsigned long loopnum = 0; | |
+ | |
+ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ | |
+ volatile u32 hash; | |
+ volatile int ret; | |
+ | |
+ p1 = alloc_page(GFP_KERNEL); | |
+ if (!p1) | |
+ return -ENOMEM; | |
+ | |
+ p2 = alloc_page(GFP_KERNEL); | |
+ if (!p2) | |
+ return -ENOMEM; | |
+ | |
+ addr1 = kmap_atomic(p1); | |
+ addr2 = kmap_atomic(p2); | |
+ memset(addr1, prandom_u32(), PAGE_SIZE); | |
+ memcpy(addr2, addr1, PAGE_SIZE); | |
+ | |
+ /* make sure that the two pages differ in last byte */ | |
+ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; | |
+ kunmap_atomic(addr2); | |
+ kunmap_atomic(addr1); | |
+ | |
+ time_start = jiffies; | |
+ while (jiffies - time_start < 100) { | |
+ for (i = 0; i < 100; i++) | |
+ hash = page_hash(p1, HASH_STRENGTH_FULL, 0); | |
+ loopnum += 100; | |
+ } | |
+ hash_cost = (jiffies - time_start); | |
+ | |
+ time_start = jiffies; | |
+ for (i = 0; i < loopnum; i++) | |
+ ret = pages_identical(p1, p2); | |
+ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); | |
+ memcmp_cost /= hash_cost; | |
+ printk(KERN_INFO "UKSM: relative memcmp_cost = %lu " | |
+ "hash=%u cmp_ret=%d.\n", | |
+ memcmp_cost, hash, ret); | |
+ | |
+ __free_page(p1); | |
+ __free_page(p2); | |
+ return 0; | |
+} | |
+ | |
+static int init_zeropage_hash_table(void) | |
+{ | |
+ struct page *page; | |
+ char *addr; | |
+ int i; | |
+ | |
+ page = alloc_page(GFP_KERNEL); | |
+ if (!page) | |
+ return -ENOMEM; | |
+ | |
+ addr = kmap_atomic(page); | |
+ memset(addr, 0, PAGE_SIZE); | |
+ kunmap_atomic(addr); | |
+ | |
+ zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32), | |
+ GFP_KERNEL); | |
+ if (!zero_hash_table) | |
+ return -ENOMEM; | |
+ | |
+ for (i = 0; i < HASH_STRENGTH_MAX; i++) | |
+ zero_hash_table[i] = page_hash(page, i, 0); | |
+ | |
+ __free_page(page); | |
+ | |
+ return 0; | |
+} | |
+ | |
+static inline int init_random_sampling(void) | |
+{ | |
+ unsigned long i; | |
+ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); | |
+ if (!random_nums) | |
+ return -ENOMEM; | |
+ | |
+ for (i = 0; i < HASH_STRENGTH_FULL; i++) | |
+ random_nums[i] = i; | |
+ | |
+ for (i = 0; i < HASH_STRENGTH_FULL; i++) { | |
+ unsigned long rand_range, swap_index, tmp; | |
+ | |
+ rand_range = HASH_STRENGTH_FULL - i; | |
+ swap_index = i + prandom_u32() % rand_range; | |
+ tmp = random_nums[i]; | |
+ random_nums[i] = random_nums[swap_index]; | |
+ random_nums[swap_index] = tmp; | |
+ } | |
+ | |
+ rshash_state.state = RSHASH_NEW; | |
+ rshash_state.below_count = 0; | |
+ rshash_state.lookup_window_index = 0; | |
+ | |
+ return cal_positive_negative_costs(); | |
+} | |
+ | |
+static int __init uksm_slab_init(void) | |
+{ | |
+ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); | |
+ if (!rmap_item_cache) | |
+ goto out; | |
+ | |
+ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); | |
+ if (!stable_node_cache) | |
+ goto out_free1; | |
+ | |
+ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); | |
+ if (!node_vma_cache) | |
+ goto out_free2; | |
+ | |
+ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); | |
+ if (!vma_slot_cache) | |
+ goto out_free3; | |
+ | |
+ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); | |
+ if (!tree_node_cache) | |
+ goto out_free4; | |
+ | |
+ return 0; | |
+ | |
+out_free4: | |
+ kmem_cache_destroy(vma_slot_cache); | |
+out_free3: | |
+ kmem_cache_destroy(node_vma_cache); | |
+out_free2: | |
+ kmem_cache_destroy(stable_node_cache); | |
+out_free1: | |
+ kmem_cache_destroy(rmap_item_cache); | |
+out: | |
+ return -ENOMEM; | |
+} | |
+ | |
+static void __init uksm_slab_free(void) | |
+{ | |
+ kmem_cache_destroy(stable_node_cache); | |
+ kmem_cache_destroy(rmap_item_cache); | |
+ kmem_cache_destroy(node_vma_cache); | |
+ kmem_cache_destroy(vma_slot_cache); | |
+ kmem_cache_destroy(tree_node_cache); | |
+} | |
+ | |
+/* Common interface to ksm, different to it. */ | |
+int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |
+ unsigned long end, int advice, unsigned long *vm_flags) | |
+{ | |
+ int err; | |
+ | |
+ switch (advice) { | |
+ case MADV_MERGEABLE: | |
+ return 0; /* just ignore the advice */ | |
+ | |
+ case MADV_UNMERGEABLE: | |
+ if (!(*vm_flags & VM_MERGEABLE)) | |
+ return 0; /* just ignore the advice */ | |
+ | |
+ if (vma->anon_vma) { | |
+ err = unmerge_uksm_pages(vma, start, end); | |
+ if (err) | |
+ return err; | |
+ } | |
+ | |
+ uksm_remove_vma(vma); | |
+ *vm_flags &= ~VM_MERGEABLE; | |
+ break; | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
+/* Common interface to ksm, actually the same. */ | |
+struct page *ksm_might_need_to_copy(struct page *page, | |
+ struct vm_area_struct *vma, unsigned long address) | |
+{ | |
+ struct anon_vma *anon_vma = page_anon_vma(page); | |
+ struct page *new_page; | |
+ | |
+ if (PageKsm(page)) { | |
+ if (page_stable_node(page)) | |
+ return page; /* no need to copy it */ | |
+ } else if (!anon_vma) { | |
+ return page; /* no need to copy it */ | |
+ } else if (anon_vma->root == vma->anon_vma->root && | |
+ page->index == linear_page_index(vma, address)) { | |
+ return page; /* still no need to copy it */ | |
+ } | |
+ if (!PageUptodate(page)) | |
+ return page; /* let do_swap_page report the error */ | |
+ | |
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | |
+ if (new_page) { | |
+ copy_user_highpage(new_page, page, address, vma); | |
+ | |
+ SetPageDirty(new_page); | |
+ __SetPageUptodate(new_page); | |
+ __SetPageLocked(new_page); | |
+ } | |
+ | |
+ return new_page; | |
+} | |
+ | |
+static int __init uksm_init(void) | |
+{ | |
+ struct task_struct *uksm_thread; | |
+ int err; | |
+ | |
+ uksm_sleep_jiffies = msecs_to_jiffies(100); | |
+ uksm_sleep_saved = uksm_sleep_jiffies; | |
+ | |
+ slot_tree_init(); | |
+ init_scan_ladder(); | |
+ | |
+ | |
+ err = init_random_sampling(); | |
+ if (err) | |
+ goto out_free2; | |
+ | |
+ err = uksm_slab_init(); | |
+ if (err) | |
+ goto out_free1; | |
+ | |
+ err = init_zeropage_hash_table(); | |
+ if (err) | |
+ goto out_free0; | |
+ | |
+ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); | |
+ if (IS_ERR(uksm_thread)) { | |
+ printk(KERN_ERR "uksm: creating kthread failed\n"); | |
+ err = PTR_ERR(uksm_thread); | |
+ goto out_free; | |
+ } | |
+ | |
+#ifdef CONFIG_SYSFS | |
+ err = sysfs_create_group(mm_kobj, &uksm_attr_group); | |
+ if (err) { | |
+ printk(KERN_ERR "uksm: register sysfs failed\n"); | |
+ kthread_stop(uksm_thread); | |
+ goto out_free; | |
+ } | |
+#else | |
+ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ | |
+ | |
+#endif /* CONFIG_SYSFS */ | |
+ | |
+#ifdef CONFIG_MEMORY_HOTREMOVE | |
+ /* | |
+ * Choose a high priority since the callback takes uksm_thread_mutex: | |
+ * later callbacks could only be taking locks which nest within that. | |
+ */ | |
+ hotplug_memory_notifier(uksm_memory_callback, 100); | |
+#endif | |
+ return 0; | |
+ | |
+out_free: | |
+ kfree(zero_hash_table); | |
+out_free0: | |
+ uksm_slab_free(); | |
+out_free1: | |
+ kfree(random_nums); | |
+out_free2: | |
+ kfree(uksm_scan_ladder); | |
+ return err; | |
+} | |
+ | |
+#ifdef MODULE | |
+subsys_initcall(ksm_init); | |
+#else | |
+late_initcall(uksm_init); | |
+#endif | |
+ | |
diff --git a/mm/vmstat.c b/mm/vmstat.c | |
index 084c672..c301bd6 100644 | |
--- a/mm/vmstat.c | |
+++ b/mm/vmstat.c | |
@@ -764,6 +764,9 @@ const char * const vmstat_text[] = { | |
"nr_anon_transparent_hugepages", | |
"nr_free_cma", | |
+#ifdef CONFIG_UKSM | |
+ "nr_uksm_zero_pages", | |
+#endif | |
/* enum writeback_stat_item counters */ | |
"nr_dirty_threshold", | |
"nr_dirty_background_threshold", | |
-- | |
2.8.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment