xen0n · June 6, 2016 08:29
diff --git a/0001-add-uksm-patch-ported-for-v4.5.patch b/0001-add-uksm-patch-ported-for-v4.5.patch
 From 3e9afc3a63c3126ee55df341547319ee4a8ac498 Mon Sep 17 00:00:00 2001
 From: Wang Xuerui <idontknw.wang@gmail.com>
 Date: Mon, 6 Jun 2016 16:20:23 +0800
 Subject: [PATCH] add uksm patch ported for v4.5

 ---
 Documentation/vm/00-INDEX     |    2 +
 Documentation/vm/uksm.txt     |   59 +
 fs/exec.c                     |    4 +-
 fs/proc/meminfo.c             |    6 +
 include/asm-generic/pgtable.h |   17 +-
 include/linux/ksm.h           |   44 +-
 include/linux/mm_types.h      |    3 +
 include/linux/mmzone.h        |    5 +-
 include/linux/sradix-tree.h   |   77 +
 include/linux/uksm.h          |  146 ++
 kernel/fork.c                 |    4 +-
 lib/Makefile                  |    2 +-
 lib/sradix-tree.c             |  476 ++++
 mm/Kconfig                    |   26 +
 mm/Makefile                   |    3 +-
 mm/memory.c                   |   39 +-
 mm/mmap.c                     |   44 +-
 mm/rmap.c                     |    4 +-
 mm/uksm.c                     | 5525 +++++++++++++++++++++++++++++++++++++++++
 mm/vmstat.c                   |    3 +
 20 files changed, 6459 insertions(+), 30 deletions(-)
 create mode 100644 Documentation/vm/uksm.txt
 create mode 100644 include/linux/sradix-tree.h
 create mode 100644 include/linux/uksm.h
 create mode 100644 lib/sradix-tree.c
 create mode 100644 mm/uksm.c

 diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
 index 6a5e2a1..09eaa9a1 100644
 --- a/Documentation/vm/00-INDEX
 +++ b/Documentation/vm/00-INDEX
 @@ -18,6 +18,8 @@ idle_page_tracking.txt
 	- description of the idle page tracking feature.
 ksm.txt
 	- how to use the Kernel Samepage Merging feature.
 +uksm.txt
 +	- Introduction to Ultra KSM
 numa
 	- information about NUMA specific code in the Linux vm.
 numa_memory_policy.txt
 diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
 new file mode 100644
 index 0000000..387d96a
 --- /dev/null
 +++ b/Documentation/vm/uksm.txt
 @@ -0,0 +1,59 @@
 +The Ultra Kernel Samepage Merging feature
 +----------------------------------------------
 +/*
 + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
 + *
 + * This is an improvement upon KSM. Some basic data structures and routines
 + * are borrowed from ksm.c .
 + *
 + * Its new features:
 + * 1. Full system scan:
 + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
 + *      interaction to submit a memory area to KSM is no longer needed.
 + *
 + * 2. Rich area detection:
 + *      It automatically detects rich areas containing abundant duplicated
 + *      pages based. Rich areas are given a full scan speed. Poor areas are
 + *      sampled at a reasonable speed with very low CPU consumption.
 + *
 + * 3. Ultra Per-page scan speed improvement:
 + *      A new hash algorithm is proposed. As a result, on a machine with
 + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
 + *      can scan memory areas that does not contain duplicated pages at speed of
 + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
 + *      477MB/sec ~ 923MB/sec.
 + *
 + * 4. Thrashing area avoidance:
 + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
 + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
 + *      hash value based volatile page detection.
 + *
 + *
 + * 5. Misc changes upon KSM:
 + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
 + *        comparison. It's much faster than default C version on x86.
 + *      * rmap_item now has an struct *page member to loosely cache a
 + *        address-->page mapping, which reduces too much time-costly
 + *        follow_page().
 + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
 + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
 + *        ksm is needed for this case.
 + *
 + * 6. Full Zero Page consideration(contributed by Figo Zhang)
 + *    Now uksmd consider full zero pages as special pages and merge them to an
 + *    special unswappable uksm zero page.
 + */
 +
 +ChangeLog:
 +
 +2012-05-05 The creation of this Doc
 +2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
 +2012-05-28 UKSM 0.1.1.2 bug fix release
 +2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
 +2012-07-2  UKSM 0.1.2-beta2
 +2012-07-10 UKSM 0.1.2-beta3
 +2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
 +2012-10-13 UKSM 0.1.2.1 Bug fixes.
 +2012-12-31 UKSM 0.1.2.2 Minor bug fixes.
 +2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug".
 +2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings.
 diff --git a/fs/exec.c b/fs/exec.c
 index dcd4ac7..80032f8 100644
 --- a/fs/exec.c
 +++ b/fs/exec.c
 @@ -19,7 +19,7 @@
  * current->executable is only used by the procfs.  This allows a dispatch
  * table to check for several different types  of binary formats.  We keep
  * trying until we recognize the file or we run out of supported binary
 - * formats. 
 + * formats.
  */
 
 #include <linux/slab.h>
 @@ -56,6 +56,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
 +#include <linux/ksm.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 @@ -1162,6 +1163,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 	current->self_exec_id++;
 +
 	flush_signal_handlers(current, 0);
 	do_close_on_exec(current->files);
 }
 diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
 index df4661a..70b9f95 100644
 --- a/fs/proc/meminfo.c
 +++ b/fs/proc/meminfo.c
 @@ -118,6 +118,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"SUnreclaim:     %8lu kB\n"
 		"KernelStack:    %8lu kB\n"
 		"PageTables:     %8lu kB\n"
 +#ifdef CONFIG_UKSM
 +		"KsmZeroPages:   %8lu kB\n"
 +#endif
 #ifdef CONFIG_QUICKLIST
 		"Quicklists:     %8lu kB\n"
 #endif
 @@ -176,6 +179,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
 		global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
 		K(global_page_state(NR_PAGETABLE)),
 +#ifdef CONFIG_UKSM
 +		K(global_page_state(NR_UKSM_ZERO_PAGES)),
 +#endif
 #ifdef CONFIG_QUICKLIST
 		K(quicklist_total_size()),
 #endif
 diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
 index c370b26..f387ed1 100644
 --- a/include/asm-generic/pgtable.h
 +++ b/include/asm-generic/pgtable.h
 @@ -601,12 +601,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 extern void untrack_pfn_moved(struct vm_area_struct *vma);
 #endif
 
 +#ifdef CONFIG_UKSM
 +static inline int is_uksm_zero_pfn(unsigned long pfn)
 +{
 +	extern unsigned long uksm_zero_pfn;
 +        return pfn == uksm_zero_pfn;
 +}
 +#else
 +static inline int is_uksm_zero_pfn(unsigned long pfn)
 +{
 +        return 0;
 +}
 +#endif
 +
 #ifdef __HAVE_COLOR_ZERO_PAGE
 static inline int is_zero_pfn(unsigned long pfn)
 {
 	extern unsigned long zero_pfn;
 	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
 -	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
 +	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
 }
 
 #define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
 @@ -615,7 +628,7 @@ static inline int is_zero_pfn(unsigned long pfn)
 static inline int is_zero_pfn(unsigned long pfn)
 {
 	extern unsigned long zero_pfn;
 -	return pfn == zero_pfn;
 +	return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
 }
 
 static inline unsigned long my_zero_pfn(unsigned long addr)
 diff --git a/include/linux/ksm.h b/include/linux/ksm.h
 index 7ae216a..06861d8 100644
 --- a/include/linux/ksm.h
 +++ b/include/linux/ksm.h
 @@ -19,21 +19,6 @@ struct mem_cgroup;
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
 -int __ksm_enter(struct mm_struct *mm);
 -void __ksm_exit(struct mm_struct *mm);
 -
 -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 -{
 -	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
 -		return __ksm_enter(mm);
 -	return 0;
 -}
 -
 -static inline void ksm_exit(struct mm_struct *mm)
 -{
 -	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
 -		__ksm_exit(mm);
 -}
 
 static inline struct stable_node *page_stable_node(struct page *page)
 {
 @@ -64,6 +49,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
 int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 
 +#ifdef CONFIG_KSM_LEGACY
 +int __ksm_enter(struct mm_struct *mm);
 +void __ksm_exit(struct mm_struct *mm);
 +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 +{
 +	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
 +		return __ksm_enter(mm);
 +	return 0;
 +}
 +
 +static inline void ksm_exit(struct mm_struct *mm)
 +{
 +	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
 +		__ksm_exit(mm);
 +}
 +
 +#elif defined(CONFIG_UKSM)
 +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 +{
 +	return 0;
 +}
 +
 +static inline void ksm_exit(struct mm_struct *mm)
 +{
 +}
 +#endif /* !CONFIG_UKSM */
 +
 #else  /* !CONFIG_KSM */
 
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 @@ -106,4 +118,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 #endif /* CONFIG_MMU */
 #endif /* !CONFIG_KSM */
 
 +#include <linux/uksm.h>
 +
 #endif /* __LINUX_KSM_H */
 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
 index 624b78b..f48da4a 100644
 --- a/include/linux/mm_types.h
 +++ b/include/linux/mm_types.h
 @@ -351,6 +351,9 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
 +#ifdef CONFIG_UKSM
 +	struct vma_slot *uksm_vma_slot;
 +#endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 };
 
 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
 index 7b6c2cf..0386513 100644
 --- a/include/linux/mmzone.h
 +++ b/include/linux/mmzone.h
 @@ -157,6 +157,9 @@ enum zone_stat_item {
 	WORKINGSET_NODERECLAIM,
 	NR_ANON_TRANSPARENT_HUGEPAGES,
 	NR_FREE_CMA_PAGES,
 +#ifdef CONFIG_UKSM
 +	NR_UKSM_ZERO_PAGES,
 +#endif
 	NR_VM_ZONE_STAT_ITEMS };
 
 /*
 @@ -808,7 +811,7 @@ static inline int is_highmem_idx(enum zone_type idx)
 }
 
 /**
 - * is_highmem - helper function to quickly check if a struct zone is a 
 + * is_highmem - helper function to quickly check if a struct zone is a
  *              highmem zone or not.  This is an attempt to keep references
  *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
  * @zone - pointer to struct zone variable
 diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
 new file mode 100644
 index 0000000..6780fdb
 --- /dev/null
 +++ b/include/linux/sradix-tree.h
 @@ -0,0 +1,77 @@
 +#ifndef _LINUX_SRADIX_TREE_H
 +#define _LINUX_SRADIX_TREE_H
 +
 +
 +#define INIT_SRADIX_TREE(root, mask)					\
 +do {									\
 +	(root)->height = 0;						\
 +	(root)->gfp_mask = (mask);					\
 +	(root)->rnode = NULL;						\
 +} while (0)
 +
 +#define ULONG_BITS	(sizeof(unsigned long) * 8)
 +#define SRADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
 +//#define SRADIX_TREE_MAP_SHIFT	6
 +//#define SRADIX_TREE_MAP_SIZE	(1UL << SRADIX_TREE_MAP_SHIFT)
 +//#define SRADIX_TREE_MAP_MASK	(SRADIX_TREE_MAP_SIZE-1)
 +
 +struct sradix_tree_node {
 +	unsigned int	height;		/* Height from the bottom */
 +	unsigned int	count;		
 +	unsigned int	fulls;		/* Number of full sublevel trees */ 
 +	struct sradix_tree_node *parent;
 +	void *stores[0];
 +};
 +
 +/* A simple radix tree implementation */
 +struct sradix_tree_root {
 +        unsigned int            height;
 +        struct sradix_tree_node *rnode;
 +
 +	/* Where found to have available empty stores in its sublevels */
 +        struct sradix_tree_node *enter_node;
 +	unsigned int shift;
 +	unsigned int stores_size;
 +	unsigned int mask;
 +	unsigned long min;	/* The first hole index */
 +	unsigned long num;
 +	//unsigned long *height_to_maxindex;
 +
 +	/* How the node is allocated and freed. */
 +	struct sradix_tree_node *(*alloc)(void); 
 +	void (*free)(struct sradix_tree_node *node);
 +
 +	/* When a new node is added and removed */
 +	void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
 +	void (*assign)(struct sradix_tree_node *node, unsigned index, void *item);
 +	void (*rm)(struct sradix_tree_node *node, unsigned offset);
 +};
 +
 +struct sradix_tree_path {
 +	struct sradix_tree_node *node;
 +	int offset;
 +};
 +
 +static inline 
 +void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
 +{
 +	root->height = 0;
 +	root->rnode = NULL;
 +	root->shift = shift;
 +	root->stores_size = 1UL << shift;
 +	root->mask = root->stores_size - 1;
 +}
 +
 +
 +extern void *sradix_tree_next(struct sradix_tree_root *root,
 +		       struct sradix_tree_node *node, unsigned long index,
 +		       int (*iter)(void *, unsigned long));
 +
 +extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
 +
 +extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, 
 +			struct sradix_tree_node *node, unsigned long index);
 +
 +extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
 +
 +#endif /* _LINUX_SRADIX_TREE_H */
 diff --git a/include/linux/uksm.h b/include/linux/uksm.h
 new file mode 100644
 index 0000000..206f109
 --- /dev/null
 +++ b/include/linux/uksm.h
 @@ -0,0 +1,146 @@
 +#ifndef __LINUX_UKSM_H
 +#define __LINUX_UKSM_H
 +/*
 + * Memory merging support.
 + *
 + * This code enables dynamic sharing of identical pages found in different
 + * memory areas, even if they are not shared by fork().
 + */
 +
 +/* if !CONFIG_UKSM this file should not be compiled at all. */
 +#ifdef CONFIG_UKSM
 +
 +#include <linux/bitops.h>
 +#include <linux/mm.h>
 +#include <linux/pagemap.h>
 +#include <linux/rmap.h>
 +#include <linux/sched.h>
 +
 +extern unsigned long zero_pfn __read_mostly;
 +extern unsigned long uksm_zero_pfn __read_mostly;
 +extern struct page *empty_uksm_zero_page;
 +
 +/* must be done before linked to mm */
 +extern void uksm_vma_add_new(struct vm_area_struct *vma);
 +extern void uksm_remove_vma(struct vm_area_struct *vma);
 +
 +#define UKSM_SLOT_NEED_SORT	(1 << 0)
 +#define UKSM_SLOT_NEED_RERAND 	(1 << 1)
 +#define UKSM_SLOT_SCANNED     	(1 << 2) /* It's scanned in this round */
 +#define UKSM_SLOT_FUL_SCANNED 	(1 << 3)
 +#define UKSM_SLOT_IN_UKSM 	(1 << 4)
 +
 +struct vma_slot {
 +	struct sradix_tree_node *snode;
 +	unsigned long sindex;
 +
 +	struct list_head slot_list;
 +	unsigned long fully_scanned_round;
 +	unsigned long dedup_num;
 +	unsigned long pages_scanned;
 +	unsigned long last_scanned;
 +	unsigned long pages_to_scan;
 +	struct scan_rung *rung;
 +	struct page **rmap_list_pool;
 +	unsigned int *pool_counts;
 +	unsigned long pool_size;
 +	struct vm_area_struct *vma;
 +	struct mm_struct *mm;
 +	unsigned long ctime_j;
 +	unsigned long pages;
 +	unsigned long flags;
 +	unsigned long pages_cowed; /* pages cowed this round */
 +	unsigned long pages_merged; /* pages merged this round */
 +	unsigned long pages_bemerged;
 +
 +	/* when it has page merged in this eval round */
 +	struct list_head dedup_list;
 +};
 +
 +static inline void uksm_unmap_zero_page(pte_t pte)
 +{
 +	if (pte_pfn(pte) == uksm_zero_pfn)
 +		__dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
 +}
 +
 +static inline void uksm_map_zero_page(pte_t pte)
 +{
 +	if (pte_pfn(pte) == uksm_zero_pfn)
 +		__inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
 +}
 +
 +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
 +{
 +	if (vma->uksm_vma_slot && PageKsm(page))
 +		vma->uksm_vma_slot->pages_cowed++;
 +}
 +
 +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
 +{
 +	if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
 +		vma->uksm_vma_slot->pages_cowed++;
 +}
 +
 +static inline int uksm_flags_can_scan(unsigned long vm_flags)
 +{
 +#ifndef VM_SAO
 +#define VM_SAO 0
 +#endif
 +	return !(vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
 +			     VM_HUGETLB | VM_MIXEDMAP | VM_SHARED
 +			     | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN | VM_SAO));
 +}
 +
 +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
 +{
 +	if (uksm_flags_can_scan(*vm_flags_p))
 +		*vm_flags_p |= VM_MERGEABLE;
 +}
 +
 +/*
 + * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
 + * be removed when uksm zero page patch is stable enough.
 + */
 +static inline void uksm_bugon_zeropage(pte_t pte)
 +{
 +	BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
 +}
 +#else
 +static inline void uksm_vma_add_new(struct vm_area_struct *vma)
 +{
 +}
 +
 +static inline void uksm_remove_vma(struct vm_area_struct *vma)
 +{
 +}
 +
 +static inline void uksm_unmap_zero_page(pte_t pte)
 +{
 +}
 +
 +static inline void uksm_map_zero_page(pte_t pte)
 +{
 +}
 +
 +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
 +{
 +}
 +
 +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
 +{
 +}
 +
 +static inline int uksm_flags_can_scan(unsigned long vm_flags)
 +{
 +	return 0;
 +}
 +
 +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
 +{
 +}
 +
 +static inline void uksm_bugon_zeropage(pte_t pte)
 +{
 +}
 +#endif /* !CONFIG_UKSM */
 +#endif /* __LINUX_UKSM_H */
 diff --git a/kernel/fork.c b/kernel/fork.c
 index 2e391c7..f917401 100644
 --- a/kernel/fork.c
 +++ b/kernel/fork.c
 @@ -444,7 +444,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				goto fail_nomem;
 			charge = len;
 		}
 -		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 +		tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
 @@ -497,7 +497,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		__vma_link_rb(mm, tmp, rb_link, rb_parent);
 		rb_link = &tmp->vm_rb.rb_right;
 		rb_parent = &tmp->vm_rb;
 -
 +		uksm_vma_add_new(tmp);
 		mm->map_count++;
 		retval = copy_page_range(mm, oldmm, mpnt);
 
 diff --git a/lib/Makefile b/lib/Makefile
 index a7c26a4..1bf34ea 100644
 --- a/lib/Makefile
 +++ b/lib/Makefile
 @@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
 endif
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 -	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 +	 rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o \
 	 sha1.o md5.o irq_regs.o argv_split.o \
 	 proportions.o flex_proportions.o ratelimit.o show_mem.o \
 diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c
 new file mode 100644
 index 0000000..8d06329
 --- /dev/null
 +++ b/lib/sradix-tree.c
 @@ -0,0 +1,476 @@
 +#include <linux/errno.h>
 +#include <linux/mm.h>
 +#include <linux/mman.h>
 +#include <linux/spinlock.h>
 +#include <linux/slab.h>
 +#include <linux/gcd.h>
 +#include <linux/sradix-tree.h>
 +
 +static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
 +{
 +	return node->fulls == root->stores_size || 
 +		(node->height == 1 && node->count == root->stores_size);
 +}
 +
 +/*
 + *	Extend a sradix tree so it can store key @index.
 + */
 +static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
 +{
 +	struct sradix_tree_node *node;
 +	unsigned int height;
 +
 +	if (unlikely(root->rnode == NULL)) {
 +		if (!(node = root->alloc()))
 +			return -ENOMEM;
 +
 +		node->height = 1;
 +		root->rnode = node;
 +		root->height = 1;
 +	}
 +
 +	/* Figure out what the height should be.  */
 +	height = root->height;
 +	index >>= root->shift * height;
 +
 +	while (index) {
 +		index >>= root->shift;
 +		height++;
 +	}
 +
 +	while (height > root->height) {
 +		unsigned int newheight;
 +		if (!(node = root->alloc()))
 +			return -ENOMEM;
 +
 +		/* Increase the height.  */
 +		node->stores[0] = root->rnode;
 +		root->rnode->parent = node;
 +		if (root->extend)
 +			root->extend(node, root->rnode);
 +
 +		newheight = root->height + 1;
 +		node->height = newheight;
 +		node->count = 1;
 +		if (sradix_node_full(root, root->rnode))
 +			node->fulls = 1;
 +
 +		root->rnode = node;
 +		root->height = newheight;
 +	}
 +
 +	return 0;
 +}
 +
 +/*
 + * Search the next item from the current node, that is not NULL
 + * and can satify root->iter().
 + */
 +void *sradix_tree_next(struct sradix_tree_root *root,
 +		       struct sradix_tree_node *node, unsigned long index,
 +		       int (*iter)(void *item, unsigned long height))
 +{
 +	unsigned long offset;
 +	void *item;
 +
 +	if (unlikely(node == NULL)) {
 +		node = root->rnode;
 +		for (offset = 0; offset < root->stores_size; offset++) {
 +			item = node->stores[offset];
 +			if (item && (!iter || iter(item, node->height)))
 +				break;
 +		}
 +
 +		if (unlikely(offset >= root->stores_size))
 +			return NULL;
 +
 +		if (node->height == 1)
 +			return item;
 +		else
 +			goto go_down;
 +	}
 +
 +	while (node) {
 +		offset = (index & root->mask) + 1;					
 +		for (;offset < root->stores_size; offset++) {
 +			item = node->stores[offset];
 +			if (item && (!iter || iter(item, node->height)))
 +				break;
 +		}
 +
 +		if (offset < root->stores_size)
 +			break;
 +
 +		node = node->parent;
 +		index >>= root->shift;
 +	}
 +
 +	if (!node)
 +		return NULL;
 +
 +	while (node->height > 1) {
 +go_down:
 +		node = item;
 +		for (offset = 0; offset < root->stores_size; offset++) {
 +			item = node->stores[offset];
 +			if (item && (!iter || iter(item, node->height)))
 +				break;
 +		}
 +
 +		if (unlikely(offset >= root->stores_size))
 +			return NULL;
 +	}
 +
 +	BUG_ON(offset > root->stores_size);
 +
 +	return item;
 +}
 +
 +/*
 + * Blindly insert the item to the tree. Typically, we reuse the
 + * first empty store item.
 + */
 +int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
 +{
 +	unsigned long index;
 +	unsigned int height;
 +	struct sradix_tree_node *node, *tmp = NULL;
 +	int offset, offset_saved;
 +	void **store = NULL;
 +	int error, i, j, shift;
 +
 +go_on:
 +	index = root->min;
 +
 +	if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
 +		node = root->enter_node;
 +		BUG_ON((index >> (root->shift * root->height)));
 +	} else {
 +		node = root->rnode;
 +		if (node == NULL || (index >> (root->shift * root->height))
 +		    || sradix_node_full(root, node)) {
 +			error = sradix_tree_extend(root, index);
 +			if (error)
 +				return error;
 +
 +			node = root->rnode;
 +		}
 +	}
 +
 +
 +	height = node->height;
 +	shift = (height - 1) * root->shift;
 +	offset = (index >> shift) & root->mask;
 +	while (shift > 0) {
 +		offset_saved = offset;
 +		for (; offset < root->stores_size; offset++) {
 +			store = &node->stores[offset];
 +			tmp = *store;
 +
 +			if (!tmp || !sradix_node_full(root, tmp))
 +				break;
 +		}
 +		BUG_ON(offset >= root->stores_size);
 +
 +		if (offset != offset_saved) {
 +			index += (offset - offset_saved) << shift;
 +			index &= ~((1UL << shift) - 1);
 +		}
 +
 +		if (!tmp) {
 +			if (!(tmp = root->alloc()))
 +				return -ENOMEM;
 +
 +			tmp->height = shift / root->shift;
 +			*store = tmp;
 +			tmp->parent = node;
 +			node->count++;
 +//			if (root->extend)
 +//				root->extend(node, tmp);
 +		}
 +
 +		node = tmp;
 +		shift -= root->shift;
 +		offset = (index >> shift) & root->mask;
 +	}
 +
 +	BUG_ON(node->height != 1);
 +
 +
 +	store = &node->stores[offset];
 +	for (i = 0, j = 0;
 +	      j < root->stores_size - node->count && 
 +	      i < root->stores_size - offset && j < num; i++) {
 +		if (!store[i]) {
 +			store[i] = item[j];
 +			if (root->assign)
 +				root->assign(node, index + i, item[j]);
 +			j++;
 +		}
 +	}
 +
 +	node->count += j;
 +	root->num += j;
 +	num -= j;
 +
 +	while (sradix_node_full(root, node)) {
 +		node = node->parent;
 +		if (!node)
 +			break;
 +
 +		node->fulls++;
 +	}
 +
 +	if (unlikely(!node)) {
 +		/* All nodes are full */
 +		root->min = 1 << (root->height * root->shift);
 +		root->enter_node = NULL;
 +	} else {
 +		root->min = index + i - 1;
 +		root->min |= (1UL << (node->height - 1)) - 1;
 +		root->min++;
 +		root->enter_node = node;
 +	}
 +
 +	if (num) {
 +		item += j;
 +		goto go_on;
 +	}
 +
 +	return 0;
 +}
 +
 +
 +/**
 + *	sradix_tree_shrink    -    shrink height of a sradix tree to minimal
 + *      @root		sradix tree root
 + *  
 + */
 +static inline void sradix_tree_shrink(struct sradix_tree_root *root)
 +{
 +	/* try to shrink tree height */
 +	while (root->height > 1) {
 +		struct sradix_tree_node *to_free = root->rnode;
 +
 +		/*
 +		 * The candidate node has more than one child, or its child
 +		 * is not at the leftmost store, we cannot shrink.
 +		 */
 +		if (to_free->count != 1 || !to_free->stores[0])
 +			break;
 +
 +		root->rnode = to_free->stores[0];
 +		root->rnode->parent = NULL;
 +		root->height--;
 +		if (unlikely(root->enter_node == to_free)) {
 +			root->enter_node = NULL;
 +		}
 +		root->free(to_free);
 +	}
 +}
 +
 +/*
 + * Del the item on the known leaf node and index
 + */
 +void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, 
 +				  struct sradix_tree_node *node, unsigned long index)
 +{
 +	unsigned int offset;
 +	struct sradix_tree_node *start, *end;
 +
 +	BUG_ON(node->height != 1);
 +
 +	start = node;
 +	while (node && !(--node->count))
 +		node = node->parent;
 +
 +	end = node;
 +	if (!node) {
 +		root->rnode = NULL;
 +		root->height = 0;
 +		root->min = 0;
 +		root->num = 0;
 +		root->enter_node = NULL;
 +	} else {
 +		offset = (index >> (root->shift * (node->height - 1))) & root->mask;
 +		if (root->rm)
 +			root->rm(node, offset);
 +		node->stores[offset] = NULL;
 +		root->num--;
 +		if (root->min > index) {
 +			root->min = index;
 +			root->enter_node = node;
 +		}
 +	}
 +
 +	if (start != end) {
 +		do {
 +			node = start;
 +			start = start->parent;
 +			if (unlikely(root->enter_node == node))
 +				root->enter_node = end;
 +			root->free(node);
 +		} while (start != end);
 +
 +		/*
 +		 * Note that shrink may free "end", so enter_node still need to
 +		 * be checked inside.
 +		 */
 +		sradix_tree_shrink(root);
 +	} else if (node->count == root->stores_size - 1) {
 +		/* It WAS a full leaf node. Update the ancestors */
 +		node = node->parent;
 +		while (node) {
 +			node->fulls--;
 +			if (node->fulls != root->stores_size - 1)
 +				break;
 +
 +			node = node->parent;
 +		}
 +	}
 +}
 +
 +void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
 +{
 +	unsigned int height, offset;
 +	struct sradix_tree_node *node;
 +	int shift;
 +
 +	node = root->rnode;
 +	if (node == NULL || (index >> (root->shift * root->height)))
 +		return NULL;
 +
 +	height = root->height;
 +	shift = (height - 1) * root->shift;
 +
 +	do {
 +		offset = (index >> shift) & root->mask;
 +		node = node->stores[offset];
 +		if (!node)
 +			return NULL;
 +
 +		shift -= root->shift;
 +	} while (shift >= 0);
 +
 +	return node;
 +}
 +
 +/*
 + * Return the item if it exists, otherwise create it in place
 + * and return the created item.
 + */
 +void *sradix_tree_lookup_create(struct sradix_tree_root *root, 
 +			unsigned long index, void *(*item_alloc)(void))
 +{
 +	unsigned int height, offset;
 +	struct sradix_tree_node *node, *tmp;
 +	void *item;
 +	int shift, error;
 +
 +	if (root->rnode == NULL || (index >> (root->shift * root->height))) {
 +		if (item_alloc) {
 +			error = sradix_tree_extend(root, index);
 +			if (error)
 +				return NULL;
 +		} else {
 +			return NULL;
 +		}
 +	}
 +
 +	node = root->rnode;
 +	height = root->height;
 +	shift = (height - 1) * root->shift;
 +
 +	do {
 +		offset = (index >> shift) & root->mask;
 +		if (!node->stores[offset]) {
 +			if (!(tmp = root->alloc()))
 +				return NULL;
 +
 +			tmp->height = shift / root->shift;
 +			node->stores[offset] = tmp;
 +			tmp->parent = node;
 +			node->count++;
 +			node = tmp;
 +		} else {
 +			node = node->stores[offset];
 +		}
 +
 +		shift -= root->shift;
 +	} while (shift > 0);
 +
 +	BUG_ON(node->height != 1);
 +	offset = index & root->mask;
 +	if (node->stores[offset]) {
 +		return node->stores[offset];
 +	} else if (item_alloc) {
 +		if (!(item = item_alloc()))
 +			return NULL;
 +
 +		node->stores[offset] = item;
 +
 +		/*
 +		 * NOTE: we do NOT call root->assign here, since this item is
 +		 * newly created by us having no meaning. Caller can call this
 +		 * if it's necessary to do so.
 +		 */
 +
 +		node->count++;
 +		root->num++;
 +
 +		while (sradix_node_full(root, node)) {
 +			node = node->parent;
 +			if (!node)
 +				break;
 +
 +			node->fulls++;
 +		}
 +
 +		if (unlikely(!node)) {
 +			/* All nodes are full */
 +			root->min = 1 << (root->height * root->shift);
 +		} else {
 +			if (root->min == index) {
 +				root->min |= (1UL << (node->height - 1)) - 1;
 +				root->min++;
 +				root->enter_node = node;
 +			}
 +		}
 +
 +		return item;
 +	} else {
 +		return NULL;
 +	}
 +
 +}
 +
 +int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
 +{
 +	unsigned int height, offset;
 +	struct sradix_tree_node *node;
 +	int shift;
 +
 +	node = root->rnode;
 +	if (node == NULL || (index >> (root->shift * root->height)))
 +		return -ENOENT;
 +
 +	height = root->height;
 +	shift = (height - 1) * root->shift;
 +
 +	do {
 +		offset = (index >> shift) & root->mask;
 +		node = node->stores[offset];
 +		if (!node)
 +			return -ENOENT;
 +
 +		shift -= root->shift;
 +	} while (shift > 0);
 +
 +	offset = index & root->mask;
 +	if (!node->stores[offset])
 +		return -ENOENT;
 +
 +	sradix_tree_delete_from_leaf(root, node, index);
 +
 +	return 0;
 +}
 diff --git a/mm/Kconfig b/mm/Kconfig
 index 03cbfa0..1b1b681 100644
 --- a/mm/Kconfig
 +++ b/mm/Kconfig
 @@ -322,6 +322,32 @@ config KSM
 	  See Documentation/vm/ksm.txt for more information: KSM is inactive
 	  until a program has madvised that an area is MADV_MERGEABLE, and
 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
 +choice
 +	prompt "Choose UKSM/KSM strategy"
 +	default UKSM
 +	depends on KSM
 +	help
 +	  This option allows to select a UKSM/KSM stragety.
 +
 +config UKSM
 +	bool "Ultra-KSM for page merging"
 +	depends on KSM
 +	help
 +	UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
 +	page Merging), but with a fundamentally rewritten core algorithm. With
 +	an advanced algorithm, UKSM now can transparently scans all anonymously
 +	mapped user space applications with an significantly improved scan speed
 +	and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
 +	UKSM. Now UKSM has its first stable release and first real world enterprise user.
 +	For more information, please goto its project page.
 +	(www.kerneldedup.org)
 +
 +config KSM_LEGACY
 +	bool "Legacy KSM implementation"
 +	depends on KSM
 +	help
 +	The legacy KSM implementation from Redhat.
 +endchoice
 
 config DEFAULT_MMAP_MIN_ADDR
         int "Low address space to protect from user allocation"
 diff --git a/mm/Makefile b/mm/Makefile
 index 2ed4319..90fd58a 100644
 --- a/mm/Makefile
 +++ b/mm/Makefile
 @@ -47,7 +47,8 @@ obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 -obj-$(CONFIG_KSM) += ksm.o
 +obj-$(CONFIG_KSM_LEGACY) += ksm.o
 +obj-$(CONFIG_UKSM) += uksm.o
 obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 diff --git a/mm/memory.c b/mm/memory.c
 index 8132787..b6175a5 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -122,6 +122,28 @@ unsigned long highest_memmap_pfn __read_mostly;
 
 EXPORT_SYMBOL(zero_pfn);
 
 +#ifdef CONFIG_UKSM
 +unsigned long uksm_zero_pfn __read_mostly;
 +EXPORT_SYMBOL_GPL(uksm_zero_pfn);
 +struct page *empty_uksm_zero_page;
 +
 +static int __init setup_uksm_zero_page(void)
 +{
 +	unsigned long addr;
 +	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0);
 +	if (!addr)
 +		panic("Oh boy, that early out of memory?");
 +
 +	empty_uksm_zero_page = virt_to_page((void *) addr);
 +	SetPageReserved(empty_uksm_zero_page);
 +
 +	uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
 +
 +	return 0;
 +}
 +core_initcall(setup_uksm_zero_page);
 +#endif
 +
 /*
  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
  */
 @@ -133,6 +155,7 @@ static int __init init_zero_pfn(void)
 core_initcall(init_zero_pfn);
 
 
 +
 #if defined(SPLIT_RSS_COUNTING)
 
 void sync_mm_rss(struct mm_struct *mm)
 @@ -867,6 +890,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		get_page(page);
 		page_dup_rmap(page, false);
 		rss[mm_counter(page)]++;
 +
 +		/* Should return NULL in vm_normal_page() */
 +		uksm_bugon_zeropage(pte);
 +	} else {
 +		uksm_map_zero_page(pte);
 	}
 
 out_set_pte:
 @@ -1100,8 +1128,10 @@ again:
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 -			if (unlikely(!page))
 +			if (unlikely(!page)) {
 +				uksm_unmap_zero_page(ptent);
 				continue;
 +			}
 
 			if (!PageAnon(page)) {
 				if (pte_dirty(ptent)) {
 @@ -1937,8 +1967,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 			clear_page(kaddr);
 		kunmap_atomic(kaddr);
 		flush_dcache_page(dst);
 -	} else
 +	} else {
 		copy_user_highpage(dst, src, va, vma);
 +		uksm_cow_page(vma, src);
 +	}
 }
 
 static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
 @@ -2083,6 +2115,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 		new_page = alloc_zeroed_user_highpage_movable(vma, address);
 		if (!new_page)
 			goto oom;
 +		uksm_cow_pte(vma, orig_pte);
 	} else {
 		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!new_page)
 @@ -2108,7 +2141,9 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 						mm_counter_file(old_page));
 				inc_mm_counter_fast(mm, MM_ANONPAGES);
 			}
 +			uksm_bugon_zeropage(orig_pte);
 		} else {
 +			uksm_unmap_zero_page(orig_pte);
 			inc_mm_counter_fast(mm, MM_ANONPAGES);
 		}
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 diff --git a/mm/mmap.c b/mm/mmap.c
 index 76d1ec2..60b6347 100644
 --- a/mm/mmap.c
 +++ b/mm/mmap.c
 @@ -41,6 +41,7 @@
 #include <linux/notifier.h>
 #include <linux/memory.h>
 #include <linux/printk.h>
 +#include <linux/ksm.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/moduleparam.h>
 
 @@ -292,6 +293,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_put(vma_policy(vma));
 +	uksm_remove_vma(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 	return next;
 }
 @@ -756,9 +758,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	long adjust_next = 0;
 	int remove_next = 0;
 
 +/*
 + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
 + * acquired
 + */
 +	uksm_remove_vma(vma);
 +
 	if (next && !insert) {
 		struct vm_area_struct *exporter = NULL;
 
 +		uksm_remove_vma(next);
 		if (end >= next->vm_end) {
 			/*
 			 * vma expands, overlapping all the next, and
 @@ -852,6 +861,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		end_changed = true;
 	}
 	vma->vm_pgoff = pgoff;
 +
 	if (adjust_next) {
 		next->vm_start += adjust_next << PAGE_SHIFT;
 		next->vm_pgoff += adjust_next;
 @@ -922,16 +932,22 @@ again:			remove_next = 1 + (end > next->vm_end);
 		 * up the code too much to do both in one go.
 		 */
 		next = vma->vm_next;
 -		if (remove_next == 2)
 +		if (remove_next == 2) {
 +			uksm_remove_vma(next);
 			goto again;
 -		else if (next)
 +		} else if (next) {
 			vma_gap_update(next);
 -		else
 +		} else {
 			mm->highest_vm_end = end;
 +		}
 +	} else {
 +		if (next && !insert)
 +			uksm_vma_add_new(next);
 	}
 	if (insert && file)
 		uprobe_mmap(insert);
 
 +	uksm_vma_add_new(vma);
 	validate_mm(mm);
 
 	return 0;
 @@ -1316,6 +1332,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
 +	/* If uksm is enabled, we add VM_MERGABLE to new VMAs. */
 +	uksm_vm_flags_mod(&vm_flags);
 +
 	if (flags & MAP_LOCKED)
 		if (!can_do_mlock())
 			return -EPERM;
 @@ -1654,6 +1673,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 			allow_write_access(file);
 	}
 	file = vma->vm_file;
 +	uksm_vma_add_new(vma);
 out:
 	perf_event_mmap(vma);
 
 @@ -1695,6 +1715,7 @@ allow_write_and_free_vma:
 	if (vm_flags & VM_DENYWRITE)
 		allow_write_access(file);
 free_vma:
 +	uksm_remove_vma(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 unacct_error:
 	if (charged)
 @@ -2490,6 +2511,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	else
 		err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
 
 +	uksm_vma_add_new(new);
 +
 	/* Success. */
 	if (!err)
 		return 0;
 @@ -2750,6 +2773,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 		return addr;
 
 	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 +	uksm_vm_flags_mod(&flags);
 
 	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
 	if (offset_in_page(error))
 @@ -2807,6 +2831,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 	vma->vm_flags = flags;
 	vma->vm_page_prot = vm_get_page_prot(flags);
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 +	uksm_vma_add_new(vma);
 out:
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
 @@ -2843,6 +2868,12 @@ void exit_mmap(struct mm_struct *mm)
 	/* mm's last user has gone, and its about to be pulled down */
 	mmu_notifier_release(mm);
 
 +	/*
 +	 * Taking write lock on mmap_sem does not harm others,
 +	 * but it's crucial for uksm to avoid races.
 +	 */
 +	down_write(&mm->mmap_sem);
 +
 	if (mm->locked_vm) {
 		vma = mm->mmap;
 		while (vma) {
 @@ -2878,6 +2909,11 @@ void exit_mmap(struct mm_struct *mm)
 		vma = remove_vma(vma);
 	}
 	vm_unacct_memory(nr_accounted);
 +
 +	mm->mmap = NULL;
 +	mm->mm_rb = RB_ROOT;
 +	vmacache_invalidate(mm);
 +	up_write(&mm->mmap_sem);
 }
 
 /* Insert vm structure into process list sorted by address
 @@ -2987,6 +3023,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			new_vma->vm_ops->open(new_vma);
 		vma_link(mm, new_vma, prev, rb_link, rb_parent);
 		*need_rmap_locks = false;
 +		uksm_vma_add_new(new_vma);
 	}
 	return new_vma;
 
 @@ -3116,6 +3153,7 @@ static struct vm_area_struct *__install_special_mapping(
 	vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
 
 	perf_event_mmap(vma);
 +	uksm_vma_add_new(vma);
 
 	return vma;
 
 diff --git a/mm/rmap.c b/mm/rmap.c
 index 79f3bf0..ea35ca3 100644
 --- a/mm/rmap.c
 +++ b/mm/rmap.c
 @@ -1125,9 +1125,9 @@ void page_move_anon_rmap(struct page *page,
 
 /**
  * __page_set_anon_rmap - set up new anonymous rmap
 - * @page:	Page to add to rmap	
 + * @page:	Page to add to rmap
  * @vma:	VM area to add page to.
 - * @address:	User virtual address of the mapping	
 + * @address:	User virtual address of the mapping
  * @exclusive:	the page is exclusively owned by the current process
  */
 static void __page_set_anon_rmap(struct page *page,
 diff --git a/mm/uksm.c b/mm/uksm.c
 new file mode 100644
 index 0000000..bfa97c9
 --- /dev/null
 +++ b/mm/uksm.c
 @@ -0,0 +1,5525 @@
 +/*
 + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
 + *
 + * This is an improvement upon KSM. Some basic data structures and routines
 + * are borrowed from ksm.c .
 + *
 + * Its new features:
 + * 1. Full system scan:
 + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
 + *      interaction to submit a memory area to KSM is no longer needed.
 + *
 + * 2. Rich area detection:
 + *      It automatically detects rich areas containing abundant duplicated
 + *      pages based. Rich areas are given a full scan speed. Poor areas are
 + *      sampled at a reasonable speed with very low CPU consumption.
 + *
 + * 3. Ultra Per-page scan speed improvement:
 + *      A new hash algorithm is proposed. As a result, on a machine with
 + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
 + *      can scan memory areas that does not contain duplicated pages at speed of
 + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
 + *      477MB/sec ~ 923MB/sec.
 + *
 + * 4. Thrashing area avoidance:
 + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
 + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
 + *      hash value based volatile page detection.
 + *
 + *
 + * 5. Misc changes upon KSM:
 + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
 + *        comparison. It's much faster than default C version on x86.
 + *      * rmap_item now has an struct *page member to loosely cache a
 + *        address-->page mapping, which reduces too much time-costly
 + *        follow_page().
 + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
 + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
 + *        ksm is needed for this case.
 + *
 + * 6. Full Zero Page consideration(contributed by Figo Zhang)
 + *    Now uksmd consider full zero pages as special pages and merge them to an
 + *    special unswappable uksm zero page.
 + */
 +
 +#include <linux/errno.h>
 +#include <linux/mm.h>
 +#include <linux/fs.h>
 +#include <linux/mman.h>
 +#include <linux/sched.h>
 +#include <linux/rwsem.h>
 +#include <linux/pagemap.h>
 +#include <linux/rmap.h>
 +#include <linux/spinlock.h>
 +#include <linux/jhash.h>
 +#include <linux/delay.h>
 +#include <linux/kthread.h>
 +#include <linux/wait.h>
 +#include <linux/slab.h>
 +#include <linux/rbtree.h>
 +#include <linux/memory.h>
 +#include <linux/mmu_notifier.h>
 +#include <linux/swap.h>
 +#include <linux/ksm.h>
 +#include <linux/crypto.h>
 +#include <linux/scatterlist.h>
 +#include <crypto/hash.h>
 +#include <linux/random.h>
 +#include <linux/math64.h>
 +#include <linux/gcd.h>
 +#include <linux/freezer.h>
 +#include <linux/sradix-tree.h>
 +
 +#include <asm/tlbflush.h>
 +#include "internal.h"
 +
 +#ifdef CONFIG_X86
 +#undef memcmp
 +
 +#ifdef CONFIG_X86_32
 +#define memcmp memcmpx86_32
 +/*
 + * Compare 4-byte-aligned address s1 and s2, with length n
 + */
 +int memcmpx86_32(void *s1, void *s2, size_t n)
 +{
 +	size_t num = n / 4;
 +	register int res;
 +
 +	__asm__ __volatile__
 +	(
 +	 "testl %3,%3\n\t"
 +	 "repe; cmpsd\n\t"
 +	 "je        1f\n\t"
 +	 "sbbl      %0,%0\n\t"
 +	 "orl       $1,%0\n"
 +	 "1:"
 +	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
 +	 : "0" (0)
 +	 : "cc");
 +
 +	return res;
 +}
 +
 +/*
 + * Check the page is all zero ?
 + */
 +static int is_full_zero(const void *s1, size_t len)
 +{
 +	unsigned char same;
 +
 +	len /= 4;
 +
 +	__asm__ __volatile__
 +	("repe; scasl;"
 +	 "sete %0"
 +	 : "=qm" (same), "+D" (s1), "+c" (len)
 +	 : "a" (0)
 +	 : "cc");
 +
 +	return same;
 +}
 +
 +
 +#elif defined(CONFIG_X86_64)
 +#define memcmp memcmpx86_64
 +/*
 + * Compare 8-byte-aligned address s1 and s2, with length n
 + */
 +int memcmpx86_64(void *s1, void *s2, size_t n)
 +{
 +	size_t num = n / 8;
 +	register int res;
 +
 +	__asm__ __volatile__
 +	(
 +	 "testq %q3,%q3\n\t"
 +	 "repe; cmpsq\n\t"
 +	 "je        1f\n\t"
 +	 "sbbq      %q0,%q0\n\t"
 +	 "orq       $1,%q0\n"
 +	 "1:"
 +	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
 +	 : "0" (0)
 +	 : "cc");
 +
 +	return res;
 +}
 +
 +static int is_full_zero(const void *s1, size_t len)
 +{
 +	unsigned char same;
 +
 +	len /= 8;
 +
 +	__asm__ __volatile__
 +	("repe; scasq;"
 +	 "sete %0"
 +	 : "=qm" (same), "+D" (s1), "+c" (len)
 +	 : "a" (0)
 +	 : "cc");
 +
 +	return same;
 +}
 +
 +#endif
 +#else
 +static int is_full_zero(const void *s1, size_t len)
 +{
 +	unsigned long *src = s1;
 +	int i;
 +
 +	len /= sizeof(*src);
 +
 +	for (i = 0; i < len; i++) {
 +		if (src[i])
 +			return 0;
 +	}
 +
 +	return 1;
 +}
 +#endif
 +
 +#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
 +#define TIME_RATIO_SCALE	10000
 +
 +#define SLOT_TREE_NODE_SHIFT	8
 +#define SLOT_TREE_NODE_STORE_SIZE	(1UL << SLOT_TREE_NODE_SHIFT)
 +struct slot_tree_node {
 +	unsigned long size;
 +	struct sradix_tree_node snode;
 +	void *stores[SLOT_TREE_NODE_STORE_SIZE];
 +};
 +
 +static struct kmem_cache *slot_tree_node_cachep;
 +
 +static struct sradix_tree_node *slot_tree_node_alloc(void)
 +{
 +	struct slot_tree_node *p;
 +	p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL);
 +	if (!p)
 +		return NULL;
 +
 +	return &p->snode;
 +}
 +
 +static void slot_tree_node_free(struct sradix_tree_node *node)
 +{
 +	struct slot_tree_node *p;
 +
 +	p = container_of(node, struct slot_tree_node, snode);
 +	kmem_cache_free(slot_tree_node_cachep, p);
 +}
 +
 +static void slot_tree_node_extend(struct sradix_tree_node *parent,
 +				  struct sradix_tree_node *child)
 +{
 +	struct slot_tree_node *p, *c;
 +
 +	p = container_of(parent, struct slot_tree_node, snode);
 +	c = container_of(child, struct slot_tree_node, snode);
 +
 +	p->size += c->size;
 +}
 +
 +void slot_tree_node_assign(struct sradix_tree_node *node,
 +			   unsigned index, void *item)
 +{
 +	struct vma_slot *slot = item;
 +	struct slot_tree_node *cur;
 +
 +	slot->snode = node;
 +	slot->sindex = index;
 +
 +	while (node) {
 +		cur = container_of(node, struct slot_tree_node, snode);
 +		cur->size += slot->pages;
 +		node = node->parent;
 +	}
 +}
 +
 +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset)
 +{
 +	struct vma_slot *slot;
 +	struct slot_tree_node *cur;
 +	unsigned long pages;
 +
 +	if (node->height == 1) {
 +		slot = node->stores[offset];
 +		pages = slot->pages;
 +	} else {
 +		cur = container_of(node->stores[offset],
 +				   struct slot_tree_node, snode);
 +		pages = cur->size;
 +	}
 +
 +	while (node) {
 +		cur = container_of(node, struct slot_tree_node, snode);
 +		cur->size -= pages;
 +		node = node->parent;
 +	}
 +}
 +
 +unsigned long slot_iter_index;
 +int slot_iter(void *item,  unsigned long height)
 +{
 +	struct slot_tree_node *node;
 +	struct vma_slot *slot;
 +
 +	if (height == 1) {
 +		slot = item;
 +		if (slot_iter_index < slot->pages) {
 +			/*in this one*/
 +			return 1;
 +		} else {
 +			slot_iter_index -= slot->pages;
 +			return 0;
 +		}
 +
 +	} else {
 +		node = container_of(item, struct slot_tree_node, snode);
 +		if (slot_iter_index < node->size) {
 +			/*in this one*/
 +			return 1;
 +		} else {
 +			slot_iter_index -= node->size;
 +			return 0;
 +		}
 +	}
 +}
 +
 +
 +static inline void slot_tree_init_root(struct sradix_tree_root *root)
 +{
 +	init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
 +	root->alloc = slot_tree_node_alloc;
 +	root->free = slot_tree_node_free;
 +	root->extend = slot_tree_node_extend;
 +	root->assign = slot_tree_node_assign;
 +	root->rm = slot_tree_node_rm;
 +}
 +
 +void slot_tree_init(void)
 +{
 +	slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
 +				sizeof(struct slot_tree_node), 0,
 +				SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
 +				NULL);
 +}
 +
 +
 +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
 +struct scan_rung {
 +	//struct list_head scanned_list;
 +	struct sradix_tree_root vma_root;
 +	struct sradix_tree_root vma_root2;
 +
 +	struct vma_slot *current_scan;
 +	unsigned long current_offset;
 +
 +	/*
 +	 * The initial value for current_offset, it should loop over
 +	 * [0~ step - 1] to let all slot have its chance to be scanned.
 +	 */
 +	unsigned long offset_init;
 +	unsigned long step; /* dynamic step for current_offset */
 +	unsigned int flags;
 +	unsigned long pages_to_scan;
 +	//unsigned long fully_scanned_slots;
 +	/*
 +	 * a little bit tricky - if cpu_time_ratio > 0, then the value is the
 +	 * the cpu time ratio it can spend in rung_i for every scan
 +	 * period. if < 0, then it is the cpu time ratio relative to the
 +	 * max cpu percentage user specified. Both in unit of
 +	 * 1/TIME_RATIO_SCALE
 +	 */
 +	int cpu_ratio;
 +
 +	/*
 +	 * How long it will take for all slots in this rung to be fully
 +	 * scanned? If it's zero, we don't care about the cover time:
 +	 * it's fully scanned.
 +	 */
 +	unsigned int cover_msecs;
 +	//unsigned long vma_num;
 +	//unsigned long pages; /* Sum of all slot's pages in rung */
 +};
 +
 +/**
 + * node of either the stable or unstale rbtree
 + *
 + */
 +struct tree_node {
 +	struct rb_node node; /* link in the main (un)stable rbtree */
 +	struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
 +	u32 hash;
 +	unsigned long count; /* TODO: merged with sub_root */
 +	struct list_head all_list; /* all tree nodes in stable/unstable tree */
 +};
 +
 +/**
 + * struct stable_node - node of the stable rbtree
 + * @node: rb node of this ksm page in the stable tree
 + * @hlist: hlist head of rmap_items using this ksm page
 + * @kpfn: page frame number of this ksm page
 + */
 +struct stable_node {
 +	struct rb_node node; /* link in sub-rbtree */
 +	struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
 +	struct hlist_head hlist;
 +	unsigned long kpfn;
 +	u32 hash_max; /* if ==0 then it's not been calculated yet */
 +	struct list_head all_list; /* in a list for all stable nodes */
 +};
 +
 +/**
 + * struct node_vma - group rmap_items linked in a same stable
 + * node together.
 + */
 +struct node_vma {
 +	union {
 +		struct vma_slot *slot;
 +		unsigned long key;  /* slot is used as key sorted on hlist */
 +	};
 +	struct hlist_node hlist;
 +	struct hlist_head rmap_hlist;
 +	struct stable_node *head;
 +};
 +
 +/**
 + * struct rmap_item - reverse mapping item for virtual addresses
 + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 + * @mm: the memory structure this rmap_item is pointing into
 + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 + * @node: rb node of this rmap_item in the unstable tree
 + * @head: pointer to stable_node heading this list in the stable tree
 + * @hlist: link into hlist of rmap_items hanging off that stable_node
 + */
 +struct rmap_item {
 +	struct vma_slot *slot;
 +	struct page *page;
 +	unsigned long address;	/* + low bits used for flags below */
 +	unsigned long hash_round;
 +	unsigned long entry_index;
 +	union {
 +		struct {/* when in unstable tree */
 +			struct rb_node node;
 +			struct tree_node *tree_node;
 +			u32 hash_max;
 +		};
 +		struct { /* when in stable tree */
 +			struct node_vma *head;
 +			struct hlist_node hlist;
 +			struct anon_vma *anon_vma;
 +		};
 +	};
 +} __attribute__((aligned(4)));
 +
 +struct rmap_list_entry {
 +	union {
 +		struct rmap_item *item;
 +		unsigned long addr;
 +	};
 +	/* lowest bit is used for is_addr tag */
 +} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/
 +
 +
 +/* Basic data structure definition ends */
 +
 +
 +/*
 + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
 + * The flags use the low bits of rmap_item.address
 + */
 +#define UNSTABLE_FLAG	0x1
 +#define STABLE_FLAG	0x2
 +#define get_rmap_addr(x)	((x)->address & PAGE_MASK)
 +
 +/*
 + * rmap_list_entry helpers
 + */
 +#define IS_ADDR_FLAG	1
 +#define is_addr(ptr)		((unsigned long)(ptr) & IS_ADDR_FLAG)
 +#define set_is_addr(ptr)	((ptr) |= IS_ADDR_FLAG)
 +#define get_clean_addr(ptr)	(((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
 +
 +
 +/*
 + * High speed caches for frequently allocated and freed structs
 + */
 +static struct kmem_cache *rmap_item_cache;
 +static struct kmem_cache *stable_node_cache;
 +static struct kmem_cache *node_vma_cache;
 +static struct kmem_cache *vma_slot_cache;
 +static struct kmem_cache *tree_node_cache;
 +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
 +		sizeof(struct __struct), __alignof__(struct __struct),\
 +		(__flags), NULL)
 +
 +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
 +#define SCAN_LADDER_SIZE 4
 +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
 +
 +/* The evaluation rounds uksmd has finished */
 +static unsigned long long uksm_eval_round = 1;
 +
 +/*
 + * we add 1 to this var when we consider we should rebuild the whole
 + * unstable tree.
 + */
 +static unsigned long uksm_hash_round = 1;
 +
 +/*
 + * How many times the whole memory is scanned.
 + */
 +static unsigned long long fully_scanned_round = 1;
 +
 +/* The total number of virtual pages of all vma slots */
 +static u64 uksm_pages_total;
 +
 +/* The number of pages has been scanned since the start up */
 +static u64 uksm_pages_scanned;
 +
 +static u64 scanned_virtual_pages;
 +
 +/* The number of pages has been scanned since last encode_benefit call */
 +static u64 uksm_pages_scanned_last;
 +
 +/* If the scanned number is tooo large, we encode it here */
 +static u64 pages_scanned_stored;
 +
 +static unsigned long pages_scanned_base;
 +
 +/* The number of nodes in the stable tree */
 +static unsigned long uksm_pages_shared;
 +
 +/* The number of page slots additionally sharing those nodes */
 +static unsigned long uksm_pages_sharing;
 +
 +/* The number of nodes in the unstable tree */
 +static unsigned long uksm_pages_unshared;
 +
 +/*
 + * Milliseconds ksmd should sleep between scans,
 + * >= 100ms to be consistent with
 + * scan_time_to_sleep_msec()
 + */
 +static unsigned int uksm_sleep_jiffies;
 +
 +/* The real value for the uksmd next sleep */
 +static unsigned int uksm_sleep_real;
 +
 +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
 +static unsigned int uksm_sleep_saved;
 +
 +/* Max percentage of cpu utilization ksmd can take to scan in one batch */
 +static unsigned int uksm_max_cpu_percentage;
 +
 +static int uksm_cpu_governor;
 +
 +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
 +
 +struct uksm_cpu_preset_s {
 +	int cpu_ratio[SCAN_LADDER_SIZE];
 +	unsigned int cover_msecs[SCAN_LADDER_SIZE];
 +	unsigned int max_cpu; /* percentage */
 +};
 +
 +struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
 +	{ {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
 +	{ {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
 +	{ {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
 +	{ {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
 +};
 +
 +/* The default value for uksm_ema_page_time if it's not initialized */
 +#define UKSM_PAGE_TIME_DEFAULT	500
 +
 +/*cost to scan one page by expotional moving average in nsecs */
 +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
 +
 +/* The expotional moving average alpha weight, in percentage. */
 +#define EMA_ALPHA	20
 +
 +/*
 + * The threshold used to filter out thrashing areas,
 + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
 + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
 + * will be considered as having a zero duplication ratio.
 + */
 +static unsigned int uksm_thrash_threshold = 50;
 +
 +/* How much dedup ratio is considered to be abundant*/
 +static unsigned int uksm_abundant_threshold = 10;
 +
 +/* All slots having merged pages in this eval round. */
 +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
 +
 +/* How many times the ksmd has slept since startup */
 +static unsigned long long uksm_sleep_times;
 +
 +#define UKSM_RUN_STOP	0
 +#define UKSM_RUN_MERGE	1
 +static unsigned int uksm_run = 1;
 +
 +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
 +static DEFINE_MUTEX(uksm_thread_mutex);
 +
 +/*
 + * List vma_slot_new is for newly created vma_slot waiting to be added by
 + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
 + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
 + * VMA has been removed/freed.
 + */
 +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
 +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
 +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
 +static DEFINE_SPINLOCK(vma_slot_list_lock);
 +
 +/* The unstable tree heads */
 +static struct rb_root root_unstable_tree = RB_ROOT;
 +
 +/*
 + * All tree_nodes are in a list to be freed at once when unstable tree is
 + * freed after each scan round.
 + */
 +static struct list_head unstable_tree_node_list =
 +				LIST_HEAD_INIT(unstable_tree_node_list);
 +
 +/* List contains all stable nodes */
 +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
 +
 +/*
 + * When the hash strength is changed, the stable tree must be delta_hashed and
 + * re-structured. We use two set of below structs to speed up the
 + * re-structuring of stable tree.
 + */
 +static struct list_head
 +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
 +			    LIST_HEAD_INIT(stable_tree_node_list[1])};
 +
 +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
 +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
 +static struct rb_root *root_stable_treep = &root_stable_tree[0];
 +static unsigned long stable_tree_index;
 +
 +/* The hash strength needed to hash a full page */
 +#define HASH_STRENGTH_FULL		(PAGE_SIZE / sizeof(u32))
 +
 +/* The hash strength needed for loop-back hashing */
 +#define HASH_STRENGTH_MAX		(HASH_STRENGTH_FULL + 10)
 +
 +/* The random offsets in a page */
 +static u32 *random_nums;
 +
 +/* The hash strength */
 +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
 +
 +/* The delta value each time the hash strength increases or decreases */
 +static unsigned long hash_strength_delta;
 +#define HASH_STRENGTH_DELTA_MAX	5
 +
 +/* The time we have saved due to random_sample_hash */
 +static u64 rshash_pos;
 +
 +/* The time we have wasted due to hash collision */
 +static u64 rshash_neg;
 +
 +struct uksm_benefit {
 +	u64 pos;
 +	u64 neg;
 +	u64 scanned;
 +	unsigned long base;
 +} benefit;
 +
 +/*
 + * The relative cost of memcmp, compared to 1 time unit of random sample
 + * hash, this value is tested when ksm module is initialized
 + */
 +static unsigned long memcmp_cost;
 +
 +static unsigned long  rshash_neg_cont_zero;
 +static unsigned long  rshash_cont_obscure;
 +
 +/* The possible states of hash strength adjustment heuristic */
 +enum rshash_states {
 +		RSHASH_STILL,
 +		RSHASH_TRYUP,
 +		RSHASH_TRYDOWN,
 +		RSHASH_NEW,
 +		RSHASH_PRE_STILL,
 +};
 +
 +/* The possible direction we are about to adjust hash strength */
 +enum rshash_direct {
 +	GO_UP,
 +	GO_DOWN,
 +	OBSCURE,
 +	STILL,
 +};
 +
 +/* random sampling hash state machine */
 +static struct {
 +	enum rshash_states state;
 +	enum rshash_direct pre_direct;
 +	u8 below_count;
 +	/* Keep a lookup window of size 5, iff above_count/below_count > 3
 +	 * in this window we stop trying.
 +	 */
 +	u8 lookup_window_index;
 +	u64 stable_benefit;
 +	unsigned long turn_point_down;
 +	unsigned long turn_benefit_down;
 +	unsigned long turn_point_up;
 +	unsigned long turn_benefit_up;
 +	unsigned long stable_point;
 +} rshash_state;
 +
 +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
 +static u32 *zero_hash_table;
 +
 +static inline struct node_vma *alloc_node_vma(void)
 +{
 +	struct node_vma *node_vma;
 +	node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
 +	if (node_vma) {
 +		INIT_HLIST_HEAD(&node_vma->rmap_hlist);
 +		INIT_HLIST_NODE(&node_vma->hlist);
 +	}
 +	return node_vma;
 +}
 +
 +static inline void free_node_vma(struct node_vma *node_vma)
 +{
 +	kmem_cache_free(node_vma_cache, node_vma);
 +}
 +
 +
 +static inline struct vma_slot *alloc_vma_slot(void)
 +{
 +	struct vma_slot *slot;
 +
 +	/*
 +	 * In case ksm is not initialized by now.
 +	 * Oops, we need to consider the call site of uksm_init() in the future.
 +	 */
 +	if (!vma_slot_cache)
 +		return NULL;
 +
 +	slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
 +	if (slot) {
 +		INIT_LIST_HEAD(&slot->slot_list);
 +		INIT_LIST_HEAD(&slot->dedup_list);
 +		slot->flags |= UKSM_SLOT_NEED_RERAND;
 +	}
 +	return slot;
 +}
 +
 +static inline void free_vma_slot(struct vma_slot *vma_slot)
 +{
 +	kmem_cache_free(vma_slot_cache, vma_slot);
 +}
 +
 +
 +
 +static inline struct rmap_item *alloc_rmap_item(void)
 +{
 +	struct rmap_item *rmap_item;
 +
 +	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
 +	if (rmap_item) {
 +		/* bug on lowest bit is not clear for flag use */
 +		BUG_ON(is_addr(rmap_item));
 +	}
 +	return rmap_item;
 +}
 +
 +static inline void free_rmap_item(struct rmap_item *rmap_item)
 +{
 +	rmap_item->slot = NULL;	/* debug safety */
 +	kmem_cache_free(rmap_item_cache, rmap_item);
 +}
 +
 +static inline struct stable_node *alloc_stable_node(void)
 +{
 +	struct stable_node *node;
 +	node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
 +	if (!node)
 +		return NULL;
 +
 +	INIT_HLIST_HEAD(&node->hlist);
 +	list_add(&node->all_list, &stable_node_list);
 +	return node;
 +}
 +
 +static inline void free_stable_node(struct stable_node *stable_node)
 +{
 +	list_del(&stable_node->all_list);
 +	kmem_cache_free(stable_node_cache, stable_node);
 +}
 +
 +static inline struct tree_node *alloc_tree_node(struct list_head *list)
 +{
 +	struct tree_node *node;
 +	node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
 +	if (!node)
 +		return NULL;
 +
 +	list_add(&node->all_list, list);
 +	return node;
 +}
 +
 +static inline void free_tree_node(struct tree_node *node)
 +{
 +	list_del(&node->all_list);
 +	kmem_cache_free(tree_node_cache, node);
 +}
 +
 +static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
 +{
 +	struct anon_vma *anon_vma = rmap_item->anon_vma;
 +
 +	put_anon_vma(anon_vma);
 +}
 +
 +
 +/**
 + * Remove a stable node from stable_tree, may unlink from its tree_node and
 + * may remove its parent tree_node if no other stable node is pending.
 + *
 + * @stable_node 	The node need to be removed
 + * @unlink_rb 		Will this node be unlinked from the rbtree?
 + * @remove_tree_	node Will its tree_node be removed if empty?
 + */
 +static void remove_node_from_stable_tree(struct stable_node *stable_node,
 +					 int unlink_rb,  int remove_tree_node)
 +{
 +	struct node_vma *node_vma;
 +	struct rmap_item *rmap_item;
 +	struct hlist_node *n;
 +
 +	if (!hlist_empty(&stable_node->hlist)) {
 +		hlist_for_each_entry_safe(node_vma, n,
 +					  &stable_node->hlist, hlist) {
 +			hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
 +				uksm_pages_sharing--;
 +
 +				uksm_drop_anon_vma(rmap_item);
 +				rmap_item->address &= PAGE_MASK;
 +			}
 +			free_node_vma(node_vma);
 +			cond_resched();
 +		}
 +
 +		/* the last one is counted as shared */
 +		uksm_pages_shared--;
 +		uksm_pages_sharing++;
 +	}
 +
 +	if (stable_node->tree_node && unlink_rb) {
 +		rb_erase(&stable_node->node,
 +			 &stable_node->tree_node->sub_root);
 +
 +		if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
 +		    remove_tree_node) {
 +			rb_erase(&stable_node->tree_node->node,
 +				 root_stable_treep);
 +			free_tree_node(stable_node->tree_node);
 +		} else {
 +			stable_node->tree_node->count--;
 +		}
 +	}
 +
 +	free_stable_node(stable_node);
 +}
 +
 +
 +/*
 + * get_uksm_page: checks if the page indicated by the stable node
 + * is still its ksm page, despite having held no reference to it.
 + * In which case we can trust the content of the page, and it
 + * returns the gotten page; but if the page has now been zapped,
 + * remove the stale node from the stable tree and return NULL.
 + *
 + * You would expect the stable_node to hold a reference to the ksm page.
 + * But if it increments the page's count, swapping out has to wait for
 + * ksmd to come around again before it can free the page, which may take
 + * seconds or even minutes: much too unresponsive.  So instead we use a
 + * "keyhole reference": access to the ksm page from the stable node peeps
 + * out through its keyhole to see if that page still holds the right key,
 + * pointing back to this stable node.  This relies on freeing a PageAnon
 + * page to reset its page->mapping to NULL, and relies on no other use of
 + * a page to put something that might look like our key in page->mapping.
 + *
 + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
 + * but this is different - made simpler by uksm_thread_mutex being held, but
 + * interesting for assuming that no other use of the struct page could ever
 + * put our expected_mapping into page->mapping (or a field of the union which
 + * coincides with page->mapping).  The RCU calls are not for KSM at all, but
 + * to keep the page_count protocol described with page_cache_get_speculative.
 + *
 + * Note: it is possible that get_uksm_page() will return NULL one moment,
 + * then page the next, if the page is in between page_freeze_refs() and
 + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
 + * is on its way to being freed; but it is an anomaly to bear in mind.
 + *
 + * @unlink_rb: 		if the removal of this node will firstly unlink from
 + * its rbtree. stable_node_reinsert will prevent this when restructuring the
 + * node from its old tree.
 + *
 + * @remove_tree_node:	if this is the last one of its tree_node, will the
 + * tree_node be freed ? If we are inserting stable node, this tree_node may
 + * be reused, so don't free it.
 + */
 +static struct page *get_uksm_page(struct stable_node *stable_node,
 +				 int unlink_rb, int remove_tree_node)
 +{
 +	struct page *page;
 +	void *expected_mapping;
 +
 +	page = pfn_to_page(stable_node->kpfn);
 +	expected_mapping = (void *)stable_node +
 +				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
 +	rcu_read_lock();
 +	if (page->mapping != expected_mapping)
 +		goto stale;
 +	if (!get_page_unless_zero(page))
 +		goto stale;
 +	if (page->mapping != expected_mapping) {
 +		put_page(page);
 +		goto stale;
 +	}
 +	rcu_read_unlock();
 +	return page;
 +stale:
 +	rcu_read_unlock();
 +	remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
 +
 +	return NULL;
 +}
 +
 +/*
 + * Removing rmap_item from stable or unstable tree.
 + * This function will clean the information from the stable/unstable tree.
 + */
 +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
 +{
 +	if (rmap_item->address & STABLE_FLAG) {
 +		struct stable_node *stable_node;
 +		struct node_vma *node_vma;
 +		struct page *page;
 +
 +		node_vma = rmap_item->head;
 +		stable_node = node_vma->head;
 +		page = get_uksm_page(stable_node, 1, 1);
 +		if (!page)
 +			goto out;
 +
 +		/*
 +		 * page lock is needed because it's racing with
 +		 * try_to_unmap_ksm(), etc.
 +		 */
 +		lock_page(page);
 +		hlist_del(&rmap_item->hlist);
 +
 +		if (hlist_empty(&node_vma->rmap_hlist)) {
 +			hlist_del(&node_vma->hlist);
 +			free_node_vma(node_vma);
 +		}
 +		unlock_page(page);
 +
 +		put_page(page);
 +		if (hlist_empty(&stable_node->hlist)) {
 +			/* do NOT call remove_node_from_stable_tree() here,
 +			 * it's possible for a forked rmap_item not in
 +			 * stable tree while the in-tree rmap_items were
 +			 * deleted.
 +			 */
 +			uksm_pages_shared--;
 +		} else
 +			uksm_pages_sharing--;
 +
 +
 +		uksm_drop_anon_vma(rmap_item);
 +	} else if (rmap_item->address & UNSTABLE_FLAG) {
 +		if (rmap_item->hash_round == uksm_hash_round) {
 +
 +			rb_erase(&rmap_item->node,
 +				 &rmap_item->tree_node->sub_root);
 +			if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
 +				rb_erase(&rmap_item->tree_node->node,
 +					 &root_unstable_tree);
 +
 +				free_tree_node(rmap_item->tree_node);
 +			} else
 +				rmap_item->tree_node->count--;
 +		}
 +		uksm_pages_unshared--;
 +	}
 +
 +	rmap_item->address &= PAGE_MASK;
 +	rmap_item->hash_max = 0;
 +
 +out:
 +	cond_resched();		/* we're called from many long loops */
 +}
 +
 +static inline int slot_in_uksm(struct vma_slot *slot)
 +{
 +	return list_empty(&slot->slot_list);
 +}
 +
 +/*
 + * Test if the mm is exiting
 + */
 +static inline bool uksm_test_exit(struct mm_struct *mm)
 +{
 +	return atomic_read(&mm->mm_users) == 0;
 +}
 +
 +static inline unsigned long vma_pool_size(struct vma_slot *slot)
 +{
 +	return round_up(sizeof(struct rmap_list_entry) * slot->pages,
 +			PAGE_SIZE) >> PAGE_SHIFT;
 +}
 +
 +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
 +
 +/* must be done with sem locked */
 +static int slot_pool_alloc(struct vma_slot *slot)
 +{
 +	unsigned long pool_size;
 +
 +	if (slot->rmap_list_pool)
 +		return 0;
 +
 +	pool_size = vma_pool_size(slot);
 +	slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
 +				       pool_size, GFP_KERNEL);
 +	if (!slot->rmap_list_pool)
 +		return -ENOMEM;
 +
 +	slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size,
 +				    GFP_KERNEL);
 +	if (!slot->pool_counts) {
 +		kfree(slot->rmap_list_pool);
 +		return -ENOMEM;
 +	}
 +
 +	slot->pool_size = pool_size;
 +	BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
 +	slot->flags |= UKSM_SLOT_IN_UKSM;
 +	uksm_pages_total += slot->pages;
 +
 +	return 0;
 +}
 +
 +/*
 + * Called after vma is unlinked from its mm
 + */
 +void uksm_remove_vma(struct vm_area_struct *vma)
 +{
 +	struct vma_slot *slot;
 +
 +	if (!vma->uksm_vma_slot)
 +		return;
 +
 +	spin_lock(&vma_slot_list_lock);
 +	slot = vma->uksm_vma_slot;
 +	if (!slot)
 +		goto out;
 +
 +	if (slot_in_uksm(slot)) {
 +		/**
 +		 * This slot has been added by ksmd, so move to the del list
 +		 * waiting ksmd to free it.
 +		 */
 +		list_add_tail(&slot->slot_list, &vma_slot_del);
 +	} else {
 +		/**
 +		 * It's still on new list. It's ok to free slot directly.
 +		 */
 +		list_del(&slot->slot_list);
 +		free_vma_slot(slot);
 +	}
 +out:
 +	vma->uksm_vma_slot = NULL;
 +	spin_unlock(&vma_slot_list_lock);
 +}
 +
 +/**
 + * Need to do two things:
 + * 1. check if slot was moved to del list
 + * 2. make sure the mmap_sem is manipulated under valid vma.
 + *
 + * My concern here is that in some cases, this may make
 + * vma_slot_list_lock() waiters to serialized further by some
 + * sem->wait_lock, can this really be expensive?
 + *
 + *
 + * @return
 + * 0: if successfully locked mmap_sem
 + * -ENOENT: this slot was moved to del list
 + * -EBUSY: vma lock failed
 + */
 +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
 +{
 +	struct vm_area_struct *vma;
 +	struct mm_struct *mm;
 +	struct rw_semaphore *sem;
 +
 +	spin_lock(&vma_slot_list_lock);
 +
 +	/* the slot_list was removed and inited from new list, when it enters
 +	 * uksm_list. If now it's not empty, then it must be moved to del list
 +	 */
 +	if (!slot_in_uksm(slot)) {
 +		spin_unlock(&vma_slot_list_lock);
 +		return -ENOENT;
 +	}
 +
 +	BUG_ON(slot->pages != vma_pages(slot->vma));
 +	/* Ok, vma still valid */
 +	vma = slot->vma;
 +	mm = vma->vm_mm;
 +	sem = &mm->mmap_sem;
 +
 +	if (uksm_test_exit(mm)) {
 +		spin_unlock(&vma_slot_list_lock);
 +		return -ENOENT;
 +	}
 +
 +	if (down_read_trylock(sem)) {
 +		spin_unlock(&vma_slot_list_lock);
 +		if (slot_pool_alloc(slot)) {
 +			uksm_remove_vma(vma);
 +			up_read(sem);
 +			return -ENOENT;
 +		}
 +		return 0;
 +	}
 +
 +	spin_unlock(&vma_slot_list_lock);
 +	return -EBUSY;
 +}
 +
 +static inline unsigned long
 +vma_page_address(struct page *page, struct vm_area_struct *vma)
 +{
 +	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 +	unsigned long address;
 +
 +	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 +	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 +		/* page should be within @vma mapping range */
 +		return -EFAULT;
 +	}
 +	return address;
 +}
 +
 +
 +/* return 0 on success with the item's mmap_sem locked */
 +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
 +{
 +	struct mm_struct *mm;
 +	struct vma_slot *slot = item->slot;
 +	int err = -EINVAL;
 +
 +	struct page *page;
 +
 +	/*
 +	 * try_down_read_slot_mmap_sem() returns non-zero if the slot
 +	 * has been removed by uksm_remove_vma().
 +	 */
 +	if (try_down_read_slot_mmap_sem(slot))
 +		return -EBUSY;
 +
 +	mm = slot->vma->vm_mm;
 +
 +	if (uksm_test_exit(mm))
 +		goto failout_up;
 +
 +	page = item->page;
 +	rcu_read_lock();
 +	if (!get_page_unless_zero(page)) {
 +		rcu_read_unlock();
 +		goto failout_up;
 +	}
 +
 +	/* No need to consider huge page here. */
 +	if (item->slot->vma->anon_vma != page_anon_vma(page) ||
 +	    vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
 +		/*
 +		 * TODO:
 +		 * should we release this item becase of its stale page
 +		 * mapping?
 +		 */
 +		put_page(page);
 +		rcu_read_unlock();
 +		goto failout_up;
 +	}
 +	rcu_read_unlock();
 +	return 0;
 +
 +failout_up:
 +	up_read(&mm->mmap_sem);
 +	return err;
 +}
 +
 +/*
 + * What kind of VMA is considered ?
 + */
 +static inline int vma_can_enter(struct vm_area_struct *vma)
 +{
 +	return uksm_flags_can_scan(vma->vm_flags);
 +}
 +
 +/*
 + * Called whenever a fresh new vma is created A new vma_slot.
 + * is created and inserted into a global list Must be called.
 + * after vma is inserted to its mm      		    .
 + */
 +void uksm_vma_add_new(struct vm_area_struct *vma)
 +{
 +	struct vma_slot *slot;
 +
 +	if (!vma_can_enter(vma)) {
 +		vma->uksm_vma_slot = NULL;
 +		return;
 +	}
 +
 +	slot = alloc_vma_slot();
 +	if (!slot) {
 +		vma->uksm_vma_slot = NULL;
 +		return;
 +	}
 +
 +	vma->uksm_vma_slot = slot;
 +	vma->vm_flags |= VM_MERGEABLE;
 +	slot->vma = vma;
 +	slot->mm = vma->vm_mm;
 +	slot->ctime_j = jiffies;
 +	slot->pages = vma_pages(vma);
 +	spin_lock(&vma_slot_list_lock);
 +	list_add_tail(&slot->slot_list, &vma_slot_new);
 +	spin_unlock(&vma_slot_list_lock);
 +}
 +
 +/*   32/3 < they < 32/2 */
 +#define shiftl	8
 +#define shiftr	12
 +
 +#define HASH_FROM_TO(from, to) 				\
 +for (index = from; index < to; index++) {		\
 +	pos = random_nums[index];			\
 +	hash += key[pos];				\
 +	hash += (hash << shiftl);			\
 +	hash ^= (hash >> shiftr);			\
 +}
 +
 +
 +#define HASH_FROM_DOWN_TO(from, to) 			\
 +for (index = from - 1; index >= to; index--) {		\
 +	hash ^= (hash >> shiftr);			\
 +	hash ^= (hash >> (shiftr*2));			\
 +	hash -= (hash << shiftl);			\
 +	hash += (hash << (shiftl*2));			\
 +	pos = random_nums[index];			\
 +	hash -= key[pos];				\
 +}
 +
 +/*
 + * The main random sample hash function.
 + */
 +static u32 random_sample_hash(void *addr, u32 hash_strength)
 +{
 +	u32 hash = 0xdeadbeef;
 +	int index, pos, loop = hash_strength;
 +	u32 *key = (u32 *)addr;
 +
 +	if (loop > HASH_STRENGTH_FULL)
 +		loop = HASH_STRENGTH_FULL;
 +
 +	HASH_FROM_TO(0, loop);
 +
 +	if (hash_strength > HASH_STRENGTH_FULL) {
 +		loop = hash_strength - HASH_STRENGTH_FULL;
 +		HASH_FROM_TO(0, loop);
 +	}
 +
 +	return hash;
 +}
 +
 +
 +/**
 + * It's used when hash strength is adjusted
 + *
 + * @addr The page's virtual address
 + * @from The original hash strength
 + * @to   The hash strength changed to
 + * @hash The hash value generated with "from" hash value
 + *
 + * return the hash value
 + */
 +static u32 delta_hash(void *addr, int from, int to, u32 hash)
 +{
 +	u32 *key = (u32 *)addr;
 +	int index, pos; /* make sure they are int type */
 +
 +	if (to > from) {
 +		if (from >= HASH_STRENGTH_FULL) {
 +			from -= HASH_STRENGTH_FULL;
 +			to -= HASH_STRENGTH_FULL;
 +			HASH_FROM_TO(from, to);
 +		} else if (to <= HASH_STRENGTH_FULL) {
 +			HASH_FROM_TO(from, to);
 +		} else {
 +			HASH_FROM_TO(from, HASH_STRENGTH_FULL);
 +			HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
 +		}
 +	} else {
 +		if (from <= HASH_STRENGTH_FULL) {
 +			HASH_FROM_DOWN_TO(from, to);
 +		} else if (to >= HASH_STRENGTH_FULL) {
 +			from -= HASH_STRENGTH_FULL;
 +			to -= HASH_STRENGTH_FULL;
 +			HASH_FROM_DOWN_TO(from, to);
 +		} else {
 +			HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
 +			HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
 +		}
 +	}
 +
 +	return hash;
 +}
 +
 +/**
 + *
 + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
 + * has finished.
 + *
 + * return 0 if no page has been scanned since last call, 1 otherwise.
 + */
 +static inline int encode_benefit(void)
 +{
 +	u64 scanned_delta, pos_delta, neg_delta;
 +	unsigned long base = benefit.base;
 +
 +	scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
 +
 +	if (!scanned_delta)
 +		return 0;
 +
 +	scanned_delta >>= base;
 +	pos_delta = rshash_pos >> base;
 +	neg_delta = rshash_neg >> base;
 +
 +	if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
 +	    CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
 +	    CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
 +		benefit.scanned >>= 1;
 +		benefit.neg >>= 1;
 +		benefit.pos >>= 1;
 +		benefit.base++;
 +		scanned_delta >>= 1;
 +		pos_delta >>= 1;
 +		neg_delta >>= 1;
 +	}
 +
 +	benefit.pos += pos_delta;
 +	benefit.neg += neg_delta;
 +	benefit.scanned += scanned_delta;
 +
 +	BUG_ON(!benefit.scanned);
 +
 +	rshash_pos = rshash_neg = 0;
 +	uksm_pages_scanned_last = uksm_pages_scanned;
 +
 +	return 1;
 +}
 +
 +static inline void reset_benefit(void)
 +{
 +	benefit.pos = 0;
 +	benefit.neg = 0;
 +	benefit.base = 0;
 +	benefit.scanned = 0;
 +}
 +
 +static inline void inc_rshash_pos(unsigned long delta)
 +{
 +	if (CAN_OVERFLOW_U64(rshash_pos, delta))
 +		encode_benefit();
 +
 +	rshash_pos += delta;
 +}
 +
 +static inline void inc_rshash_neg(unsigned long delta)
 +{
 +	if (CAN_OVERFLOW_U64(rshash_neg, delta))
 +		encode_benefit();
 +
 +	rshash_neg += delta;
 +}
 +
 +
 +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
 +			    int cost_accounting)
 +{
 +	u32 val;
 +	unsigned long delta;
 +
 +	void *addr = kmap_atomic(page);
 +
 +	val = random_sample_hash(addr, hash_strength);
 +	kunmap_atomic(addr);
 +
 +	if (cost_accounting) {
 +		if (HASH_STRENGTH_FULL > hash_strength)
 +			delta = HASH_STRENGTH_FULL - hash_strength;
 +		else
 +			delta = 0;
 +
 +		inc_rshash_pos(delta);
 +	}
 +
 +	return val;
 +}
 +
 +static int memcmp_pages(struct page *page1, struct page *page2,
 +			int cost_accounting)
 +{
 +	char *addr1, *addr2;
 +	int ret;
 +
 +	addr1 = kmap_atomic(page1);
 +	addr2 = kmap_atomic(page2);
 +	ret = memcmp(addr1, addr2, PAGE_SIZE);
 +	kunmap_atomic(addr2);
 +	kunmap_atomic(addr1);
 +
 +	if (cost_accounting)
 +		inc_rshash_neg(memcmp_cost);
 +
 +	return ret;
 +}
 +
 +static inline int pages_identical(struct page *page1, struct page *page2)
 +{
 +	return !memcmp_pages(page1, page2, 0);
 +}
 +
 +static inline int is_page_full_zero(struct page *page)
 +{
 +	char *addr;
 +	int ret;
 +
 +	addr = kmap_atomic(page);
 +	ret = is_full_zero(addr, PAGE_SIZE);
 +	kunmap_atomic(addr);
 +
 +	return ret;
 +}
 +
 +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 +			      pte_t *orig_pte, pte_t *old_pte)
 +{
 +	struct mm_struct *mm = vma->vm_mm;
 +	unsigned long addr;
 +	pte_t *ptep;
 +	spinlock_t *ptl;
 +	int swapped;
 +	int err = -EFAULT;
 +	unsigned long mmun_start;	/* For mmu_notifiers */
 +	unsigned long mmun_end;		/* For mmu_notifiers */
 +
 +	addr = page_address_in_vma(page, vma);
 +	if (addr == -EFAULT)
 +		goto out;
 +
 +	BUG_ON(PageTransCompound(page));
 +
 +	mmun_start = addr;
 +	mmun_end   = addr + PAGE_SIZE;
 +	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 +
 +	ptep = page_check_address(page, mm, addr, &ptl, 0);
 +	if (!ptep)
 +		goto out_mn;
 +
 +	if (old_pte)
 +		*old_pte = *ptep;
 +
 +	if (pte_write(*ptep) || pte_dirty(*ptep)) {
 +		pte_t entry;
 +
 +		swapped = PageSwapCache(page);
 +		flush_cache_page(vma, addr, page_to_pfn(page));
 +		/*
 +		 * Ok this is tricky, when get_user_pages_fast() run it doesnt
 +		 * take any lock, therefore the check that we are going to make
 +		 * with the pagecount against the mapcount is racey and
 +		 * O_DIRECT can happen right after the check.
 +		 * So we clear the pte and flush the tlb before the check
 +		 * this assure us that no O_DIRECT can happen after the check
 +		 * or in the middle of the check.
 +		 */
 +		entry = ptep_clear_flush_notify(vma, addr, ptep);
 +		/*
 +		 * Check that no O_DIRECT or similar I/O is in progress on the
 +		 * page
 +		 */
 +		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
 +			set_pte_at(mm, addr, ptep, entry);
 +			goto out_unlock;
 +		}
 +		if (pte_dirty(entry))
 +			set_page_dirty(page);
 +		entry = pte_mkclean(pte_wrprotect(entry));
 +		set_pte_at_notify(mm, addr, ptep, entry);
 +	}
 +	*orig_pte = *ptep;
 +	err = 0;
 +
 +out_unlock:
 +	pte_unmap_unlock(ptep, ptl);
 +out_mn:
 +	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 +out:
 +	return err;
 +}
 +
 +#define MERGE_ERR_PGERR		1 /* the page is invalid cannot continue */
 +#define MERGE_ERR_COLLI		2 /* there is a collision */
 +#define MERGE_ERR_COLLI_MAX	3 /* collision at the max hash strength */
 +#define MERGE_ERR_CHANGED	4 /* the page has changed since last hash */
 +
 +
 +/**
 + * replace_page - replace page in vma by new ksm page
 + * @vma:      vma that holds the pte pointing to page
 + * @page:     the page we are replacing by kpage
 + * @kpage:    the ksm page we replace page by
 + * @orig_pte: the original value of the pte
 + *
 + * Returns 0 on success, MERGE_ERR_PGERR on failure.
 + */
 +static int replace_page(struct vm_area_struct *vma, struct page *page,
 +			struct page *kpage, pte_t orig_pte)
 +{
 +	struct mm_struct *mm = vma->vm_mm;
 +	pgd_t *pgd;
 +	pud_t *pud;
 +	pmd_t *pmd;
 +	pte_t *ptep;
 +	spinlock_t *ptl;
 +	pte_t entry;
 +
 +	unsigned long addr;
 +	int err = MERGE_ERR_PGERR;
 +	unsigned long mmun_start;	/* For mmu_notifiers */
 +	unsigned long mmun_end;		/* For mmu_notifiers */
 +
 +	addr = page_address_in_vma(page, vma);
 +	if (addr == -EFAULT)
 +		goto out;
 +
 +	pgd = pgd_offset(mm, addr);
 +	if (!pgd_present(*pgd))
 +		goto out;
 +
 +	pud = pud_offset(pgd, addr);
 +	if (!pud_present(*pud))
 +		goto out;
 +
 +	pmd = pmd_offset(pud, addr);
 +	BUG_ON(pmd_trans_huge(*pmd));
 +	if (!pmd_present(*pmd))
 +		goto out;
 +
 +	mmun_start = addr;
 +	mmun_end   = addr + PAGE_SIZE;
 +	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 +
 +	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
 +	if (!pte_same(*ptep, orig_pte)) {
 +		pte_unmap_unlock(ptep, ptl);
 +		goto out_mn;
 +	}
 +
 +	flush_cache_page(vma, addr, pte_pfn(*ptep));
 +	ptep_clear_flush_notify(vma, addr, ptep);
 +	entry = mk_pte(kpage, vma->vm_page_prot);
 +
 +	/* special treatment is needed for zero_page */
 +	if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
 +				(page_to_pfn(kpage) == zero_pfn))
 +		entry = pte_mkspecial(entry);
 +	else {
 +		get_page(kpage);
 +		page_add_anon_rmap(kpage, vma, addr, false);
 +	}
 +
 +	set_pte_at_notify(mm, addr, ptep, entry);
 +
 +	page_remove_rmap(page, false);
 +	if (!page_mapped(page))
 +		try_to_free_swap(page);
 +	put_page(page);
 +
 +	pte_unmap_unlock(ptep, ptl);
 +	err = 0;
 +out_mn:
 +	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 +out:
 +	return err;
 +}
 +
 +
 +/**
 + *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
 + *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
 + *  hash_max member has not been calculated.
 + *
 + * @page The page needs to be hashed
 + * @hash_old The hash value calculated with current hash strength
 + *
 + * return the new hash value calculated at HASH_STRENGTH_MAX
 + */
 +static inline u32 page_hash_max(struct page *page, u32 hash_old)
 +{
 +	u32 hash_max = 0;
 +	void *addr;
 +
 +	addr = kmap_atomic(page);
 +	hash_max = delta_hash(addr, hash_strength,
 +			      HASH_STRENGTH_MAX, hash_old);
 +
 +	kunmap_atomic(addr);
 +
 +	if (!hash_max)
 +		hash_max = 1;
 +
 +	inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
 +	return hash_max;
 +}
 +
 +/*
 + * We compare the hash again, to ensure that it is really a hash collision
 + * instead of being caused by page write.
 + */
 +static inline int check_collision(struct rmap_item *rmap_item,
 +				  u32 hash)
 +{
 +	int err;
 +	struct page *page = rmap_item->page;
 +
 +	/* if this rmap_item has already been hash_maxed, then the collision
 +	 * must appears in the second-level rbtree search. In this case we check
 +	 * if its hash_max value has been changed. Otherwise, the collision
 +	 * happens in the first-level rbtree search, so we check against it's
 +	 * current hash value.
 +	 */
 +	if (rmap_item->hash_max) {
 +		inc_rshash_neg(memcmp_cost);
 +		inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
 +
 +		if (rmap_item->hash_max == page_hash_max(page, hash))
 +			err = MERGE_ERR_COLLI;
 +		else
 +			err = MERGE_ERR_CHANGED;
 +	} else {
 +		inc_rshash_neg(memcmp_cost + hash_strength);
 +
 +		if (page_hash(page, hash_strength, 0) == hash)
 +			err = MERGE_ERR_COLLI;
 +		else
 +			err = MERGE_ERR_CHANGED;
 +	}
 +
 +	return err;
 +}
 +
 +static struct page *page_trans_compound_anon(struct page *page)
 +{
 +	if (PageTransCompound(page)) {
 +		struct page *head = compound_head(page);
 +		/*
 +		 * head may actually be splitted and freed from under
 +		 * us but it's ok here.
 +		 */
 +		if (PageAnon(head))
 +			return head;
 +	}
 +	return NULL;
 +}
 +
 +static int page_trans_compound_anon_split(struct page *page)
 +{
 +	int ret = 0;
 +	struct page *transhuge_head = page_trans_compound_anon(page);
 +	if (transhuge_head) {
 +		/* Get the reference on the head to split it. */
 +		if (get_page_unless_zero(transhuge_head)) {
 +			/*
 +			 * Recheck we got the reference while the head
 +			 * was still anonymous.
 +			 */
 +			if (PageAnon(transhuge_head))
 +				ret = split_huge_page(transhuge_head);
 +			else
 +				/*
 +				 * Retry later if split_huge_page run
 +				 * from under us.
 +				 */
 +				ret = 1;
 +			put_page(transhuge_head);
 +		} else
 +			/* Retry later if split_huge_page run from under us. */
 +			ret = 1;
 +	}
 +	return ret;
 +}
 +
 +/**
 + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
 + * already be a ksm page.
 + *
 + * @return 0 if the pages were merged, -EFAULT otherwise.
 + */
 +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
 +				      struct page *kpage, u32 hash)
 +{
 +	struct vm_area_struct *vma = rmap_item->slot->vma;
 +	struct mm_struct *mm = vma->vm_mm;
 +	pte_t orig_pte = __pte(0);
 +	int err = MERGE_ERR_PGERR;
 +	struct page *page;
 +
 +	if (uksm_test_exit(mm))
 +		goto out;
 +
 +	page = rmap_item->page;
 +
 +	if (page == kpage) { /* ksm page forked */
 +		err = 0;
 +		goto out;
 +	}
 +
 +	if (PageTransCompound(page) && page_trans_compound_anon_split(page))
 +		goto out;
 +	BUG_ON(PageTransCompound(page));
 +
 +	if (!PageAnon(page) || !PageKsm(kpage))
 +		goto out;
 +
 +	/*
 +	 * We need the page lock to read a stable PageSwapCache in
 +	 * write_protect_page().  We use trylock_page() instead of
 +	 * lock_page() because we don't want to wait here - we
 +	 * prefer to continue scanning and merging different pages,
 +	 * then come back to this page when it is unlocked.
 +	 */
 +	if (!trylock_page(page))
 +		goto out;
 +	/*
 +	 * If this anonymous page is mapped only here, its pte may need
 +	 * to be write-protected.  If it's mapped elsewhere, all of its
 +	 * ptes are necessarily already write-protected.  But in either
 +	 * case, we need to lock and check page_count is not raised.
 +	 */
 +	if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
 +		if (pages_identical(page, kpage))
 +			err = replace_page(vma, page, kpage, orig_pte);
 +		else
 +			err = check_collision(rmap_item, hash);
 +	}
 +
 +	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
 +		munlock_vma_page(page);
 +		if (!PageMlocked(kpage)) {
 +			unlock_page(page);
 +			lock_page(kpage);
 +			mlock_vma_page(kpage);
 +			page = kpage;		/* for final unlock */
 +		}
 +	}
 +
 +	unlock_page(page);
 +out:
 +	return err;
 +}
 +
 +
 +
 +/**
 + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
 + * to restore a page mapping that has been changed in try_to_merge_two_pages.
 + *
 + * @return 0 on success.
 + */
 +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
 +			     pte_t orig_pte, pte_t wprt_pte)
 +{
 +	struct mm_struct *mm = vma->vm_mm;
 +	pgd_t *pgd;
 +	pud_t *pud;
 +	pmd_t *pmd;
 +	pte_t *ptep;
 +	spinlock_t *ptl;
 +
 +	int err = -EFAULT;
 +
 +	pgd = pgd_offset(mm, addr);
 +	if (!pgd_present(*pgd))
 +		goto out;
 +
 +	pud = pud_offset(pgd, addr);
 +	if (!pud_present(*pud))
 +		goto out;
 +
 +	pmd = pmd_offset(pud, addr);
 +	if (!pmd_present(*pmd))
 +		goto out;
 +
 +	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
 +	if (!pte_same(*ptep, wprt_pte)) {
 +		/* already copied, let it be */
 +		pte_unmap_unlock(ptep, ptl);
 +		goto out;
 +	}
 +
 +	/*
 +	 * Good boy, still here. When we still get the ksm page, it does not
 +	 * return to the free page pool, there is no way that a pte was changed
 +	 * to other page and gets back to this page. And remind that ksm page
 +	 * do not reuse in do_wp_page(). So it's safe to restore the original
 +	 * pte.
 +	 */
 +	flush_cache_page(vma, addr, pte_pfn(*ptep));
 +	ptep_clear_flush_notify(vma, addr, ptep);
 +	set_pte_at_notify(mm, addr, ptep, orig_pte);
 +
 +	pte_unmap_unlock(ptep, ptl);
 +	err = 0;
 +out:
 +	return err;
 +}
 +
 +/**
 + * try_to_merge_two_pages() - take two identical pages and prepare
 + * them to be merged into one page(rmap_item->page)
 + *
 + * @return 0 if we successfully merged two identical pages into
 + *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
 + *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
 + *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
 + *
 + */
 +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
 +				  struct rmap_item *tree_rmap_item,
 +				  u32 hash)
 +{
 +	pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
 +	pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
 +	struct vm_area_struct *vma1 = rmap_item->slot->vma;
 +	struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
 +	struct page *page = rmap_item->page;
 +	struct page *tree_page = tree_rmap_item->page;
 +	int err = MERGE_ERR_PGERR;
 +	struct address_space *saved_mapping;
 +
 +
 +	if (rmap_item->page == tree_rmap_item->page)
 +		goto out;
 +
 +	if (PageTransCompound(page) && page_trans_compound_anon_split(page))
 +		goto out;
 +	BUG_ON(PageTransCompound(page));
 +
 +	if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
 +		goto out;
 +	BUG_ON(PageTransCompound(tree_page));
 +
 +	if (!PageAnon(page) || !PageAnon(tree_page))
 +		goto out;
 +
 +	if (!trylock_page(page))
 +		goto out;
 +
 +
 +	if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
 +		unlock_page(page);
 +		goto out;
 +	}
 +
 +	/*
 +	 * While we hold page lock, upgrade page from
 +	 * PageAnon+anon_vma to PageKsm+NULL stable_node:
 +	 * stable_tree_insert() will update stable_node.
 +	 */
 +	saved_mapping = page->mapping;
 +	set_page_stable_node(page, NULL);
 +	mark_page_accessed(page);
 +	unlock_page(page);
 +
 +	if (!trylock_page(tree_page))
 +		goto restore_out;
 +
 +	if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
 +		unlock_page(tree_page);
 +		goto restore_out;
 +	}
 +
 +	if (pages_identical(page, tree_page)) {
 +		err = replace_page(vma2, tree_page, page, wprt_pte2);
 +		if (err) {
 +			unlock_page(tree_page);
 +			goto restore_out;
 +		}
 +
 +		if ((vma2->vm_flags & VM_LOCKED)) {
 +			munlock_vma_page(tree_page);
 +			if (!PageMlocked(page)) {
 +				unlock_page(tree_page);
 +				lock_page(page);
 +				mlock_vma_page(page);
 +				tree_page = page; /* for final unlock */
 +			}
 +		}
 +
 +		unlock_page(tree_page);
 +
 +		goto out; /* success */
 +
 +	} else {
 +		if (tree_rmap_item->hash_max &&
 +		    tree_rmap_item->hash_max == rmap_item->hash_max) {
 +			err = MERGE_ERR_COLLI_MAX;
 +		} else if (page_hash(page, hash_strength, 0) ==
 +		    page_hash(tree_page, hash_strength, 0)) {
 +			inc_rshash_neg(memcmp_cost + hash_strength * 2);
 +			err = MERGE_ERR_COLLI;
 +		} else {
 +			err = MERGE_ERR_CHANGED;
 +		}
 +
 +		unlock_page(tree_page);
 +	}
 +
 +restore_out:
 +	lock_page(page);
 +	if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
 +				  orig_pte1, wprt_pte1))
 +		page->mapping = saved_mapping;
 +
 +	unlock_page(page);
 +out:
 +	return err;
 +}
 +
 +static inline int hash_cmp(u32 new_val, u32 node_val)
 +{
 +	if (new_val > node_val)
 +		return 1;
 +	else if (new_val < node_val)
 +		return -1;
 +	else
 +		return 0;
 +}
 +
 +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
 +{
 +	u32 hash_max = item->hash_max;
 +
 +	if (!hash_max) {
 +		hash_max = page_hash_max(item->page, hash);
 +
 +		item->hash_max = hash_max;
 +	}
 +
 +	return hash_max;
 +}
 +
 +
 +
 +/**
 + * stable_tree_search() - search the stable tree for a page
 + *
 + * @item: 	the rmap_item we are comparing with
 + * @hash: 	the hash value of this item->page already calculated
 + *
 + * @return 	the page we have found, NULL otherwise. The page returned has
 + *         	been gotten.
 + */
 +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
 +{
 +	struct rb_node *node = root_stable_treep->rb_node;
 +	struct tree_node *tree_node;
 +	unsigned long hash_max;
 +	struct page *page = item->page;
 +	struct stable_node *stable_node;
 +
 +	stable_node = page_stable_node(page);
 +	if (stable_node) {
 +		/* ksm page forked, that is
 +		 * if (PageKsm(page) && !in_stable_tree(rmap_item))
 +		 * it's actually gotten once outside.
 +		 */
 +		get_page(page);
 +		return page;
 +	}
 +
 +	while (node) {
 +		int cmp;
 +
 +		tree_node = rb_entry(node, struct tree_node, node);
 +
 +		cmp = hash_cmp(hash, tree_node->hash);
 +
 +		if (cmp < 0)
 +			node = node->rb_left;
 +		else if (cmp > 0)
 +			node = node->rb_right;
 +		else
 +			break;
 +	}
 +
 +	if (!node)
 +		return NULL;
 +
 +	if (tree_node->count == 1) {
 +		stable_node = rb_entry(tree_node->sub_root.rb_node,
 +				       struct stable_node, node);
 +		BUG_ON(!stable_node);
 +
 +		goto get_page_out;
 +	}
 +
 +	/*
 +	 * ok, we have to search the second
 +	 * level subtree, hash the page to a
 +	 * full strength.
 +	 */
 +	node = tree_node->sub_root.rb_node;
 +	BUG_ON(!node);
 +	hash_max = rmap_item_hash_max(item, hash);
 +
 +	while (node) {
 +		int cmp;
 +
 +		stable_node = rb_entry(node, struct stable_node, node);
 +
 +		cmp = hash_cmp(hash_max, stable_node->hash_max);
 +
 +		if (cmp < 0)
 +			node = node->rb_left;
 +		else if (cmp > 0)
 +			node = node->rb_right;
 +		else
 +			goto get_page_out;
 +	}
 +
 +	return NULL;
 +
 +get_page_out:
 +	page = get_uksm_page(stable_node, 1, 1);
 +	return page;
 +}
 +
 +static int try_merge_rmap_item(struct rmap_item *item,
 +			       struct page *kpage,
 +			       struct page *tree_page)
 +{
 +	spinlock_t *ptl;
 +	pte_t *ptep;
 +	unsigned long addr;
 +	struct vm_area_struct *vma = item->slot->vma;
 +
 +	addr = get_rmap_addr(item);
 +	ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
 +	if (!ptep)
 +		return 0;
 +
 +	if (pte_write(*ptep)) {
 +		/* has changed, abort! */
 +		pte_unmap_unlock(ptep, ptl);
 +		return 0;
 +	}
 +
 +	get_page(tree_page);
 +	page_add_anon_rmap(tree_page, vma, addr, false);
 +
 +	flush_cache_page(vma, addr, pte_pfn(*ptep));
 +	ptep_clear_flush_notify(vma, addr, ptep);
 +	set_pte_at_notify(vma->vm_mm, addr, ptep,
 +			  mk_pte(tree_page, vma->vm_page_prot));
 +
 +	page_remove_rmap(kpage, false);
 +	put_page(kpage);
 +
 +	pte_unmap_unlock(ptep, ptl);
 +
 +	return 1;
 +}
 +
 +/**
 + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
 + * into stable tree, the page was found to be identical to a stable ksm page,
 + * this is the last chance we can merge them into one.
 + *
 + * @item1:	the rmap_item holding the page which we wanted to insert
 + *       	into stable tree.
 + * @item2:	the other rmap_item we found when unstable tree search
 + * @oldpage:	the page currently mapped by the two rmap_items
 + * @tree_page: 	the page we found identical in stable tree node
 + * @success1:	return if item1 is successfully merged
 + * @success2:	return if item2 is successfully merged
 + */
 +static void try_merge_with_stable(struct rmap_item *item1,
 +				  struct rmap_item *item2,
 +				  struct page **kpage,
 +				  struct page *tree_page,
 +				  int *success1, int *success2)
 +{
 +	struct vm_area_struct *vma1 = item1->slot->vma;
 +	struct vm_area_struct *vma2 = item2->slot->vma;
 +	*success1 = 0;
 +	*success2 = 0;
 +
 +	if (unlikely(*kpage == tree_page)) {
 +		/* I don't think this can really happen */
 +		printk(KERN_WARNING "UKSM: unexpected condition detected in "
 +			"try_merge_with_stable() -- *kpage == tree_page !\n");
 +		*success1 = 1;
 +		*success2 = 1;
 +		return;
 +	}
 +
 +	if (!PageAnon(*kpage) || !PageKsm(*kpage))
 +		goto failed;
 +
 +	if (!trylock_page(tree_page))
 +		goto failed;
 +
 +	/* If the oldpage is still ksm and still pointed
 +	 * to in the right place, and still write protected,
 +	 * we are confident it's not changed, no need to
 +	 * memcmp anymore.
 +	 * be ware, we cannot take nested pte locks,
 +	 * deadlock risk.
 +	 */
 +	if (!try_merge_rmap_item(item1, *kpage, tree_page))
 +		goto unlock_failed;
 +
 +	/* ok, then vma2, remind that pte1 already set */
 +	if (!try_merge_rmap_item(item2, *kpage, tree_page))
 +		goto success_1;
 +
 +	*success2 = 1;
 +success_1:
 +	*success1 = 1;
 +
 +
 +	if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
 +	    (*success2 && vma2->vm_flags & VM_LOCKED)) {
 +		munlock_vma_page(*kpage);
 +		if (!PageMlocked(tree_page))
 +			mlock_vma_page(tree_page);
 +	}
 +
 +	/*
 +	 * We do not need oldpage any more in the caller, so can break the lock
 +	 * now.
 +	 */
 +	unlock_page(*kpage);
 +	*kpage = tree_page; /* Get unlocked outside. */
 +	return;
 +
 +unlock_failed:
 +	unlock_page(tree_page);
 +failed:
 +	return;
 +}
 +
 +static inline void stable_node_hash_max(struct stable_node *node,
 +					 struct page *page, u32 hash)
 +{
 +	u32 hash_max = node->hash_max;
 +
 +	if (!hash_max) {
 +		hash_max = page_hash_max(page, hash);
 +		node->hash_max = hash_max;
 +	}
 +}
 +
 +static inline
 +struct stable_node *new_stable_node(struct tree_node *tree_node,
 +				    struct page *kpage, u32 hash_max)
 +{
 +	struct stable_node *new_stable_node;
 +
 +	new_stable_node = alloc_stable_node();
 +	if (!new_stable_node)
 +		return NULL;
 +
 +	new_stable_node->kpfn = page_to_pfn(kpage);
 +	new_stable_node->hash_max = hash_max;
 +	new_stable_node->tree_node = tree_node;
 +	set_page_stable_node(kpage, new_stable_node);
 +
 +	return new_stable_node;
 +}
 +
 +static inline
 +struct stable_node *first_level_insert(struct tree_node *tree_node,
 +				       struct rmap_item *rmap_item,
 +				       struct rmap_item *tree_rmap_item,
 +				       struct page **kpage, u32 hash,
 +				       int *success1, int *success2)
 +{
 +	int cmp;
 +	struct page *tree_page;
 +	u32 hash_max = 0;
 +	struct stable_node *stable_node, *new_snode;
 +	struct rb_node *parent = NULL, **new;
 +
 +	/* this tree node contains no sub-tree yet */
 +	stable_node = rb_entry(tree_node->sub_root.rb_node,
 +			       struct stable_node, node);
 +
 +	tree_page = get_uksm_page(stable_node, 1, 0);
 +	if (tree_page) {
 +		cmp = memcmp_pages(*kpage, tree_page, 1);
 +		if (!cmp) {
 +			try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
 +					      tree_page, success1, success2);
 +			put_page(tree_page);
 +			if (!*success1 && !*success2)
 +				goto failed;
 +
 +			return stable_node;
 +
 +		} else {
 +			/*
 +			 * collision in first level try to create a subtree.
 +			 * A new node need to be created.
 +			 */
 +			put_page(tree_page);
 +
 +			stable_node_hash_max(stable_node, tree_page,
 +					     tree_node->hash);
 +			hash_max = rmap_item_hash_max(rmap_item, hash);
 +			cmp = hash_cmp(hash_max, stable_node->hash_max);
 +
 +			parent = &stable_node->node;
 +			if (cmp < 0) {
 +				new = &parent->rb_left;
 +			} else if (cmp > 0) {
 +				new = &parent->rb_right;
 +			} else {
 +				goto failed;
 +			}
 +		}
 +
 +	} else {
 +		/* the only stable_node deleted, we reuse its tree_node.
 +		 */
 +		parent = NULL;
 +		new = &tree_node->sub_root.rb_node;
 +	}
 +
 +	new_snode = new_stable_node(tree_node, *kpage, hash_max);
 +	if (!new_snode)
 +		goto failed;
 +
 +	rb_link_node(&new_snode->node, parent, new);
 +	rb_insert_color(&new_snode->node, &tree_node->sub_root);
 +	tree_node->count++;
 +	*success1 = *success2 = 1;
 +
 +	return new_snode;
 +
 +failed:
 +	return NULL;
 +}
 +
 +static inline
 +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
 +					  struct rmap_item *rmap_item,
 +					  struct rmap_item *tree_rmap_item,
 +					  struct page **kpage, u32 hash,
 +					  int *success1, int *success2)
 +{
 +	struct page *tree_page;
 +	u32 hash_max;
 +	struct stable_node *stable_node, *new_snode;
 +	struct rb_node *parent, **new;
 +
 +research:
 +	parent = NULL;
 +	new = &tree_node->sub_root.rb_node;
 +	BUG_ON(!*new);
 +	hash_max = rmap_item_hash_max(rmap_item, hash);
 +	while (*new) {
 +		int cmp;
 +
 +		stable_node = rb_entry(*new, struct stable_node, node);
 +
 +		cmp = hash_cmp(hash_max, stable_node->hash_max);
 +
 +		if (cmp < 0) {
 +			parent = *new;
 +			new = &parent->rb_left;
 +		} else if (cmp > 0) {
 +			parent = *new;
 +			new = &parent->rb_right;
 +		} else {
 +			tree_page = get_uksm_page(stable_node, 1, 0);
 +			if (tree_page) {
 +				cmp = memcmp_pages(*kpage, tree_page, 1);
 +				if (!cmp) {
 +					try_merge_with_stable(rmap_item,
 +						tree_rmap_item, kpage,
 +						tree_page, success1, success2);
 +
 +					put_page(tree_page);
 +					if (!*success1 && !*success2)
 +						goto failed;
 +					/*
 +					 * successfully merged with a stable
 +					 * node
 +					 */
 +					return stable_node;
 +				} else {
 +					put_page(tree_page);
 +					goto failed;
 +				}
 +			} else {
 +				/*
 +				 * stable node may be deleted,
 +				 * and subtree maybe
 +				 * restructed, cannot
 +				 * continue, research it.
 +				 */
 +				if (tree_node->count) {
 +					goto research;
 +				} else {
 +					/* reuse the tree node*/
 +					parent = NULL;
 +					new = &tree_node->sub_root.rb_node;
 +				}
 +			}
 +		}
 +	}
 +
 +	new_snode = new_stable_node(tree_node, *kpage, hash_max);
 +	if (!new_snode)
 +		goto failed;
 +
 +	rb_link_node(&new_snode->node, parent, new);
 +	rb_insert_color(&new_snode->node, &tree_node->sub_root);
 +	tree_node->count++;
 +	*success1 = *success2 = 1;
 +
 +	return new_snode;
 +
 +failed:
 +	return NULL;
 +}
 +
 +
 +/**
 + * stable_tree_insert() - try to insert a merged page in unstable tree to
 + * the stable tree
 + *
 + * @kpage:		the page need to be inserted
 + * @hash:		the current hash of this page
 + * @rmap_item:		the rmap_item being scanned
 + * @tree_rmap_item:	the rmap_item found on unstable tree
 + * @success1:		return if rmap_item is merged
 + * @success2:		return if tree_rmap_item is merged
 + *
 + * @return 		the stable_node on stable tree if at least one
 + *      		rmap_item is inserted into stable tree, NULL
 + *      		otherwise.
 + */
 +static struct stable_node *
 +stable_tree_insert(struct page **kpage, u32 hash,
 +		   struct rmap_item *rmap_item,
 +		   struct rmap_item *tree_rmap_item,
 +		   int *success1, int *success2)
 +{
 +	struct rb_node **new = &root_stable_treep->rb_node;
 +	struct rb_node *parent = NULL;
 +	struct stable_node *stable_node;
 +	struct tree_node *tree_node;
 +	u32 hash_max = 0;
 +
 +	*success1 = *success2 = 0;
 +
 +	while (*new) {
 +		int cmp;
 +
 +		tree_node = rb_entry(*new, struct tree_node, node);
 +
 +		cmp = hash_cmp(hash, tree_node->hash);
 +
 +		if (cmp < 0) {
 +			parent = *new;
 +			new = &parent->rb_left;
 +		} else if (cmp > 0) {
 +			parent = *new;
 +			new = &parent->rb_right;
 +		} else
 +			break;
 +	}
 +
 +	if (*new) {
 +		if (tree_node->count == 1) {
 +			stable_node = first_level_insert(tree_node, rmap_item,
 +						tree_rmap_item, kpage,
 +						hash, success1, success2);
 +		} else {
 +			stable_node = stable_subtree_insert(tree_node,
 +					rmap_item, tree_rmap_item, kpage,
 +					hash, success1, success2);
 +		}
 +	} else {
 +
 +		/* no tree node found */
 +		tree_node = alloc_tree_node(stable_tree_node_listp);
 +		if (!tree_node) {
 +			stable_node = NULL;
 +			goto out;
 +		}
 +
 +		stable_node = new_stable_node(tree_node, *kpage, hash_max);
 +		if (!stable_node) {
 +			free_tree_node(tree_node);
 +			goto out;
 +		}
 +
 +		tree_node->hash = hash;
 +		rb_link_node(&tree_node->node, parent, new);
 +		rb_insert_color(&tree_node->node, root_stable_treep);
 +		parent = NULL;
 +		new = &tree_node->sub_root.rb_node;
 +
 +		rb_link_node(&stable_node->node, parent, new);
 +		rb_insert_color(&stable_node->node, &tree_node->sub_root);
 +		tree_node->count++;
 +		*success1 = *success2 = 1;
 +	}
 +
 +out:
 +	return stable_node;
 +}
 +
 +
 +/**
 + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
 + *
 + * @return 	0 on success, -EBUSY if unable to lock the mmap_sem,
 + *         	-EINVAL if the page mapping has been changed.
 + */
 +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
 +{
 +	int err;
 +
 +	err = get_mergeable_page_lock_mmap(tree_rmap_item);
 +
 +	if (err == -EINVAL) {
 +		/* its page map has been changed, remove it */
 +		remove_rmap_item_from_tree(tree_rmap_item);
 +	}
 +
 +	/* The page is gotten and mmap_sem is locked now. */
 +	return err;
 +}
 +
 +
 +/**
 + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
 + * same hash value. Get its page and trylock the mmap_sem
 + */
 +static inline
 +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
 +					      u32 hash)
 +
 +{
 +	struct rb_node **new = &root_unstable_tree.rb_node;
 +	struct rb_node *parent = NULL;
 +	struct tree_node *tree_node;
 +	u32 hash_max;
 +	struct rmap_item *tree_rmap_item;
 +
 +	while (*new) {
 +		int cmp;
 +
 +		tree_node = rb_entry(*new, struct tree_node, node);
 +
 +		cmp = hash_cmp(hash, tree_node->hash);
 +
 +		if (cmp < 0) {
 +			parent = *new;
 +			new = &parent->rb_left;
 +		} else if (cmp > 0) {
 +			parent = *new;
 +			new = &parent->rb_right;
 +		} else
 +			break;
 +	}
 +
 +	if (*new) {
 +		/* got the tree_node */
 +		if (tree_node->count == 1) {
 +			tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
 +						  struct rmap_item, node);
 +			BUG_ON(!tree_rmap_item);
 +
 +			goto get_page_out;
 +		}
 +
 +		/* well, search the collision subtree */
 +		new = &tree_node->sub_root.rb_node;
 +		BUG_ON(!*new);
 +		hash_max = rmap_item_hash_max(rmap_item, hash);
 +
 +		while (*new) {
 +			int cmp;
 +
 +			tree_rmap_item = rb_entry(*new, struct rmap_item,
 +						  node);
 +
 +			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
 +			parent = *new;
 +			if (cmp < 0)
 +				new = &parent->rb_left;
 +			else if (cmp > 0)
 +				new = &parent->rb_right;
 +			else
 +				goto get_page_out;
 +		}
 +	} else {
 +		/* alloc a new tree_node */
 +		tree_node = alloc_tree_node(&unstable_tree_node_list);
 +		if (!tree_node)
 +			return NULL;
 +
 +		tree_node->hash = hash;
 +		rb_link_node(&tree_node->node, parent, new);
 +		rb_insert_color(&tree_node->node, &root_unstable_tree);
 +		parent = NULL;
 +		new = &tree_node->sub_root.rb_node;
 +	}
 +
 +	/* did not found even in sub-tree */
 +	rmap_item->tree_node = tree_node;
 +	rmap_item->address |= UNSTABLE_FLAG;
 +	rmap_item->hash_round = uksm_hash_round;
 +	rb_link_node(&rmap_item->node, parent, new);
 +	rb_insert_color(&rmap_item->node, &tree_node->sub_root);
 +
 +	uksm_pages_unshared++;
 +	return NULL;
 +
 +get_page_out:
 +	if (tree_rmap_item->page == rmap_item->page)
 +		return NULL;
 +
 +	if (get_tree_rmap_item_page(tree_rmap_item))
 +		return NULL;
 +
 +	return tree_rmap_item;
 +}
 +
 +static void hold_anon_vma(struct rmap_item *rmap_item,
 +			  struct anon_vma *anon_vma)
 +{
 +	rmap_item->anon_vma = anon_vma;
 +	get_anon_vma(anon_vma);
 +}
 +
 +
 +/**
 + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
 + * ratio statistics is done in this function.
 + *
 + */
 +static void stable_tree_append(struct rmap_item *rmap_item,
 +			       struct stable_node *stable_node, int logdedup)
 +{
 +	struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
 +	unsigned long key = (unsigned long)rmap_item->slot;
 +	unsigned long factor = rmap_item->slot->rung->step;
 +
 +	BUG_ON(!stable_node);
 +	rmap_item->address |= STABLE_FLAG;
 +
 +	if (hlist_empty(&stable_node->hlist)) {
 +		uksm_pages_shared++;
 +		goto node_vma_new;
 +	} else {
 +		uksm_pages_sharing++;
 +	}
 +
 +	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
 +		if (node_vma->key >= key)
 +			break;
 +
 +		if (logdedup) {
 +			node_vma->slot->pages_bemerged += factor;
 +			if (list_empty(&node_vma->slot->dedup_list))
 +				list_add(&node_vma->slot->dedup_list,
 +					 &vma_slot_dedup);
 +		}
 +	}
 +
 +	if (node_vma) {
 +		if (node_vma->key == key) {
 +			node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
 +			goto node_vma_ok;
 +		} else if (node_vma->key > key) {
 +			node_vma_cont = node_vma;
 +		}
 +	}
 +
 +node_vma_new:
 +	/* no same vma already in node, alloc a new node_vma */
 +	new_node_vma = alloc_node_vma();
 +	BUG_ON(!new_node_vma);
 +	new_node_vma->head = stable_node;
 +	new_node_vma->slot = rmap_item->slot;
 +
 +	if (!node_vma) {
 +		hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
 +	} else if (node_vma->key != key) {
 +		if (node_vma->key < key)
 +			hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
 +		else {
 +			hlist_add_before(&new_node_vma->hlist,
 +					 &node_vma->hlist);
 +		}
 +
 +	}
 +	node_vma = new_node_vma;
 +
 +node_vma_ok: /* ok, ready to add to the list */
 +	rmap_item->head = node_vma;
 +	hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
 +	hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
 +	if (logdedup) {
 +		rmap_item->slot->pages_merged++;
 +		if (node_vma_cont) {
 +			node_vma = node_vma_cont;
 +			hlist_for_each_entry_continue(node_vma, hlist) {
 +				node_vma->slot->pages_bemerged += factor;
 +				if (list_empty(&node_vma->slot->dedup_list))
 +					list_add(&node_vma->slot->dedup_list,
 +						 &vma_slot_dedup);
 +			}
 +		}
 +	}
 +}
 +
 +/*
 + * We use break_ksm to break COW on a ksm page: it's a stripped down
 + *
 + *	if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
 + *		put_page(page);
 + *
 + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
 + * in case the application has unmapped and remapped mm,addr meanwhile.
 + * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
 + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
 + */
 +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 +{
 +	struct page *page;
 +	int ret = 0;
 +
 +	do {
 +		cond_resched();
 +		page = follow_page(vma, addr, FOLL_GET);
 +		if (IS_ERR_OR_NULL(page))
 +			break;
 +		if (PageKsm(page)) {
 +			ret = handle_mm_fault(vma->vm_mm, vma, addr,
 +					      FAULT_FLAG_WRITE);
 +		} else
 +			ret = VM_FAULT_WRITE;
 +		put_page(page);
 +	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
 +	/*
 +	 * We must loop because handle_mm_fault() may back out if there's
 +	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
 +	 *
 +	 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
 +	 * COW has been broken, even if the vma does not permit VM_WRITE;
 +	 * but note that a concurrent fault might break PageKsm for us.
 +	 *
 +	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
 +	 * backing file, which also invalidates anonymous pages: that's
 +	 * okay, that truncation will have unmapped the PageKsm for us.
 +	 *
 +	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
 +	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
 +	 * current task has TIF_MEMDIE set, and will be OOM killed on return
 +	 * to user; and ksmd, having no mm, would never be chosen for that.
 +	 *
 +	 * But if the mm is in a limited mem_cgroup, then the fault may fail
 +	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
 +	 * even ksmd can fail in this way - though it's usually breaking ksm
 +	 * just to undo a merge it made a moment before, so unlikely to oom.
 +	 *
 +	 * That's a pity: we might therefore have more kernel pages allocated
 +	 * than we're counting as nodes in the stable tree; but uksm_do_scan
 +	 * will retry to break_cow on each pass, so should recover the page
 +	 * in due course.  The important thing is to not let VM_MERGEABLE
 +	 * be cleared while any such pages might remain in the area.
 +	 */
 +	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
 +}
 +
 +static void break_cow(struct rmap_item *rmap_item)
 +{
 +	struct vm_area_struct *vma = rmap_item->slot->vma;
 +	struct mm_struct *mm = vma->vm_mm;
 +	unsigned long addr = get_rmap_addr(rmap_item);
 +
 +	if (uksm_test_exit(mm))
 +		goto out;
 +
 +	break_ksm(vma, addr);
 +out:
 +	return;
 +}
 +
 +/*
 + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
 + * than check every pte of a given vma, the locking doesn't quite work for
 + * that - an rmap_item is assigned to the stable tree after inserting ksm
 + * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
 + * rmap_items from parent to child at fork time (so as not to waste time
 + * if exit comes before the next scan reaches it).
 + *
 + * Similarly, although we'd like to remove rmap_items (so updating counts
 + * and freeing memory) when unmerging an area, it's easier to leave that
 + * to the next pass of ksmd - consider, for example, how ksmd might be
 + * in cmp_and_merge_page on one of the rmap_items we would be removing.
 + */
 +inline int unmerge_uksm_pages(struct vm_area_struct *vma,
 +		      unsigned long start, unsigned long end)
 +{
 +	unsigned long addr;
 +	int err = 0;
 +
 +	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
 +		if (uksm_test_exit(vma->vm_mm))
 +			break;
 +		if (signal_pending(current))
 +			err = -ERESTARTSYS;
 +		else
 +			err = break_ksm(vma, addr);
 +	}
 +	return err;
 +}
 +
 +static inline void inc_uksm_pages_scanned(void)
 +{
 +	u64 delta;
 +
 +
 +	if (uksm_pages_scanned == U64_MAX) {
 +		encode_benefit();
 +
 +		delta = uksm_pages_scanned >> pages_scanned_base;
 +
 +		if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
 +			pages_scanned_stored >>= 1;
 +			delta >>= 1;
 +			pages_scanned_base++;
 +		}
 +
 +		pages_scanned_stored += delta;
 +
 +		uksm_pages_scanned = uksm_pages_scanned_last = 0;
 +	}
 +
 +	uksm_pages_scanned++;
 +}
 +
 +static inline int find_zero_page_hash(int strength, u32 hash)
 +{
 +	return (zero_hash_table[strength] == hash);
 +}
 +
 +static
 +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
 +{
 +	struct page *zero_page = empty_uksm_zero_page;
 +	struct mm_struct *mm = vma->vm_mm;
 +	pte_t orig_pte = __pte(0);
 +	int err = -EFAULT;
 +
 +	if (uksm_test_exit(mm))
 +		goto out;
 +
 +	if (PageTransCompound(page) && page_trans_compound_anon_split(page))
 +		goto out;
 +	BUG_ON(PageTransCompound(page));
 +
 +	if (!PageAnon(page))
 +		goto out;
 +
 +	if (!trylock_page(page))
 +		goto out;
 +
 +	if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
 +		if (is_page_full_zero(page))
 +			err = replace_page(vma, page, zero_page, orig_pte);
 +	}
 +
 +	unlock_page(page);
 +out:
 +	return err;
 +}
 +
 +/*
 + * cmp_and_merge_page() - first see if page can be merged into the stable
 + * tree; if not, compare hash to previous and if it's the same, see if page
 + * can be inserted into the unstable tree, or merged with a page already there
 + * and both transferred to the stable tree.
 + *
 + * @page: the page that we are searching identical page to.
 + * @rmap_item: the reverse mapping into the virtual address of this page
 + */
 +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
 +{
 +	struct rmap_item *tree_rmap_item;
 +	struct page *page;
 +	struct page *kpage = NULL;
 +	u32 hash_max;
 +	int err;
 +	unsigned int success1, success2;
 +	struct stable_node *snode;
 +	int cmp;
 +	struct rb_node *parent = NULL, **new;
 +
 +	remove_rmap_item_from_tree(rmap_item);
 +	page = rmap_item->page;
 +
 +	/* We first start with searching the page inside the stable tree */
 +	kpage = stable_tree_search(rmap_item, hash);
 +	if (kpage) {
 +		err = try_to_merge_with_uksm_page(rmap_item, kpage,
 +						 hash);
 +		if (!err) {
 +			/*
 +			 * The page was successfully merged, add
 +			 * its rmap_item to the stable tree.
 +			 * page lock is needed because it's
 +			 * racing with try_to_unmap_ksm(), etc.
 +			 */
 +			lock_page(kpage);
 +			snode = page_stable_node(kpage);
 +			stable_tree_append(rmap_item, snode, 1);
 +			unlock_page(kpage);
 +			put_page(kpage);
 +			return; /* success */
 +		}
 +		put_page(kpage);
 +
 +		/*
 +		 * if it's a collision and it has been search in sub-rbtree
 +		 * (hash_max != 0), we want to abort, because if it is
 +		 * successfully merged in unstable tree, the collision trends to
 +		 * happen again.
 +		 */
 +		if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
 +			return;
 +	}
 +
 +	tree_rmap_item =
 +		unstable_tree_search_insert(rmap_item, hash);
 +	if (tree_rmap_item) {
 +		err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
 +		/*
 +		 * As soon as we merge this page, we want to remove the
 +		 * rmap_item of the page we have merged with from the unstable
 +		 * tree, and insert it instead as new node in the stable tree.
 +		 */
 +		if (!err) {
 +			kpage = page;
 +			remove_rmap_item_from_tree(tree_rmap_item);
 +			lock_page(kpage);
 +			snode = stable_tree_insert(&kpage, hash,
 +						   rmap_item, tree_rmap_item,
 +						   &success1, &success2);
 +
 +			/*
 +			 * Do not log dedup for tree item, it's not counted as
 +			 * scanned in this round.
 +			 */
 +			if (success2)
 +				stable_tree_append(tree_rmap_item, snode, 0);
 +
 +			/*
 +			 * The order of these two stable append is important:
 +			 * we are scanning rmap_item.
 +			 */
 +			if (success1)
 +				stable_tree_append(rmap_item, snode, 1);
 +
 +			/*
 +			 * The original kpage may be unlocked inside
 +			 * stable_tree_insert() already. This page
 +			 * should be unlocked before doing
 +			 * break_cow().
 +			 */
 +			unlock_page(kpage);
 +
 +			if (!success1)
 +				break_cow(rmap_item);
 +
 +			if (!success2)
 +				break_cow(tree_rmap_item);
 +
 +		} else if (err == MERGE_ERR_COLLI) {
 +			BUG_ON(tree_rmap_item->tree_node->count > 1);
 +
 +			rmap_item_hash_max(tree_rmap_item,
 +					   tree_rmap_item->tree_node->hash);
 +
 +			hash_max = rmap_item_hash_max(rmap_item, hash);
 +			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
 +			parent = &tree_rmap_item->node;
 +			if (cmp < 0)
 +				new = &parent->rb_left;
 +			else if (cmp > 0)
 +				new = &parent->rb_right;
 +			else
 +				goto put_up_out;
 +
 +			rmap_item->tree_node = tree_rmap_item->tree_node;
 +			rmap_item->address |= UNSTABLE_FLAG;
 +			rmap_item->hash_round = uksm_hash_round;
 +			rb_link_node(&rmap_item->node, parent, new);
 +			rb_insert_color(&rmap_item->node,
 +					&tree_rmap_item->tree_node->sub_root);
 +			rmap_item->tree_node->count++;
 +		} else {
 +			/*
 +			 * either one of the page has changed or they collide
 +			 * at the max hash, we consider them as ill items.
 +			 */
 +			remove_rmap_item_from_tree(tree_rmap_item);
 +		}
 +put_up_out:
 +		put_page(tree_rmap_item->page);
 +		up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
 +	}
 +}
 +
 +
 +
 +
 +static inline unsigned long get_pool_index(struct vma_slot *slot,
 +					   unsigned long index)
 +{
 +	unsigned long pool_index;
 +
 +	pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
 +	if (pool_index >= slot->pool_size)
 +		BUG();
 +	return pool_index;
 +}
 +
 +static inline unsigned long index_page_offset(unsigned long index)
 +{
 +	return offset_in_page(sizeof(struct rmap_list_entry *) * index);
 +}
 +
 +static inline
 +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
 +					    unsigned long index, int need_alloc)
 +{
 +	unsigned long pool_index;
 +	struct page *page;
 +	void *addr;
 +
 +
 +	pool_index = get_pool_index(slot, index);
 +	if (!slot->rmap_list_pool[pool_index]) {
 +		if (!need_alloc)
 +			return NULL;
 +
 +		page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
 +		if (!page)
 +			return NULL;
 +
 +		slot->rmap_list_pool[pool_index] = page;
 +	}
 +
 +	addr = kmap(slot->rmap_list_pool[pool_index]);
 +	addr += index_page_offset(index);
 +
 +	return addr;
 +}
 +
 +static inline void put_rmap_list_entry(struct vma_slot *slot,
 +				       unsigned long index)
 +{
 +	unsigned long pool_index;
 +
 +	pool_index = get_pool_index(slot, index);
 +	BUG_ON(!slot->rmap_list_pool[pool_index]);
 +	kunmap(slot->rmap_list_pool[pool_index]);
 +}
 +
 +static inline int entry_is_new(struct rmap_list_entry *entry)
 +{
 +	return !entry->item;
 +}
 +
 +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
 +						unsigned long index)
 +{
 +	return slot->vma->vm_start + (index << PAGE_SHIFT);
 +}
 +
 +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
 +{
 +	unsigned long addr;
 +
 +	if (is_addr(entry->addr))
 +		addr = get_clean_addr(entry->addr);
 +	else if (entry->item)
 +		addr = get_rmap_addr(entry->item);
 +	else
 +		BUG();
 +
 +	return addr;
 +}
 +
 +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
 +{
 +	if (is_addr(entry->addr))
 +		return NULL;
 +
 +	return entry->item;
 +}
 +
 +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
 +					    unsigned long index)
 +{
 +	unsigned long pool_index;
 +
 +	pool_index = get_pool_index(slot, index);
 +	BUG_ON(!slot->rmap_list_pool[pool_index]);
 +	slot->pool_counts[pool_index]++;
 +}
 +
 +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
 +					    unsigned long index)
 +{
 +	unsigned long pool_index;
 +
 +	pool_index = get_pool_index(slot, index);
 +	BUG_ON(!slot->rmap_list_pool[pool_index]);
 +	BUG_ON(!slot->pool_counts[pool_index]);
 +	slot->pool_counts[pool_index]--;
 +}
 +
 +static inline int entry_has_rmap(struct rmap_list_entry *entry)
 +{
 +	return !is_addr(entry->addr) && entry->item;
 +}
 +
 +static inline void swap_entries(struct rmap_list_entry *entry1,
 +				unsigned long index1,
 +				struct rmap_list_entry *entry2,
 +				unsigned long index2)
 +{
 +	struct rmap_list_entry tmp;
 +
 +	/* swapping two new entries is meaningless */
 +	BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
 +
 +	tmp = *entry1;
 +	*entry1 = *entry2;
 +	*entry2 = tmp;
 +
 +	if (entry_has_rmap(entry1))
 +		entry1->item->entry_index = index1;
 +
 +	if (entry_has_rmap(entry2))
 +		entry2->item->entry_index = index2;
 +
 +	if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
 +		inc_rmap_list_pool_count(entry1->item->slot, index1);
 +		dec_rmap_list_pool_count(entry1->item->slot, index2);
 +	} else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
 +		inc_rmap_list_pool_count(entry2->item->slot, index2);
 +		dec_rmap_list_pool_count(entry2->item->slot, index1);
 +	}
 +}
 +
 +static inline void free_entry_item(struct rmap_list_entry *entry)
 +{
 +	unsigned long index;
 +	struct rmap_item *item;
 +
 +	if (!is_addr(entry->addr)) {
 +		BUG_ON(!entry->item);
 +		item = entry->item;
 +		entry->addr = get_rmap_addr(item);
 +		set_is_addr(entry->addr);
 +		index = item->entry_index;
 +		remove_rmap_item_from_tree(item);
 +		dec_rmap_list_pool_count(item->slot, index);
 +		free_rmap_item(item);
 +	}
 +}
 +
 +static inline int pool_entry_boundary(unsigned long index)
 +{
 +	unsigned long linear_addr;
 +
 +	linear_addr = sizeof(struct rmap_list_entry *) * index;
 +	return index && !offset_in_page(linear_addr);
 +}
 +
 +static inline void try_free_last_pool(struct vma_slot *slot,
 +				      unsigned long index)
 +{
 +	unsigned long pool_index;
 +
 +	pool_index = get_pool_index(slot, index);
 +	if (slot->rmap_list_pool[pool_index] &&
 +	    !slot->pool_counts[pool_index]) {
 +		__free_page(slot->rmap_list_pool[pool_index]);
 +		slot->rmap_list_pool[pool_index] = NULL;
 +		slot->flags |= UKSM_SLOT_NEED_SORT;
 +	}
 +
 +}
 +
 +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
 +					   struct rmap_item *item)
 +{
 +	return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
 +}
 +
 +static int within_same_pool(struct vma_slot *slot,
 +			    unsigned long i, unsigned long j)
 +{
 +	unsigned long pool_i, pool_j;
 +
 +	pool_i = get_pool_index(slot, i);
 +	pool_j = get_pool_index(slot, j);
 +
 +	return (pool_i == pool_j);
 +}
 +
 +static void sort_rmap_entry_list(struct vma_slot *slot)
 +{
 +	unsigned long i, j;
 +	struct rmap_list_entry *entry, *swap_entry;
 +
 +	entry = get_rmap_list_entry(slot, 0, 0);
 +	for (i = 0; i < slot->pages; ) {
 +
 +		if (!entry)
 +			goto skip_whole_pool;
 +
 +		if (entry_is_new(entry))
 +			goto next_entry;
 +
 +		if (is_addr(entry->addr)) {
 +			entry->addr = 0;
 +			goto next_entry;
 +		}
 +
 +		j = vma_item_index(slot->vma, entry->item);
 +		if (j == i)
 +			goto next_entry;
 +
 +		if (within_same_pool(slot, i, j))
 +			swap_entry = entry + j - i;
 +		else
 +			swap_entry = get_rmap_list_entry(slot, j, 1);
 +
 +		swap_entries(entry, i, swap_entry, j);
 +		if (!within_same_pool(slot, i, j))
 +			put_rmap_list_entry(slot, j);
 +		continue;
 +
 +skip_whole_pool:
 +		i += PAGE_SIZE / sizeof(*entry);
 +		if (i < slot->pages)
 +			entry = get_rmap_list_entry(slot, i, 0);
 +		continue;
 +
 +next_entry:
 +		if (i >= slot->pages - 1 ||
 +		    !within_same_pool(slot, i, i + 1)) {
 +			put_rmap_list_entry(slot, i);
 +			if (i + 1 < slot->pages)
 +				entry = get_rmap_list_entry(slot, i + 1, 0);
 +		} else
 +			entry++;
 +		i++;
 +		continue;
 +	}
 +
 +	/* free empty pool entries which contain no rmap_item */
 +	/* CAN be simplied to based on only pool_counts when bug freed !!!!! */
 +	for (i = 0; i < slot->pool_size; i++) {
 +		unsigned char has_rmap;
 +		void *addr;
 +
 +		if (!slot->rmap_list_pool[i])
 +			continue;
 +
 +		has_rmap = 0;
 +		addr = kmap(slot->rmap_list_pool[i]);
 +		BUG_ON(!addr);
 +		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
 +			entry = (struct rmap_list_entry *)addr + j;
 +			if (is_addr(entry->addr))
 +				continue;
 +			if (!entry->item)
 +				continue;
 +			has_rmap = 1;
 +		}
 +		kunmap(slot->rmap_list_pool[i]);
 +		if (!has_rmap) {
 +			BUG_ON(slot->pool_counts[i]);
 +			__free_page(slot->rmap_list_pool[i]);
 +			slot->rmap_list_pool[i] = NULL;
 +		}
 +	}
 +
 +	slot->flags &= ~UKSM_SLOT_NEED_SORT;
 +}
 +
 +/*
 + * vma_fully_scanned() - if all the pages in this slot have been scanned.
 + */
 +static inline int vma_fully_scanned(struct vma_slot *slot)
 +{
 +	return slot->pages_scanned == slot->pages;
 +}
 +
 +/**
 + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
 + * its random permutation. This function is embedded with the random
 + * permutation index management code.
 + */
 +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
 +{
 +	unsigned long rand_range, addr, swap_index, scan_index;
 +	struct rmap_item *item = NULL;
 +	struct rmap_list_entry *scan_entry, *swap_entry = NULL;
 +	struct page *page;
 +
 +	scan_index = swap_index = slot->pages_scanned % slot->pages;
 +
 +	if (pool_entry_boundary(scan_index))
 +		try_free_last_pool(slot, scan_index - 1);
 +
 +	if (vma_fully_scanned(slot)) {
 +		if (slot->flags & UKSM_SLOT_NEED_SORT)
 +			slot->flags |= UKSM_SLOT_NEED_RERAND;
 +		else
 +			slot->flags &= ~UKSM_SLOT_NEED_RERAND;
 +		if (slot->flags & UKSM_SLOT_NEED_SORT)
 +			sort_rmap_entry_list(slot);
 +	}
 +
 +	scan_entry = get_rmap_list_entry(slot, scan_index, 1);
 +	if (!scan_entry)
 +		return NULL;
 +
 +	if (entry_is_new(scan_entry)) {
 +		scan_entry->addr = get_index_orig_addr(slot, scan_index);
 +		set_is_addr(scan_entry->addr);
 +	}
 +
 +	if (slot->flags & UKSM_SLOT_NEED_RERAND) {
 +		rand_range = slot->pages - scan_index;
 +		BUG_ON(!rand_range);
 +		swap_index = scan_index + (prandom_u32() % rand_range);
 +	}
 +
 +	if (swap_index != scan_index) {
 +		swap_entry = get_rmap_list_entry(slot, swap_index, 1);
 +		if (entry_is_new(swap_entry)) {
 +			swap_entry->addr = get_index_orig_addr(slot,
 +							       swap_index);
 +			set_is_addr(swap_entry->addr);
 +		}
 +		swap_entries(scan_entry, scan_index, swap_entry, swap_index);
 +	}
 +
 +	addr = get_entry_address(scan_entry);
 +	item = get_entry_item(scan_entry);
 +	BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
 +
 +	page = follow_page(slot->vma, addr, FOLL_GET);
 +	if (IS_ERR_OR_NULL(page))
 +		goto nopage;
 +
 +	if (!PageAnon(page) && !page_trans_compound_anon(page))
 +		goto putpage;
 +
 +	/*check is zero_page pfn or uksm_zero_page*/
 +	if ((page_to_pfn(page) == zero_pfn)
 +			|| (page_to_pfn(page) == uksm_zero_pfn))
 +		goto putpage;
 +
 +	flush_anon_page(slot->vma, page, addr);
 +	flush_dcache_page(page);
 +
 +
 +	*hash = page_hash(page, hash_strength, 1);
 +	inc_uksm_pages_scanned();
 +	/*if the page content all zero, re-map to zero-page*/
 +	if (find_zero_page_hash(hash_strength, *hash)) {
 +		if (!cmp_and_merge_zero_page(slot->vma, page)) {
 +			slot->pages_merged++;
 +			inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
 +			dec_mm_counter(slot->mm, MM_ANONPAGES);
 +
 +			/* For full-zero pages, no need to create rmap item */
 +			goto putpage;
 +		} else {
 +			inc_rshash_neg(memcmp_cost / 2);
 +		}
 +	}
 +
 +	if (!item) {
 +		item = alloc_rmap_item();
 +		if (item) {
 +			/* It has already been zeroed */
 +			item->slot = slot;
 +			item->address = addr;
 +			item->entry_index = scan_index;
 +			scan_entry->item = item;
 +			inc_rmap_list_pool_count(slot, scan_index);
 +		} else
 +			goto putpage;
 +	}
 +
 +	BUG_ON(item->slot != slot);
 +	/* the page may have changed */
 +	item->page = page;
 +	put_rmap_list_entry(slot, scan_index);
 +	if (swap_entry)
 +		put_rmap_list_entry(slot, swap_index);
 +	return item;
 +
 +putpage:
 +	put_page(page);
 +	page = NULL;
 +nopage:
 +	/* no page, store addr back and free rmap_item if possible */
 +	free_entry_item(scan_entry);
 +	put_rmap_list_entry(slot, scan_index);
 +	if (swap_entry)
 +		put_rmap_list_entry(slot, swap_index);
 +	return NULL;
 +}
 +
 +static inline int in_stable_tree(struct rmap_item *rmap_item)
 +{
 +	return rmap_item->address & STABLE_FLAG;
 +}
 +
 +/**
 + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
 + * mmap_sem locked.
 + */
 +static noinline void scan_vma_one_page(struct vma_slot *slot)
 +{
 +	u32 hash;
 +	struct mm_struct *mm;
 +	struct rmap_item *rmap_item = NULL;
 +	struct vm_area_struct *vma = slot->vma;
 +
 +	mm = vma->vm_mm;
 +	BUG_ON(!mm);
 +	BUG_ON(!slot);
 +
 +	rmap_item = get_next_rmap_item(slot, &hash);
 +	if (!rmap_item)
 +		goto out1;
 +
 +	if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
 +		goto out2;
 +
 +	cmp_and_merge_page(rmap_item, hash);
 +out2:
 +	put_page(rmap_item->page);
 +out1:
 +	slot->pages_scanned++;
 +	if (slot->fully_scanned_round != fully_scanned_round)
 +		scanned_virtual_pages++;
 +
 +	if (vma_fully_scanned(slot))
 +		slot->fully_scanned_round = fully_scanned_round;
 +}
 +
 +static inline unsigned long rung_get_pages(struct scan_rung *rung)
 +{
 +	struct slot_tree_node *node;
 +
 +	if (!rung->vma_root.rnode)
 +		return 0;
 +
 +	node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
 +
 +	return node->size;
 +}
 +
 +#define RUNG_SAMPLED_MIN	3
 +
 +static inline
 +void uksm_calc_rung_step(struct scan_rung *rung,
 +			 unsigned long page_time, unsigned long ratio)
 +{
 +	unsigned long sampled, pages;
 +
 +	/* will be fully scanned ? */
 +	if (!rung->cover_msecs) {
 +		rung->step = 1;
 +		return;
 +	}
 +
 +	sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
 +		  * ratio / page_time;
 +
 +	/*
 +	 *  Before we finsish a scan round and expensive per-round jobs,
 +	 *  we need to have a chance to estimate the per page time. So
 +	 *  the sampled number can not be too small.
 +	 */
 +	if (sampled < RUNG_SAMPLED_MIN)
 +		sampled = RUNG_SAMPLED_MIN;
 +
 +	pages = rung_get_pages(rung);
 +	if (likely(pages > sampled))
 +		rung->step = pages / sampled;
 +	else
 +		rung->step = 1;
 +}
 +
 +static inline int step_need_recalc(struct scan_rung *rung)
 +{
 +	unsigned long pages, stepmax;
 +
 +	pages = rung_get_pages(rung);
 +	stepmax = pages / RUNG_SAMPLED_MIN;
 +
 +	return pages && (rung->step > pages ||
 +			 (stepmax && rung->step > stepmax));
 +}
 +
 +static inline
 +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
 +{
 +	struct vma_slot *slot;
 +
 +	if (finished)
 +		rung->flags |= UKSM_RUNG_ROUND_FINISHED;
 +
 +	if (step_recalc || step_need_recalc(rung)) {
 +		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
 +		BUG_ON(step_need_recalc(rung));
 +	}
 +
 +	slot_iter_index = prandom_u32() % rung->step;
 +	BUG_ON(!rung->vma_root.rnode);
 +	slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
 +	BUG_ON(!slot);
 +
 +	rung->current_scan = slot;
 +	rung->current_offset = slot_iter_index;
 +}
 +
 +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
 +{
 +	return &slot->rung->vma_root;
 +}
 +
 +/*
 + * return if resetted.
 + */
 +static int advance_current_scan(struct scan_rung *rung)
 +{
 +	unsigned short n;
 +	struct vma_slot *slot, *next = NULL;
 +
 +	BUG_ON(!rung->vma_root.num);
 +
 +	slot = rung->current_scan;
 +	n = (slot->pages - rung->current_offset) % rung->step;
 +	slot_iter_index = rung->step - n;
 +	next = sradix_tree_next(&rung->vma_root, slot->snode,
 +				slot->sindex, slot_iter);
 +
 +	if (next) {
 +		rung->current_offset = slot_iter_index;
 +		rung->current_scan = next;
 +		return 0;
 +	} else {
 +		reset_current_scan(rung, 1, 0);
 +		return 1;
 +	}
 +}
 +
 +static inline void rung_rm_slot(struct vma_slot *slot)
 +{
 +	struct scan_rung *rung = slot->rung;
 +	struct sradix_tree_root *root;
 +
 +	if (rung->current_scan == slot)
 +		advance_current_scan(rung);
 +
 +	root = slot_get_root(slot);
 +	sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
 +	slot->snode = NULL;
 +	if (step_need_recalc(rung)) {
 +		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
 +		BUG_ON(step_need_recalc(rung));
 +	}
 +
 +	/* In case advance_current_scan loop back to this slot again */
 +	if (rung->vma_root.num && rung->current_scan == slot)
 +		reset_current_scan(slot->rung, 1, 0);
 +}
 +
 +static inline void rung_add_new_slots(struct scan_rung *rung,
 +			struct vma_slot **slots, unsigned long num)
 +{
 +	int err;
 +	struct vma_slot *slot;
 +	unsigned long i;
 +	struct sradix_tree_root *root = &rung->vma_root;
 +
 +	err = sradix_tree_enter(root, (void **)slots, num);
 +	BUG_ON(err);
 +
 +	for (i = 0; i < num; i++) {
 +		slot = slots[i];
 +		slot->rung = rung;
 +		BUG_ON(vma_fully_scanned(slot));
 +	}
 +
 +	if (rung->vma_root.num == num)
 +		reset_current_scan(rung, 0, 1);
 +}
 +
 +static inline int rung_add_one_slot(struct scan_rung *rung,
 +				     struct vma_slot *slot)
 +{
 +	int err;
 +
 +	err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
 +	if (err)
 +		return err;
 +
 +	slot->rung = rung;
 +	if (rung->vma_root.num == 1)
 +		reset_current_scan(rung, 0, 1);
 +
 +	return 0;
 +}
 +
 +/*
 + * Return true if the slot is deleted from its rung.
 + */
 +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
 +{
 +	struct scan_rung *old_rung = slot->rung;
 +	int err;
 +
 +	if (old_rung == rung)
 +		return 0;
 +
 +	rung_rm_slot(slot);
 +	err = rung_add_one_slot(rung, slot);
 +	if (err) {
 +		err = rung_add_one_slot(old_rung, slot);
 +		WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
 +	}
 +
 +	return 1;
 +}
 +
 +static inline int vma_rung_up(struct vma_slot *slot)
 +{
 +	struct scan_rung *rung;
 +
 +	rung = slot->rung;
 +	if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
 +		rung++;
 +
 +	return vma_rung_enter(slot, rung);
 +}
 +
 +static inline int vma_rung_down(struct vma_slot *slot)
 +{
 +	struct scan_rung *rung;
 +
 +	rung = slot->rung;
 +	if (slot->rung != &uksm_scan_ladder[0])
 +		rung--;
 +
 +	return vma_rung_enter(slot, rung);
 +}
 +
 +/**
 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
 + */
 +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
 +{
 +	unsigned long ret;
 +
 +	BUG_ON(slot->pages_scanned == slot->last_scanned);
 +
 +	ret = slot->pages_merged;
 +
 +	/* Thrashing area filtering */
 +	if (ret && uksm_thrash_threshold) {
 +		if (slot->pages_cowed * 100 / slot->pages_merged
 +		    > uksm_thrash_threshold) {
 +			ret = 0;
 +		} else {
 +			ret = slot->pages_merged - slot->pages_cowed;
 +		}
 +	}
 +
 +	return ret;
 +}
 +
 +/**
 + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
 + */
 +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
 +{
 +	unsigned long ret;
 +	unsigned long pages_scanned;
 +
 +	pages_scanned = slot->pages_scanned;
 +	if (!pages_scanned) {
 +		if (uksm_thrash_threshold)
 +			return 0;
 +		else
 +			pages_scanned = slot->pages_scanned;
 +	}
 +
 +	ret = slot->pages_bemerged * 100 / pages_scanned;
 +
 +	/* Thrashing area filtering */
 +	if (ret && uksm_thrash_threshold) {
 +		if (slot->pages_cowed * 100 / slot->pages_bemerged
 +		    > uksm_thrash_threshold) {
 +			ret = 0;
 +		} else {
 +			ret = slot->pages_bemerged - slot->pages_cowed;
 +		}
 +	}
 +
 +	return ret;
 +}
 +
 +/**
 + * stable_node_reinsert() - When the hash_strength has been adjusted, the
 + * stable tree need to be restructured, this is the function re-inserting the
 + * stable node.
 + */
 +static inline void stable_node_reinsert(struct stable_node *new_node,
 +					struct page *page,
 +					struct rb_root *root_treep,
 +					struct list_head *tree_node_listp,
 +					u32 hash)
 +{
 +	struct rb_node **new = &root_treep->rb_node;
 +	struct rb_node *parent = NULL;
 +	struct stable_node *stable_node;
 +	struct tree_node *tree_node;
 +	struct page *tree_page;
 +	int cmp;
 +
 +	while (*new) {
 +		int cmp;
 +
 +		tree_node = rb_entry(*new, struct tree_node, node);
 +
 +		cmp = hash_cmp(hash, tree_node->hash);
 +
 +		if (cmp < 0) {
 +			parent = *new;
 +			new = &parent->rb_left;
 +		} else if (cmp > 0) {
 +			parent = *new;
 +			new = &parent->rb_right;
 +		} else
 +			break;
 +	}
 +
 +	if (*new) {
 +		/* find a stable tree node with same first level hash value */
 +		stable_node_hash_max(new_node, page, hash);
 +		if (tree_node->count == 1) {
 +			stable_node = rb_entry(tree_node->sub_root.rb_node,
 +					       struct stable_node, node);
 +			tree_page = get_uksm_page(stable_node, 1, 0);
 +			if (tree_page) {
 +				stable_node_hash_max(stable_node,
 +						      tree_page, hash);
 +				put_page(tree_page);
 +
 +				/* prepare for stable node insertion */
 +
 +				cmp = hash_cmp(new_node->hash_max,
 +						   stable_node->hash_max);
 +				parent = &stable_node->node;
 +				if (cmp < 0)
 +					new = &parent->rb_left;
 +				else if (cmp > 0)
 +					new = &parent->rb_right;
 +				else
 +					goto failed;
 +
 +				goto add_node;
 +			} else {
 +				/* the only stable_node deleted, the tree node
 +				 * was not deleted.
 +				 */
 +				goto tree_node_reuse;
 +			}
 +		}
 +
 +		/* well, search the collision subtree */
 +		new = &tree_node->sub_root.rb_node;
 +		parent = NULL;
 +		BUG_ON(!*new);
 +		while (*new) {
 +			int cmp;
 +
 +			stable_node = rb_entry(*new, struct stable_node, node);
 +
 +			cmp = hash_cmp(new_node->hash_max,
 +					   stable_node->hash_max);
 +
 +			if (cmp < 0) {
 +				parent = *new;
 +				new = &parent->rb_left;
 +			} else if (cmp > 0) {
 +				parent = *new;
 +				new = &parent->rb_right;
 +			} else {
 +				/* oh, no, still a collision */
 +				goto failed;
 +			}
 +		}
 +
 +		goto add_node;
 +	}
 +
 +	/* no tree node found */
 +	tree_node = alloc_tree_node(tree_node_listp);
 +	if (!tree_node) {
 +		printk(KERN_ERR "UKSM: memory allocation error!\n");
 +		goto failed;
 +	} else {
 +		tree_node->hash = hash;
 +		rb_link_node(&tree_node->node, parent, new);
 +		rb_insert_color(&tree_node->node, root_treep);
 +
 +tree_node_reuse:
 +		/* prepare for stable node insertion */
 +		parent = NULL;
 +		new = &tree_node->sub_root.rb_node;
 +	}
 +
 +add_node:
 +	rb_link_node(&new_node->node, parent, new);
 +	rb_insert_color(&new_node->node, &tree_node->sub_root);
 +	new_node->tree_node = tree_node;
 +	tree_node->count++;
 +	return;
 +
 +failed:
 +	/* This can only happen when two nodes have collided
 +	 * in two levels.
 +	 */
 +	new_node->tree_node = NULL;
 +	return;
 +}
 +
 +static inline void free_all_tree_nodes(struct list_head *list)
 +{
 +	struct tree_node *node, *tmp;
 +
 +	list_for_each_entry_safe(node, tmp, list, all_list) {
 +		free_tree_node(node);
 +	}
 +}
 +
 +/**
 + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
 + * strength to the current hash_strength. It re-structures the hole tree.
 + */
 +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
 +{
 +	struct stable_node *node, *tmp;
 +	struct rb_root *root_new_treep;
 +	struct list_head *new_tree_node_listp;
 +
 +	stable_tree_index = (stable_tree_index + 1) % 2;
 +	root_new_treep = &root_stable_tree[stable_tree_index];
 +	new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
 +	*root_new_treep = RB_ROOT;
 +	BUG_ON(!list_empty(new_tree_node_listp));
 +
 +	/*
 +	 * we need to be safe, the node could be removed by get_uksm_page()
 +	 */
 +	list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
 +		void *addr;
 +		struct page *node_page;
 +		u32 hash;
 +
 +		/*
 +		 * We are completely re-structuring the stable nodes to a new
 +		 * stable tree. We don't want to touch the old tree unlinks and
 +		 * old tree_nodes. The old tree_nodes will be freed at once.
 +		 */
 +		node_page = get_uksm_page(node, 0, 0);
 +		if (!node_page)
 +			continue;
 +
 +		if (node->tree_node) {
 +			hash = node->tree_node->hash;
 +
 +			addr = kmap_atomic(node_page);
 +
 +			hash = delta_hash(addr, prev_hash_strength,
 +					  hash_strength, hash);
 +			kunmap_atomic(addr);
 +		} else {
 +			/*
 +			 *it was not inserted to rbtree due to collision in last
 +			 *round scan.
 +			 */
 +			hash = page_hash(node_page, hash_strength, 0);
 +		}
 +
 +		stable_node_reinsert(node, node_page, root_new_treep,
 +				     new_tree_node_listp, hash);
 +		put_page(node_page);
 +	}
 +
 +	root_stable_treep = root_new_treep;
 +	free_all_tree_nodes(stable_tree_node_listp);
 +	BUG_ON(!list_empty(stable_tree_node_listp));
 +	stable_tree_node_listp = new_tree_node_listp;
 +}
 +
 +static inline void inc_hash_strength(unsigned long delta)
 +{
 +	hash_strength += 1 << delta;
 +	if (hash_strength > HASH_STRENGTH_MAX)
 +		hash_strength = HASH_STRENGTH_MAX;
 +}
 +
 +static inline void dec_hash_strength(unsigned long delta)
 +{
 +	unsigned long change = 1 << delta;
 +
 +	if (hash_strength <= change + 1)
 +		hash_strength = 1;
 +	else
 +		hash_strength -= change;
 +}
 +
 +static inline void inc_hash_strength_delta(void)
 +{
 +	hash_strength_delta++;
 +	if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
 +		hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
 +}
 +
 +/*
 +static inline unsigned long get_current_neg_ratio(void)
 +{
 +	if (!rshash_pos || rshash_neg > rshash_pos)
 +		return 100;
 +
 +	return div64_u64(100 * rshash_neg , rshash_pos);
 +}
 +*/
 +
 +static inline unsigned long get_current_neg_ratio(void)
 +{
 +	u64 pos = benefit.pos;
 +	u64 neg = benefit.neg;
 +
 +	if (!neg)
 +		return 0;
 +
 +	if (!pos || neg > pos)
 +		return 100;
 +
 +	if (neg > div64_u64(U64_MAX, 100))
 +		pos = div64_u64(pos, 100);
 +	else
 +		neg *= 100;
 +
 +	return div64_u64(neg, pos);
 +}
 +
 +static inline unsigned long get_current_benefit(void)
 +{
 +	u64 pos = benefit.pos;
 +	u64 neg = benefit.neg;
 +	u64 scanned = benefit.scanned;
 +
 +	if (neg > pos)
 +		return 0;
 +
 +	return div64_u64((pos - neg), scanned);
 +}
 +
 +static inline int judge_rshash_direction(void)
 +{
 +	u64 current_neg_ratio, stable_benefit;
 +	u64 current_benefit, delta = 0;
 +	int ret = STILL;
 +
 +	/* Try to probe a value after the boot, and in case the system
 +	   are still for a long time. */
 +	if ((fully_scanned_round & 0xFFULL) == 10) {
 +		ret = OBSCURE;
 +		goto out;
 +	}
 +
 +	current_neg_ratio = get_current_neg_ratio();
 +
 +	if (current_neg_ratio == 0) {
 +		rshash_neg_cont_zero++;
 +		if (rshash_neg_cont_zero > 2)
 +			return GO_DOWN;
 +		else
 +			return STILL;
 +	}
 +	rshash_neg_cont_zero = 0;
 +
 +	if (current_neg_ratio > 90) {
 +		ret = GO_UP;
 +		goto out;
 +	}
 +
 +	current_benefit = get_current_benefit();
 +	stable_benefit = rshash_state.stable_benefit;
 +
 +	if (!stable_benefit) {
 +		ret = OBSCURE;
 +		goto out;
 +	}
 +
 +	if (current_benefit > stable_benefit)
 +		delta = current_benefit - stable_benefit;
 +	else if (current_benefit < stable_benefit)
 +		delta = stable_benefit - current_benefit;
 +
 +	delta = div64_u64(100 * delta , stable_benefit);
 +
 +	if (delta > 50) {
 +		rshash_cont_obscure++;
 +		if (rshash_cont_obscure > 2)
 +			return OBSCURE;
 +		else
 +			return STILL;
 +	}
 +
 +out:
 +	rshash_cont_obscure = 0;
 +	return ret;
 +}
 +
 +/**
 + * rshash_adjust() - The main function to control the random sampling state
 + * machine for hash strength adapting.
 + *
 + * return true if hash_strength has changed.
 + */
 +static inline int rshash_adjust(void)
 +{
 +	unsigned long prev_hash_strength = hash_strength;
 +
 +	if (!encode_benefit())
 +		return 0;
 +
 +	switch (rshash_state.state) {
 +	case RSHASH_STILL:
 +		switch (judge_rshash_direction()) {
 +		case GO_UP:
 +			if (rshash_state.pre_direct == GO_DOWN)
 +				hash_strength_delta = 0;
 +
 +			inc_hash_strength(hash_strength_delta);
 +			inc_hash_strength_delta();
 +			rshash_state.stable_benefit = get_current_benefit();
 +			rshash_state.pre_direct = GO_UP;
 +			break;
 +
 +		case GO_DOWN:
 +			if (rshash_state.pre_direct == GO_UP)
 +				hash_strength_delta = 0;
 +
 +			dec_hash_strength(hash_strength_delta);
 +			inc_hash_strength_delta();
 +			rshash_state.stable_benefit = get_current_benefit();
 +			rshash_state.pre_direct = GO_DOWN;
 +			break;
 +
 +		case OBSCURE:
 +			rshash_state.stable_point = hash_strength;
 +			rshash_state.turn_point_down = hash_strength;
 +			rshash_state.turn_point_up = hash_strength;
 +			rshash_state.turn_benefit_down = get_current_benefit();
 +			rshash_state.turn_benefit_up = get_current_benefit();
 +			rshash_state.lookup_window_index = 0;
 +			rshash_state.state = RSHASH_TRYDOWN;
 +			dec_hash_strength(hash_strength_delta);
 +			inc_hash_strength_delta();
 +			break;
 +
 +		case STILL:
 +			break;
 +		default:
 +			BUG();
 +		}
 +		break;
 +
 +	case RSHASH_TRYDOWN:
 +		if (rshash_state.lookup_window_index++ % 5 == 0)
 +			rshash_state.below_count = 0;
 +
 +		if (get_current_benefit() < rshash_state.stable_benefit)
 +			rshash_state.below_count++;
 +		else if (get_current_benefit() >
 +			 rshash_state.turn_benefit_down) {
 +			rshash_state.turn_point_down = hash_strength;
 +			rshash_state.turn_benefit_down = get_current_benefit();
 +		}
 +
 +		if (rshash_state.below_count >= 3 ||
 +		    judge_rshash_direction() == GO_UP ||
 +		    hash_strength == 1) {
 +			hash_strength = rshash_state.stable_point;
 +			hash_strength_delta = 0;
 +			inc_hash_strength(hash_strength_delta);
 +			inc_hash_strength_delta();
 +			rshash_state.lookup_window_index = 0;
 +			rshash_state.state = RSHASH_TRYUP;
 +			hash_strength_delta = 0;
 +		} else {
 +			dec_hash_strength(hash_strength_delta);
 +			inc_hash_strength_delta();
 +		}
 +		break;
 +
 +	case RSHASH_TRYUP:
 +		if (rshash_state.lookup_window_index++ % 5 == 0)
 +			rshash_state.below_count = 0;
 +
 +		if (get_current_benefit() < rshash_state.turn_benefit_down)
 +			rshash_state.below_count++;
 +		else if (get_current_benefit() > rshash_state.turn_benefit_up) {
 +			rshash_state.turn_point_up = hash_strength;
 +			rshash_state.turn_benefit_up = get_current_benefit();
 +		}
 +
 +		if (rshash_state.below_count >= 3 ||
 +		    judge_rshash_direction() == GO_DOWN ||
 +		    hash_strength == HASH_STRENGTH_MAX) {
 +			hash_strength = rshash_state.turn_benefit_up >
 +				rshash_state.turn_benefit_down ?
 +				rshash_state.turn_point_up :
 +				rshash_state.turn_point_down;
 +
 +			rshash_state.state = RSHASH_PRE_STILL;
 +		} else {
 +			inc_hash_strength(hash_strength_delta);
 +			inc_hash_strength_delta();
 +		}
 +
 +		break;
 +
 +	case RSHASH_NEW:
 +	case RSHASH_PRE_STILL:
 +		rshash_state.stable_benefit = get_current_benefit();
 +		rshash_state.state = RSHASH_STILL;
 +		hash_strength_delta = 0;
 +		break;
 +	default:
 +		BUG();
 +	}
 +
 +	/* rshash_neg = rshash_pos = 0; */
 +	reset_benefit();
 +
 +	if (prev_hash_strength != hash_strength)
 +		stable_tree_delta_hash(prev_hash_strength);
 +
 +	return prev_hash_strength != hash_strength;
 +}
 +
 +/**
 + * round_update_ladder() - The main function to do update of all the
 + * adjustments whenever a scan round is finished.
 + */
 +static noinline void round_update_ladder(void)
 +{
 +	int i;
 +	unsigned long dedup;
 +	struct vma_slot *slot, *tmp_slot;
 +
 +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
 +		uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
 +	}
 +
 +	list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
 +
 +		/* slot may be rung_rm_slot() when mm exits */
 +		if (slot->snode) {
 +			dedup = cal_dedup_ratio_old(slot);
 +			if (dedup && dedup >= uksm_abundant_threshold)
 +				vma_rung_up(slot);
 +		}
 +
 +		slot->pages_bemerged = 0;
 +		slot->pages_cowed = 0;
 +
 +		list_del_init(&slot->dedup_list);
 +	}
 +}
 +
 +static void uksm_del_vma_slot(struct vma_slot *slot)
 +{
 +	int i, j;
 +	struct rmap_list_entry *entry;
 +
 +	if (slot->snode) {
 +		/*
 +		 * In case it just failed when entering the rung, it's not
 +		 * necessary.
 +		 */
 +		rung_rm_slot(slot);
 +	}
 +
 +	if (!list_empty(&slot->dedup_list))
 +		list_del(&slot->dedup_list);
 +
 +	if (!slot->rmap_list_pool || !slot->pool_counts) {
 +		/* In case it OOMed in uksm_vma_enter() */
 +		goto out;
 +	}
 +
 +	for (i = 0; i < slot->pool_size; i++) {
 +		void *addr;
 +
 +		if (!slot->rmap_list_pool[i])
 +			continue;
 +
 +		addr = kmap(slot->rmap_list_pool[i]);
 +		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
 +			entry = (struct rmap_list_entry *)addr + j;
 +			if (is_addr(entry->addr))
 +				continue;
 +			if (!entry->item)
 +				continue;
 +
 +			remove_rmap_item_from_tree(entry->item);
 +			free_rmap_item(entry->item);
 +			slot->pool_counts[i]--;
 +		}
 +		BUG_ON(slot->pool_counts[i]);
 +		kunmap(slot->rmap_list_pool[i]);
 +		__free_page(slot->rmap_list_pool[i]);
 +	}
 +	kfree(slot->rmap_list_pool);
 +	kfree(slot->pool_counts);
 +
 +out:
 +	slot->rung = NULL;
 +	if (slot->flags & UKSM_SLOT_IN_UKSM) {
 +		BUG_ON(uksm_pages_total < slot->pages);
 +		uksm_pages_total -= slot->pages;
 +	}
 +
 +	if (slot->fully_scanned_round == fully_scanned_round)
 +		scanned_virtual_pages -= slot->pages;
 +	else
 +		scanned_virtual_pages -= slot->pages_scanned;
 +	free_vma_slot(slot);
 +}
 +
 +
 +#define SPIN_LOCK_PERIOD	32
 +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
 +static inline void cleanup_vma_slots(void)
 +{
 +	struct vma_slot *slot;
 +	int i;
 +
 +	i = 0;
 +	spin_lock(&vma_slot_list_lock);
 +	while (!list_empty(&vma_slot_del)) {
 +		slot = list_entry(vma_slot_del.next,
 +				  struct vma_slot, slot_list);
 +		list_del(&slot->slot_list);
 +		cleanup_slots[i++] = slot;
 +		if (i == SPIN_LOCK_PERIOD) {
 +			spin_unlock(&vma_slot_list_lock);
 +			while (--i >= 0)
 +				uksm_del_vma_slot(cleanup_slots[i]);
 +			i = 0;
 +			spin_lock(&vma_slot_list_lock);
 +		}
 +	}
 +	spin_unlock(&vma_slot_list_lock);
 +
 +	while (--i >= 0)
 +		uksm_del_vma_slot(cleanup_slots[i]);
 +}
 +
 +/*
 +*expotional moving average formula
 +*/
 +static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
 +{
 +	/*
 +	 * For a very high burst, even the ema cannot work well, a false very
 +	 * high per-page time estimation can result in feedback in very high
 +	 * overhead of context swith and rung update -- this will then lead
 +	 * to higher per-paper time, this may not converge.
 +	 *
 +	 * Instead, we try to approach this value in a binary manner.
 +	 */
 +	if (curr > last_ema * 10)
 +		return last_ema * 2;
 +
 +	return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
 +}
 +
 +/*
 + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
 + * nanoseconds based on current uksm_sleep_jiffies.
 + */
 +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
 +{
 +	return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
 +		(TIME_RATIO_SCALE - ratio) * ratio;
 +}
 +
 +
 +static inline unsigned long rung_real_ratio(int cpu_time_ratio)
 +{
 +	unsigned long ret;
 +
 +	BUG_ON(!cpu_time_ratio);
 +
 +	if (cpu_time_ratio > 0)
 +		ret = cpu_time_ratio;
 +	else
 +		ret = (unsigned long)(-cpu_time_ratio) *
 +			uksm_max_cpu_percentage / 100UL;
 +
 +	return ret ? ret : 1;
 +}
 +
 +static noinline void uksm_calc_scan_pages(void)
 +{
 +	struct scan_rung *ladder = uksm_scan_ladder;
 +	unsigned long sleep_usecs, nsecs;
 +	unsigned long ratio;
 +	int i;
 +	unsigned long per_page;
 +
 +	if (uksm_ema_page_time > 100000 ||
 +	    (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
 +		uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
 +
 +	per_page = uksm_ema_page_time;
 +	BUG_ON(!per_page);
 +
 +	/*
 +	 * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
 +	 * based on saved user input.
 +	 */
 +	if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
 +		uksm_sleep_jiffies = uksm_sleep_saved;
 +
 +	/* We require a rung scan at least 1 page in a period. */
 +	nsecs = per_page;
 +	ratio = rung_real_ratio(ladder[0].cpu_ratio);
 +	if (cpu_ratio_to_nsec(ratio) < nsecs) {
 +		sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
 +				/ NSEC_PER_USEC;
 +		uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
 +	}
 +
 +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
 +		ratio = rung_real_ratio(ladder[i].cpu_ratio);
 +		ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
 +					per_page;
 +		BUG_ON(!ladder[i].pages_to_scan);
 +		uksm_calc_rung_step(&ladder[i], per_page, ratio);
 +	}
 +}
 +
 +/*
 + * From the scan time of this round (ns) to next expected min sleep time
 + * (ms), be careful of the possible overflows. ratio is taken from
 + * rung_real_ratio()
 + */
 +static inline
 +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
 +{
 +	scan_time >>= 20; /* to msec level now */
 +	BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
 +
 +	return (unsigned int) ((unsigned long) scan_time *
 +			       (TIME_RATIO_SCALE - ratio) / ratio);
 +}
 +
 +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
 +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
 +
 +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
 +{
 +	struct scan_rung *rung;
 +
 +	rung = &uksm_scan_ladder[0];
 +	rung_add_new_slots(rung, slots, num);
 +}
 +
 +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
 +
 +static void uksm_enter_all_slots(void)
 +{
 +	struct vma_slot *slot;
 +	unsigned long index;
 +	struct list_head empty_vma_list;
 +	int i;
 +
 +	i = 0;
 +	index = 0;
 +	INIT_LIST_HEAD(&empty_vma_list);
 +
 +	spin_lock(&vma_slot_list_lock);
 +	while (!list_empty(&vma_slot_new)) {
 +		slot = list_entry(vma_slot_new.next,
 +				  struct vma_slot, slot_list);
 +
 +		if (!slot->vma->anon_vma) {
 +			list_move(&slot->slot_list, &empty_vma_list);
 +		} else if (vma_can_enter(slot->vma)) {
 +			batch_slots[index++] = slot;
 +			list_del_init(&slot->slot_list);
 +		} else {
 +			list_move(&slot->slot_list, &vma_slot_noadd);
 +		}
 +
 +		if (++i == SPIN_LOCK_PERIOD ||
 +		    (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
 +			spin_unlock(&vma_slot_list_lock);
 +
 +			if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
 +				uksm_vma_enter(batch_slots, index);
 +				index = 0;
 +			}
 +			i = 0;
 +			cond_resched();
 +			spin_lock(&vma_slot_list_lock);
 +		}
 +	}
 +
 +	list_splice(&empty_vma_list, &vma_slot_new);
 +
 +	spin_unlock(&vma_slot_list_lock);
 +
 +	if (index)
 +		uksm_vma_enter(batch_slots, index);
 +
 +}
 +
 +static inline int rung_round_finished(struct scan_rung *rung)
 +{
 +	return rung->flags & UKSM_RUNG_ROUND_FINISHED;
 +}
 +
 +static inline void judge_slot(struct vma_slot *slot)
 +{
 +	struct scan_rung *rung = slot->rung;
 +	unsigned long dedup;
 +	int deleted;
 +
 +	dedup = cal_dedup_ratio(slot);
 +	if (vma_fully_scanned(slot) && uksm_thrash_threshold)
 +		deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
 +	else if (dedup && dedup >= uksm_abundant_threshold)
 +		deleted = vma_rung_up(slot);
 +	else
 +		deleted = vma_rung_down(slot);
 +
 +	slot->pages_merged = 0;
 +	slot->pages_cowed = 0;
 +
 +	if (vma_fully_scanned(slot))
 +		slot->pages_scanned = 0;
 +
 +	slot->last_scanned = slot->pages_scanned;
 +
 +	/* If its deleted in above, then rung was already advanced. */
 +	if (!deleted)
 +		advance_current_scan(rung);
 +}
 +
 +
 +static inline int hash_round_finished(void)
 +{
 +	if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
 +		scanned_virtual_pages = 0;
 +		if (uksm_pages_scanned)
 +			fully_scanned_round++;
 +
 +		return 1;
 +	} else {
 +		return 0;
 +	}
 +}
 +
 +#define UKSM_MMSEM_BATCH	5
 +#define BUSY_RETRY		100
 +
 +/**
 + * uksm_do_scan()  - the main worker function.
 + */
 +static noinline void uksm_do_scan(void)
 +{
 +	struct vma_slot *slot, *iter;
 +	struct mm_struct *busy_mm;
 +	unsigned char round_finished, all_rungs_emtpy;
 +	int i, err, mmsem_batch;
 +	unsigned long pcost;
 +	long long delta_exec;
 +	unsigned long vpages, max_cpu_ratio;
 +	unsigned long long start_time, end_time, scan_time;
 +	unsigned int expected_jiffies;
 +
 +	might_sleep();
 +
 +	vpages = 0;
 +
 +	start_time = task_sched_runtime(current);
 +	max_cpu_ratio = 0;
 +	mmsem_batch = 0;
 +
 +	for (i = 0; i < SCAN_LADDER_SIZE;) {
 +		struct scan_rung *rung = &uksm_scan_ladder[i];
 +		unsigned long ratio;
 +		int busy_retry;
 +
 +		if (!rung->pages_to_scan) {
 +			i++;
 +			continue;
 +		}
 +
 +		if (!rung->vma_root.num) {
 +			rung->pages_to_scan = 0;
 +			i++;
 +			continue;
 +		}
 +
 +		ratio = rung_real_ratio(rung->cpu_ratio);
 +		if (ratio > max_cpu_ratio)
 +			max_cpu_ratio = ratio;
 +
 +		busy_retry = BUSY_RETRY;
 +		/*
 +		 * Do not consider rung_round_finished() here, just used up the
 +		 * rung->pages_to_scan quota.
 +		 */
 +		while (rung->pages_to_scan && rung->vma_root.num &&
 +		       likely(!freezing(current))) {
 +			int reset = 0;
 +
 +			slot = rung->current_scan;
 +
 +			BUG_ON(vma_fully_scanned(slot));
 +
 +			if (mmsem_batch) {
 +				err = 0;
 +			} else {
 +				err = try_down_read_slot_mmap_sem(slot);
 +			}
 +
 +			if (err == -ENOENT) {
 +rm_slot:
 +				rung_rm_slot(slot);
 +				continue;
 +			}
 +
 +			busy_mm = slot->mm;
 +
 +			if (err == -EBUSY) {
 +				/* skip other vmas on the same mm */
 +				do {
 +					reset = advance_current_scan(rung);
 +					iter = rung->current_scan;
 +					busy_retry--;
 +					if (iter->vma->vm_mm != busy_mm ||
 +					    !busy_retry || reset)
 +						break;
 +				} while (1);
 +
 +				if (iter->vma->vm_mm != busy_mm) {
 +					continue;
 +				} else {
 +					/* scan round finsished */
 +					break;
 +				}
 +			}
 +
 +			BUG_ON(!vma_can_enter(slot->vma));
 +			if (uksm_test_exit(slot->vma->vm_mm)) {
 +				mmsem_batch = 0;
 +				up_read(&slot->vma->vm_mm->mmap_sem);
 +				goto rm_slot;
 +			}
 +
 +			if (mmsem_batch)
 +				mmsem_batch--;
 +			else
 +				mmsem_batch = UKSM_MMSEM_BATCH;
 +
 +			/* Ok, we have take the mmap_sem, ready to scan */
 +			scan_vma_one_page(slot);
 +			rung->pages_to_scan--;
 +			vpages++;
 +
 +			if (rung->current_offset + rung->step > slot->pages - 1
 +			    || vma_fully_scanned(slot)) {
 +				up_read(&slot->vma->vm_mm->mmap_sem);
 +				judge_slot(slot);
 +				mmsem_batch = 0;
 +			} else {
 +				rung->current_offset += rung->step;
 +				if (!mmsem_batch)
 +					up_read(&slot->vma->vm_mm->mmap_sem);
 +			}
 +
 +			busy_retry = BUSY_RETRY;
 +			cond_resched();
 +		}
 +
 +		if (mmsem_batch) {
 +			up_read(&slot->vma->vm_mm->mmap_sem);
 +			mmsem_batch = 0;
 +		}
 +
 +		if (freezing(current))
 +			break;
 +
 +		cond_resched();
 +	}
 +	end_time = task_sched_runtime(current);
 +	delta_exec = end_time - start_time;
 +
 +	if (freezing(current))
 +		return;
 +
 +	cleanup_vma_slots();
 +	uksm_enter_all_slots();
 +
 +	round_finished = 1;
 +	all_rungs_emtpy = 1;
 +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
 +		struct scan_rung *rung = &uksm_scan_ladder[i];
 +
 +		if (rung->vma_root.num) {
 +			all_rungs_emtpy = 0;
 +			if (!rung_round_finished(rung))
 +				round_finished = 0;
 +		}
 +	}
 +
 +	if (all_rungs_emtpy)
 +		round_finished = 0;
 +
 +	if (round_finished) {
 +		round_update_ladder();
 +		uksm_eval_round++;
 +
 +		if (hash_round_finished() && rshash_adjust()) {
 +			/* Reset the unstable root iff hash strength changed */
 +			uksm_hash_round++;
 +			root_unstable_tree = RB_ROOT;
 +			free_all_tree_nodes(&unstable_tree_node_list);
 +		}
 +
 +		/*
 +		 * A number of pages can hang around indefinitely on per-cpu
 +		 * pagevecs, raised page count preventing write_protect_page
 +		 * from merging them.  Though it doesn't really matter much,
 +		 * it is puzzling to see some stuck in pages_volatile until
 +		 * other activity jostles them out, and they also prevented
 +		 * LTP's KSM test from succeeding deterministically; so drain
 +		 * them here (here rather than on entry to uksm_do_scan(),
 +		 * so we don't IPI too often when pages_to_scan is set low).
 +		 */
 +		lru_add_drain_all();
 +	}
 +
 +
 +	if (vpages && delta_exec > 0) {
 +		pcost = (unsigned long) delta_exec / vpages;
 +		if (likely(uksm_ema_page_time))
 +			uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
 +		else
 +			uksm_ema_page_time = pcost;
 +	}
 +
 +	uksm_calc_scan_pages();
 +	uksm_sleep_real = uksm_sleep_jiffies;
 +	/* in case of radical cpu bursts, apply the upper bound */
 +	end_time = task_sched_runtime(current);
 +	if (max_cpu_ratio && end_time > start_time) {
 +		scan_time = end_time - start_time;
 +		expected_jiffies = msecs_to_jiffies(
 +			scan_time_to_sleep(scan_time, max_cpu_ratio));
 +
 +		if (expected_jiffies > uksm_sleep_real)
 +			uksm_sleep_real = expected_jiffies;
 +
 +		/* We have a 1 second up bound for responsiveness. */
 +		if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
 +			uksm_sleep_real = msecs_to_jiffies(1000);
 +	}
 +
 +	return;
 +}
 +
 +static int ksmd_should_run(void)
 +{
 +	return uksm_run & UKSM_RUN_MERGE;
 +}
 +
 +static int uksm_scan_thread(void *nothing)
 +{
 +	set_freezable();
 +	set_user_nice(current, 5);
 +
 +	while (!kthread_should_stop()) {
 +		mutex_lock(&uksm_thread_mutex);
 +		if (ksmd_should_run()) {
 +			uksm_do_scan();
 +		}
 +		mutex_unlock(&uksm_thread_mutex);
 +
 +		try_to_freeze();
 +
 +		if (ksmd_should_run()) {
 +			schedule_timeout_interruptible(uksm_sleep_real);
 +			uksm_sleep_times++;
 +		} else {
 +			wait_event_freezable(uksm_thread_wait,
 +				ksmd_should_run() || kthread_should_stop());
 +		}
 +	}
 +	return 0;
 +}
 +
 +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
 +{
 +	struct stable_node *stable_node;
 +	struct node_vma *node_vma;
 +	struct rmap_item *rmap_item;
 +	int ret = SWAP_AGAIN;
 +	int search_new_forks = 0;
 +	unsigned long address;
 +
 +	VM_BUG_ON_PAGE(!PageKsm(page), page);
 +	VM_BUG_ON_PAGE(!PageLocked(page), page);
 +
 +	stable_node = page_stable_node(page);
 +	if (!stable_node)
 +		return ret;
 +again:
 +	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
 +		hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
 +			struct anon_vma *anon_vma = rmap_item->anon_vma;
 +			struct anon_vma_chain *vmac;
 +			struct vm_area_struct *vma;
 +
 +			anon_vma_lock_read(anon_vma);
 +			anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
 +						       0, ULONG_MAX) {
 +				vma = vmac->vma;
 +				address = get_rmap_addr(rmap_item);
 +
 +				if (address < vma->vm_start ||
 +				    address >= vma->vm_end)
 +					continue;
 +
 +				if ((rmap_item->slot->vma == vma) ==
 +				    search_new_forks)
 +					continue;
 +
 +				if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 +					continue;
 +
 +				ret = rwc->rmap_one(page, vma, address, rwc->arg);
 +				if (ret != SWAP_AGAIN) {
 +					anon_vma_unlock_read(anon_vma);
 +					goto out;
 +				}
 +
 +				if (rwc->done && rwc->done(page)) {
 +					anon_vma_unlock_read(anon_vma);
 +					goto out;
 +				}
 +			}
 +			anon_vma_unlock_read(anon_vma);
 +		}
 +	}
 +	if (!search_new_forks++)
 +		goto again;
 +out:
 +	return ret;
 +}
 +
 +#ifdef CONFIG_MIGRATION
 +/* Common ksm interface but may be specific to uksm */
 +void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 +{
 +	struct stable_node *stable_node;
 +
 +	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
 +	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 +	VM_BUG_ON(newpage->mapping != oldpage->mapping);
 +
 +	stable_node = page_stable_node(newpage);
 +	if (stable_node) {
 +		VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
 +		stable_node->kpfn = page_to_pfn(newpage);
 +	}
 +}
 +#endif /* CONFIG_MIGRATION */
 +
 +#ifdef CONFIG_MEMORY_HOTREMOVE
 +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
 +						 unsigned long end_pfn)
 +{
 +	struct rb_node *node;
 +
 +	for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
 +		struct stable_node *stable_node;
 +
 +		stable_node = rb_entry(node, struct stable_node, node);
 +		if (stable_node->kpfn >= start_pfn &&
 +		    stable_node->kpfn < end_pfn)
 +			return stable_node;
 +	}
 +	return NULL;
 +}
 +
 +static int uksm_memory_callback(struct notifier_block *self,
 +			       unsigned long action, void *arg)
 +{
 +	struct memory_notify *mn = arg;
 +	struct stable_node *stable_node;
 +
 +	switch (action) {
 +	case MEM_GOING_OFFLINE:
 +		/*
 +		 * Keep it very simple for now: just lock out ksmd and
 +		 * MADV_UNMERGEABLE while any memory is going offline.
 +		 * mutex_lock_nested() is necessary because lockdep was alarmed
 +		 * that here we take uksm_thread_mutex inside notifier chain
 +		 * mutex, and later take notifier chain mutex inside
 +		 * uksm_thread_mutex to unlock it.   But that's safe because both
 +		 * are inside mem_hotplug_mutex.
 +		 */
 +		mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
 +		break;
 +
 +	case MEM_OFFLINE:
 +		/*
 +		 * Most of the work is done by page migration; but there might
 +		 * be a few stable_nodes left over, still pointing to struct
 +		 * pages which have been offlined: prune those from the tree.
 +		 */
 +		while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
 +					mn->start_pfn + mn->nr_pages)) != NULL)
 +			remove_node_from_stable_tree(stable_node, 1, 1);
 +		/* fallthrough */
 +
 +	case MEM_CANCEL_OFFLINE:
 +		mutex_unlock(&uksm_thread_mutex);
 +		break;
 +	}
 +	return NOTIFY_OK;
 +}
 +#endif /* CONFIG_MEMORY_HOTREMOVE */
 +
 +#ifdef CONFIG_SYSFS
 +/*
 + * This all compiles without CONFIG_SYSFS, but is a waste of space.
 + */
 +
 +#define UKSM_ATTR_RO(_name) \
 +	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 +#define UKSM_ATTR(_name) \
 +	static struct kobj_attribute _name##_attr = \
 +		__ATTR(_name, 0644, _name##_show, _name##_store)
 +
 +static ssize_t max_cpu_percentage_show(struct kobject *kobj,
 +				    struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
 +}
 +
 +static ssize_t max_cpu_percentage_store(struct kobject *kobj,
 +				     struct kobj_attribute *attr,
 +				     const char *buf, size_t count)
 +{
 +	unsigned long max_cpu_percentage;
 +	int err;
 +
 +	err = kstrtoul(buf, 10, &max_cpu_percentage);
 +	if (err || max_cpu_percentage > 100)
 +		return -EINVAL;
 +
 +	if (max_cpu_percentage == 100)
 +		max_cpu_percentage = 99;
 +	else if (max_cpu_percentage < 10)
 +		max_cpu_percentage = 10;
 +
 +	uksm_max_cpu_percentage = max_cpu_percentage;
 +
 +	return count;
 +}
 +UKSM_ATTR(max_cpu_percentage);
 +
 +static ssize_t sleep_millisecs_show(struct kobject *kobj,
 +				    struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
 +}
 +
 +static ssize_t sleep_millisecs_store(struct kobject *kobj,
 +				     struct kobj_attribute *attr,
 +				     const char *buf, size_t count)
 +{
 +	unsigned long msecs;
 +	int err;
 +
 +	err = kstrtoul(buf, 10, &msecs);
 +	if (err || msecs > MSEC_PER_SEC)
 +		return -EINVAL;
 +
 +	uksm_sleep_jiffies = msecs_to_jiffies(msecs);
 +	uksm_sleep_saved = uksm_sleep_jiffies;
 +
 +	return count;
 +}
 +UKSM_ATTR(sleep_millisecs);
 +
 +
 +static ssize_t cpu_governor_show(struct kobject *kobj,
 +				  struct kobj_attribute *attr, char *buf)
 +{
 +	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
 +	int i;
 +
 +	buf[0] = '\0';
 +	for (i = 0; i < n ; i++) {
 +		if (uksm_cpu_governor == i)
 +			strcat(buf, "[");
 +
 +		strcat(buf, uksm_cpu_governor_str[i]);
 +
 +		if (uksm_cpu_governor == i)
 +			strcat(buf, "]");
 +
 +		strcat(buf, " ");
 +	}
 +	strcat(buf, "\n");
 +
 +	return strlen(buf);
 +}
 +
 +static inline void init_performance_values(void)
 +{
 +	int i;
 +	struct scan_rung *rung;
 +	struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
 +
 +
 +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
 +		rung = uksm_scan_ladder + i;
 +		rung->cpu_ratio = preset->cpu_ratio[i];
 +		rung->cover_msecs = preset->cover_msecs[i];
 +	}
 +
 +	uksm_max_cpu_percentage = preset->max_cpu;
 +}
 +
 +static ssize_t cpu_governor_store(struct kobject *kobj,
 +				   struct kobj_attribute *attr,
 +				   const char *buf, size_t count)
 +{
 +	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
 +
 +	for (n--; n >=0 ; n--) {
 +		if (!strncmp(buf, uksm_cpu_governor_str[n],
 +			     strlen(uksm_cpu_governor_str[n])))
 +			break;
 +	}
 +
 +	if (n < 0)
 +		return -EINVAL;
 +	else
 +		uksm_cpu_governor = n;
 +
 +	init_performance_values();
 +
 +	return count;
 +}
 +UKSM_ATTR(cpu_governor);
 +
 +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
 +			char *buf)
 +{
 +	return sprintf(buf, "%u\n", uksm_run);
 +}
 +
 +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
 +			 const char *buf, size_t count)
 +{
 +	int err;
 +	unsigned long flags;
 +
 +	err = kstrtoul(buf, 10, &flags);
 +	if (err || flags > UINT_MAX)
 +		return -EINVAL;
 +	if (flags > UKSM_RUN_MERGE)
 +		return -EINVAL;
 +
 +	mutex_lock(&uksm_thread_mutex);
 +	if (uksm_run != flags) {
 +		uksm_run = flags;
 +	}
 +	mutex_unlock(&uksm_thread_mutex);
 +
 +	if (flags & UKSM_RUN_MERGE)
 +		wake_up_interruptible(&uksm_thread_wait);
 +
 +	return count;
 +}
 +UKSM_ATTR(run);
 +
 +static ssize_t abundant_threshold_show(struct kobject *kobj,
 +				     struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%u\n", uksm_abundant_threshold);
 +}
 +
 +static ssize_t abundant_threshold_store(struct kobject *kobj,
 +				      struct kobj_attribute *attr,
 +				      const char *buf, size_t count)
 +{
 +	int err;
 +	unsigned long flags;
 +
 +	err = kstrtoul(buf, 10, &flags);
 +	if (err || flags > 99)
 +		return -EINVAL;
 +
 +	uksm_abundant_threshold = flags;
 +
 +	return count;
 +}
 +UKSM_ATTR(abundant_threshold);
 +
 +static ssize_t thrash_threshold_show(struct kobject *kobj,
 +				     struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%u\n", uksm_thrash_threshold);
 +}
 +
 +static ssize_t thrash_threshold_store(struct kobject *kobj,
 +				      struct kobj_attribute *attr,
 +				      const char *buf, size_t count)
 +{
 +	int err;
 +	unsigned long flags;
 +
 +	err = kstrtoul(buf, 10, &flags);
 +	if (err || flags > 99)
 +		return -EINVAL;
 +
 +	uksm_thrash_threshold = flags;
 +
 +	return count;
 +}
 +UKSM_ATTR(thrash_threshold);
 +
 +static ssize_t cpu_ratios_show(struct kobject *kobj,
 +			       struct kobj_attribute *attr, char *buf)
 +{
 +	int i, size;
 +	struct scan_rung *rung;
 +	char *p = buf;
 +
 +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
 +		rung = &uksm_scan_ladder[i];
 +
 +		if (rung->cpu_ratio > 0)
 +			size = sprintf(p, "%d ", rung->cpu_ratio);
 +		else
 +			size = sprintf(p, "MAX/%d ",
 +					TIME_RATIO_SCALE / -rung->cpu_ratio);
 +
 +		p += size;
 +	}
 +
 +	*p++ = '\n';
 +	*p = '\0';
 +
 +	return p - buf;
 +}
 +
 +static ssize_t cpu_ratios_store(struct kobject *kobj,
 +				      struct kobj_attribute *attr,
 +				      const char *buf, size_t count)
 +{
 +	int i, cpuratios[SCAN_LADDER_SIZE], err;
 +	unsigned long value;
 +	struct scan_rung *rung;
 +	char *p, *end = NULL;
 +
 +	p = kzalloc(count, GFP_KERNEL);
 +	if (!p)
 +		return -ENOMEM;
 +
 +	memcpy(p, buf, count);
 +
 +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
 +		if (i != SCAN_LADDER_SIZE -1) {
 +			end = strchr(p, ' ');
 +			if (!end)
 +				return -EINVAL;
 +
 +			*end = '\0';
 +		}
 +
 +		if (strstr(p, "MAX/")) {
 +			p = strchr(p, '/') + 1;
 +			err = kstrtoul(p, 10, &value);
 +			if (err || value > TIME_RATIO_SCALE || !value)
 +				return -EINVAL;
 +
 +			cpuratios[i] = - (int) (TIME_RATIO_SCALE / value);
 +		} else {
 +			err = kstrtoul(p, 10, &value);
 +			if (err || value > TIME_RATIO_SCALE || !value)
 +				return -EINVAL;
 +
 +			cpuratios[i] = value;
 +		}
 +
 +		p = end + 1;
 +	}
 +
 +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
 +		rung = &uksm_scan_ladder[i];
 +
 +		rung->cpu_ratio = cpuratios[i];
 +	}
 +
 +	return count;
 +}
 +UKSM_ATTR(cpu_ratios);
 +
 +static ssize_t eval_intervals_show(struct kobject *kobj,
 +			       struct kobj_attribute *attr, char *buf)
 +{
 +	int i, size;
 +	struct scan_rung *rung;
 +	char *p = buf;
 +
 +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
 +		rung = &uksm_scan_ladder[i];
 +		size = sprintf(p, "%u ", rung->cover_msecs);
 +		p += size;
 +	}
 +
 +	*p++ = '\n';
 +	*p = '\0';
 +
 +	return p - buf;
 +}
 +
 +static ssize_t eval_intervals_store(struct kobject *kobj,
 +				      struct kobj_attribute *attr,
 +				      const char *buf, size_t count)
 +{
 +	int i, err;
 +	unsigned long values[SCAN_LADDER_SIZE];
 +	struct scan_rung *rung;
 +	char *p, *end = NULL;
 +
 +	p = kzalloc(count, GFP_KERNEL);
 +	if (!p)
 +		return -ENOMEM;
 +
 +	memcpy(p, buf, count);
 +
 +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
 +		if (i != SCAN_LADDER_SIZE -1) {
 +			end = strchr(p, ' ');
 +			if (!end)
 +				return -EINVAL;
 +
 +			*end = '\0';
 +		}
 +
 +		err = kstrtoul(p, 10, &values[i]);
 +		if (err)
 +			return -EINVAL;
 +
 +		p = end + 1;
 +	}
 +
 +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
 +		rung = &uksm_scan_ladder[i];
 +
 +		rung->cover_msecs = values[i];
 +	}
 +
 +	return count;
 +}
 +UKSM_ATTR(eval_intervals);
 +
 +static ssize_t ema_per_page_time_show(struct kobject *kobj,
 +				 struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%lu\n", uksm_ema_page_time);
 +}
 +UKSM_ATTR_RO(ema_per_page_time);
 +
 +static ssize_t pages_shared_show(struct kobject *kobj,
 +				 struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%lu\n", uksm_pages_shared);
 +}
 +UKSM_ATTR_RO(pages_shared);
 +
 +static ssize_t pages_sharing_show(struct kobject *kobj,
 +				  struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%lu\n", uksm_pages_sharing);
 +}
 +UKSM_ATTR_RO(pages_sharing);
 +
 +static ssize_t pages_unshared_show(struct kobject *kobj,
 +				   struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%lu\n", uksm_pages_unshared);
 +}
 +UKSM_ATTR_RO(pages_unshared);
 +
 +static ssize_t full_scans_show(struct kobject *kobj,
 +			       struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%llu\n", fully_scanned_round);
 +}
 +UKSM_ATTR_RO(full_scans);
 +
 +static ssize_t pages_scanned_show(struct kobject *kobj,
 +				  struct kobj_attribute *attr, char *buf)
 +{
 +	unsigned long base = 0;
 +	u64 delta, ret;
 +
 +	if (pages_scanned_stored) {
 +		base = pages_scanned_base;
 +		ret = pages_scanned_stored;
 +		delta = uksm_pages_scanned >> base;
 +		if (CAN_OVERFLOW_U64(ret, delta)) {
 +			ret >>= 1;
 +			delta >>= 1;
 +			base++;
 +			ret += delta;
 +		}
 +	} else {
 +		ret = uksm_pages_scanned;
 +	}
 +
 +	while (ret > ULONG_MAX) {
 +		ret >>= 1;
 +		base++;
 +	}
 +
 +	if (base)
 +		return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
 +	else
 +		return sprintf(buf, "%lu\n", (unsigned long)ret);
 +}
 +UKSM_ATTR_RO(pages_scanned);
 +
 +static ssize_t hash_strength_show(struct kobject *kobj,
 +				  struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%lu\n", hash_strength);
 +}
 +UKSM_ATTR_RO(hash_strength);
 +
 +static ssize_t sleep_times_show(struct kobject *kobj,
 +				  struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%llu\n", uksm_sleep_times);
 +}
 +UKSM_ATTR_RO(sleep_times);
 +
 +
 +static struct attribute *uksm_attrs[] = {
 +	&max_cpu_percentage_attr.attr,
 +	&sleep_millisecs_attr.attr,
 +	&cpu_governor_attr.attr,
 +	&run_attr.attr,
 +	&ema_per_page_time_attr.attr,
 +	&pages_shared_attr.attr,
 +	&pages_sharing_attr.attr,
 +	&pages_unshared_attr.attr,
 +	&full_scans_attr.attr,
 +	&pages_scanned_attr.attr,
 +	&hash_strength_attr.attr,
 +	&sleep_times_attr.attr,
 +	&thrash_threshold_attr.attr,
 +	&abundant_threshold_attr.attr,
 +	&cpu_ratios_attr.attr,
 +	&eval_intervals_attr.attr,
 +	NULL,
 +};
 +
 +static struct attribute_group uksm_attr_group = {
 +	.attrs = uksm_attrs,
 +	.name = "uksm",
 +};
 +#endif /* CONFIG_SYSFS */
 +
 +static inline void init_scan_ladder(void)
 +{
 +	int i;
 +	struct scan_rung *rung;
 +
 +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
 +		rung = uksm_scan_ladder + i;
 +		slot_tree_init_root(&rung->vma_root);
 +	}
 +
 +	init_performance_values();
 +	uksm_calc_scan_pages();
 +}
 +
 +static inline int cal_positive_negative_costs(void)
 +{
 +	struct page *p1, *p2;
 +	unsigned char *addr1, *addr2;
 +	unsigned long i, time_start, hash_cost;
 +	unsigned long loopnum = 0;
 +
 +	/*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
 +	volatile u32 hash;
 +	volatile int ret;
 +
 +	p1 = alloc_page(GFP_KERNEL);
 +	if (!p1)
 +		return -ENOMEM;
 +
 +	p2 = alloc_page(GFP_KERNEL);
 +	if (!p2)
 +		return -ENOMEM;
 +
 +	addr1 = kmap_atomic(p1);
 +	addr2 = kmap_atomic(p2);
 +	memset(addr1, prandom_u32(), PAGE_SIZE);
 +	memcpy(addr2, addr1, PAGE_SIZE);
 +
 +	/* make sure that the two pages differ in last byte */
 +	addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
 +	kunmap_atomic(addr2);
 +	kunmap_atomic(addr1);
 +
 +	time_start = jiffies;
 +	while (jiffies - time_start < 100) {
 +		for (i = 0; i < 100; i++)
 +			hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
 +		loopnum += 100;
 +	}
 +	hash_cost = (jiffies - time_start);
 +
 +	time_start = jiffies;
 +	for (i = 0; i < loopnum; i++)
 +		ret = pages_identical(p1, p2);
 +	memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
 +	memcmp_cost /= hash_cost;
 +	printk(KERN_INFO "UKSM: relative memcmp_cost = %lu "
 +			 "hash=%u cmp_ret=%d.\n",
 +	       memcmp_cost, hash, ret);
 +
 +	__free_page(p1);
 +	__free_page(p2);
 +	return 0;
 +}
 +
 +static int init_zeropage_hash_table(void)
 +{
 +	struct page *page;
 +	char *addr;
 +	int i;
 +
 +	page = alloc_page(GFP_KERNEL);
 +	if (!page)
 +		return -ENOMEM;
 +
 +	addr = kmap_atomic(page);
 +	memset(addr, 0, PAGE_SIZE);
 +	kunmap_atomic(addr);
 +
 +	zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
 +		GFP_KERNEL);
 +	if (!zero_hash_table)
 +		return -ENOMEM;
 +
 +	for (i = 0; i < HASH_STRENGTH_MAX; i++)
 +		zero_hash_table[i] = page_hash(page, i, 0);
 +
 +	__free_page(page);
 +
 +	return 0;
 +}
 +
 +static inline int init_random_sampling(void)
 +{
 +	unsigned long i;
 +	random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
 +	if (!random_nums)
 +		return -ENOMEM;
 +
 +	for (i = 0; i < HASH_STRENGTH_FULL; i++)
 +		random_nums[i] = i;
 +
 +	for (i = 0; i < HASH_STRENGTH_FULL; i++) {
 +		unsigned long rand_range, swap_index, tmp;
 +
 +		rand_range = HASH_STRENGTH_FULL - i;
 +		swap_index = i + prandom_u32() % rand_range;
 +		tmp = random_nums[i];
 +		random_nums[i] =  random_nums[swap_index];
 +		random_nums[swap_index] = tmp;
 +	}
 +
 +	rshash_state.state = RSHASH_NEW;
 +	rshash_state.below_count = 0;
 +	rshash_state.lookup_window_index = 0;
 +
 +	return cal_positive_negative_costs();
 +}
 +
 +static int __init uksm_slab_init(void)
 +{
 +	rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
 +	if (!rmap_item_cache)
 +		goto out;
 +
 +	stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
 +	if (!stable_node_cache)
 +		goto out_free1;
 +
 +	node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
 +	if (!node_vma_cache)
 +		goto out_free2;
 +
 +	vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
 +	if (!vma_slot_cache)
 +		goto out_free3;
 +
 +	tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
 +	if (!tree_node_cache)
 +		goto out_free4;
 +
 +	return 0;
 +
 +out_free4:
 +	kmem_cache_destroy(vma_slot_cache);
 +out_free3:
 +	kmem_cache_destroy(node_vma_cache);
 +out_free2:
 +	kmem_cache_destroy(stable_node_cache);
 +out_free1:
 +	kmem_cache_destroy(rmap_item_cache);
 +out:
 +	return -ENOMEM;
 +}
 +
 +static void __init uksm_slab_free(void)
 +{
 +	kmem_cache_destroy(stable_node_cache);
 +	kmem_cache_destroy(rmap_item_cache);
 +	kmem_cache_destroy(node_vma_cache);
 +	kmem_cache_destroy(vma_slot_cache);
 +	kmem_cache_destroy(tree_node_cache);
 +}
 +
 +/* Common interface to ksm, different to it. */
 +int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 +		unsigned long end, int advice, unsigned long *vm_flags)
 +{
 +	int err;
 +
 +	switch (advice) {
 +	case MADV_MERGEABLE:
 +		return 0;		/* just ignore the advice */
 +
 +	case MADV_UNMERGEABLE:
 +		if (!(*vm_flags & VM_MERGEABLE))
 +			return 0;		/* just ignore the advice */
 +
 +		if (vma->anon_vma) {
 +			err = unmerge_uksm_pages(vma, start, end);
 +			if (err)
 +				return err;
 +		}
 +
 +		uksm_remove_vma(vma);
 +		*vm_flags &= ~VM_MERGEABLE;
 +		break;
 +	}
 +
 +	return 0;
 +}
 +
 +/* Common interface to ksm, actually the same. */
 +struct page *ksm_might_need_to_copy(struct page *page,
 +			struct vm_area_struct *vma, unsigned long address)
 +{
 +	struct anon_vma *anon_vma = page_anon_vma(page);
 +	struct page *new_page;
 +
 +	if (PageKsm(page)) {
 +		if (page_stable_node(page))
 +			return page;	/* no need to copy it */
 +	} else if (!anon_vma) {
 +		return page;		/* no need to copy it */
 +	} else if (anon_vma->root == vma->anon_vma->root &&
 +		 page->index == linear_page_index(vma, address)) {
 +		return page;		/* still no need to copy it */
 +	}
 +	if (!PageUptodate(page))
 +		return page;		/* let do_swap_page report the error */
 +
 +	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 +	if (new_page) {
 +		copy_user_highpage(new_page, page, address, vma);
 +
 +		SetPageDirty(new_page);
 +		__SetPageUptodate(new_page);
 +		__SetPageLocked(new_page);
 +	}
 +
 +	return new_page;
 +}
 +
 +static int __init uksm_init(void)
 +{
 +	struct task_struct *uksm_thread;
 +	int err;
 +
 +	uksm_sleep_jiffies = msecs_to_jiffies(100);
 +	uksm_sleep_saved = uksm_sleep_jiffies;
 +
 +	slot_tree_init();
 +	init_scan_ladder();
 +
 +
 +	err = init_random_sampling();
 +	if (err)
 +		goto out_free2;
 +
 +	err = uksm_slab_init();
 +	if (err)
 +		goto out_free1;
 +
 +	err = init_zeropage_hash_table();
 +	if (err)
 +		goto out_free0;
 +
 +	uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
 +	if (IS_ERR(uksm_thread)) {
 +		printk(KERN_ERR "uksm: creating kthread failed\n");
 +		err = PTR_ERR(uksm_thread);
 +		goto out_free;
 +	}
 +
 +#ifdef CONFIG_SYSFS
 +	err = sysfs_create_group(mm_kobj, &uksm_attr_group);
 +	if (err) {
 +		printk(KERN_ERR "uksm: register sysfs failed\n");
 +		kthread_stop(uksm_thread);
 +		goto out_free;
 +	}
 +#else
 +	uksm_run = UKSM_RUN_MERGE;	/* no way for user to start it */
 +
 +#endif /* CONFIG_SYSFS */
 +
 +#ifdef CONFIG_MEMORY_HOTREMOVE
 +	/*
 +	 * Choose a high priority since the callback takes uksm_thread_mutex:
 +	 * later callbacks could only be taking locks which nest within that.
 +	 */
 +	hotplug_memory_notifier(uksm_memory_callback, 100);
 +#endif
 +	return 0;
 +
 +out_free:
 +	kfree(zero_hash_table);
 +out_free0:
 +	uksm_slab_free();
 +out_free1:
 +	kfree(random_nums);
 +out_free2:
 +	kfree(uksm_scan_ladder);
 +	return err;
 +}
 +
 +#ifdef MODULE
 +subsys_initcall(ksm_init);
 +#else
 +late_initcall(uksm_init);
 +#endif
 +
 diff --git a/mm/vmstat.c b/mm/vmstat.c
 index 084c672..c301bd6 100644
 --- a/mm/vmstat.c
 +++ b/mm/vmstat.c
 @@ -764,6 +764,9 @@ const char * const vmstat_text[] = {
 	"nr_anon_transparent_hugepages",
 	"nr_free_cma",
 
 +#ifdef CONFIG_UKSM
 +	"nr_uksm_zero_pages",
 +#endif
 	/* enum writeback_stat_item counters */
 	"nr_dirty_threshold",
 	"nr_dirty_background_threshold",
 -- 
 2.8.1