Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2018-01-12 23:07:42 +0100
committerJiri Kosina <jkosina@suse.cz>2018-01-12 23:07:42 +0100
commit53f1a3d9e0a1d35eae63f30e72cd0f5d6982859e (patch)
treec8613a070e9fd730f7cb84aa508af921fd7f1279
parent77b0afb86730dc8abb38acfe3ce59e87ca0ceff7 (diff)
parent5579b56d7515006b2c2be7e0e9f4fff340c29602 (diff)
Merge remote-tracking branch 'origin/users/vbabka/SLE15/for-next' into SLE15
Pull HMM support from Vlastimil Babka Conflicts: series.conf
-rw-r--r--config/x86_64/default8
-rw-r--r--patches.suse/hmm-heterogeneous-memory-management-documentation.patch622
-rw-r--r--patches.suse/mm-device-public-memory-device-memory-cache-coherent-with-cpu.patch530
-rw-r--r--patches.suse/mm-device-public-memory-fix-edge-case-in-vm_normal_page.patch57
-rw-r--r--patches.suse/mm-hmm-add-new-helper-to-hotplug-cdm-memory-region.patch190
-rw-r--r--patches.suse/mm-hmm-avoid-bloating-arch-that-do-not-make-use-of-hmm.patch269
-rw-r--r--patches.suse/mm-hmm-devmem-device-memory-hotplug-using-zone_device.patch625
-rw-r--r--patches.suse/mm-hmm-devmem-dummy-hmm-device-for-zone_device-memory.patch179
-rw-r--r--patches.suse/mm-hmm-fix-build-when-hmm-is-disabled.patch68
-rw-r--r--patches.suse/mm-hmm-heterogeneous-memory-management-hmm-for-short.patch362
-rw-r--r--patches.suse/mm-hmm-mirror-device-page-fault-handler.patch448
-rw-r--r--patches.suse/mm-hmm-mirror-helper-to-snapshot-cpu-page-table.patch446
-rw-r--r--patches.suse/mm-hmm-mirror-mirror-process-address-space-on-device-with-hmm-helpers.patch389
-rw-r--r--patches.suse/mm-memcontrol-allow-to-uncharge-page-without-using-page-lru-field.patch261
-rw-r--r--patches.suse/mm-memcontrol-support-memory_device_private.patch189
-rw-r--r--patches.suse/mm-memory_hotplug-introduce-add_pages.patch130
-rw-r--r--patches.suse/mm-migrate-allow-migrate_vma-to-alloc-new-page-on-empty-entry.patch361
-rw-r--r--patches.suse/mm-migrate-fix-indexing-bug-off-by-one-and-avoid-out-of-bound-access.patch40
-rw-r--r--patches.suse/mm-migrate-migrate_vma-unmap-page-from-vma-while-collecting-pages.patch261
-rw-r--r--patches.suse/mm-migrate-new-memory-migration-helper-for-use-with-device-memory.patch671
-rw-r--r--patches.suse/mm-migrate-new-migrate-mode-migrate_sync_no_copy.patch340
-rw-r--r--patches.suse/mm-migrate-support-un-addressable-zone_device-page-in-migration.patch396
-rw-r--r--patches.suse/mm-zone_device-new-type-of-zone_device-for-unaddressable-memory.patch598
-rw-r--r--patches.suse/mm-zone_device-special-case-put_page-for-device-private-pages.patch194
-rw-r--r--series.conf23
25 files changed, 7656 insertions, 1 deletions
diff --git a/config/x86_64/default b/config/x86_64/default
index c466be040e..734335c627 100644
--- a/config/x86_64/default
+++ b/config/x86_64/default
@@ -627,6 +627,12 @@ CONFIG_ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT=y
CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
# CONFIG_IDLE_PAGE_TRACKING is not set
CONFIG_ZONE_DEVICE=y
+CONFIG_ARCH_HAS_HMM=y
+CONFIG_MIGRATE_VMA_HELPER=y
+CONFIG_HMM=y
+CONFIG_HMM_MIRROR=y
+CONFIG_DEVICE_PRIVATE=y
+CONFIG_DEVICE_PUBLIC=y
CONFIG_FRAME_VECTOR=y
CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y
CONFIG_ARCH_HAS_PKEYS=y
@@ -680,6 +686,7 @@ CONFIG_LEGACY_VSYSCALL_EMULATE=y
CONFIG_MODIFY_LDT_SYSCALL=y
CONFIG_HAVE_LIVEPATCH=y
CONFIG_LIVEPATCH=y
+CONFIG_ARCH_HAS_ADD_PAGES=y
CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y
CONFIG_USE_PERCPU_NUMA_NODE_ID=y
@@ -7311,7 +7318,6 @@ CONFIG_SG_POOL=y
CONFIG_ARCH_HAS_SG_CHAIN=y
CONFIG_ARCH_HAS_PMEM_API=y
CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y
-CONFIG_ARCH_HAS_MMIO_FLUSH=y
CONFIG_STACKDEPOT=y
CONFIG_SBITMAP=y
CONFIG_PARMAN=m
diff --git a/patches.suse/hmm-heterogeneous-memory-management-documentation.patch b/patches.suse/hmm-heterogeneous-memory-management-documentation.patch
new file mode 100644
index 0000000000..0f6cf0957c
--- /dev/null
+++ b/patches.suse/hmm-heterogeneous-memory-management-documentation.patch
@@ -0,0 +1,622 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:11:19 -0700
+Subject: hmm: heterogeneous memory management documentation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: bffc33ec539699f045a9254144de3d4eace05f07
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+Patch series "HMM (Heterogeneous Memory Management)", v25.
+
+Heterogeneous Memory Management (HMM) (description and justification)
+
+Today device driver expose dedicated memory allocation API through their
+device file, often relying on a combination of IOCTL and mmap calls.
+The device can only access and use memory allocated through this API.
+This effectively split the program address space into object allocated
+for the device and useable by the device and other regular memory
+(malloc, mmap of a file, share memory, â) only accessible by
+CPU (or in a very limited way by a device by pinning memory).
+
+Allowing different isolated component of a program to use a device thus
+require duplication of the input data structure using device memory
+allocator. This is reasonable for simple data structure (array, grid,
+image, â) but this get extremely complex with advance data
+structure (list, tree, graph, â) that rely on a web of memory
+pointers. This is becoming a serious limitation on the kind of work
+load that can be offloaded to device like GPU.
+
+New industry standard like C++, OpenCL or CUDA are pushing to remove
+this barrier. This require a shared address space between GPU device
+and CPU so that GPU can access any memory of a process (while still
+obeying memory protection like read only). This kind of feature is also
+appearing in various other operating systems.
+
+HMM is a set of helpers to facilitate several aspects of address space
+sharing and device memory management. Unlike existing sharing mechanism
+that rely on pining pages use by a device, HMM relies on mmu_notifier to
+propagate CPU page table update to device page table.
+
+Duplicating CPU page table is only one aspect necessary for efficiently
+using device like GPU. GPU local memory have bandwidth in the TeraBytes/
+second range but they are connected to main memory through a system bus
+like PCIE that is limited to 32GigaBytes/second (PCIE 4.0 16x). Thus it
+is necessary to allow migration of process memory from main system memory
+to device memory. Issue is that on platform that only have PCIE the
+device memory is not accessible by the CPU with the same properties as
+main memory (cache coherency, atomic operations, ...).
+
+To allow migration from main memory to device memory HMM provides a set of
+helper to hotplug device memory as a new type of ZONE_DEVICE memory which
+is un-addressable by CPU but still has struct page representing it. This
+allow most of the core kernel logic that deals with a process memory to
+stay oblivious of the peculiarity of device memory.
+
+When page backing an address of a process is migrated to device memory the
+CPU page table entry is set to a new specific swap entry. CPU access to
+such address triggers a migration back to system memory, just like if the
+page was swap on disk. HMM also blocks any one from pinning a ZONE_DEVICE
+page so that it can always be migrated back to system memory if CPU access
+it. Conversely HMM does not migrate to device memory any page that is pin
+in system memory.
+
+To allow efficient migration between device memory and main memory a new
+migrate_vma() helpers is added with this patchset. It allows to leverage
+device DMA engine to perform the copy operation.
+
+This feature will be use by upstream driver like nouveau mlx5 and probably
+other in the future (amdgpu is next suspect in line). We are actively
+working on nouveau and mlx5 support. To test this patchset we also worked
+with NVidia close source driver team, they have more resources than us to
+test this kind of infrastructure and also a bigger and better userspace
+eco-system with various real industry workload they can be use to test and
+profile HMM.
+
+The expected workload is a program builds a data set on the CPU (from
+disk, from network, from sensors, â). Program uses GPU API (OpenCL,
+CUDA, ...) to give hint on memory placement for the input data and also
+for the output buffer. Program call GPU API to schedule a GPU job, this
+happens using device driver specific ioctl. All this is hidden from
+programmer point of view in case of C++ compiler that transparently
+offload some part of a program to GPU. Program can keep doing other stuff
+on the CPU while the GPU is crunching numbers.
+
+It is expected that CPU will not access the same data set as the GPU while
+GPU is working on it, but this is not mandatory. In fact we expect some
+small memory object to be actively access by both GPU and CPU concurrently
+as synchronization channel and/or for monitoring purposes. Such object
+will stay in system memory and should not be bottlenecked by system bus
+bandwidth (rare write and read access from both CPU and GPU).
+
+As we are relying on device driver API, HMM does not introduce any new
+syscall nor does it modify any existing ones. It does not change any
+POSIX semantics or behaviors. For instance the child after a fork of a
+process that is using HMM will not be impacted in anyway, nor is there any
+data hazard between child COW or parent COW of memory that was migrated to
+device prior to fork.
+
+HMM assume a numbers of hardware features. Device must allow device page
+table to be updated at any time (ie device job must be preemptable).
+Device page table must provides memory protection such as read only.
+Device must track write access (dirty bit). Device must have a minimum
+granularity that match PAGE_SIZE (ie 4k).
+
+Reviewer (just hint):
+Patch 1 HMM documentation
+Patch 2 introduce core infrastructure and definition of HMM, pretty
+ small patch and easy to review
+Patch 3 introduce the mirror functionality of HMM, it relies on
+ mmu_notifier and thus someone familiar with that part would be
+ in better position to review
+Patch 4 is an helper to snapshot CPU page table while synchronizing with
+ concurrent page table update. Understanding mmu_notifier makes
+ review easier.
+Patch 5 is mostly a wrapper around handle_mm_fault()
+Patch 6 add new add_pages() helper to avoid modifying each arch memory
+ hot plug function
+Patch 7 add a new memory type for ZONE_DEVICE and also add all the logic
+ in various core mm to support this new type. Dan Williams and
+ any core mm contributor are best people to review each half of
+ this patchset
+Patch 8 special case HMM ZONE_DEVICE pages inside put_page() Kirill and
+ Dan Williams are best person to review this
+Patch 9 allow to uncharge a page from memory group without using the lru
+ list field of struct page (best reviewer: Johannes Weiner or
+ Vladimir Davydov or Michal Hocko)
+Patch 10 Add support to uncharge ZONE_DEVICE page from a memory cgroup (best
+ reviewer: Johannes Weiner or Vladimir Davydov or Michal Hocko)
+Patch 11 add helper to hotplug un-addressable device memory as new type
+ of ZONE_DEVICE memory (new type introducted in patch 3 of this
+ serie). This is boiler plate code around memory hotplug and it
+ also pick a free range of physical address for the device memory.
+ Note that the physical address do not point to anything (at least
+ as far as the kernel knows).
+Patch 12 introduce a new hmm_device class as an helper for device driver
+ that want to expose multiple device memory under a common fake
+ device driver. This is usefull for multi-gpu configuration.
+ Anyone familiar with device driver infrastructure can review
+ this. Boiler plate code really.
+Patch 13 add a new migrate mode. Any one familiar with page migration is
+ welcome to review.
+Patch 14 introduce a new migration helper (migrate_vma()) that allow to
+ migrate a range of virtual address of a process using device DMA
+ engine to perform the copy. It is not limited to do copy from and
+ to device but can also do copy between any kind of source and
+ destination memory. Again anyone familiar with migration code
+ should be able to verify the logic.
+Patch 15 optimize the new migrate_vma() by unmapping pages while we are
+ collecting them. This can be review by any mm folks.
+Patch 16 add unaddressable memory migration to helper introduced in patch
+ 7, this can be review by anyone familiar with migration code
+Patch 17 add a feature that allow device to allocate non-present page on
+ the GPU when migrating a range of address to device memory. This
+ is an helper for device driver to avoid having to first allocate
+ system memory before migration to device memory
+Patch 18 add a new kind of ZONE_DEVICE memory for cache coherent device
+ memory (CDM)
+Patch 19 add an helper to hotplug CDM memory
+
+Previous patchset posting :
+v1 http://lwn.net/Articles/597289/
+v2 https://lkml.org/lkml/2014/6/12/559
+v3 https://lkml.org/lkml/2014/6/13/633
+v4 https://lkml.org/lkml/2014/8/29/423
+v5 https://lkml.org/lkml/2014/11/3/759
+v6 http://lwn.net/Articles/619737/
+v7 http://lwn.net/Articles/627316/
+v8 https://lwn.net/Articles/645515/
+v9 https://lwn.net/Articles/651553/
+v10 https://lwn.net/Articles/654430/
+v11 http://www.gossamer-threads.com/lists/linux/kernel/2286424
+v12 http://www.kernelhub.org/?msg=972982&p=2
+v13 https://lwn.net/Articles/706856/
+v14 https://lkml.org/lkml/2016/12/8/344
+v15 http://www.mail-archive.com/linux-kernel@xxxxxxxxxxxxxxx/msg1304107.html
+v16 http://www.spinics.net/lists/linux-mm/msg119814.html
+v17 https://lkml.org/lkml/2017/1/27/847
+v18 https://lkml.org/lkml/2017/3/16/596
+v19 https://lkml.org/lkml/2017/4/5/831
+v20 https://lwn.net/Articles/720715/
+v21 https://lkml.org/lkml/2017/4/24/747
+v22 http://lkml.iu.edu/hypermail/linux/kernel/1705.2/05176.html
+v23 https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1404788.html
+v24 https://lwn.net/Articles/726691/
+
+This patch (of 19):
+
+This adds documentation for HMM (Heterogeneous Memory Management). It
+presents the motivation behind it, the features necessary for it to be
+useful and and gives an overview of how this is implemented.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-2-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Mark Hairgrove <mhairgrove@nvidia.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Sherry Cheung <SCheung@nvidia.com>
+Cc: Subhash Gutti <sgutti@nvidia.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ Documentation/vm/hmm.txt | 384 +++++++++++++++++++++++++++++++++++++++++++++++
+ MAINTAINERS | 7
+ 2 files changed, 391 insertions(+)
+
+--- /dev/null
++++ b/Documentation/vm/hmm.txt
+@@ -0,0 +1,384 @@
++Heterogeneous Memory Management (HMM)
++
++Transparently allow any component of a program to use any memory region of said
++program with a device without using device specific memory allocator. This is
++becoming a requirement to simplify the use of advance heterogeneous computing
++where GPU, DSP or FPGA are use to perform various computations.
++
++This document is divided as follow, in the first section i expose the problems
++related to the use of a device specific allocator. The second section i expose
++the hardware limitations that are inherent to many platforms. The third section
++gives an overview of HMM designs. The fourth section explains how CPU page-
++table mirroring works and what is HMM purpose in this context. Fifth section
++deals with how device memory is represented inside the kernel. Finaly the last
++section present the new migration helper that allow to leverage the device DMA
++engine.
++
++
++1) Problems of using device specific memory allocator:
++2) System bus, device memory characteristics
++3) Share address space and migration
++4) Address space mirroring implementation and API
++5) Represent and manage device memory from core kernel point of view
++6) Migrate to and from device memory
++7) Memory cgroup (memcg) and rss accounting
++
++
++-------------------------------------------------------------------------------
++
++1) Problems of using device specific memory allocator:
++
++Device with large amount of on board memory (several giga bytes) like GPU have
++historically manage their memory through dedicated driver specific API. This
++creates a disconnect between memory allocated and managed by device driver and
++regular application memory (private anonymous, share memory or regular file
++back memory). From here on i will refer to this aspect as split address space.
++I use share address space to refer to the opposite situation ie one in which
++any memory region can be use by device transparently.
++
++Split address space because device can only access memory allocated through the
++device specific API. This imply that all memory object in a program are not
++equal from device point of view which complicate large program that rely on a
++wide set of libraries.
++
++Concretly this means that code that wants to leverage device like GPU need to
++copy object between genericly allocated memory (malloc, mmap private/share/)
++and memory allocated through the device driver API (this still end up with an
++mmap but of the device file).
++
++For flat dataset (array, grid, image, ...) this isn't too hard to achieve but
++complex data-set (list, tree, ...) are hard to get right. Duplicating a complex
++data-set need to re-map all the pointer relations between each of its elements.
++This is error prone and program gets harder to debug because of the duplicate
++data-set.
++
++Split address space also means that library can not transparently use data they
++are getting from core program or other library and thus each library might have
++to duplicate its input data-set using specific memory allocator. Large project
++suffer from this and waste resources because of the various memory copy.
++
++Duplicating each library API to accept as input or output memory allocted by
++each device specific allocator is not a viable option. It would lead to a
++combinatorial explosions in the library entry points.
++
++Finaly with the advance of high level language constructs (in C++ but in other
++language too) it is now possible for compiler to leverage GPU or other devices
++without even the programmer knowledge. Some of compiler identified patterns are
++only do-able with a share address. It is as well more reasonable to use a share
++address space for all the other patterns.
++
++
++-------------------------------------------------------------------------------
++
++2) System bus, device memory characteristics
++
++System bus cripple share address due to few limitations. Most system bus only
++allow basic memory access from device to main memory, even cache coherency is
++often optional. Access to device memory from CPU is even more limited, most
++often than not it is not cache coherent.
++
++If we only consider the PCIE bus than device can access main memory (often
++through an IOMMU) and be cache coherent with the CPUs. However it only allows
++a limited set of atomic operation from device on main memory. This is worse
++in the other direction the CPUs can only access a limited range of the device
++memory and can not perform atomic operations on it. Thus device memory can not
++be consider like regular memory from kernel point of view.
++
++Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0
++and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s).
++The final limitation is latency, access to main memory from the device has an
++order of magnitude higher latency than when the device access its own memory.
++
++Some platform are developing new system bus or additions/modifications to PCIE
++to address some of those limitations (OpenCAPI, CCIX). They mainly allow two
++way cache coherency between CPU and device and allow all atomic operations the
++architecture supports. Saddly not all platform are following this trends and
++some major architecture are left without hardware solutions to those problems.
++
++So for share address space to make sense not only we must allow device to
++access any memory memory but we must also permit any memory to be migrated to
++device memory while device is using it (blocking CPU access while it happens).
++
++
++-------------------------------------------------------------------------------
++
++3) Share address space and migration
++
++HMM intends to provide two main features. First one is to share the address
++space by duplication the CPU page table into the device page table so same
++address point to same memory and this for any valid main memory address in
++the process address space.
++
++To achieve this, HMM offer a set of helpers to populate the device page table
++while keeping track of CPU page table updates. Device page table updates are
++not as easy as CPU page table updates. To update the device page table you must
++allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics
++commands in it to perform the update (unmap, cache invalidations and flush,
++...). This can not be done through common code for all device. Hence why HMM
++provides helpers to factor out everything that can be while leaving the gory
++details to the device driver.
++
++The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does
++allow to allocate a struct page for each page of the device memory. Those page
++are special because the CPU can not map them. They however allow to migrate
++main memory to device memory using exhisting migration mechanism and everything
++looks like if page was swap out to disk from CPU point of view. Using a struct
++page gives the easiest and cleanest integration with existing mm mechanisms.
++Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory
++for the device memory and second to perform migration. Policy decision of what
++and when to migrate things is left to the device driver.
++
++Note that any CPU access to a device page trigger a page fault and a migration
++back to main memory ie when a page backing an given address A is migrated from
++a main memory page to a device page then any CPU access to address A trigger a
++page fault and initiate a migration back to main memory.
++
++
++With this two features, HMM not only allow a device to mirror a process address
++space and keeps both CPU and device page table synchronize, but also allow to
++leverage device memory by migrating part of data-set that is actively use by a
++device.
++
++
++-------------------------------------------------------------------------------
++
++4) Address space mirroring implementation and API
++
++Address space mirroring main objective is to allow to duplicate range of CPU
++page table into a device page table and HMM helps keeping both synchronize. A
++device driver that want to mirror a process address space must start with the
++registration of an hmm_mirror struct:
++
++ int hmm_mirror_register(struct hmm_mirror *mirror,
++ struct mm_struct *mm);
++ int hmm_mirror_register_locked(struct hmm_mirror *mirror,
++ struct mm_struct *mm);
++
++The locked variant is to be use when the driver is already holding the mmap_sem
++of the mm in write mode. The mirror struct has a set of callback that are use
++to propagate CPU page table:
++
++ struct hmm_mirror_ops {
++ /* sync_cpu_device_pagetables() - synchronize page tables
++ *
++ * @mirror: pointer to struct hmm_mirror
++ * @update_type: type of update that occurred to the CPU page table
++ * @start: virtual start address of the range to update
++ * @end: virtual end address of the range to update
++ *
++ * This callback ultimately originates from mmu_notifiers when the CPU
++ * page table is updated. The device driver must update its page table
++ * in response to this callback. The update argument tells what action
++ * to perform.
++ *
++ * The device driver must not return from this callback until the device
++ * page tables are completely updated (TLBs flushed, etc); this is a
++ * synchronous call.
++ */
++ void (*update)(struct hmm_mirror *mirror,
++ enum hmm_update action,
++ unsigned long start,
++ unsigned long end);
++ };
++
++Device driver must perform update to the range following action (turn range
++read only, or fully unmap, ...). Once driver callback returns the device must
++be done with the update.
++
++
++When device driver wants to populate a range of virtual address it can use
++either:
++ int hmm_vma_get_pfns(struct vm_area_struct *vma,
++ struct hmm_range *range,
++ unsigned long start,
++ unsigned long end,
++ hmm_pfn_t *pfns);
++ int hmm_vma_fault(struct vm_area_struct *vma,
++ struct hmm_range *range,
++ unsigned long start,
++ unsigned long end,
++ hmm_pfn_t *pfns,
++ bool write,
++ bool block);
++
++First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and
++will not trigger a page fault on missing or non present entry. The second one
++do trigger page fault on missing or read only entry if write parameter is true.
++Page fault use the generic mm page fault code path just like a CPU page fault.
++
++Both function copy CPU page table into their pfns array argument. Each entry in
++that array correspond to an address in the virtual range. HMM provide a set of
++flags to help driver identify special CPU page table entries.
++
++Locking with the update() callback is the most important aspect the driver must
++respect in order to keep things properly synchronize. The usage pattern is :
++
++ int driver_populate_range(...)
++ {
++ struct hmm_range range;
++ ...
++ again:
++ ret = hmm_vma_get_pfns(vma, &range, start, end, pfns);
++ if (ret)
++ return ret;
++ take_lock(driver->update);
++ if (!hmm_vma_range_done(vma, &range)) {
++ release_lock(driver->update);
++ goto again;
++ }
++
++ // Use pfns array content to update device page table
++
++ release_lock(driver->update);
++ return 0;
++ }
++
++The driver->update lock is the same lock that driver takes inside its update()
++callback. That lock must be call before hmm_vma_range_done() to avoid any race
++with a concurrent CPU page table update.
++
++HMM implements all this on top of the mmu_notifier API because we wanted to a
++simpler API and also to be able to perform optimization latter own like doing
++concurrent device update in multi-devices scenario.
++
++HMM also serve as an impedence missmatch between how CPU page table update are
++done (by CPU write to the page table and TLB flushes) from how device update
++their own page table. Device update is a multi-step process, first appropriate
++commands are write to a buffer, then this buffer is schedule for execution on
++the device. It is only once the device has executed commands in the buffer that
++the update is done. Creating and scheduling update command buffer can happen
++concurrently for multiple devices. Waiting for each device to report commands
++as executed is serialize (there is no point in doing this concurrently).
++
++
++-------------------------------------------------------------------------------
++
++5) Represent and manage device memory from core kernel point of view
++
++Several differents design were try to support device memory. First one use
++device specific data structure to keep information about migrated memory and
++HMM hooked itself in various place of mm code to handle any access to address
++that were back by device memory. It turns out that this ended up replicating
++most of the fields of struct page and also needed many kernel code path to be
++updated to understand this new kind of memory.
++
++Thing is most kernel code path never try to access the memory behind a page
++but only care about struct page contents. Because of this HMM switchted to
++directly using struct page for device memory which left most kernel code path
++un-aware of the difference. We only need to make sure that no one ever try to
++map those page from the CPU side.
++
++HMM provide a set of helpers to register and hotplug device memory as a new
++region needing struct page. This is offer through a very simple API:
++
++ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
++ struct device *device,
++ unsigned long size);
++ void hmm_devmem_remove(struct hmm_devmem *devmem);
++
++The hmm_devmem_ops is where most of the important things are:
++
++ struct hmm_devmem_ops {
++ void (*free)(struct hmm_devmem *devmem, struct page *page);
++ int (*fault)(struct hmm_devmem *devmem,
++ struct vm_area_struct *vma,
++ unsigned long addr,
++ struct page *page,
++ unsigned flags,
++ pmd_t *pmdp);
++ };
++
++The first callback (free()) happens when the last reference on a device page is
++drop. This means the device page is now free and no longer use by anyone. The
++second callback happens whenever CPU try to access a device page which it can
++not do. This second callback must trigger a migration back to system memory.
++
++
++-------------------------------------------------------------------------------
++
++6) Migrate to and from device memory
++
++Because CPU can not access device memory, migration must use device DMA engine
++to perform copy from and to device memory. For this we need a new migration
++helper:
++
++ int migrate_vma(const struct migrate_vma_ops *ops,
++ struct vm_area_struct *vma,
++ unsigned long mentries,
++ unsigned long start,
++ unsigned long end,
++ unsigned long *src,
++ unsigned long *dst,
++ void *private);
++
++Unlike other migration function it works on a range of virtual address, there
++is two reasons for that. First device DMA copy has a high setup overhead cost
++and thus batching multiple pages is needed as otherwise the migration overhead
++make the whole excersie pointless. The second reason is because driver trigger
++such migration base on range of address the device is actively accessing.
++
++The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy())
++control destination memory allocation and copy operation. Second one is there
++to allow device driver to perform cleanup operation after migration.
++
++ struct migrate_vma_ops {
++ void (*alloc_and_copy)(struct vm_area_struct *vma,
++ const unsigned long *src,
++ unsigned long *dst,
++ unsigned long start,
++ unsigned long end,
++ void *private);
++ void (*finalize_and_map)(struct vm_area_struct *vma,
++ const unsigned long *src,
++ const unsigned long *dst,
++ unsigned long start,
++ unsigned long end,
++ void *private);
++ };
++
++It is important to stress that this migration helpers allow for hole in the
++virtual address range. Some pages in the range might not be migrated for all
++the usual reasons (page is pin, page is lock, ...). This helper does not fail
++but just skip over those pages.
++
++The alloc_and_copy() might as well decide to not migrate all pages in the
++range (for reasons under the callback control). For those the callback just
++have to leave the corresponding dst entry empty.
++
++Finaly the migration of the struct page might fails (for file back page) for
++various reasons (failure to freeze reference, or update page cache, ...). If
++that happens then the finalize_and_map() can catch any pages that was not
++migrated. Note those page were still copied to new page and thus we wasted
++bandwidth but this is considered as a rare event and a price that we are
++willing to pay to keep all the code simpler.
++
++
++-------------------------------------------------------------------------------
++
++7) Memory cgroup (memcg) and rss accounting
++
++For now device memory is accounted as any regular page in rss counters (either
++anonymous if device page is use for anonymous, file if device page is use for
++file back page or shmem if device page is use for share memory). This is a
++deliberate choice to keep existing application that might start using device
++memory without knowing about it to keep runing unimpacted.
++
++Drawbacks is that OOM killer might kill an application using a lot of device
++memory and not a lot of regular system memory and thus not freeing much system
++memory. We want to gather more real world experience on how application and
++system react under memory pressure in the presence of device memory before
++deciding to account device memory differently.
++
++
++Same decision was made for memory cgroup. Device memory page are accounted
++against same memory cgroup a regular page would be accounted to. This does
++simplify migration to and from device memory. This also means that migration
++back from device memory to regular memory can not fail because it would
++go above memory cgroup limit. We might revisit this choice latter on once we
++get more experience in how device memory is use and its impact on memory
++resource control.
++
++
++Note that device memory can never be pin nor by device driver nor through GUP
++and thus such memory is always free upon process exit. Or when last reference
++is drop in case of share memory or file back memory.
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -7562,6 +7562,13 @@ F: include/linux/nd.h
+ F: include/linux/libnvdimm.h
+ F: include/uapi/linux/ndctl.h
+
++HMM - Heterogeneous Memory Management
++M: Jérôme Glisse <jglisse@redhat.com>
++L: linux-mm@kvack.org
++S: Maintained
++F: mm/hmm*
++F: include/linux/hmm*
++
+ LIBNVDIMM BLK: MMIO-APERTURE DRIVER
+ M: Ross Zwisler <ross.zwisler@linux.intel.com>
+ L: linux-nvdimm@lists.01.org
diff --git a/patches.suse/mm-device-public-memory-device-memory-cache-coherent-with-cpu.patch b/patches.suse/mm-device-public-memory-device-memory-cache-coherent-with-cpu.patch
new file mode 100644
index 0000000000..463db29427
--- /dev/null
+++ b/patches.suse/mm-device-public-memory-device-memory-cache-coherent-with-cpu.patch
@@ -0,0 +1,530 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:12:24 -0700
+Subject: mm/device-public-memory: device memory cache coherent with CPU
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: df6ad69838fc9dcdbee0dcf2fc2c6f1113f8d609
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+Platform with advance system bus (like CAPI or CCIX) allow device memory
+to be accessible from CPU in a cache coherent fashion. Add a new type of
+ZONE_DEVICE to represent such memory. The use case are the same as for
+the un-addressable device memory but without all the corners cases.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-19-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Mark Hairgrove <mhairgrove@nvidia.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Sherry Cheung <SCheung@nvidia.com>
+Cc: Subhash Gutti <sgutti@nvidia.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ fs/proc/task_mmu.c | 2 -
+ include/linux/hmm.h | 4 +--
+ include/linux/ioport.h | 1
+ include/linux/memremap.h | 21 +++++++++++++++++
+ include/linux/mm.h | 20 ++++++++++-------
+ kernel/memremap.c | 8 +++---
+ mm/Kconfig | 11 +++++++++
+ mm/gup.c | 7 +++++
+ mm/hmm.c | 4 +--
+ mm/madvise.c | 2 -
+ mm/memcontrol.c | 12 +++++-----
+ mm/memory.c | 46 +++++++++++++++++++++++++++++++++++----
+ mm/migrate.c | 55 +++++++++++++++++++++++++++++++----------------
+ mm/swap.c | 11 +++++++++
+ 14 files changed, 158 insertions(+), 46 deletions(-)
+
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -1186,7 +1186,7 @@ static pagemap_entry_t pte_to_pagemap_en
+ if (pm->show_pfn)
+ frame = pte_pfn(pte);
+ flags |= PM_PRESENT;
+- page = vm_normal_page(vma, addr, pte);
++ page = _vm_normal_page(vma, addr, pte, true);
+ if (pte_soft_dirty(pte))
+ flags |= PM_SOFT_DIRTY;
+ } else if (is_swap_pte(pte)) {
+--- a/include/linux/hmm.h
++++ b/include/linux/hmm.h
+@@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct
+ #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
++#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+ struct hmm_devmem;
+
+ struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
+@@ -494,7 +494,7 @@ struct hmm_device {
+ */
+ struct hmm_device *hmm_device_new(void *drvdata);
+ void hmm_device_put(struct hmm_device *hmm_device);
+-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
++#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
+
+ /* Below are for HMM internal use only! Not to be used by device driver! */
+--- a/include/linux/ioport.h
++++ b/include/linux/ioport.h
+@@ -131,6 +131,7 @@ enum {
+ IORES_DESC_PERSISTENT_MEMORY = 4,
+ IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
+ IORES_DESC_DEVICE_PRIVATE_MEMORY = 6,
++ IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
+ };
+
+ /* helpers to define resources */
+--- a/include/linux/memremap.h
++++ b/include/linux/memremap.h
+@@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vme
+ *
+ * A more complete discussion of unaddressable memory may be found in
+ * include/linux/hmm.h and Documentation/vm/hmm.txt.
++ *
++ * MEMORY_DEVICE_PUBLIC:
++ * Device memory that is cache coherent from device and CPU point of view. This
++ * is use on platform that have an advance system bus (like CAPI or CCIX). A
++ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
++ * type. Any page of a process can be migrated to such memory. However no one
++ * should be allow to pin such memory so that it can always be evicted.
+ */
+ enum memory_type {
+ MEMORY_DEVICE_HOST = 0,
+ MEMORY_DEVICE_PRIVATE,
++ MEMORY_DEVICE_PUBLIC,
+ };
+
+ /*
+@@ -92,6 +100,8 @@ enum memory_type {
+ * The page_free() callback is called once the page refcount reaches 1
+ * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
+ * This allows the device driver to implement its own memory management.)
++ *
++ * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
+ */
+ typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
+ unsigned long addr,
+@@ -134,6 +144,12 @@ static inline bool is_device_private_pag
+ return is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+ }
++
++static inline bool is_device_public_page(const struct page *page)
++{
++ return is_zone_device_page(page) &&
++ page->pgmap->type == MEMORY_DEVICE_PUBLIC;
++}
+ #else
+ static inline void *devm_memremap_pages(struct device *dev,
+ struct resource *res, struct percpu_ref *ref,
+@@ -157,6 +173,11 @@ static inline bool is_device_private_pag
+ {
+ return false;
+ }
++
++static inline bool is_device_public_page(const struct page *page)
++{
++ return false;
++}
+ #endif
+
+ /**
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -796,15 +796,16 @@ static inline bool is_zone_device_page(c
+ }
+ #endif
+
+-#ifdef CONFIG_DEVICE_PRIVATE
+-void put_zone_device_private_page(struct page *page);
++#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
++void put_zone_device_private_or_public_page(struct page *page);
+ #else
+-static inline void put_zone_device_private_page(struct page *page)
++static inline void put_zone_device_private_or_public_page(struct page *page)
+ {
+ }
+-#endif
++#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
+ static inline bool is_device_private_page(const struct page *page);
++static inline bool is_device_public_page(const struct page *page);
+
+ DECLARE_STATIC_KEY_FALSE(device_private_key);
+
+@@ -830,8 +831,9 @@ static inline void put_page(struct page
+ * include/linux/memremap.h and HMM for details.
+ */
+ if (static_branch_unlikely(&device_private_key) &&
+- unlikely(is_device_private_page(page))) {
+- put_zone_device_private_page(page);
++ unlikely(is_device_private_page(page) ||
++ is_device_public_page(page))) {
++ put_zone_device_private_or_public_page(page);
+ return;
+ }
+
+@@ -1220,8 +1222,10 @@ struct zap_details {
+ pgoff_t last_index; /* Highest page->index to unmap */
+ };
+
+-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+- pte_t pte);
++struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
++ pte_t pte, bool with_public_device);
++#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
++
+ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t pmd);
+
+--- a/kernel/memremap.c
++++ b/kernel/memremap.c
+@@ -503,8 +503,8 @@ struct vmem_altmap *to_vmem_altmap(unsig
+ #endif /* CONFIG_ZONE_DEVICE */
+
+
+-#ifdef CONFIG_DEVICE_PRIVATE
+-void put_zone_device_private_page(struct page *page)
++#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
++void put_zone_device_private_or_public_page(struct page *page)
+ {
+ int count = page_ref_dec_return(page);
+
+@@ -524,5 +524,5 @@ void put_zone_device_private_page(struct
+ } else if (!count)
+ __put_page(page);
+ }
+-EXPORT_SYMBOL(put_zone_device_private_page);
+-#endif /* CONFIG_DEVICE_PRIVATE */
++EXPORT_SYMBOL(put_zone_device_private_or_public_page);
++#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -728,12 +728,23 @@ config HMM_MIRROR
+ config DEVICE_PRIVATE
+ bool "Unaddressable device memory (GPU memory, ...)"
+ depends on ARCH_HAS_HMM
++ select HMM
+
+ help
+ Allows creation of struct pages to represent unaddressable device
+ memory; i.e., memory that is only accessible from the device (or
+ group of devices). You likely also want to select HMM_MIRROR.
+
++config DEVICE_PUBLIC
++ bool "Addressable device memory (like GPU memory)"
++ depends on ARCH_HAS_HMM
++ select HMM
++
++ help
++ Allows creation of struct pages to represent addressable device
++ memory; i.e., memory that is accessible from both the device and
++ the CPU
++
+ config FRAME_VECTOR
+ bool
+
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -364,6 +364,13 @@ static int get_gate_page(struct mm_struc
+ if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
+ goto unmap;
+ *page = pte_page(*pte);
++
++ /*
++ * This should never happen (a device public page in the gate
++ * area).
++ */
++ if (is_device_public_page(*page))
++ goto unmap;
+ }
+ get_page(*page);
+ out:
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -737,7 +737,7 @@ EXPORT_SYMBOL(hmm_vma_fault);
+ #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
++#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+ struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
+ unsigned long addr)
+ {
+@@ -1177,4 +1177,4 @@ static int __init hmm_init(void)
+ }
+
+ device_initcall(hmm_init);
+-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
++#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -344,7 +344,7 @@ static int madvise_free_pte_range(pmd_t
+ continue;
+ }
+
+- page = vm_normal_page(vma, addr, ptent);
++ page = _vm_normal_page(vma, addr, ptent, true);
+ if (!page)
+ continue;
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -4626,10 +4626,11 @@ out:
+ * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
+ * target for charge migration. if @target is not NULL, the entry is stored
+ * in target->ent.
+- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
+- * (so ZONE_DEVICE page and thus not on the lru). For now we such page is
+- * charge like a regular page would be as for all intent and purposes it is
+- * just special memory taking the place of a regular page.
++ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
++ * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
++ * For now we such page is charge like a regular page would be as for all
++ * intent and purposes it is just special memory taking the place of a
++ * regular page.
+ *
+ * See Documentations/vm/hmm.txt and include/linux/hmm.h
+ *
+@@ -4660,7 +4661,8 @@ static enum mc_target_type get_mctgt_typ
+ */
+ if (page->mem_cgroup == mc.from) {
+ ret = MC_TARGET_PAGE;
+- if (is_device_private_page(page))
++ if (is_device_private_page(page) ||
++ is_device_public_page(page))
+ ret = MC_TARGET_DEVICE;
+ if (target)
+ target->page = page;
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -818,8 +818,8 @@ static void print_bad_pte(struct vm_area
+ #else
+ # define HAVE_PTE_SPECIAL 0
+ #endif
+-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+- pte_t pte)
++struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
++ pte_t pte, bool with_public_device)
+ {
+ unsigned long pfn = pte_pfn(pte);
+
+@@ -830,8 +830,31 @@ struct page *vm_normal_page(struct vm_ar
+ return vma->vm_ops->find_special_page(vma, addr);
+ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+ return NULL;
+- if (!is_zero_pfn(pfn))
+- print_bad_pte(vma, addr, pte, NULL);
++ if (is_zero_pfn(pfn))
++ return NULL;
++
++ /*
++ * Device public pages are special pages (they are ZONE_DEVICE
++ * pages but different from persistent memory). They behave
++ * allmost like normal pages. The difference is that they are
++ * not on the lru and thus should never be involve with any-
++ * thing that involve lru manipulation (mlock, numa balancing,
++ * ...).
++ *
++ * This is why we still want to return NULL for such page from
++ * vm_normal_page() so that we do not have to special case all
++ * call site of vm_normal_page().
++ */
++ if (likely(pfn < highest_memmap_pfn)) {
++ struct page *page = pfn_to_page(pfn);
++
++ if (is_device_public_page(page)) {
++ if (with_public_device)
++ return page;
++ return NULL;
++ }
++ }
++ print_bad_pte(vma, addr, pte, NULL);
+ return NULL;
+ }
+
+@@ -1012,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, s
+ get_page(page);
+ page_dup_rmap(page, false);
+ rss[mm_counter(page)]++;
++ } else if (pte_devmap(pte)) {
++ page = pte_page(pte);
++
++ /*
++ * Cache coherent device memory behave like regular page and
++ * not like persistent memory page. For more informations see
++ * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
++ */
++ if (is_device_public_page(page)) {
++ get_page(page);
++ page_dup_rmap(page, false);
++ rss[mm_counter(page)]++;
++ }
+ }
+
+ out_set_pte:
+@@ -1266,7 +1302,7 @@ again:
+ if (pte_present(ptent)) {
+ struct page *page;
+
+- page = vm_normal_page(vma, addr, ptent);
++ page = _vm_normal_page(vma, addr, ptent, true);
+ if (unlikely(details) && page) {
+ /*
+ * unmap_shared_mapping_pages() wants to
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -36,6 +36,7 @@
+ #include <linux/hugetlb.h>
+ #include <linux/hugetlb_cgroup.h>
+ #include <linux/gfp.h>
++#include <linux/pfn_t.h>
+ #include <linux/memremap.h>
+ #include <linux/userfaultfd_k.h>
+ #include <linux/balloon_compaction.h>
+@@ -230,10 +231,14 @@ static bool remove_migration_pte(struct
+ if (is_write_migration_entry(entry))
+ pte = maybe_mkwrite(pte, vma);
+
+- if (unlikely(is_zone_device_page(new)) &&
+- is_device_private_page(new)) {
+- entry = make_device_private_entry(new, pte_write(pte));
+- pte = swp_entry_to_pte(entry);
++ if (unlikely(is_zone_device_page(new))) {
++ if (is_device_private_page(new)) {
++ entry = make_device_private_entry(new, pte_write(pte));
++ pte = swp_entry_to_pte(entry);
++ } else if (is_device_public_page(new)) {
++ pte = pte_mkdevmap(pte);
++ flush_dcache_page(new);
++ }
+ } else
+ flush_dcache_page(new);
+
+@@ -406,12 +411,11 @@ int migrate_page_move_mapping(struct add
+ void **pslot;
+
+ /*
+- * ZONE_DEVICE pages have 1 refcount always held by their device
+- *
+- * Note that DAX memory will never reach that point as it does not have
+- * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
++ * Device public or private pages have an extra refcount as they are
++ * ZONE_DEVICE pages.
+ */
+- expected_count += is_zone_device_page(page);
++ expected_count += is_device_private_page(page);
++ expected_count += is_device_public_page(page);
+
+ if (!mapping) {
+ /* Anonymous page without mapping */
+@@ -2082,7 +2086,6 @@ out_unlock:
+
+ #endif /* CONFIG_NUMA */
+
+-
+ struct migrate_vma {
+ struct vm_area_struct *vma;
+ unsigned long *dst;
+@@ -2222,7 +2225,7 @@ again:
+ pfn = 0;
+ goto next;
+ }
+- page = vm_normal_page(migrate->vma, addr, pte);
++ page = _vm_normal_page(migrate->vma, addr, pte, true);
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+ }
+@@ -2365,10 +2368,19 @@ static bool migrate_vma_check_page(struc
+ if (is_device_private_page(page))
+ return true;
+
+- /* Other ZONE_DEVICE memory type are not supported */
+- return false;
++ /*
++ * Only allow device public page to be migrated and account for
++ * the extra reference count imply by ZONE_DEVICE pages.
++ */
++ if (!is_device_public_page(page))
++ return false;
++ extra++;
+ }
+
++ /* For file back page */
++ if (page_mapping(page))
++ extra += 1 + page_has_private(page);
++
+ if ((page_count(page) - extra) > page_mapcount(page))
+ return false;
+
+@@ -2606,11 +2618,18 @@ static void migrate_vma_insert_page(stru
+ */
+ __SetPageUptodate(page);
+
+- if (is_zone_device_page(page) && is_device_private_page(page)) {
+- swp_entry_t swp_entry;
++ if (is_zone_device_page(page)) {
++ if (is_device_private_page(page)) {
++ swp_entry_t swp_entry;
+
+- swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+- entry = swp_entry_to_pte(swp_entry);
++ swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
++ entry = swp_entry_to_pte(swp_entry);
++ } else if (is_device_public_page(page)) {
++ entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
++ if (vma->vm_flags & VM_WRITE)
++ entry = pte_mkwrite(pte_mkdirty(entry));
++ entry = pte_mkdevmap(entry);
++ }
+ } else {
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+@@ -2727,7 +2746,7 @@ static void migrate_vma_pages(struct mig
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+- } else {
++ } else if (!is_device_public_page(newpage)) {
+ /*
+ * Other types of ZONE_DEVICE page are not
+ * supported.
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -759,6 +759,17 @@ void release_pages(struct page **pages,
+ if (is_huge_zero_page(page))
+ continue;
+
++ /* Device public page can not be huge page */
++ if (is_device_public_page(page)) {
++ if (locked_pgdat) {
++ spin_unlock_irqrestore(&locked_pgdat->lru_lock,
++ flags);
++ locked_pgdat = NULL;
++ }
++ put_zone_device_private_or_public_page(page);
++ continue;
++ }
++
+ page = compound_head(page);
+ if (!put_page_testzero(page))
+ continue;
diff --git a/patches.suse/mm-device-public-memory-fix-edge-case-in-vm_normal_page.patch b/patches.suse/mm-device-public-memory-fix-edge-case-in-vm_normal_page.patch
new file mode 100644
index 0000000000..b8e50c7afd
--- /dev/null
+++ b/patches.suse/mm-device-public-memory-fix-edge-case-in-vm_normal_page.patch
@@ -0,0 +1,57 @@
+From: Reza Arbab <arbab@linux.vnet.ibm.com>
+Date: Tue, 3 Oct 2017 16:15:35 -0700
+Subject: mm/device-public-memory: fix edge case in _vm_normal_page()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 7d790d2da386a52cfebcf0c898ba927bece9d4ab
+Patch-mainline: v4.14-rc4
+References: VM Functionality, FATE#323338, bsc#1047814
+
+With device public pages at the end of my memory space, I'm getting
+output from _vm_normal_page():
+
+ BUG: Bad page map in process migrate_pages pte:c0800001ffff0d06 pmd:f95d3000
+ addr:00007fff89330000 vm_flags:00100073 anon_vma:c0000000fa899320 mapping: (null) index:7fff8933
+ file: (null) fault: (null) mmap: (null) readpage: (null)
+ CPU: 0 PID: 13963 Comm: migrate_pages Tainted: P B OE 4.14.0-rc1-wip #155
+ Call Trace:
+ dump_stack+0xb0/0xf4 (unreliable)
+ print_bad_pte+0x28c/0x340
+ _vm_normal_page+0xc0/0x140
+ zap_pte_range+0x664/0xc10
+ unmap_page_range+0x318/0x670
+ unmap_vmas+0x74/0xe0
+ exit_mmap+0xe8/0x1f0
+ mmput+0xac/0x1f0
+ do_exit+0x348/0xcd0
+ do_group_exit+0x5c/0xf0
+ SyS_exit_group+0x1c/0x20
+ system_call+0x58/0x6c
+
+The pfn causing this is the very last one. Correct the bounds check
+accordingly.
+
+Fixes: df6ad69838fc ("mm/device-public-memory: device memory cache coherent with CPU")
+Link: http://lkml.kernel.org/r/1506092178-20351-1-git-send-email-arbab@linux.vnet.ibm.com
+Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
+Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
+Reviewed-by: Balbir Singh <bsingharora@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ mm/memory.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -845,7 +845,7 @@ struct page *_vm_normal_page(struct vm_a
+ * vm_normal_page() so that we do not have to special case all
+ * call site of vm_normal_page().
+ */
+- if (likely(pfn < highest_memmap_pfn)) {
++ if (likely(pfn <= highest_memmap_pfn)) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (is_device_public_page(page)) {
diff --git a/patches.suse/mm-hmm-add-new-helper-to-hotplug-cdm-memory-region.patch b/patches.suse/mm-hmm-add-new-helper-to-hotplug-cdm-memory-region.patch
new file mode 100644
index 0000000000..5f817ccb73
--- /dev/null
+++ b/patches.suse/mm-hmm-add-new-helper-to-hotplug-cdm-memory-region.patch
@@ -0,0 +1,190 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:12:28 -0700
+Subject: mm/hmm: add new helper to hotplug CDM memory region
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: d3df0a423397c9a1ae05c3857e8c04240dd85e68
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+Unlike unaddressable memory, coherent device memory has a real resource
+associated with it on the system (as CPU can address it). Add a new
+helper to hotplug such memory within the HMM framework.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-20-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Reviewed-by: Balbir Singh <bsingharora@gmail.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Mark Hairgrove <mhairgrove@nvidia.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Sherry Cheung <SCheung@nvidia.com>
+Cc: Subhash Gutti <sgutti@nvidia.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/hmm.h | 3 +
+ mm/hmm.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 86 insertions(+), 5 deletions(-)
+
+--- a/include/linux/hmm.h
++++ b/include/linux/hmm.h
+@@ -443,6 +443,9 @@ struct hmm_devmem {
+ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ unsigned long size);
++struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
++ struct device *device,
++ struct resource *res);
+ void hmm_devmem_remove(struct hmm_devmem *devmem);
+
+ /*
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -839,7 +839,11 @@ static void hmm_devmem_release(struct de
+ zone = page_zone(page);
+
+ mem_hotplug_begin();
+- __remove_pages(zone, start_pfn, npages);
++ if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
++ __remove_pages(zone, start_pfn, npages);
++ else
++ arch_remove_memory(start_pfn << PAGE_SHIFT,
++ npages << PAGE_SHIFT);
+ mem_hotplug_done();
+
+ hmm_devmem_radix_release(resource);
+@@ -875,7 +879,11 @@ static int hmm_devmem_pages_create(struc
+ if (is_ram == REGION_INTERSECTS)
+ return -ENXIO;
+
+- devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
++ if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
++ devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
++ else
++ devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
++
+ devmem->pagemap.res = devmem->resource;
+ devmem->pagemap.page_fault = hmm_devmem_fault;
+ devmem->pagemap.page_free = hmm_devmem_free;
+@@ -920,9 +928,15 @@ static int hmm_devmem_pages_create(struc
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
++ *
++ * For device public memory, which is accesible by the CPU, we do
++ * want the linear mapping and thus use arch_add_memory().
+ */
+- ret = add_pages(nid, align_start >> PAGE_SHIFT,
+- align_size >> PAGE_SHIFT, false);
++ if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
++ ret = arch_add_memory(nid, align_start, align_size, false);
++ else
++ ret = add_pages(nid, align_start >> PAGE_SHIFT,
++ align_size >> PAGE_SHIFT, false);
+ if (ret) {
+ mem_hotplug_done();
+ goto error_add_memory;
+@@ -1069,6 +1083,67 @@ error_percpu_ref:
+ }
+ EXPORT_SYMBOL(hmm_devmem_add);
+
++struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
++ struct device *device,
++ struct resource *res)
++{
++ struct hmm_devmem *devmem;
++ int ret;
++
++ if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
++ return ERR_PTR(-EINVAL);
++
++ static_branch_enable(&device_private_key);
++
++ devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
++ GFP_KERNEL, dev_to_node(device));
++ if (!devmem)
++ return ERR_PTR(-ENOMEM);
++
++ init_completion(&devmem->completion);
++ devmem->pfn_first = -1UL;
++ devmem->pfn_last = -1UL;
++ devmem->resource = res;
++ devmem->device = device;
++ devmem->ops = ops;
++
++ ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
++ 0, GFP_KERNEL);
++ if (ret)
++ goto error_percpu_ref;
++
++ ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
++ if (ret)
++ goto error_devm_add_action;
++
++
++ devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
++ devmem->pfn_last = devmem->pfn_first +
++ (resource_size(devmem->resource) >> PAGE_SHIFT);
++
++ ret = hmm_devmem_pages_create(devmem);
++ if (ret)
++ goto error_devm_add_action;
++
++ devres_add(device, devmem);
++
++ ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
++ if (ret) {
++ hmm_devmem_remove(devmem);
++ return ERR_PTR(ret);
++ }
++
++ return devmem;
++
++error_devm_add_action:
++ hmm_devmem_ref_kill(&devmem->ref);
++ hmm_devmem_ref_exit(&devmem->ref);
++error_percpu_ref:
++ devres_free(devmem);
++ return ERR_PTR(ret);
++}
++EXPORT_SYMBOL(hmm_devmem_add_resource);
++
+ /*
+ * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
+ *
+@@ -1082,6 +1157,7 @@ void hmm_devmem_remove(struct hmm_devmem
+ {
+ resource_size_t start, size;
+ struct device *device;
++ bool cdm = false;
+
+ if (!devmem)
+ return;
+@@ -1090,11 +1166,13 @@ void hmm_devmem_remove(struct hmm_devmem
+ start = devmem->resource->start;
+ size = resource_size(devmem->resource);
+
++ cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
+ hmm_devmem_ref_kill(&devmem->ref);
+ hmm_devmem_ref_exit(&devmem->ref);
+ hmm_devmem_pages_remove(devmem);
+
+- devm_release_mem_region(device, start, size);
++ if (!cdm)
++ devm_release_mem_region(device, start, size);
+ }
+ EXPORT_SYMBOL(hmm_devmem_remove);
+
diff --git a/patches.suse/mm-hmm-avoid-bloating-arch-that-do-not-make-use-of-hmm.patch b/patches.suse/mm-hmm-avoid-bloating-arch-that-do-not-make-use-of-hmm.patch
new file mode 100644
index 0000000000..75741378e2
--- /dev/null
+++ b/patches.suse/mm-hmm-avoid-bloating-arch-that-do-not-make-use-of-hmm.patch
@@ -0,0 +1,269 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:12:32 -0700
+Subject: mm/hmm: avoid bloating arch that do not make use of HMM
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 6b368cd4a44ce95b33f1d31f2f932e6ae707f319
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+This moves all new code including new page migration helper behind kernel
+Kconfig option so that there is no codee bloat for arch or user that do
+not want to use HMM or any of its associated features.
+
+arm allyesconfig (without all the patchset, then with and this patch):
+ text data bss dec hex filename
+83721896 46511131 27582964 157815991 96814b7 ../without/vmlinux
+83722364 46511131 27582964 157816459 968168b vmlinux
+
+[jglisse@redhat.com: struct hmm is only use by HMM mirror functionality]
+ Link: http://lkml.kernel.org/r/20170825213133.27286-1-jglisse@redhat.com
+[sfr@canb.auug.org.au: fix build (arm multi_v7_defconfig)]
+ Link: http://lkml.kernel.org/r/20170828181849.323ab81b@canb.auug.org.au
+Link: http://lkml.kernel.org/r/20170818032858.7447-1-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/hmm.h | 9 ++++++---
+ include/linux/memremap.h | 22 +++++++---------------
+ include/linux/migrate.h | 13 +++++++++++++
+ include/linux/mm.h | 26 +++++++++++++++++---------
+ mm/Kconfig | 4 ++++
+ mm/Makefile | 3 ++-
+ mm/hmm.c | 7 +++----
+ mm/migrate.c | 2 ++
+ 8 files changed, 54 insertions(+), 32 deletions(-)
+
+--- a/include/linux/hmm.h
++++ b/include/linux/hmm.h
+@@ -501,18 +501,21 @@ void hmm_device_put(struct hmm_device *h
+
+
+ /* Below are for HMM internal use only! Not to be used by device driver! */
++#if IS_ENABLED(CONFIG_HMM_MIRROR)
+ void hmm_mm_destroy(struct mm_struct *mm);
+
+ static inline void hmm_mm_init(struct mm_struct *mm)
+ {
+ mm->hmm = NULL;
+ }
++#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
++static inline void hmm_mm_destroy(struct mm_struct *mm) {}
++static inline void hmm_mm_init(struct mm_struct *mm) {}
++#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+-#else /* IS_ENABLED(CONFIG_HMM) */
+
+-/* Below are for HMM internal use only! Not to be used by device driver! */
++#else /* IS_ENABLED(CONFIG_HMM) */
+ static inline void hmm_mm_destroy(struct mm_struct *mm) {}
+ static inline void hmm_mm_init(struct mm_struct *mm) {}
+-
+ #endif /* IS_ENABLED(CONFIG_HMM) */
+ #endif /* LINUX_HMM_H */
+--- a/include/linux/memremap.h
++++ b/include/linux/memremap.h
+@@ -138,18 +138,6 @@ void *devm_memremap_pages(struct device
+ struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
+
+ static inline bool is_zone_device_page(const struct page *page);
+-
+-static inline bool is_device_private_page(const struct page *page)
+-{
+- return is_zone_device_page(page) &&
+- page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+-}
+-
+-static inline bool is_device_public_page(const struct page *page)
+-{
+- return is_zone_device_page(page) &&
+- page->pgmap->type == MEMORY_DEVICE_PUBLIC;
+-}
+ #else
+ static inline void *devm_memremap_pages(struct device *dev,
+ struct resource *res, struct percpu_ref *ref,
+@@ -168,17 +156,21 @@ static inline struct dev_pagemap *find_d
+ {
+ return NULL;
+ }
++#endif
+
++#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
+ static inline bool is_device_private_page(const struct page *page)
+ {
+- return false;
++ return is_zone_device_page(page) &&
++ page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+ }
+
+ static inline bool is_device_public_page(const struct page *page)
+ {
+- return false;
++ return is_zone_device_page(page) &&
++ page->pgmap->type == MEMORY_DEVICE_PUBLIC;
+ }
+-#endif
++#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
+ /**
+ * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
+--- a/include/linux/migrate.h
++++ b/include/linux/migrate.h
+@@ -254,6 +254,7 @@ struct migrate_vma_ops {
+ void *private);
+ };
+
++#if defined(CONFIG_MIGRATE_VMA_HELPER)
+ int migrate_vma(const struct migrate_vma_ops *ops,
+ struct vm_area_struct *vma,
+ unsigned long start,
+@@ -261,6 +262,18 @@ int migrate_vma(const struct migrate_vma
+ unsigned long *src,
+ unsigned long *dst,
+ void *private);
++#else
++static inline int migrate_vma(const struct migrate_vma_ops *ops,
++ struct vm_area_struct *vma,
++ unsigned long start,
++ unsigned long end,
++ unsigned long *src,
++ unsigned long *dst,
++ void *private)
++{
++ return -EINVAL;
++}
++#endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
+
+ #endif /* CONFIG_MIGRATION */
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -796,18 +796,27 @@ static inline bool is_zone_device_page(c
+ }
+ #endif
+
+-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
++#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
+ void put_zone_device_private_or_public_page(struct page *page);
+-#else
++DECLARE_STATIC_KEY_FALSE(device_private_key);
++#define IS_HMM_ENABLED static_branch_unlikely(&device_private_key)
++static inline bool is_device_private_page(const struct page *page);
++static inline bool is_device_public_page(const struct page *page);
++#else /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+ static inline void put_zone_device_private_or_public_page(struct page *page)
+ {
+ }
++#define IS_HMM_ENABLED 0
++static inline bool is_device_private_page(const struct page *page)
++{
++ return false;
++}
++static inline bool is_device_public_page(const struct page *page)
++{
++ return false;
++}
+ #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
+-static inline bool is_device_private_page(const struct page *page);
+-static inline bool is_device_public_page(const struct page *page);
+-
+-DECLARE_STATIC_KEY_FALSE(device_private_key);
+
+ static inline void get_page(struct page *page)
+ {
+@@ -830,9 +839,8 @@ static inline void put_page(struct page
+ * free and we need to inform the device driver through callback. See
+ * include/linux/memremap.h and HMM for details.
+ */
+- if (static_branch_unlikely(&device_private_key) &&
+- unlikely(is_device_private_page(page) ||
+- is_device_public_page(page))) {
++ if (IS_HMM_ENABLED && unlikely(is_device_private_page(page) ||
++ unlikely(is_device_public_page(page)))) {
+ put_zone_device_private_or_public_page(page);
+ return;
+ }
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -710,8 +710,12 @@ config ARCH_HAS_HMM
+ depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
+
++config MIGRATE_VMA_HELPER
++ bool
++
+ config HMM
+ bool
++ select MIGRATE_VMA_HELPER
+
+ config HMM_MIRROR
+ bool "HMM mirror CPU page table into a device page table"
+--- a/mm/Makefile
++++ b/mm/Makefile
+@@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.
+ mm_init.o mmu_context.o percpu.o slab_common.o \
+ compaction.o vmacache.o swap_slots.o \
+ interval_tree.o list_lru.o workingset.o \
+- debug.o hmm.o $(mmu-y)
++ debug.o $(mmu-y)
+
+ obj-y += init-mm.o
+
+@@ -103,3 +103,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page
+ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
+ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
+ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
++obj-$(CONFIG_HMM) += hmm.o
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -35,15 +35,16 @@
+
+ #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+
+-
++#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
+ /*
+ * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
+ */
+ DEFINE_STATIC_KEY_FALSE(device_private_key);
+ EXPORT_SYMBOL(device_private_key);
++#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
+
+-#ifdef CONFIG_HMM
++#if IS_ENABLED(CONFIG_HMM_MIRROR)
+ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
+
+ /*
+@@ -128,9 +129,7 @@ void hmm_mm_destroy(struct mm_struct *mm
+ {
+ kfree(mm->hmm);
+ }
+-#endif /* CONFIG_HMM */
+
+-#if IS_ENABLED(CONFIG_HMM_MIRROR)
+ static void hmm_invalidate_range(struct hmm *hmm,
+ enum hmm_update_type action,
+ unsigned long start,
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -2086,6 +2086,7 @@ out_unlock:
+
+ #endif /* CONFIG_NUMA */
+
++#if defined(CONFIG_MIGRATE_VMA_HELPER)
+ struct migrate_vma {
+ struct vm_area_struct *vma;
+ unsigned long *dst;
+@@ -2939,3 +2940,4 @@ int migrate_vma(const struct migrate_vma
+ return 0;
+ }
+ EXPORT_SYMBOL(migrate_vma);
++#endif /* defined(MIGRATE_VMA_HELPER) */
diff --git a/patches.suse/mm-hmm-devmem-device-memory-hotplug-using-zone_device.patch b/patches.suse/mm-hmm-devmem-device-memory-hotplug-using-zone_device.patch
new file mode 100644
index 0000000000..9d6a28d7c0
--- /dev/null
+++ b/patches.suse/mm-hmm-devmem-device-memory-hotplug-using-zone_device.patch
@@ -0,0 +1,625 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:11:58 -0700
+Subject: mm/hmm/devmem: device memory hotplug using ZONE_DEVICE
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 4ef589dc9b10cdcae75a2b2b0e9b2c5e8a92c378
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+This introduce a simple struct and associated helpers for device driver to
+use when hotpluging un-addressable device memory as ZONE_DEVICE. It will
+find a unuse physical address range and trigger memory hotplug for it
+which allocates and initialize struct page for the device memory.
+
+Device driver should use this helper during device initialization to
+hotplug the device memory. It should only need to remove the memory once
+the device is going offline (shutdown or hotremove). There should not be
+any userspace API to hotplug memory expect maybe for host device driver to
+allow to add more memory to a guest device driver.
+
+Device's memory is manage by the device driver and HMM only provides
+helpers to that effect.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-12-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
+Signed-off-by: John Hubbard <jhubbard@nvidia.com>
+Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
+Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
+Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
+Signed-off-by: Balbir Singh <bsingharora@gmail.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/hmm.h | 155 +++++++++++++++++++++
+ mm/hmm.c | 379 +++++++++++++++++++++++++++++++++++++++++++++++++++-
+ 2 files changed, 533 insertions(+), 1 deletion(-)
+
+--- a/include/linux/hmm.h
++++ b/include/linux/hmm.h
+@@ -72,6 +72,11 @@
+
+ #if IS_ENABLED(CONFIG_HMM)
+
++#include <linux/migrate.h>
++#include <linux/memremap.h>
++#include <linux/completion.h>
++
++
+ struct hmm;
+
+ /*
+@@ -322,6 +327,156 @@ int hmm_vma_fault(struct vm_area_struct
+ #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
++#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
++struct hmm_devmem;
++
++struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
++ unsigned long addr);
++
++/*
++ * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
++ *
++ * @free: call when refcount on page reach 1 and thus is no longer use
++ * @fault: call when there is a page fault to unaddressable memory
++ *
++ * Both callback happens from page_free() and page_fault() callback of struct
++ * dev_pagemap respectively. See include/linux/memremap.h for more details on
++ * those.
++ *
++ * The hmm_devmem_ops callback are just here to provide a coherent and
++ * uniq API to device driver and device driver should not register their
++ * own page_free() or page_fault() but rely on the hmm_devmem_ops call-
++ * back.
++ */
++struct hmm_devmem_ops {
++ /*
++ * free() - free a device page
++ * @devmem: device memory structure (see struct hmm_devmem)
++ * @page: pointer to struct page being freed
++ *
++ * Call back occurs whenever a device page refcount reach 1 which
++ * means that no one is holding any reference on the page anymore
++ * (ZONE_DEVICE page have an elevated refcount of 1 as default so
++ * that they are not release to the general page allocator).
++ *
++ * Note that callback has exclusive ownership of the page (as no
++ * one is holding any reference).
++ */
++ void (*free)(struct hmm_devmem *devmem, struct page *page);
++ /*
++ * fault() - CPU page fault or get user page (GUP)
++ * @devmem: device memory structure (see struct hmm_devmem)
++ * @vma: virtual memory area containing the virtual address
++ * @addr: virtual address that faulted or for which there is a GUP
++ * @page: pointer to struct page backing virtual address (unreliable)
++ * @flags: FAULT_FLAG_* (see include/linux/mm.h)
++ * @pmdp: page middle directory
++ * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
++ * on error
++ *
++ * The callback occurs whenever there is a CPU page fault or GUP on a
++ * virtual address. This means that the device driver must migrate the
++ * page back to regular memory (CPU accessible).
++ *
++ * The device driver is free to migrate more than one page from the
++ * fault() callback as an optimization. However if device decide to
++ * migrate more than one page it must always priotirize the faulting
++ * address over the others.
++ *
++ * The struct page pointer is only given as an hint to allow quick
++ * lookup of internal device driver data. A concurrent migration
++ * might have already free that page and the virtual address might
++ * not longer be back by it. So it should not be modified by the
++ * callback.
++ *
++ * Note that mmap semaphore is held in read mode at least when this
++ * callback occurs, hence the vma is valid upon callback entry.
++ */
++ int (*fault)(struct hmm_devmem *devmem,
++ struct vm_area_struct *vma,
++ unsigned long addr,
++ const struct page *page,
++ unsigned int flags,
++ pmd_t *pmdp);
++};
++
++/*
++ * struct hmm_devmem - track device memory
++ *
++ * @completion: completion object for device memory
++ * @pfn_first: first pfn for this resource (set by hmm_devmem_add())
++ * @pfn_last: last pfn for this resource (set by hmm_devmem_add())
++ * @resource: IO resource reserved for this chunk of memory
++ * @pagemap: device page map for that chunk
++ * @device: device to bind resource to
++ * @ops: memory operations callback
++ * @ref: per CPU refcount
++ *
++ * This an helper structure for device drivers that do not wish to implement
++ * the gory details related to hotplugging new memoy and allocating struct
++ * pages.
++ *
++ * Device drivers can directly use ZONE_DEVICE memory on their own if they
++ * wish to do so.
++ */
++struct hmm_devmem {
++ struct completion completion;
++ unsigned long pfn_first;
++ unsigned long pfn_last;
++ struct resource *resource;
++ struct device *device;
++ struct dev_pagemap pagemap;
++ const struct hmm_devmem_ops *ops;
++ struct percpu_ref ref;
++};
++
++/*
++ * To add (hotplug) device memory, HMM assumes that there is no real resource
++ * that reserves a range in the physical address space (this is intended to be
++ * use by unaddressable device memory). It will reserve a physical range big
++ * enough and allocate struct page for it.
++ *
++ * The device driver can wrap the hmm_devmem struct inside a private device
++ * driver struct. The device driver must call hmm_devmem_remove() before the
++ * device goes away and before freeing the hmm_devmem struct memory.
++ */
++struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
++ struct device *device,
++ unsigned long size);
++void hmm_devmem_remove(struct hmm_devmem *devmem);
++
++/*
++ * hmm_devmem_page_set_drvdata - set per-page driver data field
++ *
++ * @page: pointer to struct page
++ * @data: driver data value to set
++ *
++ * Because page can not be on lru we have an unsigned long that driver can use
++ * to store a per page field. This just a simple helper to do that.
++ */
++static inline void hmm_devmem_page_set_drvdata(struct page *page,
++ unsigned long data)
++{
++ unsigned long *drvdata = (unsigned long *)&page->pgmap;
++
++ drvdata[1] = data;
++}
++
++/*
++ * hmm_devmem_page_get_drvdata - get per page driver data field
++ *
++ * @page: pointer to struct page
++ * Return: driver data value
++ */
++static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page)
++{
++ unsigned long *drvdata = (unsigned long *)&page->pgmap;
++
++ return drvdata[1];
++}
++#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
++
++
+ /* Below are for HMM internal use only! Not to be used by device driver! */
+ void hmm_mm_destroy(struct mm_struct *mm);
+
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -23,10 +23,16 @@
+ #include <linux/swap.h>
+ #include <linux/slab.h>
+ #include <linux/sched.h>
++#include <linux/mmzone.h>
++#include <linux/pagemap.h>
+ #include <linux/swapops.h>
+ #include <linux/hugetlb.h>
++#include <linux/memremap.h>
+ #include <linux/jump_label.h>
+ #include <linux/mmu_notifier.h>
++#include <linux/memory_hotplug.h>
++
++#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+
+
+ /*
+@@ -426,7 +432,15 @@ again:
+ * This is a special swap entry, ignore migration, use
+ * device and report anything else as error.
+ */
+- if (is_migration_entry(entry)) {
++ if (is_device_private_entry(entry)) {
++ pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry));
++ if (is_write_device_private_entry(entry)) {
++ pfns[i] |= HMM_PFN_WRITE;
++ } else if (write_fault)
++ goto fault;
++ pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE;
++ pfns[i] |= flag;
++ } else if (is_migration_entry(entry)) {
+ if (hmm_vma_walk->fault) {
+ pte_unmap(ptep);
+ hmm_vma_walk->last = addr;
+@@ -720,3 +734,366 @@ int hmm_vma_fault(struct vm_area_struct
+ }
+ EXPORT_SYMBOL(hmm_vma_fault);
+ #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
++
++
++#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
++struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
++ unsigned long addr)
++{
++ struct page *page;
++
++ page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
++ if (!page)
++ return NULL;
++ lock_page(page);
++ return page;
++}
++EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
++
++
++static void hmm_devmem_ref_release(struct percpu_ref *ref)
++{
++ struct hmm_devmem *devmem;
++
++ devmem = container_of(ref, struct hmm_devmem, ref);
++ complete(&devmem->completion);
++}
++
++static void hmm_devmem_ref_exit(void *data)
++{
++ struct percpu_ref *ref = data;
++ struct hmm_devmem *devmem;
++
++ devmem = container_of(ref, struct hmm_devmem, ref);
++ percpu_ref_exit(ref);
++ devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
++}
++
++static void hmm_devmem_ref_kill(void *data)
++{
++ struct percpu_ref *ref = data;
++ struct hmm_devmem *devmem;
++
++ devmem = container_of(ref, struct hmm_devmem, ref);
++ percpu_ref_kill(ref);
++ wait_for_completion(&devmem->completion);
++ devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
++}
++
++static int hmm_devmem_fault(struct vm_area_struct *vma,
++ unsigned long addr,
++ const struct page *page,
++ unsigned int flags,
++ pmd_t *pmdp)
++{
++ struct hmm_devmem *devmem = page->pgmap->data;
++
++ return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
++}
++
++static void hmm_devmem_free(struct page *page, void *data)
++{
++ struct hmm_devmem *devmem = data;
++
++ devmem->ops->free(devmem, page);
++}
++
++static DEFINE_MUTEX(hmm_devmem_lock);
++static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
++
++static void hmm_devmem_radix_release(struct resource *resource)
++{
++ resource_size_t key, align_start, align_size, align_end;
++
++ align_start = resource->start & ~(PA_SECTION_SIZE - 1);
++ align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
++ align_end = align_start + align_size - 1;
++
++ mutex_lock(&hmm_devmem_lock);
++ for (key = resource->start;
++ key <= resource->end;
++ key += PA_SECTION_SIZE)
++ radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
++ mutex_unlock(&hmm_devmem_lock);
++}
++
++static void hmm_devmem_release(struct device *dev, void *data)
++{
++ struct hmm_devmem *devmem = data;
++ struct resource *resource = devmem->resource;
++ unsigned long start_pfn, npages;
++ struct zone *zone;
++ struct page *page;
++
++ if (percpu_ref_tryget_live(&devmem->ref)) {
++ dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
++ percpu_ref_put(&devmem->ref);
++ }
++
++ /* pages are dead and unused, undo the arch mapping */
++ start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
++ npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
++
++ page = pfn_to_page(start_pfn);
++ zone = page_zone(page);
++
++ mem_hotplug_begin();
++ __remove_pages(zone, start_pfn, npages);
++ mem_hotplug_done();
++
++ hmm_devmem_radix_release(resource);
++}
++
++static struct hmm_devmem *hmm_devmem_find(resource_size_t phys)
++{
++ WARN_ON_ONCE(!rcu_read_lock_held());
++
++ return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT);
++}
++
++static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
++{
++ resource_size_t key, align_start, align_size, align_end;
++ struct device *device = devmem->device;
++ int ret, nid, is_ram;
++ unsigned long pfn;
++
++ align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
++ align_size = ALIGN(devmem->resource->start +
++ resource_size(devmem->resource),
++ PA_SECTION_SIZE) - align_start;
++
++ is_ram = region_intersects(align_start, align_size,
++ IORESOURCE_SYSTEM_RAM,
++ IORES_DESC_NONE);
++ if (is_ram == REGION_MIXED) {
++ WARN_ONCE(1, "%s attempted on mixed region %pr\n",
++ __func__, devmem->resource);
++ return -ENXIO;
++ }
++ if (is_ram == REGION_INTERSECTS)
++ return -ENXIO;
++
++ devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
++ devmem->pagemap.res = devmem->resource;
++ devmem->pagemap.page_fault = hmm_devmem_fault;
++ devmem->pagemap.page_free = hmm_devmem_free;
++ devmem->pagemap.dev = devmem->device;
++ devmem->pagemap.ref = &devmem->ref;
++ devmem->pagemap.data = devmem;
++
++ mutex_lock(&hmm_devmem_lock);
++ align_end = align_start + align_size - 1;
++ for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
++ struct hmm_devmem *dup;
++
++ rcu_read_lock();
++ dup = hmm_devmem_find(key);
++ rcu_read_unlock();
++ if (dup) {
++ dev_err(device, "%s: collides with mapping for %s\n",
++ __func__, dev_name(dup->device));
++ mutex_unlock(&hmm_devmem_lock);
++ ret = -EBUSY;
++ goto error;
++ }
++ ret = radix_tree_insert(&hmm_devmem_radix,
++ key >> PA_SECTION_SHIFT,
++ devmem);
++ if (ret) {
++ dev_err(device, "%s: failed: %d\n", __func__, ret);
++ mutex_unlock(&hmm_devmem_lock);
++ goto error_radix;
++ }
++ }
++ mutex_unlock(&hmm_devmem_lock);
++
++ nid = dev_to_node(device);
++ if (nid < 0)
++ nid = numa_mem_id();
++
++ mem_hotplug_begin();
++ /*
++ * For device private memory we call add_pages() as we only need to
++ * allocate and initialize struct page for the device memory. More-
++ * over the device memory is un-accessible thus we do not want to
++ * create a linear mapping for the memory like arch_add_memory()
++ * would do.
++ */
++ ret = add_pages(nid, align_start >> PAGE_SHIFT,
++ align_size >> PAGE_SHIFT, false);
++ if (ret) {
++ mem_hotplug_done();
++ goto error_add_memory;
++ }
++ move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
++ align_start >> PAGE_SHIFT,
++ align_size >> PAGE_SHIFT);
++ mem_hotplug_done();
++
++ for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
++ struct page *page = pfn_to_page(pfn);
++
++ page->pgmap = &devmem->pagemap;
++ }
++ return 0;
++
++error_add_memory:
++ untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
++error_radix:
++ hmm_devmem_radix_release(devmem->resource);
++error:
++ return ret;
++}
++
++static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
++{
++ struct hmm_devmem *devmem = data;
++
++ return devmem->resource == match_data;
++}
++
++static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
++{
++ devres_release(devmem->device, &hmm_devmem_release,
++ &hmm_devmem_match, devmem->resource);
++}
++
++/*
++ * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
++ *
++ * @ops: memory event device driver callback (see struct hmm_devmem_ops)
++ * @device: device struct to bind the resource too
++ * @size: size in bytes of the device memory to add
++ * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
++ *
++ * This function first finds an empty range of physical address big enough to
++ * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
++ * in turn allocates struct pages. It does not do anything beyond that; all
++ * events affecting the memory will go through the various callbacks provided
++ * by hmm_devmem_ops struct.
++ *
++ * Device driver should call this function during device initialization and
++ * is then responsible of memory management. HMM only provides helpers.
++ */
++struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
++ struct device *device,
++ unsigned long size)
++{
++ struct hmm_devmem *devmem;
++ resource_size_t addr;
++ int ret;
++
++ static_branch_enable(&device_private_key);
++
++ devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
++ GFP_KERNEL, dev_to_node(device));
++ if (!devmem)
++ return ERR_PTR(-ENOMEM);
++
++ init_completion(&devmem->completion);
++ devmem->pfn_first = -1UL;
++ devmem->pfn_last = -1UL;
++ devmem->resource = NULL;
++ devmem->device = device;
++ devmem->ops = ops;
++
++ ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
++ 0, GFP_KERNEL);
++ if (ret)
++ goto error_percpu_ref;
++
++ ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
++ if (ret)
++ goto error_devm_add_action;
++
++ size = ALIGN(size, PA_SECTION_SIZE);
++ addr = min((unsigned long)iomem_resource.end,
++ (1UL << MAX_PHYSMEM_BITS) - 1);
++ addr = addr - size + 1UL;
++
++ /*
++ * FIXME add a new helper to quickly walk resource tree and find free
++ * range
++ *
++ * FIXME what about ioport_resource resource ?
++ */
++ for (; addr > size && addr >= iomem_resource.start; addr -= size) {
++ ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
++ if (ret != REGION_DISJOINT)
++ continue;
++
++ devmem->resource = devm_request_mem_region(device, addr, size,
++ dev_name(device));
++ if (!devmem->resource) {
++ ret = -ENOMEM;
++ goto error_no_resource;
++ }
++ break;
++ }
++ if (!devmem->resource) {
++ ret = -ERANGE;
++ goto error_no_resource;
++ }
++
++ devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
++ devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
++ devmem->pfn_last = devmem->pfn_first +
++ (resource_size(devmem->resource) >> PAGE_SHIFT);
++
++ ret = hmm_devmem_pages_create(devmem);
++ if (ret)
++ goto error_pages;
++
++ devres_add(device, devmem);
++
++ ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
++ if (ret) {
++ hmm_devmem_remove(devmem);
++ return ERR_PTR(ret);
++ }
++
++ return devmem;
++
++error_pages:
++ devm_release_mem_region(device, devmem->resource->start,
++ resource_size(devmem->resource));
++error_no_resource:
++error_devm_add_action:
++ hmm_devmem_ref_kill(&devmem->ref);
++ hmm_devmem_ref_exit(&devmem->ref);
++error_percpu_ref:
++ devres_free(devmem);
++ return ERR_PTR(ret);
++}
++EXPORT_SYMBOL(hmm_devmem_add);
++
++/*
++ * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
++ *
++ * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
++ *
++ * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
++ * of the device driver. It will free struct page and remove the resource that
++ * reserved the physical address range for this device memory.
++ */
++void hmm_devmem_remove(struct hmm_devmem *devmem)
++{
++ resource_size_t start, size;
++ struct device *device;
++
++ if (!devmem)
++ return;
++
++ device = devmem->device;
++ start = devmem->resource->start;
++ size = resource_size(devmem->resource);
++
++ hmm_devmem_ref_kill(&devmem->ref);
++ hmm_devmem_ref_exit(&devmem->ref);
++ hmm_devmem_pages_remove(devmem);
++
++ devm_release_mem_region(device, start, size);
++}
++EXPORT_SYMBOL(hmm_devmem_remove);
++#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
diff --git a/patches.suse/mm-hmm-devmem-dummy-hmm-device-for-zone_device-memory.patch b/patches.suse/mm-hmm-devmem-dummy-hmm-device-for-zone_device-memory.patch
new file mode 100644
index 0000000000..54ea9de98b
--- /dev/null
+++ b/patches.suse/mm-hmm-devmem-dummy-hmm-device-for-zone_device-memory.patch
@@ -0,0 +1,179 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:12:02 -0700
+Subject: mm/hmm/devmem: dummy HMM device for ZONE_DEVICE memory
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 858b54dabf4363daa3a97b9a722130a8e7cea8c9
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+This introduce a dummy HMM device class so device driver can use it to
+create hmm_device for the sole purpose of registering device memory. It
+is useful to device driver that want to manage multiple physical device
+memory under same struct device umbrella.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-13-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
+Signed-off-by: John Hubbard <jhubbard@nvidia.com>
+Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
+Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
+Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/hmm.h | 22 +++++++++++++-
+ mm/hmm.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 102 insertions(+), 1 deletion(-)
+
+--- a/include/linux/hmm.h
++++ b/include/linux/hmm.h
+@@ -72,11 +72,11 @@
+
+ #if IS_ENABLED(CONFIG_HMM)
+
++#include <linux/device.h>
+ #include <linux/migrate.h>
+ #include <linux/memremap.h>
+ #include <linux/completion.h>
+
+-
+ struct hmm;
+
+ /*
+@@ -474,6 +474,26 @@ static inline unsigned long hmm_devmem_p
+
+ return drvdata[1];
+ }
++
++
++/*
++ * struct hmm_device - fake device to hang device memory onto
++ *
++ * @device: device struct
++ * @minor: device minor number
++ */
++struct hmm_device {
++ struct device device;
++ unsigned int minor;
++};
++
++/*
++ * A device driver that wants to handle multiple devices memory through a
++ * single fake device can use hmm_device to do so. This is purely a helper and
++ * it is not strictly needed, in order to make use of any HMM functionality.
++ */
++struct hmm_device *hmm_device_new(void *drvdata);
++void hmm_device_put(struct hmm_device *hmm_device);
+ #endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+
+
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -19,6 +19,7 @@
+ */
+ #include <linux/mm.h>
+ #include <linux/hmm.h>
++#include <linux/init.h>
+ #include <linux/rmap.h>
+ #include <linux/swap.h>
+ #include <linux/slab.h>
+@@ -1096,4 +1097,84 @@ void hmm_devmem_remove(struct hmm_devmem
+ devm_release_mem_region(device, start, size);
+ }
+ EXPORT_SYMBOL(hmm_devmem_remove);
++
++/*
++ * A device driver that wants to handle multiple devices memory through a
++ * single fake device can use hmm_device to do so. This is purely a helper
++ * and it is not needed to make use of any HMM functionality.
++ */
++#define HMM_DEVICE_MAX 256
++
++static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
++static DEFINE_SPINLOCK(hmm_device_lock);
++static struct class *hmm_device_class;
++static dev_t hmm_device_devt;
++
++static void hmm_device_release(struct device *device)
++{
++ struct hmm_device *hmm_device;
++
++ hmm_device = container_of(device, struct hmm_device, device);
++ spin_lock(&hmm_device_lock);
++ clear_bit(hmm_device->minor, hmm_device_mask);
++ spin_unlock(&hmm_device_lock);
++
++ kfree(hmm_device);
++}
++
++struct hmm_device *hmm_device_new(void *drvdata)
++{
++ struct hmm_device *hmm_device;
++
++ hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
++ if (!hmm_device)
++ return ERR_PTR(-ENOMEM);
++
++ spin_lock(&hmm_device_lock);
++ hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
++ if (hmm_device->minor >= HMM_DEVICE_MAX) {
++ spin_unlock(&hmm_device_lock);
++ kfree(hmm_device);
++ return ERR_PTR(-EBUSY);
++ }
++ set_bit(hmm_device->minor, hmm_device_mask);
++ spin_unlock(&hmm_device_lock);
++
++ dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
++ hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
++ hmm_device->minor);
++ hmm_device->device.release = hmm_device_release;
++ dev_set_drvdata(&hmm_device->device, drvdata);
++ hmm_device->device.class = hmm_device_class;
++ device_initialize(&hmm_device->device);
++
++ return hmm_device;
++}
++EXPORT_SYMBOL(hmm_device_new);
++
++void hmm_device_put(struct hmm_device *hmm_device)
++{
++ put_device(&hmm_device->device);
++}
++EXPORT_SYMBOL(hmm_device_put);
++
++static int __init hmm_init(void)
++{
++ int ret;
++
++ ret = alloc_chrdev_region(&hmm_device_devt, 0,
++ HMM_DEVICE_MAX,
++ "hmm_device");
++ if (ret)
++ return ret;
++
++ hmm_device_class = class_create(THIS_MODULE, "hmm_device");
++ if (IS_ERR(hmm_device_class)) {
++ unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
++ return PTR_ERR(hmm_device_class);
++ }
++ return 0;
++}
++
++device_initcall(hmm_init);
+ #endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
diff --git a/patches.suse/mm-hmm-fix-build-when-hmm-is-disabled.patch b/patches.suse/mm-hmm-fix-build-when-hmm-is-disabled.patch
new file mode 100644
index 0000000000..fb0e1747ac
--- /dev/null
+++ b/patches.suse/mm-hmm-fix-build-when-hmm-is-disabled.patch
@@ -0,0 +1,68 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:12:35 -0700
+Subject: mm/hmm: fix build when HMM is disabled
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: de540a9763efcd5b6339158ac2e5932fb3e691b9
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+Combinatorial Kconfig is painfull. Withi this patch all below combination
+build.
+
+1)
+
+2)
+CONFIG_HMM_MIRROR=y
+
+3)
+CONFIG_DEVICE_PRIVATE=y
+
+4)
+CONFIG_DEVICE_PUBLIC=y
+
+5)
+CONFIG_HMM_MIRROR=y
+CONFIG_DEVICE_PUBLIC=y
+
+6)
+CONFIG_HMM_MIRROR=y
+CONFIG_DEVICE_PRIVATE=y
+
+7)
+CONFIG_DEVICE_PRIVATE=y
+CONFIG_DEVICE_PUBLIC=y
+
+8)
+CONFIG_HMM_MIRROR=y
+CONFIG_DEVICE_PRIVATE=y
+CONFIG_DEVICE_PUBLIC=y
+
+Link: http://lkml.kernel.org/r/20170826002149.20919-1-jglisse@redhat.com
+Reported-by: Randy Dunlap <rdunlap@infradead.org>
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/hmm.h | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/include/linux/hmm.h
++++ b/include/linux/hmm.h
+@@ -498,7 +498,7 @@ struct hmm_device {
+ struct hmm_device *hmm_device_new(void *drvdata);
+ void hmm_device_put(struct hmm_device *hmm_device);
+ #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+-
++#endif /* IS_ENABLED(CONFIG_HMM) */
+
+ /* Below are for HMM internal use only! Not to be used by device driver! */
+ #if IS_ENABLED(CONFIG_HMM_MIRROR)
+@@ -517,5 +517,4 @@ static inline void hmm_mm_init(struct mm
+ #else /* IS_ENABLED(CONFIG_HMM) */
+ static inline void hmm_mm_destroy(struct mm_struct *mm) {}
+ static inline void hmm_mm_init(struct mm_struct *mm) {}
+-#endif /* IS_ENABLED(CONFIG_HMM) */
+ #endif /* LINUX_HMM_H */
diff --git a/patches.suse/mm-hmm-heterogeneous-memory-management-hmm-for-short.patch b/patches.suse/mm-hmm-heterogeneous-memory-management-hmm-for-short.patch
new file mode 100644
index 0000000000..266462405c
--- /dev/null
+++ b/patches.suse/mm-hmm-heterogeneous-memory-management-hmm-for-short.patch
@@ -0,0 +1,362 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:11:23 -0700
+Subject: mm/hmm: heterogeneous memory management (HMM for short)
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 133ff0eac95b7dc6edf89dc51bd139a0630bbae7
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+HMM provides 3 separate types of functionality:
+ - Mirroring: synchronize CPU page table and device page table
+ - Device memory: allocating struct page for device memory
+ - Migration: migrating regular memory to device memory
+
+This patch introduces some common helpers and definitions to all of
+those 3 functionality.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-3-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
+Signed-off-by: John Hubbard <jhubbard@nvidia.com>
+Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
+Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
+Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/hmm.h | 152 +++++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/mm_types.h | 6 +
+ kernel/fork.c | 3
+ mm/Kconfig | 13 ++++
+ mm/Makefile | 2
+ mm/hmm.c | 74 ++++++++++++++++++++++
+ 6 files changed, 249 insertions(+), 1 deletion(-)
+
+--- /dev/null
++++ b/include/linux/hmm.h
+@@ -0,0 +1,152 @@
++/*
++ * Copyright 2013 Red Hat Inc.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * Authors: Jérôme Glisse <jglisse@redhat.com>
++ */
++/*
++ * Heterogeneous Memory Management (HMM)
++ *
++ * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it
++ * is for. Here we focus on the HMM API description, with some explanation of
++ * the underlying implementation.
++ *
++ * Short description: HMM provides a set of helpers to share a virtual address
++ * space between CPU and a device, so that the device can access any valid
++ * address of the process (while still obeying memory protection). HMM also
++ * provides helpers to migrate process memory to device memory, and back. Each
++ * set of functionality (address space mirroring, and migration to and from
++ * device memory) can be used independently of the other.
++ *
++ *
++ * HMM address space mirroring API:
++ *
++ * Use HMM address space mirroring if you want to mirror range of the CPU page
++ * table of a process into a device page table. Here, "mirror" means "keep
++ * synchronized". Prerequisites: the device must provide the ability to write-
++ * protect its page tables (at PAGE_SIZE granularity), and must be able to
++ * recover from the resulting potential page faults.
++ *
++ * HMM guarantees that at any point in time, a given virtual address points to
++ * either the same memory in both CPU and device page tables (that is: CPU and
++ * device page tables each point to the same pages), or that one page table (CPU
++ * or device) points to no entry, while the other still points to the old page
++ * for the address. The latter case happens when the CPU page table update
++ * happens first, and then the update is mirrored over to the device page table.
++ * This does not cause any issue, because the CPU page table cannot start
++ * pointing to a new page until the device page table is invalidated.
++ *
++ * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
++ * updates to each device driver that has registered a mirror. It also provides
++ * some API calls to help with taking a snapshot of the CPU page table, and to
++ * synchronize with any updates that might happen concurrently.
++ *
++ *
++ * HMM migration to and from device memory:
++ *
++ * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
++ * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
++ * of the device memory, and allows the device driver to manage its memory
++ * using those struct pages. Having struct pages for device memory makes
++ * migration easier. Because that memory is not addressable by the CPU it must
++ * never be pinned to the device; in other words, any CPU page fault can always
++ * cause the device memory to be migrated (copied/moved) back to regular memory.
++ *
++ * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
++ * allows use of a device DMA engine to perform the copy operation between
++ * regular system memory and device memory.
++ */
++#ifndef LINUX_HMM_H
++#define LINUX_HMM_H
++
++#include <linux/kconfig.h>
++
++#if IS_ENABLED(CONFIG_HMM)
++
++
++/*
++ * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page
++ *
++ * Flags:
++ * HMM_PFN_VALID: pfn is valid
++ * HMM_PFN_WRITE: CPU page table has write permission set
++ */
++typedef unsigned long hmm_pfn_t;
++
++#define HMM_PFN_VALID (1 << 0)
++#define HMM_PFN_WRITE (1 << 1)
++#define HMM_PFN_SHIFT 2
++
++/*
++ * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t
++ * @pfn: hmm_pfn_t to convert to struct page
++ * Returns: struct page pointer if pfn is a valid hmm_pfn_t, NULL otherwise
++ *
++ * If the hmm_pfn_t is valid (ie valid flag set) then return the struct page
++ * matching the pfn value stored in the hmm_pfn_t. Otherwise return NULL.
++ */
++static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn)
++{
++ if (!(pfn & HMM_PFN_VALID))
++ return NULL;
++ return pfn_to_page(pfn >> HMM_PFN_SHIFT);
++}
++
++/*
++ * hmm_pfn_t_to_pfn() - return pfn value store in a hmm_pfn_t
++ * @pfn: hmm_pfn_t to extract pfn from
++ * Returns: pfn value if hmm_pfn_t is valid, -1UL otherwise
++ */
++static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn)
++{
++ if (!(pfn & HMM_PFN_VALID))
++ return -1UL;
++ return (pfn >> HMM_PFN_SHIFT);
++}
++
++/*
++ * hmm_pfn_t_from_page() - create a valid hmm_pfn_t value from struct page
++ * @page: struct page pointer for which to create the hmm_pfn_t
++ * Returns: valid hmm_pfn_t for the page
++ */
++static inline hmm_pfn_t hmm_pfn_t_from_page(struct page *page)
++{
++ return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID;
++}
++
++/*
++ * hmm_pfn_t_from_pfn() - create a valid hmm_pfn_t value from pfn
++ * @pfn: pfn value for which to create the hmm_pfn_t
++ * Returns: valid hmm_pfn_t for the pfn
++ */
++static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn)
++{
++ return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID;
++}
++
++
++/* Below are for HMM internal use only! Not to be used by device driver! */
++void hmm_mm_destroy(struct mm_struct *mm);
++
++static inline void hmm_mm_init(struct mm_struct *mm)
++{
++ mm->hmm = NULL;
++}
++
++#else /* IS_ENABLED(CONFIG_HMM) */
++
++/* Below are for HMM internal use only! Not to be used by device driver! */
++static inline void hmm_mm_destroy(struct mm_struct *mm) {}
++static inline void hmm_mm_init(struct mm_struct *mm) {}
++
++#endif /* IS_ENABLED(CONFIG_HMM) */
++#endif /* LINUX_HMM_H */
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -23,6 +23,7 @@
+
+ struct address_space;
+ struct mem_cgroup;
++struct hmm;
+
+ /*
+ * Each physical page in the system has a struct page associated with
+@@ -503,6 +504,11 @@ struct mm_struct {
+ atomic_long_t hugetlb_usage;
+ #endif
+ struct work_struct async_put_work;
++
++#if IS_ENABLED(CONFIG_HMM)
++ /* HMM needs to track a few things per mm */
++ struct hmm *hmm;
++#endif
+ };
+
+ extern struct mm_struct init_mm;
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -37,6 +37,7 @@
+ #include <linux/binfmts.h>
+ #include <linux/mman.h>
+ #include <linux/mmu_notifier.h>
++#include <linux/hmm.h>
+ #include <linux/fs.h>
+ #include <linux/mm.h>
+ #include <linux/vmacache.h>
+@@ -811,6 +812,7 @@ static struct mm_struct *mm_init(struct
+ mm_init_owner(mm, p);
+ RCU_INIT_POINTER(mm->exe_file, NULL);
+ mmu_notifier_mm_init(mm);
++ hmm_mm_init(mm);
+ init_tlb_flush_pending(mm);
+ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ mm->pmd_huge_pte = NULL;
+@@ -890,6 +892,7 @@ void __mmdrop(struct mm_struct *mm)
+ BUG_ON(mm == &init_mm);
+ mm_free_pgd(mm);
+ destroy_context(mm);
++ hmm_mm_destroy(mm);
+ mmu_notifier_mm_destroy(mm);
+ check_mm(mm);
+ put_user_ns(mm->user_ns);
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -700,6 +700,19 @@ config ZONE_DEVICE
+
+ If FS_DAX is enabled, then say Y.
+
++config ARCH_HAS_HMM
++ bool
++ default y
++ depends on (X86_64 || PPC64)
++ depends on ZONE_DEVICE
++ depends on MMU && 64BIT
++ depends on MEMORY_HOTPLUG
++ depends on MEMORY_HOTREMOVE
++ depends on SPARSEMEM_VMEMMAP
++
++config HMM
++ bool
++
+ config FRAME_VECTOR
+ bool
+
+--- a/mm/Makefile
++++ b/mm/Makefile
+@@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.
+ mm_init.o mmu_context.o percpu.o slab_common.o \
+ compaction.o vmacache.o swap_slots.o \
+ interval_tree.o list_lru.o workingset.o \
+- debug.o $(mmu-y)
++ debug.o hmm.o $(mmu-y)
+
+ obj-y += init-mm.o
+
+--- /dev/null
++++ b/mm/hmm.c
+@@ -0,0 +1,74 @@
++/*
++ * Copyright 2013 Red Hat Inc.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * Authors: Jérôme Glisse <jglisse@redhat.com>
++ */
++/*
++ * Refer to include/linux/hmm.h for information about heterogeneous memory
++ * management or HMM for short.
++ */
++#include <linux/mm.h>
++#include <linux/hmm.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++
++
++#ifdef CONFIG_HMM
++/*
++ * struct hmm - HMM per mm struct
++ *
++ * @mm: mm struct this HMM struct is bound to
++ */
++struct hmm {
++ struct mm_struct *mm;
++};
++
++/*
++ * hmm_register - register HMM against an mm (HMM internal)
++ *
++ * @mm: mm struct to attach to
++ *
++ * This is not intended to be used directly by device drivers. It allocates an
++ * HMM struct if mm does not have one, and initializes it.
++ */
++static struct hmm *hmm_register(struct mm_struct *mm)
++{
++ if (!mm->hmm) {
++ struct hmm *hmm = NULL;
++
++ hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
++ if (!hmm)
++ return NULL;
++ hmm->mm = mm;
++
++ spin_lock(&mm->page_table_lock);
++ if (!mm->hmm)
++ mm->hmm = hmm;
++ else
++ kfree(hmm);
++ spin_unlock(&mm->page_table_lock);
++ }
++
++ /*
++ * The hmm struct can only be freed once the mm_struct goes away,
++ * hence we should always have pre-allocated an new hmm struct
++ * above.
++ */
++ return mm->hmm;
++}
++
++void hmm_mm_destroy(struct mm_struct *mm)
++{
++ kfree(mm->hmm);
++}
++#endif /* CONFIG_HMM */
diff --git a/patches.suse/mm-hmm-mirror-device-page-fault-handler.patch b/patches.suse/mm-hmm-mirror-device-page-fault-handler.patch
new file mode 100644
index 0000000000..4f48d11381
--- /dev/null
+++ b/patches.suse/mm-hmm-mirror-device-page-fault-handler.patch
@@ -0,0 +1,448 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:11:35 -0700
+Subject: mm/hmm/mirror: device page fault handler
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 74eee180b935fcb9b83a56dd7648fb75caf38f0e
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+This handles page fault on behalf of device driver, unlike
+handle_mm_fault() it does not trigger migration back to system memory for
+device memory.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-6-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
+Signed-off-by: John Hubbard <jhubbard@nvidia.com>
+Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
+Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
+Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/hmm.h | 27 +++++
+ mm/hmm.c | 256 +++++++++++++++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 271 insertions(+), 12 deletions(-)
+
+--- a/include/linux/hmm.h
++++ b/include/linux/hmm.h
+@@ -292,6 +292,33 @@ int hmm_vma_get_pfns(struct vm_area_stru
+ unsigned long end,
+ hmm_pfn_t *pfns);
+ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
++
++
++/*
++ * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
++ * not migrate any device memory back to system memory. The hmm_pfn_t array will
++ * be updated with the fault result and current snapshot of the CPU page table
++ * for the range.
++ *
++ * The mmap_sem must be taken in read mode before entering and it might be
++ * dropped by the function if the block argument is false. In that case, the
++ * function returns -EAGAIN.
++ *
++ * Return value does not reflect if the fault was successful for every single
++ * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to
++ * determine fault status for each address.
++ *
++ * Trying to fault inside an invalid vma will result in -EINVAL.
++ *
++ * See the function description in mm/hmm.c for further documentation.
++ */
++int hmm_vma_fault(struct vm_area_struct *vma,
++ struct hmm_range *range,
++ unsigned long start,
++ unsigned long end,
++ hmm_pfn_t *pfns,
++ bool write,
++ bool block);
+ #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -221,6 +221,36 @@ void hmm_mirror_unregister(struct hmm_mi
+ }
+ EXPORT_SYMBOL(hmm_mirror_unregister);
+
++struct hmm_vma_walk {
++ struct hmm_range *range;
++ unsigned long last;
++ bool fault;
++ bool block;
++ bool write;
++};
++
++static int hmm_vma_do_fault(struct mm_walk *walk,
++ unsigned long addr,
++ hmm_pfn_t *pfn)
++{
++ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
++ struct hmm_vma_walk *hmm_vma_walk = walk->private;
++ struct vm_area_struct *vma = walk->vma;
++ int r;
++
++ flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
++ flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
++ r = handle_mm_fault(vma, addr, flags);
++ if (r & VM_FAULT_RETRY)
++ return -EBUSY;
++ if (r & VM_FAULT_ERROR) {
++ *pfn = HMM_PFN_ERROR;
++ return -EFAULT;
++ }
++
++ return -EAGAIN;
++}
++
+ static void hmm_pfns_special(hmm_pfn_t *pfns,
+ unsigned long addr,
+ unsigned long end)
+@@ -244,34 +274,62 @@ static int hmm_pfns_bad(unsigned long ad
+ return 0;
+ }
+
++static void hmm_pfns_clear(hmm_pfn_t *pfns,
++ unsigned long addr,
++ unsigned long end)
++{
++ for (; addr < end; addr += PAGE_SIZE, pfns++)
++ *pfns = 0;
++}
++
+ static int hmm_vma_walk_hole(unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+ {
+- struct hmm_range *range = walk->private;
++ struct hmm_vma_walk *hmm_vma_walk = walk->private;
++ struct hmm_range *range = hmm_vma_walk->range;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long i;
+
++ hmm_vma_walk->last = addr;
+ i = (addr - range->start) >> PAGE_SHIFT;
+- for (; addr < end; addr += PAGE_SIZE, i++)
++ for (; addr < end; addr += PAGE_SIZE, i++) {
+ pfns[i] = HMM_PFN_EMPTY;
++ if (hmm_vma_walk->fault) {
++ int ret;
+
+- return 0;
++ ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
++ if (ret != -EAGAIN)
++ return ret;
++ }
++ }
++
++ return hmm_vma_walk->fault ? -EAGAIN : 0;
+ }
+
+ static int hmm_vma_walk_clear(unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+ {
+- struct hmm_range *range = walk->private;
++ struct hmm_vma_walk *hmm_vma_walk = walk->private;
++ struct hmm_range *range = hmm_vma_walk->range;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long i;
+
++ hmm_vma_walk->last = addr;
+ i = (addr - range->start) >> PAGE_SHIFT;
+- for (; addr < end; addr += PAGE_SIZE, i++)
++ for (; addr < end; addr += PAGE_SIZE, i++) {
+ pfns[i] = 0;
++ if (hmm_vma_walk->fault) {
++ int ret;
+
+- return 0;
++ ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
++ if (ret != -EAGAIN)
++ return ret;
++ }
++ }
++
++ return hmm_vma_walk->fault ? -EAGAIN : 0;
+ }
+
+ static int hmm_vma_walk_pmd(pmd_t *pmdp,
+@@ -279,15 +337,18 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
+ unsigned long end,
+ struct mm_walk *walk)
+ {
+- struct hmm_range *range = walk->private;
++ struct hmm_vma_walk *hmm_vma_walk = walk->private;
++ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long addr = start, i;
++ bool write_fault;
+ hmm_pfn_t flag;
+ pte_t *ptep;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
++ write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
+
+ again:
+ if (pmd_none(*pmdp))
+@@ -316,6 +377,9 @@ again:
+ if (pmd_protnone(pmd))
+ return hmm_vma_walk_clear(start, end, walk);
+
++ if (write_fault && !pmd_write(pmd))
++ return hmm_vma_walk_clear(start, end, walk);
++
+ pfn = pmd_pfn(pmd) + pte_index(addr);
+ flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
+@@ -332,13 +396,55 @@ again:
+
+ pfns[i] = 0;
+
+- if (pte_none(pte) || !pte_present(pte)) {
++ if (pte_none(pte)) {
+ pfns[i] = HMM_PFN_EMPTY;
++ if (hmm_vma_walk->fault)
++ goto fault;
+ continue;
+ }
+
++ if (!pte_present(pte)) {
++ swp_entry_t entry;
++
++ if (!non_swap_entry(entry)) {
++ if (hmm_vma_walk->fault)
++ goto fault;
++ continue;
++ }
++
++ entry = pte_to_swp_entry(pte);
++
++ /*
++ * This is a special swap entry, ignore migration, use
++ * device and report anything else as error.
++ */
++ if (is_migration_entry(entry)) {
++ if (hmm_vma_walk->fault) {
++ pte_unmap(ptep);
++ hmm_vma_walk->last = addr;
++ migration_entry_wait(vma->vm_mm,
++ pmdp, addr);
++ return -EAGAIN;
++ }
++ continue;
++ } else {
++ /* Report error for everything else */
++ pfns[i] = HMM_PFN_ERROR;
++ }
++ continue;
++ }
++
++ if (write_fault && !pte_write(pte))
++ goto fault;
++
+ pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
+ pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
++ continue;
++
++fault:
++ pte_unmap(ptep);
++ /* Fault all pages in range */
++ return hmm_vma_walk_clear(start, end, walk);
+ }
+ pte_unmap(ptep - 1);
+
+@@ -371,6 +477,7 @@ int hmm_vma_get_pfns(struct vm_area_stru
+ unsigned long end,
+ hmm_pfn_t *pfns)
+ {
++ struct hmm_vma_walk hmm_vma_walk;
+ struct mm_walk mm_walk;
+ struct hmm *hmm;
+
+@@ -402,9 +509,12 @@ int hmm_vma_get_pfns(struct vm_area_stru
+ list_add_rcu(&range->list, &hmm->ranges);
+ spin_unlock(&hmm->lock);
+
++ hmm_vma_walk.fault = false;
++ hmm_vma_walk.range = range;
++ mm_walk.private = &hmm_vma_walk;
++
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+- mm_walk.private = range;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+@@ -412,7 +522,6 @@ int hmm_vma_get_pfns(struct vm_area_stru
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+
+ walk_page_range(start, end, &mm_walk);
+-
+ return 0;
+ }
+ EXPORT_SYMBOL(hmm_vma_get_pfns);
+@@ -439,7 +548,7 @@ EXPORT_SYMBOL(hmm_vma_get_pfns);
+ *
+ * There are two ways to use this :
+ * again:
+- * hmm_vma_get_pfns(vma, range, start, end, pfns);
++ * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * trans = device_build_page_table_update_transaction(pfns);
+ * device_page_table_lock();
+ * if (!hmm_vma_range_done(vma, range)) {
+@@ -450,7 +559,7 @@ EXPORT_SYMBOL(hmm_vma_get_pfns);
+ * device_page_table_unlock();
+ *
+ * Or:
+- * hmm_vma_get_pfns(vma, range, start, end, pfns);
++ * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * device_page_table_lock();
+ * hmm_vma_range_done(vma, range);
+ * device_update_page_table(pfns);
+@@ -479,4 +588,127 @@ bool hmm_vma_range_done(struct vm_area_s
+ return range->valid;
+ }
+ EXPORT_SYMBOL(hmm_vma_range_done);
++
++/*
++ * hmm_vma_fault() - try to fault some address in a virtual address range
++ * @vma: virtual memory area containing the virtual address range
++ * @range: use to track pfns array content validity
++ * @start: fault range virtual start address (inclusive)
++ * @end: fault range virtual end address (exclusive)
++ * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted
++ * @write: is it a write fault
++ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
++ * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
++ *
++ * This is similar to a regular CPU page fault except that it will not trigger
++ * any memory migration if the memory being faulted is not accessible by CPUs.
++ *
++ * On error, for one virtual address in the range, the function will set the
++ * hmm_pfn_t error flag for the corresponding pfn entry.
++ *
++ * Expected use pattern:
++ * retry:
++ * down_read(&mm->mmap_sem);
++ * // Find vma and address device wants to fault, initialize hmm_pfn_t
++ * // array accordingly
++ * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry);
++ * switch (ret) {
++ * case -EAGAIN:
++ * hmm_vma_range_done(vma, range);
++ * // You might want to rate limit or yield to play nicely, you may
++ * // also commit any valid pfn in the array assuming that you are
++ * // getting true from hmm_vma_range_monitor_end()
++ * goto retry;
++ * case 0:
++ * break;
++ * default:
++ * // Handle error !
++ * up_read(&mm->mmap_sem)
++ * return;
++ * }
++ * // Take device driver lock that serialize device page table update
++ * driver_lock_device_page_table_update();
++ * hmm_vma_range_done(vma, range);
++ * // Commit pfns we got from hmm_vma_fault()
++ * driver_unlock_device_page_table_update();
++ * up_read(&mm->mmap_sem)
++ *
++ * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
++ * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
++ *
++ * YOU HAVE BEEN WARNED !
++ */
++int hmm_vma_fault(struct vm_area_struct *vma,
++ struct hmm_range *range,
++ unsigned long start,
++ unsigned long end,
++ hmm_pfn_t *pfns,
++ bool write,
++ bool block)
++{
++ struct hmm_vma_walk hmm_vma_walk;
++ struct mm_walk mm_walk;
++ struct hmm *hmm;
++ int ret;
++
++ /* Sanity check, this really should not happen ! */
++ if (start < vma->vm_start || start >= vma->vm_end)
++ return -EINVAL;
++ if (end < vma->vm_start || end > vma->vm_end)
++ return -EINVAL;
++
++ hmm = hmm_register(vma->vm_mm);
++ if (!hmm) {
++ hmm_pfns_clear(pfns, start, end);
++ return -ENOMEM;
++ }
++ /* Caller must have registered a mirror using hmm_mirror_register() */
++ if (!hmm->mmu_notifier.ops)
++ return -EINVAL;
++
++ /* Initialize range to track CPU page table update */
++ range->start = start;
++ range->pfns = pfns;
++ range->end = end;
++ spin_lock(&hmm->lock);
++ range->valid = true;
++ list_add_rcu(&range->list, &hmm->ranges);
++ spin_unlock(&hmm->lock);
++
++ /* FIXME support hugetlb fs */
++ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
++ hmm_pfns_special(pfns, start, end);
++ return 0;
++ }
++
++ hmm_vma_walk.fault = true;
++ hmm_vma_walk.write = write;
++ hmm_vma_walk.block = block;
++ hmm_vma_walk.range = range;
++ mm_walk.private = &hmm_vma_walk;
++ hmm_vma_walk.last = range->start;
++
++ mm_walk.vma = vma;
++ mm_walk.mm = vma->vm_mm;
++ mm_walk.pte_entry = NULL;
++ mm_walk.test_walk = NULL;
++ mm_walk.hugetlb_entry = NULL;
++ mm_walk.pmd_entry = hmm_vma_walk_pmd;
++ mm_walk.pte_hole = hmm_vma_walk_hole;
++
++ do {
++ ret = walk_page_range(start, end, &mm_walk);
++ start = hmm_vma_walk.last;
++ } while (ret == -EAGAIN);
++
++ if (ret) {
++ unsigned long i;
++
++ i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
++ hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
++ hmm_vma_range_done(vma, range);
++ }
++ return ret;
++}
++EXPORT_SYMBOL(hmm_vma_fault);
+ #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
diff --git a/patches.suse/mm-hmm-mirror-helper-to-snapshot-cpu-page-table.patch b/patches.suse/mm-hmm-mirror-helper-to-snapshot-cpu-page-table.patch
new file mode 100644
index 0000000000..fb1c1a4633
--- /dev/null
+++ b/patches.suse/mm-hmm-mirror-helper-to-snapshot-cpu-page-table.patch
@@ -0,0 +1,446 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:11:31 -0700
+Subject: mm/hmm/mirror: helper to snapshot CPU page table
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: da4c3c735ea4dcc2a0b0ff0bd4803c336361b6f5
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+This does not use existing page table walker because we want to share
+same code for our page fault handler.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-5-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
+Signed-off-by: John Hubbard <jhubbard@nvidia.com>
+Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
+Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
+Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/hmm.h | 55 +++++++++-
+ mm/hmm.c | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 338 insertions(+), 2 deletions(-)
+
+--- a/include/linux/hmm.h
++++ b/include/linux/hmm.h
+@@ -79,13 +79,26 @@ struct hmm;
+ *
+ * Flags:
+ * HMM_PFN_VALID: pfn is valid
++ * HMM_PFN_READ: CPU page table has read permission set
+ * HMM_PFN_WRITE: CPU page table has write permission set
++ * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
++ * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none()
++ * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
++ * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not
++ * be mirrored by a device, because the entry will never have HMM_PFN_VALID
++ * set and the pfn value is undefined.
++ * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE)
+ */
+ typedef unsigned long hmm_pfn_t;
+
+ #define HMM_PFN_VALID (1 << 0)
+-#define HMM_PFN_WRITE (1 << 1)
+-#define HMM_PFN_SHIFT 2
++#define HMM_PFN_READ (1 << 1)
++#define HMM_PFN_WRITE (1 << 2)
++#define HMM_PFN_ERROR (1 << 3)
++#define HMM_PFN_EMPTY (1 << 4)
++#define HMM_PFN_SPECIAL (1 << 5)
++#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6)
++#define HMM_PFN_SHIFT 7
+
+ /*
+ * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t
+@@ -241,6 +254,44 @@ struct hmm_mirror {
+
+ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
+ void hmm_mirror_unregister(struct hmm_mirror *mirror);
++
++
++/*
++ * struct hmm_range - track invalidation lock on virtual address range
++ *
++ * @list: all range lock are on a list
++ * @start: range virtual start address (inclusive)
++ * @end: range virtual end address (exclusive)
++ * @pfns: array of pfns (big enough for the range)
++ * @valid: pfns array did not change since it has been fill by an HMM function
++ */
++struct hmm_range {
++ struct list_head list;
++ unsigned long start;
++ unsigned long end;
++ hmm_pfn_t *pfns;
++ bool valid;
++};
++
++/*
++ * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
++ * driver lock that serializes device page table updates, then call
++ * hmm_vma_range_done(), to check if the snapshot is still valid. The same
++ * device driver page table update lock must also be used in the
++ * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
++ * table invalidation serializes on it.
++ *
++ * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
++ * hmm_vma_get_pfns() WITHOUT ERROR !
++ *
++ * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
++ */
++int hmm_vma_get_pfns(struct vm_area_struct *vma,
++ struct hmm_range *range,
++ unsigned long start,
++ unsigned long end,
++ hmm_pfn_t *pfns);
++bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
+ #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -19,8 +19,12 @@
+ */
+ #include <linux/mm.h>
+ #include <linux/hmm.h>
++#include <linux/rmap.h>
++#include <linux/swap.h>
+ #include <linux/slab.h>
+ #include <linux/sched.h>
++#include <linux/swapops.h>
++#include <linux/hugetlb.h>
+ #include <linux/mmu_notifier.h>
+
+
+@@ -31,14 +35,18 @@ static const struct mmu_notifier_ops hmm
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
++ * @lock: lock protecting ranges list
+ * @sequence: we track updates to the CPU page table with a sequence number
++ * @ranges: list of range being snapshotted
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ */
+ struct hmm {
+ struct mm_struct *mm;
++ spinlock_t lock;
+ atomic_t sequence;
++ struct list_head ranges;
+ struct list_head mirrors;
+ struct mmu_notifier mmu_notifier;
+ struct rw_semaphore mirrors_sem;
+@@ -72,6 +80,8 @@ static struct hmm *hmm_register(struct m
+ init_rwsem(&hmm->mirrors_sem);
+ atomic_set(&hmm->sequence, 0);
+ hmm->mmu_notifier.ops = NULL;
++ INIT_LIST_HEAD(&hmm->ranges);
++ spin_lock_init(&hmm->lock);
+ hmm->mm = mm;
+
+ /*
+@@ -112,6 +122,22 @@ static void hmm_invalidate_range(struct
+ unsigned long end)
+ {
+ struct hmm_mirror *mirror;
++ struct hmm_range *range;
++
++ spin_lock(&hmm->lock);
++ list_for_each_entry(range, &hmm->ranges, list) {
++ unsigned long addr, idx, npages;
++
++ if (end < range->start || start >= range->end)
++ continue;
++
++ range->valid = false;
++ addr = max(start, range->start);
++ idx = (addr - range->start) >> PAGE_SHIFT;
++ npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
++ memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
++ }
++ spin_unlock(&hmm->lock);
+
+ down_read(&hmm->mirrors_sem);
+ list_for_each_entry(mirror, &hmm->mirrors, list)
+@@ -194,4 +220,263 @@ void hmm_mirror_unregister(struct hmm_mi
+ up_write(&hmm->mirrors_sem);
+ }
+ EXPORT_SYMBOL(hmm_mirror_unregister);
++
++static void hmm_pfns_special(hmm_pfn_t *pfns,
++ unsigned long addr,
++ unsigned long end)
++{
++ for (; addr < end; addr += PAGE_SIZE, pfns++)
++ *pfns = HMM_PFN_SPECIAL;
++}
++
++static int hmm_pfns_bad(unsigned long addr,
++ unsigned long end,
++ struct mm_walk *walk)
++{
++ struct hmm_range *range = walk->private;
++ hmm_pfn_t *pfns = range->pfns;
++ unsigned long i;
++
++ i = (addr - range->start) >> PAGE_SHIFT;
++ for (; addr < end; addr += PAGE_SIZE, i++)
++ pfns[i] = HMM_PFN_ERROR;
++
++ return 0;
++}
++
++static int hmm_vma_walk_hole(unsigned long addr,
++ unsigned long end,
++ struct mm_walk *walk)
++{
++ struct hmm_range *range = walk->private;
++ hmm_pfn_t *pfns = range->pfns;
++ unsigned long i;
++
++ i = (addr - range->start) >> PAGE_SHIFT;
++ for (; addr < end; addr += PAGE_SIZE, i++)
++ pfns[i] = HMM_PFN_EMPTY;
++
++ return 0;
++}
++
++static int hmm_vma_walk_clear(unsigned long addr,
++ unsigned long end,
++ struct mm_walk *walk)
++{
++ struct hmm_range *range = walk->private;
++ hmm_pfn_t *pfns = range->pfns;
++ unsigned long i;
++
++ i = (addr - range->start) >> PAGE_SHIFT;
++ for (; addr < end; addr += PAGE_SIZE, i++)
++ pfns[i] = 0;
++
++ return 0;
++}
++
++static int hmm_vma_walk_pmd(pmd_t *pmdp,
++ unsigned long start,
++ unsigned long end,
++ struct mm_walk *walk)
++{
++ struct hmm_range *range = walk->private;
++ struct vm_area_struct *vma = walk->vma;
++ hmm_pfn_t *pfns = range->pfns;
++ unsigned long addr = start, i;
++ hmm_pfn_t flag;
++ pte_t *ptep;
++
++ i = (addr - range->start) >> PAGE_SHIFT;
++ flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
++
++again:
++ if (pmd_none(*pmdp))
++ return hmm_vma_walk_hole(start, end, walk);
++
++ if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
++ return hmm_pfns_bad(start, end, walk);
++
++ if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
++ unsigned long pfn;
++ pmd_t pmd;
++
++ /*
++ * No need to take pmd_lock here, even if some other threads
++ * is splitting the huge pmd we will get that event through
++ * mmu_notifier callback.
++ *
++ * So just read pmd value and check again its a transparent
++ * huge or device mapping one and compute corresponding pfn
++ * values.
++ */
++ pmd = pmd_read_atomic(pmdp);
++ barrier();
++ if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
++ goto again;
++ if (pmd_protnone(pmd))
++ return hmm_vma_walk_clear(start, end, walk);
++
++ pfn = pmd_pfn(pmd) + pte_index(addr);
++ flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
++ for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
++ pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
++ return 0;
++ }
++
++ if (pmd_bad(*pmdp))
++ return hmm_pfns_bad(start, end, walk);
++
++ ptep = pte_offset_map(pmdp, addr);
++ for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
++ pte_t pte = *ptep;
++
++ pfns[i] = 0;
++
++ if (pte_none(pte) || !pte_present(pte)) {
++ pfns[i] = HMM_PFN_EMPTY;
++ continue;
++ }
++
++ pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
++ pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
++ }
++ pte_unmap(ptep - 1);
++
++ return 0;
++}
++
++/*
++ * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
++ * @vma: virtual memory area containing the virtual address range
++ * @range: used to track snapshot validity
++ * @start: range virtual start address (inclusive)
++ * @end: range virtual end address (exclusive)
++ * @entries: array of hmm_pfn_t: provided by the caller, filled in by function
++ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success
++ *
++ * This snapshots the CPU page table for a range of virtual addresses. Snapshot
++ * validity is tracked by range struct. See hmm_vma_range_done() for further
++ * information.
++ *
++ * The range struct is initialized here. It tracks the CPU page table, but only
++ * if the function returns success (0), in which case the caller must then call
++ * hmm_vma_range_done() to stop CPU page table update tracking on this range.
++ *
++ * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
++ * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
++ */
++int hmm_vma_get_pfns(struct vm_area_struct *vma,
++ struct hmm_range *range,
++ unsigned long start,
++ unsigned long end,
++ hmm_pfn_t *pfns)
++{
++ struct mm_walk mm_walk;
++ struct hmm *hmm;
++
++ /* FIXME support hugetlb fs */
++ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
++ hmm_pfns_special(pfns, start, end);
++ return -EINVAL;
++ }
++
++ /* Sanity check, this really should not happen ! */
++ if (start < vma->vm_start || start >= vma->vm_end)
++ return -EINVAL;
++ if (end < vma->vm_start || end > vma->vm_end)
++ return -EINVAL;
++
++ hmm = hmm_register(vma->vm_mm);
++ if (!hmm)
++ return -ENOMEM;
++ /* Caller must have registered a mirror, via hmm_mirror_register() ! */
++ if (!hmm->mmu_notifier.ops)
++ return -EINVAL;
++
++ /* Initialize range to track CPU page table update */
++ range->start = start;
++ range->pfns = pfns;
++ range->end = end;
++ spin_lock(&hmm->lock);
++ range->valid = true;
++ list_add_rcu(&range->list, &hmm->ranges);
++ spin_unlock(&hmm->lock);
++
++ mm_walk.vma = vma;
++ mm_walk.mm = vma->vm_mm;
++ mm_walk.private = range;
++ mm_walk.pte_entry = NULL;
++ mm_walk.test_walk = NULL;
++ mm_walk.hugetlb_entry = NULL;
++ mm_walk.pmd_entry = hmm_vma_walk_pmd;
++ mm_walk.pte_hole = hmm_vma_walk_hole;
++
++ walk_page_range(start, end, &mm_walk);
++
++ return 0;
++}
++EXPORT_SYMBOL(hmm_vma_get_pfns);
++
++/*
++ * hmm_vma_range_done() - stop tracking change to CPU page table over a range
++ * @vma: virtual memory area containing the virtual address range
++ * @range: range being tracked
++ * Returns: false if range data has been invalidated, true otherwise
++ *
++ * Range struct is used to track updates to the CPU page table after a call to
++ * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
++ * using the data, or wants to lock updates to the data it got from those
++ * functions, it must call the hmm_vma_range_done() function, which will then
++ * stop tracking CPU page table updates.
++ *
++ * Note that device driver must still implement general CPU page table update
++ * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
++ * the mmu_notifier API directly.
++ *
++ * CPU page table update tracking done through hmm_range is only temporary and
++ * to be used while trying to duplicate CPU page table contents for a range of
++ * virtual addresses.
++ *
++ * There are two ways to use this :
++ * again:
++ * hmm_vma_get_pfns(vma, range, start, end, pfns);
++ * trans = device_build_page_table_update_transaction(pfns);
++ * device_page_table_lock();
++ * if (!hmm_vma_range_done(vma, range)) {
++ * device_page_table_unlock();
++ * goto again;
++ * }
++ * device_commit_transaction(trans);
++ * device_page_table_unlock();
++ *
++ * Or:
++ * hmm_vma_get_pfns(vma, range, start, end, pfns);
++ * device_page_table_lock();
++ * hmm_vma_range_done(vma, range);
++ * device_update_page_table(pfns);
++ * device_page_table_unlock();
++ */
++bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
++{
++ unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
++ struct hmm *hmm;
++
++ if (range->end <= range->start) {
++ BUG();
++ return false;
++ }
++
++ hmm = hmm_register(vma->vm_mm);
++ if (!hmm) {
++ memset(range->pfns, 0, sizeof(*range->pfns) * npages);
++ return false;
++ }
++
++ spin_lock(&hmm->lock);
++ list_del_rcu(&range->list);
++ spin_unlock(&hmm->lock);
++
++ return range->valid;
++}
++EXPORT_SYMBOL(hmm_vma_range_done);
+ #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
diff --git a/patches.suse/mm-hmm-mirror-mirror-process-address-space-on-device-with-hmm-helpers.patch b/patches.suse/mm-hmm-mirror-mirror-process-address-space-on-device-with-hmm-helpers.patch
new file mode 100644
index 0000000000..d6b4e77285
--- /dev/null
+++ b/patches.suse/mm-hmm-mirror-mirror-process-address-space-on-device-with-hmm-helpers.patch
@@ -0,0 +1,389 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:11:27 -0700
+Subject: mm/hmm/mirror: mirror process address space on device with HMM helpers
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: c0b124054f9e42eb6da545a10fe9122a7d7c3f72
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+This is a heterogeneous memory management (HMM) process address space
+mirroring. In a nutshell this provide an API to mirror process address
+space on a device. This boils down to keeping CPU and device page table
+synchronize (we assume that both device and CPU are cache coherent like
+PCIe device can be).
+
+This patch provide a simple API for device driver to achieve address space
+mirroring thus avoiding each device driver to grow its own CPU page table
+walker and its own CPU page table synchronization mechanism.
+
+This is useful for NVidia GPU >= Pascal, Mellanox IB >= mlx5 and more
+hardware in the future.
+
+[jglisse@redhat.com: fix hmm for "mmu_notifier kill invalidate_page callback"]
+ Link: http://lkml.kernel.org/r/20170830231955.GD9445@redhat.com
+Link: http://lkml.kernel.org/r/20170817000548.32038-4-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
+Signed-off-by: John Hubbard <jhubbard@nvidia.com>
+Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
+Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
+Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/hmm.h | 110 +++++++++++++++++++++++++++++++++++++
+ mm/Kconfig | 12 ++++
+ mm/hmm.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++------
+ 3 files changed, 260 insertions(+), 15 deletions(-)
+
+--- a/include/linux/hmm.h
++++ b/include/linux/hmm.h
+@@ -72,6 +72,7 @@
+
+ #if IS_ENABLED(CONFIG_HMM)
+
++struct hmm;
+
+ /*
+ * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page
+@@ -134,6 +135,115 @@ static inline hmm_pfn_t hmm_pfn_t_from_p
+ }
+
+
++#if IS_ENABLED(CONFIG_HMM_MIRROR)
++/*
++ * Mirroring: how to synchronize device page table with CPU page table.
++ *
++ * A device driver that is participating in HMM mirroring must always
++ * synchronize with CPU page table updates. For this, device drivers can either
++ * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
++ * drivers can decide to register one mirror per device per process, or just
++ * one mirror per process for a group of devices. The pattern is:
++ *
++ * int device_bind_address_space(..., struct mm_struct *mm, ...)
++ * {
++ * struct device_address_space *das;
++ *
++ * // Device driver specific initialization, and allocation of das
++ * // which contains an hmm_mirror struct as one of its fields.
++ * ...
++ *
++ * ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops);
++ * if (ret) {
++ * // Cleanup on error
++ * return ret;
++ * }
++ *
++ * // Other device driver specific initialization
++ * ...
++ * }
++ *
++ * Once an hmm_mirror is registered for an address space, the device driver
++ * will get callbacks through sync_cpu_device_pagetables() operation (see
++ * hmm_mirror_ops struct).
++ *
++ * Device driver must not free the struct containing the hmm_mirror struct
++ * before calling hmm_mirror_unregister(). The expected usage is to do that when
++ * the device driver is unbinding from an address space.
++ *
++ *
++ * void device_unbind_address_space(struct device_address_space *das)
++ * {
++ * // Device driver specific cleanup
++ * ...
++ *
++ * hmm_mirror_unregister(&das->mirror);
++ *
++ * // Other device driver specific cleanup, and now das can be freed
++ * ...
++ * }
++ */
++
++struct hmm_mirror;
++
++/*
++ * enum hmm_update_type - type of update
++ * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
++ */
++enum hmm_update_type {
++ HMM_UPDATE_INVALIDATE,
++};
++
++/*
++ * struct hmm_mirror_ops - HMM mirror device operations callback
++ *
++ * @update: callback to update range on a device
++ */
++struct hmm_mirror_ops {
++ /* sync_cpu_device_pagetables() - synchronize page tables
++ *
++ * @mirror: pointer to struct hmm_mirror
++ * @update_type: type of update that occurred to the CPU page table
++ * @start: virtual start address of the range to update
++ * @end: virtual end address of the range to update
++ *
++ * This callback ultimately originates from mmu_notifiers when the CPU
++ * page table is updated. The device driver must update its page table
++ * in response to this callback. The update argument tells what action
++ * to perform.
++ *
++ * The device driver must not return from this callback until the device
++ * page tables are completely updated (TLBs flushed, etc); this is a
++ * synchronous call.
++ */
++ void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
++ enum hmm_update_type update_type,
++ unsigned long start,
++ unsigned long end);
++};
++
++/*
++ * struct hmm_mirror - mirror struct for a device driver
++ *
++ * @hmm: pointer to struct hmm (which is unique per mm_struct)
++ * @ops: device driver callback for HMM mirror operations
++ * @list: for list of mirrors of a given mm
++ *
++ * Each address space (mm_struct) being mirrored by a device must register one
++ * instance of an hmm_mirror struct with HMM. HMM will track the list of all
++ * mirrors for each mm_struct.
++ */
++struct hmm_mirror {
++ struct hmm *hmm;
++ const struct hmm_mirror_ops *ops;
++ struct list_head list;
++};
++
++int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
++void hmm_mirror_unregister(struct hmm_mirror *mirror);
++#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
++
++
+ /* Below are for HMM internal use only! Not to be used by device driver! */
+ void hmm_mm_destroy(struct mm_struct *mm);
+
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -713,6 +713,18 @@ config ARCH_HAS_HMM
+ config HMM
+ bool
+
++config HMM_MIRROR
++ bool "HMM mirror CPU page table into a device page table"
++ depends on ARCH_HAS_HMM
++ select MMU_NOTIFIER
++ select HMM
++ help
++ Select HMM_MIRROR if you want to mirror range of the CPU page table of a
++ process into a device page table. Here, mirror means "keep synchronized".
++ Prerequisites: the device must provide the ability to write-protect its
++ page tables (at PAGE_SIZE granularity), and must be able to recover from
++ the resulting potential page faults.
++
+ config FRAME_VECTOR
+ bool
+
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -21,16 +21,27 @@
+ #include <linux/hmm.h>
+ #include <linux/slab.h>
+ #include <linux/sched.h>
++#include <linux/mmu_notifier.h>
+
+
+ #ifdef CONFIG_HMM
++static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
++
+ /*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
++ * @sequence: we track updates to the CPU page table with a sequence number
++ * @mirrors: list of mirrors for this mm
++ * @mmu_notifier: mmu notifier to track updates to CPU page table
++ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ */
+ struct hmm {
+ struct mm_struct *mm;
++ atomic_t sequence;
++ struct list_head mirrors;
++ struct mmu_notifier mmu_notifier;
++ struct rw_semaphore mirrors_sem;
+ };
+
+ /*
+@@ -43,27 +54,48 @@ struct hmm {
+ */
+ static struct hmm *hmm_register(struct mm_struct *mm)
+ {
+- if (!mm->hmm) {
+- struct hmm *hmm = NULL;
+-
+- hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
+- if (!hmm)
+- return NULL;
+- hmm->mm = mm;
+-
+- spin_lock(&mm->page_table_lock);
+- if (!mm->hmm)
+- mm->hmm = hmm;
+- else
+- kfree(hmm);
+- spin_unlock(&mm->page_table_lock);
+- }
++ struct hmm *hmm = READ_ONCE(mm->hmm);
++ bool cleanup = false;
+
+ /*
+ * The hmm struct can only be freed once the mm_struct goes away,
+ * hence we should always have pre-allocated an new hmm struct
+ * above.
+ */
++ if (hmm)
++ return hmm;
++
++ hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
++ if (!hmm)
++ return NULL;
++ INIT_LIST_HEAD(&hmm->mirrors);
++ init_rwsem(&hmm->mirrors_sem);
++ atomic_set(&hmm->sequence, 0);
++ hmm->mmu_notifier.ops = NULL;
++ hmm->mm = mm;
++
++ /*
++ * We should only get here if hold the mmap_sem in write mode ie on
++ * registration of first mirror through hmm_mirror_register()
++ */
++ hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
++ if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
++ kfree(hmm);
++ return NULL;
++ }
++
++ spin_lock(&mm->page_table_lock);
++ if (!mm->hmm)
++ mm->hmm = hmm;
++ else
++ cleanup = true;
++ spin_unlock(&mm->page_table_lock);
++
++ if (cleanup) {
++ mmu_notifier_unregister(&hmm->mmu_notifier, mm);
++ kfree(hmm);
++ }
++
+ return mm->hmm;
+ }
+
+@@ -72,3 +104,94 @@ void hmm_mm_destroy(struct mm_struct *mm
+ kfree(mm->hmm);
+ }
+ #endif /* CONFIG_HMM */
++
++#if IS_ENABLED(CONFIG_HMM_MIRROR)
++static void hmm_invalidate_range(struct hmm *hmm,
++ enum hmm_update_type action,
++ unsigned long start,
++ unsigned long end)
++{
++ struct hmm_mirror *mirror;
++
++ down_read(&hmm->mirrors_sem);
++ list_for_each_entry(mirror, &hmm->mirrors, list)
++ mirror->ops->sync_cpu_device_pagetables(mirror, action,
++ start, end);
++ up_read(&hmm->mirrors_sem);
++}
++
++static void hmm_invalidate_range_start(struct mmu_notifier *mn,
++ struct mm_struct *mm,
++ unsigned long start,
++ unsigned long end)
++{
++ struct hmm *hmm = mm->hmm;
++
++ VM_BUG_ON(!hmm);
++
++ atomic_inc(&hmm->sequence);
++}
++
++static void hmm_invalidate_range_end(struct mmu_notifier *mn,
++ struct mm_struct *mm,
++ unsigned long start,
++ unsigned long end)
++{
++ struct hmm *hmm = mm->hmm;
++
++ VM_BUG_ON(!hmm);
++
++ hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
++}
++
++static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
++ .invalidate_range_start = hmm_invalidate_range_start,
++ .invalidate_range_end = hmm_invalidate_range_end,
++};
++
++/*
++ * hmm_mirror_register() - register a mirror against an mm
++ *
++ * @mirror: new mirror struct to register
++ * @mm: mm to register against
++ *
++ * To start mirroring a process address space, the device driver must register
++ * an HMM mirror struct.
++ *
++ * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
++ */
++int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
++{
++ /* Sanity check */
++ if (!mm || !mirror || !mirror->ops)
++ return -EINVAL;
++
++ mirror->hmm = hmm_register(mm);
++ if (!mirror->hmm)
++ return -ENOMEM;
++
++ down_write(&mirror->hmm->mirrors_sem);
++ list_add(&mirror->list, &mirror->hmm->mirrors);
++ up_write(&mirror->hmm->mirrors_sem);
++
++ return 0;
++}
++EXPORT_SYMBOL(hmm_mirror_register);
++
++/*
++ * hmm_mirror_unregister() - unregister a mirror
++ *
++ * @mirror: new mirror struct to register
++ *
++ * Stop mirroring a process address space, and cleanup.
++ */
++void hmm_mirror_unregister(struct hmm_mirror *mirror)
++{
++ struct hmm *hmm = mirror->hmm;
++
++ down_write(&hmm->mirrors_sem);
++ list_del(&mirror->list);
++ up_write(&hmm->mirrors_sem);
++}
++EXPORT_SYMBOL(hmm_mirror_unregister);
++#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
diff --git a/patches.suse/mm-memcontrol-allow-to-uncharge-page-without-using-page-lru-field.patch b/patches.suse/mm-memcontrol-allow-to-uncharge-page-without-using-page-lru-field.patch
new file mode 100644
index 0000000000..fc207a827e
--- /dev/null
+++ b/patches.suse/mm-memcontrol-allow-to-uncharge-page-without-using-page-lru-field.patch
@@ -0,0 +1,261 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:11:50 -0700
+Subject: mm/memcontrol: allow to uncharge page without using page->lru field
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: a9d5adeeb4b2c73c8972180b28d0e05e7d718d06
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+HMM pages (private or public device pages) are ZONE_DEVICE page and
+thus you can not use page->lru fields of those pages. This patch
+re-arrange the uncharge to allow single page to be uncharge without
+modifying the lru field of the struct page.
+
+There is no change to memcontrol logic, it is the same as it was
+before this patch.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-10-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Mark Hairgrove <mhairgrove@nvidia.com>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Sherry Cheung <SCheung@nvidia.com>
+Cc: Subhash Gutti <sgutti@nvidia.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ mm/memcontrol.c | 168 ++++++++++++++++++++++++++++++--------------------------
+ 1 file changed, 92 insertions(+), 76 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -5510,48 +5510,102 @@ void mem_cgroup_cancel_charge(struct pag
+ cancel_charge(memcg, nr_pages);
+ }
+
+-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
+- unsigned long nr_anon, unsigned long nr_file,
+- unsigned long nr_kmem, unsigned long nr_huge,
+- unsigned long nr_shmem, struct page *dummy_page)
++struct uncharge_gather {
++ struct mem_cgroup *memcg;
++ unsigned long pgpgout;
++ unsigned long nr_anon;
++ unsigned long nr_file;
++ unsigned long nr_kmem;
++ unsigned long nr_huge;
++ unsigned long nr_shmem;
++ struct page *dummy_page;
++};
++
++static inline void uncharge_gather_clear(struct uncharge_gather *ug)
++{
++ memset(ug, 0, sizeof(*ug));
++}
++
++static void uncharge_batch(const struct uncharge_gather *ug)
+ {
+- unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
++ unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
+ unsigned long flags;
+
+- if (!mem_cgroup_is_root(memcg)) {
+- page_counter_uncharge(&memcg->memory, nr_pages);
++ if (!mem_cgroup_is_root(ug->memcg)) {
++ page_counter_uncharge(&ug->memcg->memory, nr_pages);
+ if (do_memsw_account())
+- page_counter_uncharge(&memcg->memsw, nr_pages);
+- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
+- page_counter_uncharge(&memcg->kmem, nr_kmem);
+- memcg_oom_recover(memcg);
++ page_counter_uncharge(&ug->memcg->memsw, nr_pages);
++ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
++ page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
++ memcg_oom_recover(ug->memcg);
+ }
+
+ local_irq_save(flags);
+- __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
+- __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
+- __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
+- __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
+- __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
+- __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
+- memcg_check_events(memcg, dummy_page);
++ __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
++ __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
++ __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
++ __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
++ __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
++ __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
++ memcg_check_events(ug->memcg, ug->dummy_page);
+ local_irq_restore(flags);
+
+- if (!mem_cgroup_is_root(memcg))
+- css_put_many(&memcg->css, nr_pages);
++ if (!mem_cgroup_is_root(ug->memcg))
++ css_put_many(&ug->memcg->css, nr_pages);
++}
++
++static void uncharge_page(struct page *page, struct uncharge_gather *ug)
++{
++ VM_BUG_ON_PAGE(PageLRU(page), page);
++ VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
++
++ if (!page->mem_cgroup)
++ return;
++
++ /*
++ * Nobody should be changing or seriously looking at
++ * page->mem_cgroup at this point, we have fully
++ * exclusive access to the page.
++ */
++
++ if (ug->memcg != page->mem_cgroup) {
++ if (ug->memcg) {
++ uncharge_batch(ug);
++ uncharge_gather_clear(ug);
++ }
++ ug->memcg = page->mem_cgroup;
++ }
++
++ if (!PageKmemcg(page)) {
++ unsigned int nr_pages = 1;
++
++ if (PageTransHuge(page)) {
++ nr_pages <<= compound_order(page);
++ ug->nr_huge += nr_pages;
++ }
++ if (PageAnon(page))
++ ug->nr_anon += nr_pages;
++ else {
++ ug->nr_file += nr_pages;
++ if (PageSwapBacked(page))
++ ug->nr_shmem += nr_pages;
++ }
++ ug->pgpgout++;
++ } else {
++ ug->nr_kmem += 1 << compound_order(page);
++ __ClearPageKmemcg(page);
++ }
++
++ ug->dummy_page = page;
++ page->mem_cgroup = NULL;
+ }
+
+ static void uncharge_list(struct list_head *page_list)
+ {
+- struct mem_cgroup *memcg = NULL;
+- unsigned long nr_shmem = 0;
+- unsigned long nr_anon = 0;
+- unsigned long nr_file = 0;
+- unsigned long nr_huge = 0;
+- unsigned long nr_kmem = 0;
+- unsigned long pgpgout = 0;
++ struct uncharge_gather ug;
+ struct list_head *next;
+- struct page *page;
++
++ uncharge_gather_clear(&ug);
+
+ /*
+ * Note that the list can be a single page->lru; hence the
+@@ -5559,57 +5613,16 @@ static void uncharge_list(struct list_he
+ */
+ next = page_list->next;
+ do {
++ struct page *page;
++
+ page = list_entry(next, struct page, lru);
+ next = page->lru.next;
+
+- VM_BUG_ON_PAGE(PageLRU(page), page);
+- VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+-
+- if (!page->mem_cgroup)
+- continue;
+-
+- /*
+- * Nobody should be changing or seriously looking at
+- * page->mem_cgroup at this point, we have fully
+- * exclusive access to the page.
+- */
+-
+- if (memcg != page->mem_cgroup) {
+- if (memcg) {
+- uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
+- nr_kmem, nr_huge, nr_shmem, page);
+- pgpgout = nr_anon = nr_file = nr_kmem = 0;
+- nr_huge = nr_shmem = 0;
+- }
+- memcg = page->mem_cgroup;
+- }
+-
+- if (!PageKmemcg(page)) {
+- unsigned int nr_pages = 1;
+-
+- if (PageTransHuge(page)) {
+- nr_pages <<= compound_order(page);
+- nr_huge += nr_pages;
+- }
+- if (PageAnon(page))
+- nr_anon += nr_pages;
+- else {
+- nr_file += nr_pages;
+- if (PageSwapBacked(page))
+- nr_shmem += nr_pages;
+- }
+- pgpgout++;
+- } else {
+- nr_kmem += 1 << compound_order(page);
+- __ClearPageKmemcg(page);
+- }
+-
+- page->mem_cgroup = NULL;
++ uncharge_page(page, &ug);
+ } while (next != page_list);
+
+- if (memcg)
+- uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
+- nr_kmem, nr_huge, nr_shmem, page);
++ if (ug.memcg)
++ uncharge_batch(&ug);
+ }
+
+ /**
+@@ -5621,6 +5634,8 @@ static void uncharge_list(struct list_he
+ */
+ void mem_cgroup_uncharge(struct page *page)
+ {
++ struct uncharge_gather ug;
++
+ if (mem_cgroup_disabled())
+ return;
+
+@@ -5628,8 +5643,9 @@ void mem_cgroup_uncharge(struct page *pa
+ if (!page->mem_cgroup)
+ return;
+
+- INIT_LIST_HEAD(&page->lru);
+- uncharge_list(&page->lru);
++ uncharge_gather_clear(&ug);
++ uncharge_page(page, &ug);
++ uncharge_batch(&ug);
+ }
+
+ /**
diff --git a/patches.suse/mm-memcontrol-support-memory_device_private.patch b/patches.suse/mm-memcontrol-support-memory_device_private.patch
new file mode 100644
index 0000000000..bda11689b1
--- /dev/null
+++ b/patches.suse/mm-memcontrol-support-memory_device_private.patch
@@ -0,0 +1,189 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:11:54 -0700
+Subject: mm/memcontrol: support MEMORY_DEVICE_PRIVATE
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: c733a82874a79261866a4178edbb608847df4879
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+HMM pages (private or public device pages) are ZONE_DEVICE page and thus
+need special handling when it comes to lru or refcount. This patch make
+sure that memcontrol properly handle those when it face them. Those pages
+are use like regular pages in a process address space either as anonymous
+page or as file back page. So from memcg point of view we want to handle
+them like regular page for now at least.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-11-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Acked-by: Balbir Singh <bsingharora@gmail.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Mark Hairgrove <mhairgrove@nvidia.com>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Sherry Cheung <SCheung@nvidia.com>
+Cc: Subhash Gutti <sgutti@nvidia.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ kernel/memremap.c | 1 +
+ mm/memcontrol.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++----
+ 2 files changed, 49 insertions(+), 4 deletions(-)
+
+--- a/kernel/memremap.c
++++ b/kernel/memremap.c
+@@ -518,6 +518,7 @@ void put_zone_device_private_page(struct
+ __ClearPageWaiters(page);
+
+ page->mapping = NULL;
++ mem_cgroup_uncharge(page);
+
+ page->pgmap->page_free(page, page->pgmap->data);
+ } else if (!count)
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -4417,12 +4417,13 @@ enum mc_target_type {
+ MC_TARGET_NONE = 0,
+ MC_TARGET_PAGE,
+ MC_TARGET_SWAP,
++ MC_TARGET_DEVICE,
+ };
+
+ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent)
+ {
+- struct page *page = vm_normal_page(vma, addr, ptent);
++ struct page *page = _vm_normal_page(vma, addr, ptent, true);
+
+ if (!page || !page_mapped(page))
+ return NULL;
+@@ -4439,7 +4440,7 @@ static struct page *mc_handle_present_pt
+ return page;
+ }
+
+-#ifdef CONFIG_SWAP
++#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
+ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+ pte_t ptent, swp_entry_t *entry)
+ {
+@@ -4448,6 +4449,23 @@ static struct page *mc_handle_swap_pte(s
+
+ if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
+ return NULL;
++
++ /*
++ * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
++ * a device and because they are not accessible by CPU they are store
++ * as special swap entry in the CPU page table.
++ */
++ if (is_device_private_entry(ent)) {
++ page = device_private_entry_to_page(ent);
++ /*
++ * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
++ * a refcount of 1 when free (unlike normal page)
++ */
++ if (!page_ref_add_unless(page, 1, 1))
++ return NULL;
++ return page;
++ }
++
+ /*
+ * Because lookup_swap_cache() updates some statistics counter,
+ * we call find_get_page() with swapper_space directly.
+@@ -4608,6 +4626,12 @@ out:
+ * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
+ * target for charge migration. if @target is not NULL, the entry is stored
+ * in target->ent.
++ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
++ * (so ZONE_DEVICE page and thus not on the lru). For now we such page is
++ * charge like a regular page would be as for all intent and purposes it is
++ * just special memory taking the place of a regular page.
++ *
++ * See Documentations/vm/hmm.txt and include/linux/hmm.h
+ *
+ * Called with pte lock held.
+ */
+@@ -4636,6 +4660,8 @@ static enum mc_target_type get_mctgt_typ
+ */
+ if (page->mem_cgroup == mc.from) {
+ ret = MC_TARGET_PAGE;
++ if (is_device_private_page(page))
++ ret = MC_TARGET_DEVICE;
+ if (target)
+ target->page = page;
+ }
+@@ -4695,6 +4721,11 @@ static int mem_cgroup_count_precharge_pt
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
++ /*
++ * Note their can not be MC_TARGET_DEVICE for now as we do not
++ * support transparent huge page with MEMORY_DEVICE_PUBLIC or
++ * MEMORY_DEVICE_PRIVATE but this might change.
++ */
+ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
+ mc.precharge += HPAGE_PMD_NR;
+ spin_unlock(ptl);
+@@ -4910,6 +4941,14 @@ static int mem_cgroup_move_charge_pte_ra
+ putback_lru_page(page);
+ }
+ put_page(page);
++ } else if (target_type == MC_TARGET_DEVICE) {
++ page = target.page;
++ if (!mem_cgroup_move_account(page, true,
++ mc.from, mc.to)) {
++ mc.precharge -= HPAGE_PMD_NR;
++ mc.moved_charge += HPAGE_PMD_NR;
++ }
++ put_page(page);
+ }
+ spin_unlock(ptl);
+ return 0;
+@@ -4921,12 +4960,16 @@ retry:
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; addr += PAGE_SIZE) {
+ pte_t ptent = *(pte++);
++ bool device = false;
+ swp_entry_t ent;
+
+ if (!mc.precharge)
+ break;
+
+ switch (get_mctgt_type(vma, addr, ptent, &target)) {
++ case MC_TARGET_DEVICE:
++ device = true;
++ /* fall through */
+ case MC_TARGET_PAGE:
+ page = target.page;
+ /*
+@@ -4937,7 +4980,7 @@ retry:
+ */
+ if (PageTransCompound(page))
+ goto put;
+- if (isolate_lru_page(page))
++ if (!device && isolate_lru_page(page))
+ goto put;
+ if (!mem_cgroup_move_account(page, false,
+ mc.from, mc.to)) {
+@@ -4945,7 +4988,8 @@ retry:
+ /* we uncharge from mc.from later. */
+ mc.moved_charge++;
+ }
+- putback_lru_page(page);
++ if (!device)
++ putback_lru_page(page);
+ put: /* get_mctgt_type() gets the page */
+ put_page(page);
+ break;
diff --git a/patches.suse/mm-memory_hotplug-introduce-add_pages.patch b/patches.suse/mm-memory_hotplug-introduce-add_pages.patch
new file mode 100644
index 0000000000..eaeaf241b2
--- /dev/null
+++ b/patches.suse/mm-memory_hotplug-introduce-add_pages.patch
@@ -0,0 +1,130 @@
+From: Michal Hocko <mhocko@suse.com>
+Date: Fri, 8 Sep 2017 16:11:39 -0700
+Subject: mm/memory_hotplug: introduce add_pages
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 3072e413e305e353cd4654f8a57d953b66e85bf3
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+There are new users of memory hotplug emerging. Some of them require
+different subset of arch_add_memory. There are some which only require
+allocation of struct pages without mapping those pages to the kernel
+address space. We currently have __add_pages for that purpose. But this
+is rather lowlevel and not very suitable for the code outside of the
+memory hotplug. E.g. x86_64 wants to update max_pfn which should be done
+by the caller. Introduce add_pages() which should care about those
+details if they are needed. Each architecture should define its
+implementation and select CONFIG_ARCH_HAS_ADD_PAGES. All others use the
+currently existing __add_pages.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-7-jglisse@redhat.com
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Acked-by: Balbir Singh <bsingharora@gmail.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Mark Hairgrove <mhairgrove@nvidia.com>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Sherry Cheung <SCheung@nvidia.com>
+Cc: Subhash Gutti <sgutti@nvidia.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ arch/x86/Kconfig | 4 ++++
+ arch/x86/mm/init_64.c | 22 +++++++++++++++-------
+ include/linux/memory_hotplug.h | 11 +++++++++++
+ 3 files changed, 30 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -2294,6 +2294,10 @@ source "kernel/livepatch/Kconfig"
+
+ endmenu
+
++config ARCH_HAS_ADD_PAGES
++ def_bool y
++ depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
++
+ config ARCH_ENABLE_MEMORY_HOTPLUG
+ def_bool y
+ depends on X86_64 || (X86_32 && HIGHMEM)
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -671,7 +671,7 @@ void __init paging_init(void)
+ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
+ * updating.
+ */
+-static void update_end_of_memory_vars(u64 start, u64 size)
++static void update_end_of_memory_vars(u64 start, u64 size)
+ {
+ unsigned long end_pfn = PFN_UP(start + size);
+
+@@ -682,22 +682,30 @@ static void update_end_of_memory_vars(u
+ }
+ }
+
+-int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
++int add_pages(int nid, unsigned long start_pfn,
++ unsigned long nr_pages, bool want_memblock)
+ {
+- unsigned long start_pfn = start >> PAGE_SHIFT;
+- unsigned long nr_pages = size >> PAGE_SHIFT;
+ int ret;
+
+- init_memory_mapping(start, start + size);
+-
+ ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
+ WARN_ON_ONCE(ret);
+
+ /* update max_pfn, max_low_pfn and high_memory */
+- update_end_of_memory_vars(start, size);
++ update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
++ nr_pages << PAGE_SHIFT);
+
+ return ret;
+ }
++
++int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
++{
++ unsigned long start_pfn = start >> PAGE_SHIFT;
++ unsigned long nr_pages = size >> PAGE_SHIFT;
++
++ init_memory_mapping(start, start + size);
++
++ return add_pages(nid, start_pfn, nr_pages, want_memblock);
++}
+ EXPORT_SYMBOL_GPL(arch_add_memory);
+
+ #define PAGE_INUSE 0xFD
+--- a/include/linux/memory_hotplug.h
++++ b/include/linux/memory_hotplug.h
+@@ -127,6 +127,17 @@ extern int __remove_pages(struct zone *z
+ extern int __add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock);
+
++#ifndef CONFIG_ARCH_HAS_ADD_PAGES
++static inline int add_pages(int nid, unsigned long start_pfn,
++ unsigned long nr_pages, bool want_memblock)
++{
++ return __add_pages(nid, start_pfn, nr_pages, want_memblock);
++}
++#else /* ARCH_HAS_ADD_PAGES */
++int add_pages(int nid, unsigned long start_pfn,
++ unsigned long nr_pages, bool want_memblock);
++#endif /* ARCH_HAS_ADD_PAGES */
++
+ #ifdef CONFIG_NUMA
+ extern int memory_add_physaddr_to_nid(u64 start);
+ #else
diff --git a/patches.suse/mm-migrate-allow-migrate_vma-to-alloc-new-page-on-empty-entry.patch b/patches.suse/mm-migrate-allow-migrate_vma-to-alloc-new-page-on-empty-entry.patch
new file mode 100644
index 0000000000..8d94f33014
--- /dev/null
+++ b/patches.suse/mm-migrate-allow-migrate_vma-to-alloc-new-page-on-empty-entry.patch
@@ -0,0 +1,361 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:12:21 -0700
+Subject: mm/migrate: allow migrate_vma() to alloc new page on empty entry
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 8315ada7f095bfa2cae0cd1e915b95bf6226897d
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+This allows callers of migrate_vma() to allocate new page for empty CPU
+page table entry (pte_none or back by zero page). This is only for
+anonymous memory and it won't allow new page to be instanced if the
+userfaultfd is armed.
+
+This is useful to device driver that want to migrate a range of virtual
+address and would rather allocate new memory than having to fault later
+on.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-18-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Mark Hairgrove <mhairgrove@nvidia.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Sherry Cheung <SCheung@nvidia.com>
+Cc: Subhash Gutti <sgutti@nvidia.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/migrate.h | 9 ++
+ mm/migrate.c | 205 +++++++++++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 205 insertions(+), 9 deletions(-)
+
+--- a/include/linux/migrate.h
++++ b/include/linux/migrate.h
+@@ -207,6 +207,15 @@ static inline unsigned long migrate_pfn(
+ * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
+ * unrecoverable state.
+ *
++ * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we
++ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
++ * allowing device driver to allocate device memory for those unback virtual
++ * address. For this the device driver simply have to allocate device memory
++ * and properly set the destination entry like for regular migration. Note that
++ * this can still fails and thus inside the device driver must check if the
++ * migration was successful for those entry inside the finalize_and_map()
++ * callback just like for regular migration.
++ *
+ * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
+ * OR BAD THINGS WILL HAPPEN !
+ *
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -37,6 +37,7 @@
+ #include <linux/hugetlb_cgroup.h>
+ #include <linux/gfp.h>
+ #include <linux/memremap.h>
++#include <linux/userfaultfd_k.h>
+ #include <linux/balloon_compaction.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/page_idle.h>
+@@ -2100,6 +2101,22 @@ static int migrate_vma_collect_hole(unsi
+ unsigned long addr;
+
+ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
++ migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
++ migrate->dst[migrate->npages] = 0;
++ migrate->cpages++;
++ }
++
++ return 0;
++}
++
++static int migrate_vma_collect_skip(unsigned long start,
++ unsigned long end,
++ struct mm_walk *walk)
++{
++ struct migrate_vma *migrate = walk->private;
++ unsigned long addr;
++
++ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = 0;
+ }
+@@ -2137,7 +2154,7 @@ again:
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmdp, addr);
+ if (pmd_trans_unstable(pmdp))
+- return migrate_vma_collect_hole(start, end,
++ return migrate_vma_collect_skip(start, end,
+ walk);
+ } else {
+ int ret;
+@@ -2145,19 +2162,22 @@ again:
+ get_page(page);
+ spin_unlock(ptl);
+ if (unlikely(!trylock_page(page)))
+- return migrate_vma_collect_hole(start, end,
++ return migrate_vma_collect_skip(start, end,
+ walk);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+- if (ret || pmd_none(*pmdp))
++ if (ret)
++ return migrate_vma_collect_skip(start, end,
++ walk);
++ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end,
+ walk);
+ }
+ }
+
+ if (unlikely(pmd_bad(*pmdp)))
+- return migrate_vma_collect_hole(start, end, walk);
++ return migrate_vma_collect_skip(start, end, walk);
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+@@ -2172,7 +2192,9 @@ again:
+ pfn = pte_pfn(pte);
+
+ if (pte_none(pte)) {
+- mpfn = pfn = 0;
++ mpfn = MIGRATE_PFN_MIGRATE;
++ migrate->cpages++;
++ pfn = 0;
+ goto next;
+ }
+
+@@ -2194,6 +2216,12 @@ again:
+ if (is_write_device_private_entry(entry))
+ mpfn |= MIGRATE_PFN_WRITE;
+ } else {
++ if (is_zero_pfn(pfn)) {
++ mpfn = MIGRATE_PFN_MIGRATE;
++ migrate->cpages++;
++ pfn = 0;
++ goto next;
++ }
+ page = vm_normal_page(migrate->vma, addr, pte);
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+@@ -2513,6 +2541,135 @@ restore:
+ }
+ }
+
++static void migrate_vma_insert_page(struct migrate_vma *migrate,
++ unsigned long addr,
++ struct page *page,
++ unsigned long *src,
++ unsigned long *dst)
++{
++ struct vm_area_struct *vma = migrate->vma;
++ struct mm_struct *mm = vma->vm_mm;
++ struct mem_cgroup *memcg;
++ bool flush = false;
++ spinlock_t *ptl;
++ pte_t entry;
++ pgd_t *pgdp;
++ p4d_t *p4dp;
++ pud_t *pudp;
++ pmd_t *pmdp;
++ pte_t *ptep;
++
++ /* Only allow populating anonymous memory */
++ if (!vma_is_anonymous(vma))
++ goto abort;
++
++ pgdp = pgd_offset(mm, addr);
++ p4dp = p4d_alloc(mm, pgdp, addr);
++ if (!p4dp)
++ goto abort;
++ pudp = pud_alloc(mm, p4dp, addr);
++ if (!pudp)
++ goto abort;
++ pmdp = pmd_alloc(mm, pudp, addr);
++ if (!pmdp)
++ goto abort;
++
++ if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
++ goto abort;
++
++ /*
++ * Use pte_alloc() instead of pte_alloc_map(). We can't run
++ * pte_offset_map() on pmds where a huge pmd might be created
++ * from a different thread.
++ *
++ * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
++ * parallel threads are excluded by other means.
++ *
++ * Here we only have down_read(mmap_sem).
++ */
++ if (pte_alloc(mm, pmdp, addr))
++ goto abort;
++
++ /* See the comment in pte_alloc_one_map() */
++ if (unlikely(pmd_trans_unstable(pmdp)))
++ goto abort;
++
++ if (unlikely(anon_vma_prepare(vma)))
++ goto abort;
++ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
++ goto abort;
++
++ /*
++ * The memory barrier inside __SetPageUptodate makes sure that
++ * preceding stores to the page contents become visible before
++ * the set_pte_at() write.
++ */
++ __SetPageUptodate(page);
++
++ if (is_zone_device_page(page) && is_device_private_page(page)) {
++ swp_entry_t swp_entry;
++
++ swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
++ entry = swp_entry_to_pte(swp_entry);
++ } else {
++ entry = mk_pte(page, vma->vm_page_prot);
++ if (vma->vm_flags & VM_WRITE)
++ entry = pte_mkwrite(pte_mkdirty(entry));
++ }
++
++ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
++
++ if (pte_present(*ptep)) {
++ unsigned long pfn = pte_pfn(*ptep);
++
++ if (!is_zero_pfn(pfn)) {
++ pte_unmap_unlock(ptep, ptl);
++ mem_cgroup_cancel_charge(page, memcg, false);
++ goto abort;
++ }
++ flush = true;
++ } else if (!pte_none(*ptep)) {
++ pte_unmap_unlock(ptep, ptl);
++ mem_cgroup_cancel_charge(page, memcg, false);
++ goto abort;
++ }
++
++ /*
++ * Check for usefaultfd but do not deliver the fault. Instead,
++ * just back off.
++ */
++ if (userfaultfd_missing(vma)) {
++ pte_unmap_unlock(ptep, ptl);
++ mem_cgroup_cancel_charge(page, memcg, false);
++ goto abort;
++ }
++
++ inc_mm_counter(mm, MM_ANONPAGES);
++ page_add_new_anon_rmap(page, vma, addr, false);
++ mem_cgroup_commit_charge(page, memcg, false, false);
++ if (!is_zone_device_page(page))
++ lru_cache_add_active_or_unevictable(page, vma);
++ get_page(page);
++
++ if (flush) {
++ flush_cache_page(vma, addr, pte_pfn(*ptep));
++ ptep_clear_flush_notify(vma, addr, ptep);
++ set_pte_at_notify(mm, addr, ptep, entry);
++ update_mmu_cache(vma, addr, ptep);
++ } else {
++ /* No need to invalidate - it was non-present before */
++ set_pte_at(mm, addr, ptep, entry);
++ update_mmu_cache(vma, addr, ptep);
++ }
++
++ pte_unmap_unlock(ptep, ptl);
++ *src = MIGRATE_PFN_MIGRATE;
++ return;
++
++abort:
++ *src &= ~MIGRATE_PFN_MIGRATE;
++}
++
+ /*
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+@@ -2525,7 +2682,10 @@ static void migrate_vma_pages(struct mig
+ {
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+- unsigned long addr, i;
++ struct vm_area_struct *vma = migrate->vma;
++ struct mm_struct *mm = vma->vm_mm;
++ unsigned long addr, i, mmu_start;
++ bool notified = false;
+
+ for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+@@ -2533,10 +2693,27 @@ static void migrate_vma_pages(struct mig
+ struct address_space *mapping;
+ int r;
+
+- if (!page || !newpage)
++ if (!newpage) {
++ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
++ }
++
++ if (!page) {
++ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
++ continue;
++ }
++ if (!notified) {
++ mmu_start = addr;
++ notified = true;
++ mmu_notifier_invalidate_range_start(mm,
++ mmu_start,
++ migrate->end);
++ }
++ migrate_vma_insert_page(migrate, addr, newpage,
++ &migrate->src[i],
++ &migrate->dst[i]);
+ continue;
++ }
+
+ mapping = page_mapping(page);
+
+@@ -2564,6 +2741,10 @@ static void migrate_vma_pages(struct mig
+ if (r != MIGRATEPAGE_SUCCESS)
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
++
++ if (notified)
++ mmu_notifier_invalidate_range_end(mm, mmu_start,
++ migrate->end);
+ }
+
+ /*
+@@ -2586,8 +2767,14 @@ static void migrate_vma_finalize(struct
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+- if (!page)
++ if (!page) {
++ if (newpage) {
++ unlock_page(newpage);
++ put_page(newpage);
++ }
+ continue;
++ }
++
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (newpage) {
+ unlock_page(newpage);
diff --git a/patches.suse/mm-migrate-fix-indexing-bug-off-by-one-and-avoid-out-of-bound-access.patch b/patches.suse/mm-migrate-fix-indexing-bug-off-by-one-and-avoid-out-of-bound-access.patch
new file mode 100644
index 0000000000..3095f518b7
--- /dev/null
+++ b/patches.suse/mm-migrate-fix-indexing-bug-off-by-one-and-avoid-out-of-bound-access.patch
@@ -0,0 +1,40 @@
+From: Mark Hairgrove <mhairgrove@nvidia.com>
+Date: Fri, 13 Oct 2017 15:57:30 -0700
+Subject: mm/migrate: fix indexing bug (off by one) and avoid out of bound
+ access
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: e20d103b6c37038ca27409f746f0b3351bcd0c44
+Patch-mainline: v4.14-rc5
+References: VM Functionality, FATE#323338, bsc#1047814
+
+Index was incremented before last use and thus the second array could
+dereference to an invalid address (not mentioning the fact that it did
+not properly clear the entry we intended to clear).
+
+Link: http://lkml.kernel.org/r/1506973525-16491-1-git-send-email-jglisse@redhat.com
+Fixes: 8315ada7f095bf ("mm/migrate: allow migrate_vma() to alloc new page on empty entry")
+Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ mm/migrate.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -2105,8 +2105,9 @@ static int migrate_vma_collect_hole(unsi
+ unsigned long addr;
+
+ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+- migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
++ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
+ migrate->dst[migrate->npages] = 0;
++ migrate->npages++;
+ migrate->cpages++;
+ }
+
diff --git a/patches.suse/mm-migrate-migrate_vma-unmap-page-from-vma-while-collecting-pages.patch b/patches.suse/mm-migrate-migrate_vma-unmap-page-from-vma-while-collecting-pages.patch
new file mode 100644
index 0000000000..6da8d6145d
--- /dev/null
+++ b/patches.suse/mm-migrate-migrate_vma-unmap-page-from-vma-while-collecting-pages.patch
@@ -0,0 +1,261 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:12:13 -0700
+Subject: mm/migrate: migrate_vma() unmap page from vma while collecting pages
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 8c3328f1f36a5efe817ad4e06497af601936a460
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+Common case for migration of virtual address range is page are map only
+once inside the vma in which migration is taking place. Because we
+already walk the CPU page table for that range we can directly do the
+unmap there and setup special migration swap entry.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-16-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
+Signed-off-by: John Hubbard <jhubbard@nvidia.com>
+Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
+Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
+Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ mm/migrate.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 112 insertions(+), 29 deletions(-)
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -2108,7 +2108,7 @@ static int migrate_vma_collect_pmd(pmd_t
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+- unsigned long addr = start;
++ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+@@ -2153,9 +2153,12 @@ again:
+ return migrate_vma_collect_hole(start, end, walk);
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
++ arch_enter_lazy_mmu_mode();
++
+ for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ unsigned long mpfn, pfn;
+ struct page *page;
++ swp_entry_t entry;
+ pte_t pte;
+
+ pte = *ptep;
+@@ -2187,11 +2190,44 @@ again:
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+
++ /*
++ * Optimize for the common case where page is only mapped once
++ * in one process. If we can lock the page, then we can safely
++ * set up a special migration page table entry now.
++ */
++ if (trylock_page(page)) {
++ pte_t swp_pte;
++
++ mpfn |= MIGRATE_PFN_LOCKED;
++ ptep_get_and_clear(mm, addr, ptep);
++
++ /* Setup special migration page table entry */
++ entry = make_migration_entry(page, pte_write(pte));
++ swp_pte = swp_entry_to_pte(entry);
++ if (pte_soft_dirty(pte))
++ swp_pte = pte_swp_mksoft_dirty(swp_pte);
++ set_pte_at(mm, addr, ptep, swp_pte);
++
++ /*
++ * This is like regular unmap: we remove the rmap and
++ * drop page refcount. Page won't be freed, as we took
++ * a reference just above.
++ */
++ page_remove_rmap(page, false);
++ put_page(page);
++ unmapped++;
++ }
++
+ next:
+ migrate->src[migrate->npages++] = mpfn;
+ }
++ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
++ /* Only flush the TLB if we actually modified any entries */
++ if (unmapped)
++ flush_tlb_range(walk->vma, start, end);
++
+ return 0;
+ }
+
+@@ -2216,7 +2252,13 @@ static void migrate_vma_collect(struct m
+ mm_walk.mm = migrate->vma->vm_mm;
+ mm_walk.private = migrate;
+
++ mmu_notifier_invalidate_range_start(mm_walk.mm,
++ migrate->start,
++ migrate->end);
+ walk_page_range(migrate->start, migrate->end, &mm_walk);
++ mmu_notifier_invalidate_range_end(mm_walk.mm,
++ migrate->start,
++ migrate->end);
+
+ migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+ }
+@@ -2264,32 +2306,37 @@ static bool migrate_vma_check_page(struc
+ static void migrate_vma_prepare(struct migrate_vma *migrate)
+ {
+ const unsigned long npages = migrate->npages;
++ const unsigned long start = migrate->start;
++ unsigned long addr, i, restore = 0;
+ bool allow_drain = true;
+- unsigned long i;
+
+ lru_add_drain();
+
+ for (i = 0; (i < npages) && migrate->cpages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
++ bool remap = true;
+
+ if (!page)
+ continue;
+
+- /*
+- * Because we are migrating several pages there can be
+- * a deadlock between 2 concurrent migration where each
+- * are waiting on each other page lock.
+- *
+- * Make migrate_vma() a best effort thing and backoff
+- * for any page we can not lock right away.
+- */
+- if (!trylock_page(page)) {
+- migrate->src[i] = 0;
+- migrate->cpages--;
+- put_page(page);
+- continue;
++ if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
++ /*
++ * Because we are migrating several pages there can be
++ * a deadlock between 2 concurrent migration where each
++ * are waiting on each other page lock.
++ *
++ * Make migrate_vma() a best effort thing and backoff
++ * for any page we can not lock right away.
++ */
++ if (!trylock_page(page)) {
++ migrate->src[i] = 0;
++ migrate->cpages--;
++ put_page(page);
++ continue;
++ }
++ remap = false;
++ migrate->src[i] |= MIGRATE_PFN_LOCKED;
+ }
+- migrate->src[i] |= MIGRATE_PFN_LOCKED;
+
+ if (!PageLRU(page) && allow_drain) {
+ /* Drain CPU's pagevec */
+@@ -2298,21 +2345,50 @@ static void migrate_vma_prepare(struct m
+ }
+
+ if (isolate_lru_page(page)) {
+- migrate->src[i] = 0;
+- unlock_page(page);
+- migrate->cpages--;
+- put_page(page);
++ if (remap) {
++ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
++ migrate->cpages--;
++ restore++;
++ } else {
++ migrate->src[i] = 0;
++ unlock_page(page);
++ migrate->cpages--;
++ put_page(page);
++ }
+ continue;
+ }
+
+ if (!migrate_vma_check_page(page)) {
+- migrate->src[i] = 0;
+- unlock_page(page);
+- migrate->cpages--;
++ if (remap) {
++ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
++ migrate->cpages--;
++ restore++;
++
++ get_page(page);
++ putback_lru_page(page);
++ } else {
++ migrate->src[i] = 0;
++ unlock_page(page);
++ migrate->cpages--;
+
+- putback_lru_page(page);
++ putback_lru_page(page);
++ }
+ }
+ }
++
++ for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
++ struct page *page = migrate_pfn_to_page(migrate->src[i]);
++
++ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
++ continue;
++
++ remove_migration_pte(page, migrate->vma, addr, page);
++
++ migrate->src[i] = 0;
++ unlock_page(page);
++ put_page(page);
++ restore--;
++ }
+ }
+
+ /*
+@@ -2339,12 +2415,19 @@ static void migrate_vma_unmap(struct mig
+ if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+- try_to_unmap(page, flags);
+- if (page_mapped(page) || !migrate_vma_check_page(page)) {
+- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+- migrate->cpages--;
+- restore++;
++ if (page_mapped(page)) {
++ try_to_unmap(page, flags);
++ if (page_mapped(page))
++ goto restore;
+ }
++
++ if (migrate_vma_check_page(page))
++ continue;
++
++restore:
++ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
++ migrate->cpages--;
++ restore++;
+ }
+
+ for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
diff --git a/patches.suse/mm-migrate-new-memory-migration-helper-for-use-with-device-memory.patch b/patches.suse/mm-migrate-new-memory-migration-helper-for-use-with-device-memory.patch
new file mode 100644
index 0000000000..a37e7e3857
--- /dev/null
+++ b/patches.suse/mm-migrate-new-memory-migration-helper-for-use-with-device-memory.patch
@@ -0,0 +1,671 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:12:09 -0700
+Subject: mm/migrate: new memory migration helper for use with device memory
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 8763cb45ab967a92a5ee49e9c544c0f0ea90e2d6
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+This patch add a new memory migration helpers, which migrate memory
+backing a range of virtual address of a process to different memory (which
+can be allocated through special allocator). It differs from numa
+migration by working on a range of virtual address and thus by doing
+migration in chunk that can be large enough to use DMA engine or special
+copy offloading engine.
+
+Expected users are any one with heterogeneous memory where different
+memory have different characteristics (latency, bandwidth, ...). As an
+example IBM platform with CAPI bus can make use of this feature to migrate
+between regular memory and CAPI device memory. New CPU architecture with
+a pool of high performance memory not manage as cache but presented as
+regular memory (while being faster and with lower latency than DDR) will
+also be prime user of this patch.
+
+Migration to private device memory will be useful for device that have
+large pool of such like GPU, NVidia plans to use HMM for that.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-15-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
+Signed-off-by: John Hubbard <jhubbard@nvidia.com>
+Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
+Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
+Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/migrate.h | 104 ++++++++++
+ mm/migrate.c | 492 ++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 596 insertions(+)
+
+--- a/include/linux/migrate.h
++++ b/include/linux/migrate.h
+@@ -145,4 +145,108 @@ static inline int migrate_misplaced_tran
+ }
+ #endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
+
++
++#ifdef CONFIG_MIGRATION
++
++#define MIGRATE_PFN_VALID (1UL << 0)
++#define MIGRATE_PFN_MIGRATE (1UL << 1)
++#define MIGRATE_PFN_LOCKED (1UL << 2)
++#define MIGRATE_PFN_WRITE (1UL << 3)
++#define MIGRATE_PFN_ERROR (1UL << 4)
++#define MIGRATE_PFN_SHIFT 5
++
++static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
++{
++ if (!(mpfn & MIGRATE_PFN_VALID))
++ return NULL;
++ return pfn_to_page(mpfn >> MIGRATE_PFN_SHIFT);
++}
++
++static inline unsigned long migrate_pfn(unsigned long pfn)
++{
++ return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
++}
++
++/*
++ * struct migrate_vma_ops - migrate operation callback
++ *
++ * @alloc_and_copy: alloc destination memory and copy source memory to it
++ * @finalize_and_map: allow caller to map the successfully migrated pages
++ *
++ *
++ * The alloc_and_copy() callback happens once all source pages have been locked,
++ * unmapped and checked (checked whether pinned or not). All pages that can be
++ * migrated will have an entry in the src array set with the pfn value of the
++ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other
++ * flags might be set but should be ignored by the callback).
++ *
++ * The alloc_and_copy() callback can then allocate destination memory and copy
++ * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and
++ * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the
++ * callback must update each corresponding entry in the dst array with the pfn
++ * value of the destination page and with the MIGRATE_PFN_VALID and
++ * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages
++ * locked, via lock_page()).
++ *
++ * At this point the alloc_and_copy() callback is done and returns.
++ *
++ * Note that the callback does not have to migrate all the pages that are
++ * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration
++ * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also
++ * set in the src array entry). If the device driver cannot migrate a device
++ * page back to system memory, then it must set the corresponding dst array
++ * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to
++ * access any of the virtual addresses originally backed by this page. Because
++ * a SIGBUS is such a severe result for the userspace process, the device
++ * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
++ * unrecoverable state.
++ *
++ * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
++ * OR BAD THINGS WILL HAPPEN !
++ *
++ *
++ * The finalize_and_map() callback happens after struct page migration from
++ * source to destination (destination struct pages are the struct pages for the
++ * memory allocated by the alloc_and_copy() callback). Migration can fail, and
++ * thus the finalize_and_map() allows the driver to inspect which pages were
++ * successfully migrated, and which were not. Successfully migrated pages will
++ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
++ *
++ * It is safe to update device page table from within the finalize_and_map()
++ * callback because both destination and source page are still locked, and the
++ * mmap_sem is held in read mode (hence no one can unmap the range being
++ * migrated).
++ *
++ * Once callback is done cleaning up things and updating its page table (if it
++ * chose to do so, this is not an obligation) then it returns. At this point,
++ * the HMM core will finish up the final steps, and the migration is complete.
++ *
++ * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY
++ * ENTRIES OR BAD THINGS WILL HAPPEN !
++ */
++struct migrate_vma_ops {
++ void (*alloc_and_copy)(struct vm_area_struct *vma,
++ const unsigned long *src,
++ unsigned long *dst,
++ unsigned long start,
++ unsigned long end,
++ void *private);
++ void (*finalize_and_map)(struct vm_area_struct *vma,
++ const unsigned long *src,
++ const unsigned long *dst,
++ unsigned long start,
++ unsigned long end,
++ void *private);
++};
++
++int migrate_vma(const struct migrate_vma_ops *ops,
++ struct vm_area_struct *vma,
++ unsigned long start,
++ unsigned long end,
++ unsigned long *src,
++ unsigned long *dst,
++ void *private);
++
++#endif /* CONFIG_MIGRATION */
++
+ #endif /* _LINUX_MIGRATE_H */
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -397,6 +397,14 @@ int migrate_page_move_mapping(struct add
+ int expected_count = 1 + extra_count;
+ void **pslot;
+
++ /*
++ * ZONE_DEVICE pages have 1 refcount always held by their device
++ *
++ * Note that DAX memory will never reach that point as it does not have
++ * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
++ */
++ expected_count += is_zone_device_page(page);
++
+ if (!mapping) {
+ /* Anonymous page without mapping */
+ if (page_count(page) != expected_count)
+@@ -2065,3 +2073,487 @@ out_unlock:
+ #endif /* CONFIG_NUMA_BALANCING */
+
+ #endif /* CONFIG_NUMA */
++
++
++struct migrate_vma {
++ struct vm_area_struct *vma;
++ unsigned long *dst;
++ unsigned long *src;
++ unsigned long cpages;
++ unsigned long npages;
++ unsigned long start;
++ unsigned long end;
++};
++
++static int migrate_vma_collect_hole(unsigned long start,
++ unsigned long end,
++ struct mm_walk *walk)
++{
++ struct migrate_vma *migrate = walk->private;
++ unsigned long addr;
++
++ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
++ migrate->dst[migrate->npages] = 0;
++ migrate->src[migrate->npages++] = 0;
++ }
++
++ return 0;
++}
++
++static int migrate_vma_collect_pmd(pmd_t *pmdp,
++ unsigned long start,
++ unsigned long end,
++ struct mm_walk *walk)
++{
++ struct migrate_vma *migrate = walk->private;
++ struct vm_area_struct *vma = walk->vma;
++ struct mm_struct *mm = vma->vm_mm;
++ unsigned long addr = start;
++ spinlock_t *ptl;
++ pte_t *ptep;
++
++again:
++ if (pmd_none(*pmdp))
++ return migrate_vma_collect_hole(start, end, walk);
++
++ if (pmd_trans_huge(*pmdp)) {
++ struct page *page;
++
++ ptl = pmd_lock(mm, pmdp);
++ if (unlikely(!pmd_trans_huge(*pmdp))) {
++ spin_unlock(ptl);
++ goto again;
++ }
++
++ page = pmd_page(*pmdp);
++ if (is_huge_zero_page(page)) {
++ spin_unlock(ptl);
++ split_huge_pmd(vma, pmdp, addr);
++ if (pmd_trans_unstable(pmdp))
++ return migrate_vma_collect_hole(start, end,
++ walk);
++ } else {
++ int ret;
++
++ get_page(page);
++ spin_unlock(ptl);
++ if (unlikely(!trylock_page(page)))
++ return migrate_vma_collect_hole(start, end,
++ walk);
++ ret = split_huge_page(page);
++ unlock_page(page);
++ put_page(page);
++ if (ret || pmd_none(*pmdp))
++ return migrate_vma_collect_hole(start, end,
++ walk);
++ }
++ }
++
++ if (unlikely(pmd_bad(*pmdp)))
++ return migrate_vma_collect_hole(start, end, walk);
++
++ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
++ for (; addr < end; addr += PAGE_SIZE, ptep++) {
++ unsigned long mpfn, pfn;
++ struct page *page;
++ pte_t pte;
++
++ pte = *ptep;
++ pfn = pte_pfn(pte);
++
++ if (!pte_present(pte)) {
++ mpfn = pfn = 0;
++ goto next;
++ }
++
++ /* FIXME support THP */
++ page = vm_normal_page(migrate->vma, addr, pte);
++ if (!page || !page->mapping || PageTransCompound(page)) {
++ mpfn = pfn = 0;
++ goto next;
++ }
++
++ /*
++ * By getting a reference on the page we pin it and that blocks
++ * any kind of migration. Side effect is that it "freezes" the
++ * pte.
++ *
++ * We drop this reference after isolating the page from the lru
++ * for non device page (device page are not on the lru and thus
++ * can't be dropped from it).
++ */
++ get_page(page);
++ migrate->cpages++;
++ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
++ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
++
++next:
++ migrate->src[migrate->npages++] = mpfn;
++ }
++ pte_unmap_unlock(ptep - 1, ptl);
++
++ return 0;
++}
++
++/*
++ * migrate_vma_collect() - collect pages over a range of virtual addresses
++ * @migrate: migrate struct containing all migration information
++ *
++ * This will walk the CPU page table. For each virtual address backed by a
++ * valid page, it updates the src array and takes a reference on the page, in
++ * order to pin the page until we lock it and unmap it.
++ */
++static void migrate_vma_collect(struct migrate_vma *migrate)
++{
++ struct mm_walk mm_walk;
++
++ mm_walk.pmd_entry = migrate_vma_collect_pmd;
++ mm_walk.pte_entry = NULL;
++ mm_walk.pte_hole = migrate_vma_collect_hole;
++ mm_walk.hugetlb_entry = NULL;
++ mm_walk.test_walk = NULL;
++ mm_walk.vma = migrate->vma;
++ mm_walk.mm = migrate->vma->vm_mm;
++ mm_walk.private = migrate;
++
++ walk_page_range(migrate->start, migrate->end, &mm_walk);
++
++ migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
++}
++
++/*
++ * migrate_vma_check_page() - check if page is pinned or not
++ * @page: struct page to check
++ *
++ * Pinned pages cannot be migrated. This is the same test as in
++ * migrate_page_move_mapping(), except that here we allow migration of a
++ * ZONE_DEVICE page.
++ */
++static bool migrate_vma_check_page(struct page *page)
++{
++ /*
++ * One extra ref because caller holds an extra reference, either from
++ * isolate_lru_page() for a regular page, or migrate_vma_collect() for
++ * a device page.
++ */
++ int extra = 1;
++
++ /*
++ * FIXME support THP (transparent huge page), it is bit more complex to
++ * check them than regular pages, because they can be mapped with a pmd
++ * or with a pte (split pte mapping).
++ */
++ if (PageCompound(page))
++ return false;
++
++ if ((page_count(page) - extra) > page_mapcount(page))
++ return false;
++
++ return true;
++}
++
++/*
++ * migrate_vma_prepare() - lock pages and isolate them from the lru
++ * @migrate: migrate struct containing all migration information
++ *
++ * This locks pages that have been collected by migrate_vma_collect(). Once each
++ * page is locked it is isolated from the lru (for non-device pages). Finally,
++ * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
++ * migrated by concurrent kernel threads.
++ */
++static void migrate_vma_prepare(struct migrate_vma *migrate)
++{
++ const unsigned long npages = migrate->npages;
++ bool allow_drain = true;
++ unsigned long i;
++
++ lru_add_drain();
++
++ for (i = 0; (i < npages) && migrate->cpages; i++) {
++ struct page *page = migrate_pfn_to_page(migrate->src[i]);
++
++ if (!page)
++ continue;
++
++ /*
++ * Because we are migrating several pages there can be
++ * a deadlock between 2 concurrent migration where each
++ * are waiting on each other page lock.
++ *
++ * Make migrate_vma() a best effort thing and backoff
++ * for any page we can not lock right away.
++ */
++ if (!trylock_page(page)) {
++ migrate->src[i] = 0;
++ migrate->cpages--;
++ put_page(page);
++ continue;
++ }
++ migrate->src[i] |= MIGRATE_PFN_LOCKED;
++
++ if (!PageLRU(page) && allow_drain) {
++ /* Drain CPU's pagevec */
++ lru_add_drain_all();
++ allow_drain = false;
++ }
++
++ if (isolate_lru_page(page)) {
++ migrate->src[i] = 0;
++ unlock_page(page);
++ migrate->cpages--;
++ put_page(page);
++ continue;
++ }
++
++ if (!migrate_vma_check_page(page)) {
++ migrate->src[i] = 0;
++ unlock_page(page);
++ migrate->cpages--;
++
++ putback_lru_page(page);
++ }
++ }
++}
++
++/*
++ * migrate_vma_unmap() - replace page mapping with special migration pte entry
++ * @migrate: migrate struct containing all migration information
++ *
++ * Replace page mapping (CPU page table pte) with a special migration pte entry
++ * and check again if it has been pinned. Pinned pages are restored because we
++ * cannot migrate them.
++ *
++ * This is the last step before we call the device driver callback to allocate
++ * destination memory and copy contents of original page over to new page.
++ */
++static void migrate_vma_unmap(struct migrate_vma *migrate)
++{
++ int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
++ const unsigned long npages = migrate->npages;
++ const unsigned long start = migrate->start;
++ unsigned long addr, i, restore = 0;
++
++ for (i = 0; i < npages; i++) {
++ struct page *page = migrate_pfn_to_page(migrate->src[i]);
++
++ if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
++ continue;
++
++ try_to_unmap(page, flags);
++ if (page_mapped(page) || !migrate_vma_check_page(page)) {
++ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
++ migrate->cpages--;
++ restore++;
++ }
++ }
++
++ for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
++ struct page *page = migrate_pfn_to_page(migrate->src[i]);
++
++ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
++ continue;
++
++ remove_migration_ptes(page, page, false);
++
++ migrate->src[i] = 0;
++ unlock_page(page);
++ restore--;
++
++ putback_lru_page(page);
++ }
++}
++
++/*
++ * migrate_vma_pages() - migrate meta-data from src page to dst page
++ * @migrate: migrate struct containing all migration information
++ *
++ * This migrates struct page meta-data from source struct page to destination
++ * struct page. This effectively finishes the migration from source page to the
++ * destination page.
++ */
++static void migrate_vma_pages(struct migrate_vma *migrate)
++{
++ const unsigned long npages = migrate->npages;
++ const unsigned long start = migrate->start;
++ unsigned long addr, i;
++
++ for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
++ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
++ struct page *page = migrate_pfn_to_page(migrate->src[i]);
++ struct address_space *mapping;
++ int r;
++
++ if (!page || !newpage)
++ continue;
++ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
++ continue;
++
++ mapping = page_mapping(page);
++
++ r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
++ if (r != MIGRATEPAGE_SUCCESS)
++ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
++ }
++}
++
++/*
++ * migrate_vma_finalize() - restore CPU page table entry
++ * @migrate: migrate struct containing all migration information
++ *
++ * This replaces the special migration pte entry with either a mapping to the
++ * new page if migration was successful for that page, or to the original page
++ * otherwise.
++ *
++ * This also unlocks the pages and puts them back on the lru, or drops the extra
++ * refcount, for device pages.
++ */
++static void migrate_vma_finalize(struct migrate_vma *migrate)
++{
++ const unsigned long npages = migrate->npages;
++ unsigned long i;
++
++ for (i = 0; i < npages; i++) {
++ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
++ struct page *page = migrate_pfn_to_page(migrate->src[i]);
++
++ if (!page)
++ continue;
++ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
++ if (newpage) {
++ unlock_page(newpage);
++ put_page(newpage);
++ }
++ newpage = page;
++ }
++
++ remove_migration_ptes(page, newpage, false);
++ unlock_page(page);
++ migrate->cpages--;
++
++ putback_lru_page(page);
++
++ if (newpage != page) {
++ unlock_page(newpage);
++ putback_lru_page(newpage);
++ }
++ }
++}
++
++/*
++ * migrate_vma() - migrate a range of memory inside vma
++ *
++ * @ops: migration callback for allocating destination memory and copying
++ * @vma: virtual memory area containing the range to be migrated
++ * @start: start address of the range to migrate (inclusive)
++ * @end: end address of the range to migrate (exclusive)
++ * @src: array of hmm_pfn_t containing source pfns
++ * @dst: array of hmm_pfn_t containing destination pfns
++ * @private: pointer passed back to each of the callback
++ * Returns: 0 on success, error code otherwise
++ *
++ * This function tries to migrate a range of memory virtual address range, using
++ * callbacks to allocate and copy memory from source to destination. First it
++ * collects all the pages backing each virtual address in the range, saving this
++ * inside the src array. Then it locks those pages and unmaps them. Once the pages
++ * are locked and unmapped, it checks whether each page is pinned or not. Pages
++ * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
++ * in the corresponding src array entry. It then restores any pages that are
++ * pinned, by remapping and unlocking those pages.
++ *
++ * At this point it calls the alloc_and_copy() callback. For documentation on
++ * what is expected from that callback, see struct migrate_vma_ops comments in
++ * include/linux/migrate.h
++ *
++ * After the alloc_and_copy() callback, this function goes over each entry in
++ * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
++ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
++ * then the function tries to migrate struct page information from the source
++ * struct page to the destination struct page. If it fails to migrate the struct
++ * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
++ * array.
++ *
++ * At this point all successfully migrated pages have an entry in the src
++ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
++ * array entry with MIGRATE_PFN_VALID flag set.
++ *
++ * It then calls the finalize_and_map() callback. See comments for "struct
++ * migrate_vma_ops", in include/linux/migrate.h for details about
++ * finalize_and_map() behavior.
++ *
++ * After the finalize_and_map() callback, for successfully migrated pages, this
++ * function updates the CPU page table to point to new pages, otherwise it
++ * restores the CPU page table to point to the original source pages.
++ *
++ * Function returns 0 after the above steps, even if no pages were migrated
++ * (The function only returns an error if any of the arguments are invalid.)
++ *
++ * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
++ * unsigned long entries.
++ */
++int migrate_vma(const struct migrate_vma_ops *ops,
++ struct vm_area_struct *vma,
++ unsigned long start,
++ unsigned long end,
++ unsigned long *src,
++ unsigned long *dst,
++ void *private)
++{
++ struct migrate_vma migrate;
++
++ /* Sanity check the arguments */
++ start &= PAGE_MASK;
++ end &= PAGE_MASK;
++ if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
++ return -EINVAL;
++ if (start < vma->vm_start || start >= vma->vm_end)
++ return -EINVAL;
++ if (end <= vma->vm_start || end > vma->vm_end)
++ return -EINVAL;
++ if (!ops || !src || !dst || start >= end)
++ return -EINVAL;
++
++ memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
++ migrate.src = src;
++ migrate.dst = dst;
++ migrate.start = start;
++ migrate.npages = 0;
++ migrate.cpages = 0;
++ migrate.end = end;
++ migrate.vma = vma;
++
++ /* Collect, and try to unmap source pages */
++ migrate_vma_collect(&migrate);
++ if (!migrate.cpages)
++ return 0;
++
++ /* Lock and isolate page */
++ migrate_vma_prepare(&migrate);
++ if (!migrate.cpages)
++ return 0;
++
++ /* Unmap pages */
++ migrate_vma_unmap(&migrate);
++ if (!migrate.cpages)
++ return 0;
++
++ /*
++ * At this point pages are locked and unmapped, and thus they have
++ * stable content and can safely be copied to destination memory that
++ * is allocated by the callback.
++ *
++ * Note that migration can fail in migrate_vma_struct_page() for each
++ * individual page.
++ */
++ ops->alloc_and_copy(vma, src, dst, start, end, private);
++
++ /* This does the real migration of struct page */
++ migrate_vma_pages(&migrate);
++
++ ops->finalize_and_map(vma, src, dst, start, end, private);
++
++ /* Unlock and remap pages */
++ migrate_vma_finalize(&migrate);
++
++ return 0;
++}
++EXPORT_SYMBOL(migrate_vma);
diff --git a/patches.suse/mm-migrate-new-migrate-mode-migrate_sync_no_copy.patch b/patches.suse/mm-migrate-new-migrate-mode-migrate_sync_no_copy.patch
new file mode 100644
index 0000000000..c68ff88c4e
--- /dev/null
+++ b/patches.suse/mm-migrate-new-migrate-mode-migrate_sync_no_copy.patch
@@ -0,0 +1,340 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:12:06 -0700
+Subject: mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 2916ecc0f9d435d849c98f4da50e453124c87531
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+Introduce a new migration mode that allow to offload the copy to a device
+DMA engine. This changes the workflow of migration and not all
+address_space migratepage callback can support this.
+
+This is intended to be use by migrate_vma() which itself is use for thing
+like HMM (see include/linux/hmm.h).
+
+No additional per-filesystem migratepage testing is needed. I disables
+MIGRATE_SYNC_NO_COPY in all problematic migratepage() callback and i
+added comment in those to explain why (part of this patch). The commit
+message is unclear it should say that any callback that wish to support
+this new mode need to be aware of the difference in the migration flow
+from other mode.
+
+Some of these callbacks do extra locking while copying (aio, zsmalloc,
+balloon, ...) and for DMA to be effective you want to copy multiple
+pages in one DMA operations. But in the problematic case you can not
+easily hold the extra lock accross multiple call to this callback.
+
+Usual flow is:
+
+For each page {
+ 1 - lock page
+ 2 - call migratepage() callback
+ 3 - (extra locking in some migratepage() callback)
+ 4 - migrate page state (freeze refcount, update page cache, buffer
+ head, ...)
+ 5 - copy page
+ 6 - (unlock any extra lock of migratepage() callback)
+ 7 - return from migratepage() callback
+ 8 - unlock page
+}
+
+The new mode MIGRATE_SYNC_NO_COPY:
+ 1 - lock multiple pages
+For each page {
+ 2 - call migratepage() callback
+ 3 - abort in all problematic migratepage() callback
+ 4 - migrate page state (freeze refcount, update page cache, buffer
+ head, ...)
+} // finished all calls to migratepage() callback
+ 5 - DMA copy multiple pages
+ 6 - unlock all the pages
+
+To support MIGRATE_SYNC_NO_COPY in the problematic case we would need a
+new callback migratepages() (for instance) that deals with multiple
+pages in one transaction.
+
+Because the problematic cases are not important for current usage I did
+not wanted to complexify this patchset even more for no good reason.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-14-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Mark Hairgrove <mhairgrove@nvidia.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Sherry Cheung <SCheung@nvidia.com>
+Cc: Subhash Gutti <sgutti@nvidia.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ fs/aio.c | 8 ++++++
+ fs/f2fs/data.c | 5 +++-
+ fs/hugetlbfs/inode.c | 5 +++-
+ fs/ubifs/file.c | 5 +++-
+ include/linux/migrate.h | 5 ++++
+ include/linux/migrate_mode.h | 5 ++++
+ mm/balloon_compaction.c | 8 ++++++
+ mm/migrate.c | 52 +++++++++++++++++++++++++++++++++----------
+ mm/zsmalloc.c | 8 ++++++
+ 9 files changed, 86 insertions(+), 15 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -373,6 +373,14 @@ static int aio_migratepage(struct addres
+ pgoff_t idx;
+ int rc;
+
++ /*
++ * We cannot support the _NO_COPY case here, because copy needs to
++ * happen under the ctx->completion_lock. That does not work with the
++ * migration workflow of MIGRATE_SYNC_NO_COPY.
++ */
++ if (mode == MIGRATE_SYNC_NO_COPY)
++ return -EINVAL;
++
+ rc = 0;
+
+ /* mapping->private_lock here protects against the kioctx teardown. */
+--- a/fs/f2fs/data.c
++++ b/fs/f2fs/data.c
+@@ -2186,7 +2186,10 @@ int f2fs_migrate_page(struct address_spa
+ SetPagePrivate(newpage);
+ set_page_private(newpage, page_private(page));
+
+- migrate_page_copy(newpage, page);
++ if (mode != MIGRATE_SYNC_NO_COPY)
++ migrate_page_copy(newpage, page);
++ else
++ migrate_page_states(newpage, page);
+
+ return MIGRATEPAGE_SUCCESS;
+ }
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -846,7 +846,10 @@ static int hugetlbfs_migrate_page(struct
+ rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+- migrate_page_copy(newpage, page);
++ if (mode != MIGRATE_SYNC_NO_COPY)
++ migrate_page_copy(newpage, page);
++ else
++ migrate_page_states(newpage, page);
+
+ return MIGRATEPAGE_SUCCESS;
+ }
+--- a/fs/ubifs/file.c
++++ b/fs/ubifs/file.c
+@@ -1482,7 +1482,10 @@ static int ubifs_migrate_page(struct add
+ SetPagePrivate(newpage);
+ }
+
+- migrate_page_copy(newpage, page);
++ if (mode != MIGRATE_SYNC_NO_COPY)
++ migrate_page_copy(newpage, page);
++ else
++ migrate_page_states(newpage, page);
+ return MIGRATEPAGE_SUCCESS;
+ }
+ #endif
+--- a/include/linux/migrate.h
++++ b/include/linux/migrate.h
+@@ -61,6 +61,7 @@ extern void putback_movable_page(struct
+
+ extern int migrate_prep(void);
+ extern int migrate_prep_local(void);
++extern void migrate_page_states(struct page *newpage, struct page *page);
+ extern void migrate_page_copy(struct page *newpage, struct page *page);
+ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page);
+@@ -81,6 +82,10 @@ static inline int isolate_movable_page(s
+ static inline int migrate_prep(void) { return -ENOSYS; }
+ static inline int migrate_prep_local(void) { return -ENOSYS; }
+
++static inline void migrate_page_states(struct page *newpage, struct page *page)
++{
++}
++
+ static inline void migrate_page_copy(struct page *newpage,
+ struct page *page) {}
+
+--- a/include/linux/migrate_mode.h
++++ b/include/linux/migrate_mode.h
+@@ -6,11 +6,16 @@
+ * on most operations but not ->writepage as the potential stall time
+ * is too significant
+ * MIGRATE_SYNC will block when migrating pages
++ * MIGRATE_SYNC_NO_COPY will block when migrating pages but will not copy pages
++ * with the CPU. Instead, page copy happens outside the migratepage()
++ * callback and is likely using a DMA engine. See migrate_vma() and HMM
++ * (mm/hmm.c) for users of this mode.
+ */
+ enum migrate_mode {
+ MIGRATE_ASYNC,
+ MIGRATE_SYNC_LIGHT,
+ MIGRATE_SYNC,
++ MIGRATE_SYNC_NO_COPY,
+ };
+
+ #endif /* MIGRATE_MODE_H_INCLUDED */
+--- a/mm/balloon_compaction.c
++++ b/mm/balloon_compaction.c
+@@ -139,6 +139,14 @@ int balloon_page_migrate(struct address_
+ {
+ struct balloon_dev_info *balloon = balloon_page_device(page);
+
++ /*
++ * We can not easily support the no copy case here so ignore it as it
++ * is unlikely to be use with ballon pages. See include/linux/hmm.h for
++ * user of the MIGRATE_SYNC_NO_COPY mode.
++ */
++ if (mode == MIGRATE_SYNC_NO_COPY)
++ return -EINVAL;
++
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -603,15 +603,10 @@ static void copy_huge_page(struct page *
+ /*
+ * Copy the page to its new location
+ */
+-void migrate_page_copy(struct page *newpage, struct page *page)
++void migrate_page_states(struct page *newpage, struct page *page)
+ {
+ int cpupid;
+
+- if (PageHuge(page) || PageTransHuge(page))
+- copy_huge_page(newpage, page);
+- else
+- copy_highpage(newpage, page);
+-
+ if (PageError(page))
+ SetPageError(newpage);
+ if (PageReferenced(page))
+@@ -665,6 +660,17 @@ void migrate_page_copy(struct page *newp
+
+ mem_cgroup_migrate(page, newpage);
+ }
++EXPORT_SYMBOL(migrate_page_states);
++
++void migrate_page_copy(struct page *newpage, struct page *page)
++{
++ if (PageHuge(page) || PageTransHuge(page))
++ copy_huge_page(newpage, page);
++ else
++ copy_highpage(newpage, page);
++
++ migrate_page_states(newpage, page);
++}
+ EXPORT_SYMBOL(migrate_page_copy);
+
+ /************************************************************
+@@ -690,7 +696,10 @@ int migrate_page(struct address_space *m
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+- migrate_page_copy(newpage, page);
++ if (mode != MIGRATE_SYNC_NO_COPY)
++ migrate_page_copy(newpage, page);
++ else
++ migrate_page_states(newpage, page);
+ return MIGRATEPAGE_SUCCESS;
+ }
+ EXPORT_SYMBOL(migrate_page);
+@@ -740,12 +749,15 @@ int buffer_migrate_page(struct address_s
+
+ SetPagePrivate(newpage);
+
+- migrate_page_copy(newpage, page);
++ if (mode != MIGRATE_SYNC_NO_COPY)
++ migrate_page_copy(newpage, page);
++ else
++ migrate_page_states(newpage, page);
+
+ bh = head;
+ do {
+ unlock_buffer(bh);
+- put_bh(bh);
++ put_bh(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+@@ -804,8 +816,13 @@ static int fallback_migrate_page(struct
+ {
+ if (PageDirty(page)) {
+ /* Only writeback pages in full synchronous migration */
+- if (mode != MIGRATE_SYNC)
++ switch (mode) {
++ case MIGRATE_SYNC:
++ case MIGRATE_SYNC_NO_COPY:
++ break;
++ default:
+ return -EBUSY;
++ }
+ return writeout(mapping, page);
+ }
+
+@@ -942,7 +959,11 @@ static int __unmap_and_move(struct page
+ * the retry loop is too short and in the sync-light case,
+ * the overhead of stalling is too much
+ */
+- if (mode != MIGRATE_SYNC) {
++ switch (mode) {
++ case MIGRATE_SYNC:
++ case MIGRATE_SYNC_NO_COPY:
++ break;
++ default:
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+@@ -1212,8 +1233,15 @@ static int unmap_and_move_huge_page(new_
+ return -ENOMEM;
+
+ if (!trylock_page(hpage)) {
+- if (!force || mode != MIGRATE_SYNC)
++ if (!force)
+ goto out;
++ switch (mode) {
++ case MIGRATE_SYNC:
++ case MIGRATE_SYNC_NO_COPY:
++ break;
++ default:
++ goto out;
++ }
+ lock_page(hpage);
+ }
+
+--- a/mm/zsmalloc.c
++++ b/mm/zsmalloc.c
+@@ -1983,6 +1983,14 @@ int zs_page_migrate(struct address_space
+ unsigned int obj_idx;
+ int ret = -EAGAIN;
+
++ /*
++ * We cannot support the _NO_COPY case here, because copy needs to
++ * happen under the zs lock, which does not work with
++ * MIGRATE_SYNC_NO_COPY workflow.
++ */
++ if (mode == MIGRATE_SYNC_NO_COPY)
++ return -EINVAL;
++
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
diff --git a/patches.suse/mm-migrate-support-un-addressable-zone_device-page-in-migration.patch b/patches.suse/mm-migrate-support-un-addressable-zone_device-page-in-migration.patch
new file mode 100644
index 0000000000..3df7183a12
--- /dev/null
+++ b/patches.suse/mm-migrate-support-un-addressable-zone_device-page-in-migration.patch
@@ -0,0 +1,396 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:12:17 -0700
+Subject: mm/migrate: support un-addressable ZONE_DEVICE page in migration
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: a5430dda8a3a1cdd532e37270e6f36436241b6e7
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+Allow to unmap and restore special swap entry of un-addressable
+ZONE_DEVICE memory.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-17-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Mark Hairgrove <mhairgrove@nvidia.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Sherry Cheung <SCheung@nvidia.com>
+Cc: Subhash Gutti <sgutti@nvidia.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/migrate.h | 10 ++-
+ mm/migrate.c | 149 ++++++++++++++++++++++++++++++++++++++----------
+ mm/page_vma_mapped.c | 10 +++
+ mm/rmap.c | 26 ++++++++
+ 4 files changed, 165 insertions(+), 30 deletions(-)
+
+--- a/include/linux/migrate.h
++++ b/include/linux/migrate.h
+@@ -148,12 +148,18 @@ static inline int migrate_misplaced_tran
+
+ #ifdef CONFIG_MIGRATION
+
++/*
++ * Watch out for PAE architecture, which has an unsigned long, and might not
++ * have enough bits to store all physical address and flags. So far we have
++ * enough room for all our flags.
++ */
+ #define MIGRATE_PFN_VALID (1UL << 0)
+ #define MIGRATE_PFN_MIGRATE (1UL << 1)
+ #define MIGRATE_PFN_LOCKED (1UL << 2)
+ #define MIGRATE_PFN_WRITE (1UL << 3)
+-#define MIGRATE_PFN_ERROR (1UL << 4)
+-#define MIGRATE_PFN_SHIFT 5
++#define MIGRATE_PFN_DEVICE (1UL << 4)
++#define MIGRATE_PFN_ERROR (1UL << 5)
++#define MIGRATE_PFN_SHIFT 6
+
+ static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
+ {
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -36,6 +36,7 @@
+ #include <linux/hugetlb.h>
+ #include <linux/hugetlb_cgroup.h>
+ #include <linux/gfp.h>
++#include <linux/memremap.h>
+ #include <linux/balloon_compaction.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/page_idle.h>
+@@ -228,13 +229,19 @@ static bool remove_migration_pte(struct
+ if (is_write_migration_entry(entry))
+ pte = maybe_mkwrite(pte, vma);
+
++ if (unlikely(is_zone_device_page(new)) &&
++ is_device_private_page(new)) {
++ entry = make_device_private_entry(new, pte_write(pte));
++ pte = swp_entry_to_pte(entry);
++ } else
++ flush_dcache_page(new);
++
+ #ifdef CONFIG_HUGETLB_PAGE
+ if (PageHuge(new)) {
+ pte = pte_mkhuge(pte);
+ pte = arch_make_huge_pte(pte, vma, new, 0);
+ }
+ #endif
+- flush_dcache_page(new);
+ set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
+
+ if (PageHuge(new)) {
+@@ -2164,17 +2171,40 @@ again:
+ pte = *ptep;
+ pfn = pte_pfn(pte);
+
+- if (!pte_present(pte)) {
++ if (pte_none(pte)) {
+ mpfn = pfn = 0;
+ goto next;
+ }
+
++ if (!pte_present(pte)) {
++ mpfn = pfn = 0;
++
++ /*
++ * Only care about unaddressable device page special
++ * page table entry. Other special swap entries are not
++ * migratable, and we ignore regular swapped page.
++ */
++ entry = pte_to_swp_entry(pte);
++ if (!is_device_private_entry(entry))
++ goto next;
++
++ page = device_private_entry_to_page(entry);
++ mpfn = migrate_pfn(page_to_pfn(page))|
++ MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
++ if (is_write_device_private_entry(entry))
++ mpfn |= MIGRATE_PFN_WRITE;
++ } else {
++ page = vm_normal_page(migrate->vma, addr, pte);
++ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
++ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
++ }
++
+ /* FIXME support THP */
+- page = vm_normal_page(migrate->vma, addr, pte);
+ if (!page || !page->mapping || PageTransCompound(page)) {
+ mpfn = pfn = 0;
+ goto next;
+ }
++ pfn = page_to_pfn(page);
+
+ /*
+ * By getting a reference on the page we pin it and that blocks
+@@ -2187,8 +2217,6 @@ again:
+ */
+ get_page(page);
+ migrate->cpages++;
+- mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+- mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+
+ /*
+ * Optimize for the common case where page is only mapped once
+@@ -2215,10 +2243,13 @@ again:
+ */
+ page_remove_rmap(page, false);
+ put_page(page);
+- unmapped++;
++
++ if (pte_present(pte))
++ unmapped++;
+ }
+
+ next:
++ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = mpfn;
+ }
+ arch_leave_lazy_mmu_mode();
+@@ -2288,6 +2319,28 @@ static bool migrate_vma_check_page(struc
+ if (PageCompound(page))
+ return false;
+
++ /* Page from ZONE_DEVICE have one extra reference */
++ if (is_zone_device_page(page)) {
++ /*
++ * Private page can never be pin as they have no valid pte and
++ * GUP will fail for those. Yet if there is a pending migration
++ * a thread might try to wait on the pte migration entry and
++ * will bump the page reference count. Sadly there is no way to
++ * differentiate a regular pin from migration wait. Hence to
++ * avoid 2 racing thread trying to migrate back to CPU to enter
++ * infinite loop (one stoping migration because the other is
++ * waiting on pte migration entry). We always return true here.
++ *
++ * FIXME proper solution is to rework migration_entry_wait() so
++ * it does not need to take a reference on page.
++ */
++ if (is_device_private_page(page))
++ return true;
++
++ /* Other ZONE_DEVICE memory type are not supported */
++ return false;
++ }
++
+ if ((page_count(page) - extra) > page_mapcount(page))
+ return false;
+
+@@ -2338,24 +2391,30 @@ static void migrate_vma_prepare(struct m
+ migrate->src[i] |= MIGRATE_PFN_LOCKED;
+ }
+
+- if (!PageLRU(page) && allow_drain) {
+- /* Drain CPU's pagevec */
+- lru_add_drain_all();
+- allow_drain = false;
+- }
++ /* ZONE_DEVICE pages are not on LRU */
++ if (!is_zone_device_page(page)) {
++ if (!PageLRU(page) && allow_drain) {
++ /* Drain CPU's pagevec */
++ lru_add_drain_all();
++ allow_drain = false;
++ }
+
+- if (isolate_lru_page(page)) {
+- if (remap) {
+- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+- migrate->cpages--;
+- restore++;
+- } else {
+- migrate->src[i] = 0;
+- unlock_page(page);
+- migrate->cpages--;
+- put_page(page);
++ if (isolate_lru_page(page)) {
++ if (remap) {
++ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
++ migrate->cpages--;
++ restore++;
++ } else {
++ migrate->src[i] = 0;
++ unlock_page(page);
++ migrate->cpages--;
++ put_page(page);
++ }
++ continue;
+ }
+- continue;
++
++ /* Drop the reference we took in collect */
++ put_page(page);
+ }
+
+ if (!migrate_vma_check_page(page)) {
+@@ -2364,14 +2423,19 @@ static void migrate_vma_prepare(struct m
+ migrate->cpages--;
+ restore++;
+
+- get_page(page);
+- putback_lru_page(page);
++ if (!is_zone_device_page(page)) {
++ get_page(page);
++ putback_lru_page(page);
++ }
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+
+- putback_lru_page(page);
++ if (!is_zone_device_page(page))
++ putback_lru_page(page);
++ else
++ put_page(page);
+ }
+ }
+ }
+@@ -2442,7 +2506,10 @@ restore:
+ unlock_page(page);
+ restore--;
+
+- putback_lru_page(page);
++ if (is_zone_device_page(page))
++ put_page(page);
++ else
++ putback_lru_page(page);
+ }
+ }
+
+@@ -2473,6 +2540,26 @@ static void migrate_vma_pages(struct mig
+
+ mapping = page_mapping(page);
+
++ if (is_zone_device_page(newpage)) {
++ if (is_device_private_page(newpage)) {
++ /*
++ * For now only support private anonymous when
++ * migrating to un-addressable device memory.
++ */
++ if (mapping) {
++ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
++ continue;
++ }
++ } else {
++ /*
++ * Other types of ZONE_DEVICE page are not
++ * supported.
++ */
++ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
++ continue;
++ }
++ }
++
+ r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+ if (r != MIGRATEPAGE_SUCCESS)
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+@@ -2513,11 +2600,17 @@ static void migrate_vma_finalize(struct
+ unlock_page(page);
+ migrate->cpages--;
+
+- putback_lru_page(page);
++ if (is_zone_device_page(page))
++ put_page(page);
++ else
++ putback_lru_page(page);
+
+ if (newpage != page) {
+ unlock_page(newpage);
+- putback_lru_page(newpage);
++ if (is_zone_device_page(newpage))
++ put_page(newpage);
++ else
++ putback_lru_page(newpage);
+ }
+ }
+ }
+--- a/mm/page_vma_mapped.c
++++ b/mm/page_vma_mapped.c
+@@ -37,6 +37,7 @@ static bool check_pte(struct page_vma_ma
+ if (!is_swap_pte(*pvmw->pte))
+ return false;
+ entry = pte_to_swp_entry(*pvmw->pte);
++
+ if (!is_migration_entry(entry))
+ return false;
+ if (migration_entry_to_page(entry) - pvmw->page >=
+@@ -49,6 +50,15 @@ static bool check_pte(struct page_vma_ma
+ WARN_ON_ONCE(1);
+ #endif
+ } else {
++ if (is_swap_pte(*pvmw->pte)) {
++ swp_entry_t entry;
++
++ entry = pte_to_swp_entry(*pvmw->pte);
++ if (is_device_private_entry(entry) &&
++ device_private_entry_to_page(entry) == pvmw->page)
++ return true;
++ }
++
+ if (!pte_present(*pvmw->pte))
+ return false;
+
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -63,6 +63,7 @@
+ #include <linux/hugetlb.h>
+ #include <linux/backing-dev.h>
+ #include <linux/page_idle.h>
++#include <linux/memremap.h>
+
+ #include <asm/tlbflush.h>
+
+@@ -1344,6 +1345,10 @@ static bool try_to_unmap_one(struct page
+ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
+ return true;
+
++ if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
++ is_zone_device_page(page) && !is_device_private_page(page))
++ return true;
++
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_address(vma, address,
+ flags & TTU_MIGRATION, page);
+@@ -1380,6 +1385,27 @@ static bool try_to_unmap_one(struct page
+ address = pvmw.address;
+
+
++ if (IS_ENABLED(CONFIG_MIGRATION) &&
++ (flags & TTU_MIGRATION) &&
++ is_zone_device_page(page)) {
++ swp_entry_t entry;
++ pte_t swp_pte;
++
++ pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
++
++ /*
++ * Store the pfn of the page in a special migration
++ * pte. do_swap_page() will wait until the migration
++ * pte is removed and then restart fault handling.
++ */
++ entry = make_migration_entry(page, 0);
++ swp_pte = swp_entry_to_pte(entry);
++ if (pte_soft_dirty(pteval))
++ swp_pte = pte_swp_mksoft_dirty(swp_pte);
++ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
++ goto discard;
++ }
++
+ if (!(flags & TTU_IGNORE_ACCESS)) {
+ if (ptep_clear_flush_young_notify(vma, address,
+ pvmw.pte)) {
diff --git a/patches.suse/mm-zone_device-new-type-of-zone_device-for-unaddressable-memory.patch b/patches.suse/mm-zone_device-new-type-of-zone_device-for-unaddressable-memory.patch
new file mode 100644
index 0000000000..202937a4d0
--- /dev/null
+++ b/patches.suse/mm-zone_device-new-type-of-zone_device-for-unaddressable-memory.patch
@@ -0,0 +1,598 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:11:43 -0700
+Subject: mm/ZONE_DEVICE: new type of ZONE_DEVICE for unaddressable memory
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 5042db43cc26f51eed51c56192e2c2317e44315f
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+HMM (heterogeneous memory management) need struct page to support
+migration from system main memory to device memory. Reasons for HMM and
+migration to device memory is explained with HMM core patch.
+
+This patch deals with device memory that is un-addressable memory (ie CPU
+can not access it). Hence we do not want those struct page to be manage
+like regular memory. That is why we extend ZONE_DEVICE to support
+different types of memory.
+
+A persistent memory type is define for existing user of ZONE_DEVICE and a
+new device un-addressable type is added for the un-addressable memory
+type. There is a clear separation between what is expected from each
+memory type and existing user of ZONE_DEVICE are un-affected by new
+requirement and new use of the un-addressable type. All specific code
+path are protect with test against the memory type.
+
+Because memory is un-addressable we use a new special swap type for when a
+page is migrated to device memory (this reduces the number of maximum swap
+file).
+
+The main two additions beside memory type to ZONE_DEVICE is two callbacks.
+First one, page_free() is call whenever page refcount reach 1 (which
+means the page is free as ZONE_DEVICE page never reach a refcount of 0).
+This allow device driver to manage its memory and associated struct page.
+
+The second callback page_fault() happens when there is a CPU access to an
+address that is back by a device page (which are un-addressable by the
+CPU). This callback is responsible to migrate the page back to system
+main memory. Device driver can not block migration back to system memory,
+HMM make sure that such page can not be pin into device memory.
+
+If device is in some error condition and can not migrate memory back then
+a CPU page fault to device memory should end with SIGBUS.
+
+[arnd@arndb.de: fix warning]
+ Link: http://lkml.kernel.org/r/20170823133213.712917-1-arnd@arndb.de
+Link: http://lkml.kernel.org/r/20170817000548.32038-8-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Acked-by: Dan Williams <dan.j.williams@intel.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Mark Hairgrove <mhairgrove@nvidia.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Sherry Cheung <SCheung@nvidia.com>
+Cc: Subhash Gutti <sgutti@nvidia.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ fs/proc/task_mmu.c | 7 ++++
+ include/linux/ioport.h | 1
+ include/linux/memremap.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/mm.h | 12 +++++++
+ include/linux/swap.h | 24 +++++++++++++--
+ include/linux/swapops.h | 68 +++++++++++++++++++++++++++++++++++++++++++
+ kernel/memremap.c | 34 +++++++++++++++++++++
+ mm/Kconfig | 11 ++++++-
+ mm/memory.c | 61 +++++++++++++++++++++++++++++++++++++++
+ mm/memory_hotplug.c | 10 +++++-
+ mm/mprotect.c | 14 +++++++++
+ 11 files changed, 309 insertions(+), 6 deletions(-)
+
+--- a/fs/proc/task_mmu.c
++++ b/fs/proc/task_mmu.c
+@@ -539,6 +539,8 @@ static void smaps_pte_entry(pte_t *pte,
+ }
+ } else if (is_migration_entry(swpent))
+ page = migration_entry_to_page(swpent);
++ else if (is_device_private_entry(swpent))
++ page = device_private_entry_to_page(swpent);
+ } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
+ && pte_none(*pte))) {
+ page = find_get_entry(vma->vm_file->f_mapping,
+@@ -701,6 +703,8 @@ static int smaps_hugetlb_range(pte_t *pt
+
+ if (is_migration_entry(swpent))
+ page = migration_entry_to_page(swpent);
++ else if (is_device_private_entry(swpent))
++ page = device_private_entry_to_page(swpent);
+ }
+ if (page) {
+ int mapcount = page_mapcount(page);
+@@ -1195,6 +1199,9 @@ static pagemap_entry_t pte_to_pagemap_en
+ flags |= PM_SWAP;
+ if (is_migration_entry(entry))
+ page = migration_entry_to_page(entry);
++
++ if (is_device_private_entry(entry))
++ page = device_private_entry_to_page(entry);
+ }
+
+ if (page && !PageAnon(page))
+--- a/include/linux/ioport.h
++++ b/include/linux/ioport.h
+@@ -130,6 +130,7 @@ enum {
+ IORES_DESC_ACPI_NV_STORAGE = 3,
+ IORES_DESC_PERSISTENT_MEMORY = 4,
+ IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
++ IORES_DESC_DEVICE_PRIVATE_MEMORY = 6,
+ };
+
+ /* helpers to define resources */
+--- a/include/linux/memremap.h
++++ b/include/linux/memremap.h
+@@ -4,6 +4,8 @@
+ #include <linux/ioport.h>
+ #include <linux/percpu-refcount.h>
+
++#include <asm/pgtable.h>
++
+ struct resource;
+ struct device;
+
+@@ -35,18 +37,89 @@ static inline struct vmem_altmap *to_vme
+ }
+ #endif
+
++/*
++ * Specialize ZONE_DEVICE memory into multiple types each having differents
++ * usage.
++ *
++ * MEMORY_DEVICE_HOST:
++ * Persistent device memory (pmem): struct page might be allocated in different
++ * memory and architecture might want to perform special actions. It is similar
++ * to regular memory, in that the CPU can access it transparently. However,
++ * it is likely to have different bandwidth and latency than regular memory.
++ * See Documentation/nvdimm/nvdimm.txt for more information.
++ *
++ * MEMORY_DEVICE_PRIVATE:
++ * Device memory that is not directly addressable by the CPU: CPU can neither
++ * read nor write private memory. In this case, we do still have struct pages
++ * backing the device memory. Doing so simplifies the implementation, but it is
++ * important to remember that there are certain points at which the struct page
++ * must be treated as an opaque object, rather than a "normal" struct page.
++ *
++ * A more complete discussion of unaddressable memory may be found in
++ * include/linux/hmm.h and Documentation/vm/hmm.txt.
++ */
++enum memory_type {
++ MEMORY_DEVICE_HOST = 0,
++ MEMORY_DEVICE_PRIVATE,
++};
++
++/*
++ * For MEMORY_DEVICE_PRIVATE we use ZONE_DEVICE and extend it with two
++ * callbacks:
++ * page_fault()
++ * page_free()
++ *
++ * Additional notes about MEMORY_DEVICE_PRIVATE may be found in
++ * include/linux/hmm.h and Documentation/vm/hmm.txt. There is also a brief
++ * explanation in include/linux/memory_hotplug.h.
++ *
++ * The page_fault() callback must migrate page back, from device memory to
++ * system memory, so that the CPU can access it. This might fail for various
++ * reasons (device issues, device have been unplugged, ...). When such error
++ * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
++ * set the CPU page table entry to "poisoned".
++ *
++ * Note that because memory cgroup charges are transferred to the device memory,
++ * this should never fail due to memory restrictions. However, allocation
++ * of a regular system page might still fail because we are out of memory. If
++ * that happens, the page_fault() callback must return VM_FAULT_OOM.
++ *
++ * The page_fault() callback can also try to migrate back multiple pages in one
++ * chunk, as an optimization. It must, however, prioritize the faulting address
++ * over all the others.
++ *
++ *
++ * The page_free() callback is called once the page refcount reaches 1
++ * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
++ * This allows the device driver to implement its own memory management.)
++ */
++typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
++ unsigned long addr,
++ const struct page *page,
++ unsigned int flags,
++ pmd_t *pmdp);
++typedef void (*dev_page_free_t)(struct page *page, void *data);
++
+ /**
+ * struct dev_pagemap - metadata for ZONE_DEVICE mappings
++ * @page_fault: callback when CPU fault on an unaddressable device page
++ * @page_free: free page callback when page refcount reaches 1
+ * @altmap: pre-allocated/reserved memory for vmemmap allocations
+ * @res: physical address range covered by @ref
+ * @ref: reference count that pins the devm_memremap_pages() mapping
+ * @dev: host device of the mapping for debug
++ * @data: private data pointer for page_free()
++ * @type: memory type: see MEMORY_* in memory_hotplug.h
+ */
+ struct dev_pagemap {
++ dev_page_fault_t page_fault;
++ dev_page_free_t page_free;
+ struct vmem_altmap *altmap;
+ const struct resource *res;
+ struct percpu_ref *ref;
+ struct device *dev;
++ void *data;
++ enum memory_type type;
+ };
+
+ #ifdef CONFIG_ZONE_DEVICE
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -788,11 +788,23 @@ static inline bool is_zone_device_page(c
+ {
+ return page_zonenum(page) == ZONE_DEVICE;
+ }
++
++static inline bool is_device_private_page(const struct page *page)
++{
++ /* See MEMORY_DEVICE_PRIVATE in include/linux/memory_hotplug.h */
++ return ((page_zonenum(page) == ZONE_DEVICE) &&
++ (page->pgmap->type == MEMORY_DEVICE_PRIVATE));
++}
+ #else
+ static inline bool is_zone_device_page(const struct page *page)
+ {
+ return false;
+ }
++
++static inline bool is_device_private_page(const struct page *page)
++{
++ return false;
++}
+ #endif
+
+ static inline void get_page(struct page *page)
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -51,6 +51,23 @@ static inline int current_is_kswapd(void
+ */
+
+ /*
++ * Unaddressable device memory support. See include/linux/hmm.h and
++ * Documentation/vm/hmm.txt. Short description is we need struct pages for
++ * device memory that is unaddressable (inaccessible) by CPU, so that we can
++ * migrate part of a process memory to device memory.
++ *
++ * When a page is migrated from CPU to device, we set the CPU page table entry
++ * to a special SWP_DEVICE_* entry.
++ */
++#ifdef CONFIG_DEVICE_PRIVATE
++#define SWP_DEVICE_NUM 2
++#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
++#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
++#else
++#define SWP_DEVICE_NUM 0
++#endif
++
++/*
+ * NUMA node memory migration support
+ */
+ #ifdef CONFIG_MIGRATION
+@@ -72,7 +89,8 @@ static inline int current_is_kswapd(void
+ #endif
+
+ #define MAX_SWAPFILES \
+- ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
++ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
++ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+
+ /*
+ * Magic header for a swap area. The first part of the union is
+@@ -443,8 +461,8 @@ static inline void show_swap_cache_info(
+ {
+ }
+
+-#define free_swap_and_cache(swp) is_migration_entry(swp)
+-#define swapcache_prepare(swp) is_migration_entry(swp)
++#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
++#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
+
+ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
+ {
+--- a/include/linux/swapops.h
++++ b/include/linux/swapops.h
+@@ -100,6 +100,74 @@ static inline void *swp_to_radix_entry(s
+ return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
+ }
+
++#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
++static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
++{
++ return swp_entry(write ? SWP_DEVICE_WRITE : SWP_DEVICE_READ,
++ page_to_pfn(page));
++}
++
++static inline bool is_device_private_entry(swp_entry_t entry)
++{
++ int type = swp_type(entry);
++ return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
++}
++
++static inline void make_device_private_entry_read(swp_entry_t *entry)
++{
++ *entry = swp_entry(SWP_DEVICE_READ, swp_offset(*entry));
++}
++
++static inline bool is_write_device_private_entry(swp_entry_t entry)
++{
++ return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
++}
++
++static inline struct page *device_private_entry_to_page(swp_entry_t entry)
++{
++ return pfn_to_page(swp_offset(entry));
++}
++
++int device_private_entry_fault(struct vm_area_struct *vma,
++ unsigned long addr,
++ swp_entry_t entry,
++ unsigned int flags,
++ pmd_t *pmdp);
++#else /* CONFIG_DEVICE_PRIVATE */
++static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
++{
++ return swp_entry(0, 0);
++}
++
++static inline void make_device_private_entry_read(swp_entry_t *entry)
++{
++}
++
++static inline bool is_device_private_entry(swp_entry_t entry)
++{
++ return false;
++}
++
++static inline bool is_write_device_private_entry(swp_entry_t entry)
++{
++ return false;
++}
++
++static inline struct page *device_private_entry_to_page(swp_entry_t entry)
++{
++ return NULL;
++}
++
++static inline int device_private_entry_fault(struct vm_area_struct *vma,
++ unsigned long addr,
++ swp_entry_t entry,
++ unsigned int flags,
++ pmd_t *pmdp)
++{
++ return VM_FAULT_SIGBUS;
++}
++#endif /* CONFIG_DEVICE_PRIVATE */
++
+ #ifdef CONFIG_MIGRATION
+ static inline swp_entry_t make_migration_entry(struct page *page, int write)
+ {
+--- a/kernel/memremap.c
++++ b/kernel/memremap.c
+@@ -18,6 +18,8 @@
+ #include <linux/io.h>
+ #include <linux/mm.h>
+ #include <linux/memory_hotplug.h>
++#include <linux/swap.h>
++#include <linux/swapops.h>
+
+ #ifndef ioremap_cache
+ /* temporary while we convert existing ioremap_cache users to memremap */
+@@ -219,6 +221,34 @@ static unsigned long order_at(struct res
+ for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
+ pgoff += 1UL << order, order = order_at((res), pgoff))
+
++#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
++int device_private_entry_fault(struct vm_area_struct *vma,
++ unsigned long addr,
++ swp_entry_t entry,
++ unsigned int flags,
++ pmd_t *pmdp)
++{
++ struct page *page = device_private_entry_to_page(entry);
++
++ /*
++ * The page_fault() callback must migrate page back to system memory
++ * so that CPU can access it. This might fail for various reasons
++ * (device issue, device was unsafely unplugged, ...). When such
++ * error conditions happen, the callback must return VM_FAULT_SIGBUS.
++ *
++ * Note that because memory cgroup charges are accounted to the device
++ * memory, this should never fail because of memory restrictions (but
++ * allocation of regular system page might still fail because we are
++ * out of memory).
++ *
++ * There is a more in-depth description of what that callback can and
++ * cannot do, in include/linux/memremap.h
++ */
++ return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
++}
++EXPORT_SYMBOL(device_private_entry_fault);
++#endif /* CONFIG_DEVICE_PRIVATE */
++
+ static void pgmap_radix_release(struct resource *res)
+ {
+ unsigned long pgoff, order;
+@@ -356,6 +386,10 @@ void *devm_memremap_pages(struct device
+ }
+ pgmap->ref = ref;
+ pgmap->res = &page_map->res;
++ pgmap->type = MEMORY_DEVICE_HOST;
++ pgmap->page_fault = NULL;
++ pgmap->page_free = NULL;
++ pgmap->data = NULL;
+
+ mutex_lock(&pgmap_lock);
+ error = 0;
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -684,7 +684,7 @@ config IDLE_PAGE_TRACKING
+ See Documentation/vm/idle_page_tracking.txt for more details.
+
+ config ZONE_DEVICE
+- bool "Device memory (pmem, etc...) hotplug support"
++ bool "Device memory (pmem, HMM, etc...) hotplug support"
+ depends on MEMORY_HOTPLUG
+ depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
+@@ -725,6 +725,15 @@ config HMM_MIRROR
+ page tables (at PAGE_SIZE granularity), and must be able to recover from
+ the resulting potential page faults.
+
++config DEVICE_PRIVATE
++ bool "Unaddressable device memory (GPU memory, ...)"
++ depends on ARCH_HAS_HMM
++
++ help
++ Allows creation of struct pages to represent unaddressable device
++ memory; i.e., memory that is only accessible from the device (or
++ group of devices). You likely also want to select HMM_MIRROR.
++
+ config FRAME_VECTOR
+ bool
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -49,6 +49,7 @@
+ #include <linux/swap.h>
+ #include <linux/highmem.h>
+ #include <linux/pagemap.h>
++#include <linux/memremap.h>
+ #include <linux/ksm.h>
+ #include <linux/rmap.h>
+ #include <linux/export.h>
+@@ -956,6 +957,35 @@ copy_one_pte(struct mm_struct *dst_mm, s
+ pte = pte_swp_mksoft_dirty(pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
++ } else if (is_device_private_entry(entry)) {
++ page = device_private_entry_to_page(entry);
++
++ /*
++ * Update rss count even for unaddressable pages, as
++ * they should treated just like normal pages in this
++ * respect.
++ *
++ * We will likely want to have some new rss counters
++ * for unaddressable pages, at some point. But for now
++ * keep things as they are.
++ */
++ get_page(page);
++ rss[mm_counter(page)]++;
++ page_dup_rmap(page, false);
++
++ /*
++ * We do not preserve soft-dirty information, because so
++ * far, checkpoint/restore is the only feature that
++ * requires that. And checkpoint/restore does not work
++ * when a device driver is involved (you cannot easily
++ * save and restore device driver state).
++ */
++ if (is_write_device_private_entry(entry) &&
++ is_cow_mapping(vm_flags)) {
++ make_device_private_entry_read(&entry);
++ pte = swp_entry_to_pte(entry);
++ set_pte_at(src_mm, addr, src_pte, pte);
++ }
+ }
+ goto out_set_pte;
+ }
+@@ -1273,6 +1303,29 @@ again:
+ }
+ continue;
+ }
++
++ entry = pte_to_swp_entry(ptent);
++ if (non_swap_entry(entry) && is_device_private_entry(entry)) {
++ struct page *page = device_private_entry_to_page(entry);
++
++ if (unlikely(details && details->check_mapping)) {
++ /*
++ * unmap_shared_mapping_pages() wants to
++ * invalidate cache without truncating:
++ * unmap shared but keep private pages.
++ */
++ if (details->check_mapping !=
++ page_rmapping(page))
++ continue;
++ }
++
++ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
++ rss[mm_counter(page)]--;
++ page_remove_rmap(page, false);
++ put_page(page);
++ continue;
++ }
++
+ /* If details->check_mapping, we leave swap entries. */
+ if (unlikely(details))
+ continue;
+@@ -2756,6 +2809,14 @@ int do_swap_page(struct vm_fault *vmf)
+ if (is_migration_entry(entry)) {
+ migration_entry_wait(vma->vm_mm, vmf->pmd,
+ vmf->address);
++ } else if (is_device_private_entry(entry)) {
++ /*
++ * For un-addressable device memory we call the pgmap
++ * fault handler callback. The callback must migrate
++ * the page back to some CPU accessible page.
++ */
++ ret = device_private_entry_fault(vma, vmf->address, entry,
++ vmf->flags, vmf->pmd);
+ } else if (is_hwpoison_entry(entry)) {
+ ret = VM_FAULT_HWPOISON;
+ } else {
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -156,7 +156,7 @@ void mem_hotplug_done(void)
+ /* add this memory to iomem resource */
+ static struct resource *register_memory_resource(u64 start, u64 size)
+ {
+- struct resource *res;
++ struct resource *res, *conflict;
+ res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+ if (!res)
+ return ERR_PTR(-ENOMEM);
+@@ -165,7 +165,13 @@ static struct resource *register_memory_
+ res->start = start;
+ res->end = start + size - 1;
+ res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+- if (request_resource(&iomem_resource, res) < 0) {
++ conflict = request_resource_conflict(&iomem_resource, res);
++ if (conflict) {
++ if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
++ pr_debug("Device unaddressable memory block "
++ "memory hotplug at %#010llx !\n",
++ (unsigned long long)start);
++ }
+ pr_debug("System RAM resource %pR cannot be added\n", res);
+ kfree(res);
+ return ERR_PTR(-EEXIST);
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -127,6 +127,20 @@ static unsigned long change_pte_range(st
+
+ pages++;
+ }
++
++ if (is_write_device_private_entry(entry)) {
++ pte_t newpte;
++
++ /*
++ * We do not preserve soft-dirtiness. See
++ * copy_one_pte() for explanation.
++ */
++ make_device_private_entry_read(&entry);
++ newpte = swp_entry_to_pte(entry);
++ set_pte_at(mm, addr, pte, newpte);
++
++ pages++;
++ }
+ }
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
diff --git a/patches.suse/mm-zone_device-special-case-put_page-for-device-private-pages.patch b/patches.suse/mm-zone_device-special-case-put_page-for-device-private-pages.patch
new file mode 100644
index 0000000000..1e7d124a0e
--- /dev/null
+++ b/patches.suse/mm-zone_device-special-case-put_page-for-device-private-pages.patch
@@ -0,0 +1,194 @@
+From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
+Date: Fri, 8 Sep 2017 16:11:46 -0700
+Subject: mm/ZONE_DEVICE: special case put_page() for device private pages
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Git-commit: 7b2d55d2c8961ae9d456d3133f4ae2f0fbd3e14f
+Patch-mainline: v4.14-rc1
+References: VM Functionality, FATE#323338, bsc#1047814
+
+A ZONE_DEVICE page that reach a refcount of 1 is free ie no longer have
+any user. For device private pages this is important to catch and thus we
+need to special case put_page() for this.
+
+Link: http://lkml.kernel.org/r/20170817000548.32038-9-jglisse@redhat.com
+Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: Balbir Singh <bsingharora@gmail.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: David Nellans <dnellans@nvidia.com>
+Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Mark Hairgrove <mhairgrove@nvidia.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Cc: Sherry Cheung <SCheung@nvidia.com>
+Cc: Subhash Gutti <sgutti@nvidia.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Bob Liu <liubo95@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+---
+ include/linux/memremap.h | 13 +++++++++++++
+ include/linux/mm.h | 31 ++++++++++++++++++++++---------
+ kernel/memremap.c | 25 ++++++++++++++++++++++++-
+ mm/hmm.c | 8 ++++++++
+ 4 files changed, 67 insertions(+), 10 deletions(-)
+
+--- a/include/linux/memremap.h
++++ b/include/linux/memremap.h
+@@ -126,6 +126,14 @@ struct dev_pagemap {
+ void *devm_memremap_pages(struct device *dev, struct resource *res,
+ struct percpu_ref *ref, struct vmem_altmap *altmap);
+ struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
++
++static inline bool is_zone_device_page(const struct page *page);
++
++static inline bool is_device_private_page(const struct page *page)
++{
++ return is_zone_device_page(page) &&
++ page->pgmap->type == MEMORY_DEVICE_PRIVATE;
++}
+ #else
+ static inline void *devm_memremap_pages(struct device *dev,
+ struct resource *res, struct percpu_ref *ref,
+@@ -144,6 +152,11 @@ static inline struct dev_pagemap *find_d
+ {
+ return NULL;
+ }
++
++static inline bool is_device_private_page(const struct page *page)
++{
++ return false;
++}
+ #endif
+
+ /**
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -23,6 +23,7 @@
+ #include <linux/page_ext.h>
+ #include <linux/err.h>
+ #include <linux/page_ref.h>
++#include <linux/memremap.h>
+
+ struct mempolicy;
+ struct anon_vma;
+@@ -788,25 +789,25 @@ static inline bool is_zone_device_page(c
+ {
+ return page_zonenum(page) == ZONE_DEVICE;
+ }
+-
+-static inline bool is_device_private_page(const struct page *page)
+-{
+- /* See MEMORY_DEVICE_PRIVATE in include/linux/memory_hotplug.h */
+- return ((page_zonenum(page) == ZONE_DEVICE) &&
+- (page->pgmap->type == MEMORY_DEVICE_PRIVATE));
+-}
+ #else
+ static inline bool is_zone_device_page(const struct page *page)
+ {
+ return false;
+ }
++#endif
+
+-static inline bool is_device_private_page(const struct page *page)
++#ifdef CONFIG_DEVICE_PRIVATE
++void put_zone_device_private_page(struct page *page);
++#else
++static inline void put_zone_device_private_page(struct page *page)
+ {
+- return false;
+ }
+ #endif
+
++static inline bool is_device_private_page(const struct page *page);
++
++DECLARE_STATIC_KEY_FALSE(device_private_key);
++
+ static inline void get_page(struct page *page)
+ {
+ page = compound_head(page);
+@@ -822,6 +823,18 @@ static inline void put_page(struct page
+ {
+ page = compound_head(page);
+
++ /*
++ * For private device pages we need to catch refcount transition from
++ * 2 to 1, when refcount reach one it means the private device page is
++ * free and we need to inform the device driver through callback. See
++ * include/linux/memremap.h and HMM for details.
++ */
++ if (static_branch_unlikely(&device_private_key) &&
++ unlikely(is_device_private_page(page))) {
++ put_zone_device_private_page(page);
++ return;
++ }
++
+ if (put_page_testzero(page))
+ __put_page(page);
+ }
+--- a/kernel/memremap.c
++++ b/kernel/memremap.c
+@@ -11,7 +11,6 @@
+ * General Public License for more details.
+ */
+ #include <linux/radix-tree.h>
+-#include <linux/memremap.h>
+ #include <linux/device.h>
+ #include <linux/types.h>
+ #include <linux/pfn_t.h>
+@@ -502,3 +501,27 @@ struct vmem_altmap *to_vmem_altmap(unsig
+ return pgmap ? pgmap->altmap : NULL;
+ }
+ #endif /* CONFIG_ZONE_DEVICE */
++
++
++#ifdef CONFIG_DEVICE_PRIVATE
++void put_zone_device_private_page(struct page *page)
++{
++ int count = page_ref_dec_return(page);
++
++ /*
++ * If refcount is 1 then page is freed and refcount is stable as nobody
++ * holds a reference on the page.
++ */
++ if (count == 1) {
++ /* Clear Active bit in case of parallel mark_page_accessed */
++ __ClearPageActive(page);
++ __ClearPageWaiters(page);
++
++ page->mapping = NULL;
++
++ page->pgmap->page_free(page, page->pgmap->data);
++ } else if (!count)
++ __put_page(page);
++}
++EXPORT_SYMBOL(put_zone_device_private_page);
++#endif /* CONFIG_DEVICE_PRIVATE */
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -25,9 +25,17 @@
+ #include <linux/sched.h>
+ #include <linux/swapops.h>
+ #include <linux/hugetlb.h>
++#include <linux/jump_label.h>
+ #include <linux/mmu_notifier.h>
+
+
++/*
++ * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
++ */
++DEFINE_STATIC_KEY_FALSE(device_private_key);
++EXPORT_SYMBOL(device_private_key);
++
++
+ #ifdef CONFIG_HMM
+ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
+
diff --git a/series.conf b/series.conf
index 33c665a1a5..b523416868 100644
--- a/series.conf
+++ b/series.conf
@@ -5367,6 +5367,27 @@
patches.arch/s390-sles15-00-09-KVM-s390-Support-Configuration-z-Architecture-Mode.patch
patches.drivers/KVM-s390-expose-no-DAT-to-guest-and-migration-suppor.patch
patches.drivers/PCI-vmd-Free-up-IRQs-on-suspend-path.patch
+ patches.suse/hmm-heterogeneous-memory-management-documentation.patch
+ patches.suse/mm-hmm-heterogeneous-memory-management-hmm-for-short.patch
+ patches.suse/mm-hmm-mirror-mirror-process-address-space-on-device-with-hmm-helpers.patch
+ patches.suse/mm-hmm-mirror-helper-to-snapshot-cpu-page-table.patch
+ patches.suse/mm-hmm-mirror-device-page-fault-handler.patch
+ patches.suse/mm-memory_hotplug-introduce-add_pages.patch
+ patches.suse/mm-zone_device-new-type-of-zone_device-for-unaddressable-memory.patch
+ patches.suse/mm-zone_device-special-case-put_page-for-device-private-pages.patch
+ patches.suse/mm-memcontrol-allow-to-uncharge-page-without-using-page-lru-field.patch
+ patches.suse/mm-memcontrol-support-memory_device_private.patch
+ patches.suse/mm-hmm-devmem-device-memory-hotplug-using-zone_device.patch
+ patches.suse/mm-hmm-devmem-dummy-hmm-device-for-zone_device-memory.patch
+ patches.suse/mm-migrate-new-migrate-mode-migrate_sync_no_copy.patch
+ patches.suse/mm-migrate-new-memory-migration-helper-for-use-with-device-memory.patch
+ patches.suse/mm-migrate-migrate_vma-unmap-page-from-vma-while-collecting-pages.patch
+ patches.suse/mm-migrate-support-un-addressable-zone_device-page-in-migration.patch
+ patches.suse/mm-migrate-allow-migrate_vma-to-alloc-new-page-on-empty-entry.patch
+ patches.suse/mm-device-public-memory-device-memory-cache-coherent-with-cpu.patch
+ patches.suse/mm-hmm-add-new-helper-to-hotplug-cdm-memory-region.patch
+ patches.suse/mm-hmm-avoid-bloating-arch-that-do-not-make-use-of-hmm.patch
+ patches.suse/mm-hmm-fix-build-when-hmm-is-disabled.patch
patches.drivers/mac80211-agg-tx-call-drv_wake_tx_queue-in-proper-con.patch
patches.drivers/nl80211-look-for-HT-VHT-capabilities-in-beacon-s-tai.patch
patches.drivers/cfg80211-honor-NL80211_RRF_NO_HT40-MINUS-PLUS.patch
@@ -5562,6 +5583,7 @@
patches.drivers/scsi-aacraid-Add-a-small-delay-after-IOP-reset.patch
patches.drivers/staging-rtl8723bs-avoid-null-pointer-dereference-on-
patches.drivers/iio-trigger-stm32-timer-fix-a-corner-case-to-write-p
+ patches.suse/mm-device-public-memory-fix-edge-case-in-vm_normal_page.patch
patches.drivers/0001-iwlwifi-mvm-send-all-non-bufferable-frames-on-the-pr.patch
patches.drivers/0001-iwlwifi-mvm-change-state-when-queueing-agg-start-wor.patch
patches.drivers/0001-iwlwifi-mvm-wake-the-correct-mac80211-queue.patch
@@ -5619,6 +5641,7 @@
patches.drivers/ALSA-usb-audio-Kill-stray-URB-at-exiting
patches.fixes/ALSA-seq-Fix-use-after-free-at-creating-a-port
patches.drivers/ALSA-caiaq-Fix-stray-URB-at-probe-error-path
+ patches.suse/mm-migrate-fix-indexing-bug-off-by-one-and-avoid-out-of-bound-access.patch
patches.fixes/fix-mpage_writepage-for-pages-with-buffers.patch
patches.drivers/drm-i915-Use-crtc_state_is_legacy_gamma-in-intel_col
patches.drivers/drm-i915-Order-two-completing-nop_submit_request