summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--0000_README4
-rw-r--r--1001_linux-5.1.2.patch2955
2 files changed, 2959 insertions, 0 deletions
diff --git a/0000_README b/0000_README
index 72fa25f2..b65b94af 100644
--- a/0000_README
+++ b/0000_README
@@ -47,6 +47,10 @@ Patch: 1000_linux-5.1.1.patch
From: http://www.kernel.org
Desc: Linux 5.1.1
+Patch: 1001_linux-5.1.2.patch
+From: http://www.kernel.org
+Desc: Linux 5.1.2
+
Patch: 1500_XATTR_USER_PREFIX.patch
From: https://bugs.gentoo.org/show_bug.cgi?id=470644
Desc: Support for namespace user.pax.* on tmpfs.
diff --git a/1001_linux-5.1.2.patch b/1001_linux-5.1.2.patch
new file mode 100644
index 00000000..a8d72597
--- /dev/null
+++ b/1001_linux-5.1.2.patch
@@ -0,0 +1,2955 @@
+diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
+index 9605dbd4b5b5..141a7bb58b80 100644
+--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
++++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
+@@ -484,6 +484,7 @@ What: /sys/devices/system/cpu/vulnerabilities
+ /sys/devices/system/cpu/vulnerabilities/spectre_v2
+ /sys/devices/system/cpu/vulnerabilities/spec_store_bypass
+ /sys/devices/system/cpu/vulnerabilities/l1tf
++ /sys/devices/system/cpu/vulnerabilities/mds
+ Date: January 2018
+ Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+ Description: Information about CPU vulnerabilities
+@@ -496,8 +497,7 @@ Description: Information about CPU vulnerabilities
+ "Vulnerable" CPU is affected and no mitigation in effect
+ "Mitigation: $M" CPU is affected and mitigation $M is in effect
+
+- Details about the l1tf file can be found in
+- Documentation/admin-guide/l1tf.rst
++ See also: Documentation/admin-guide/hw-vuln/index.rst
+
+ What: /sys/devices/system/cpu/smt
+ /sys/devices/system/cpu/smt/active
+diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
+new file mode 100644
+index 000000000000..ffc064c1ec68
+--- /dev/null
++++ b/Documentation/admin-guide/hw-vuln/index.rst
+@@ -0,0 +1,13 @@
++========================
++Hardware vulnerabilities
++========================
++
++This section describes CPU vulnerabilities and provides an overview of the
++possible mitigations along with guidance for selecting mitigations if they
++are configurable at compile, boot or run time.
++
++.. toctree::
++ :maxdepth: 1
++
++ l1tf
++ mds
+diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst
+new file mode 100644
+index 000000000000..31653a9f0e1b
+--- /dev/null
++++ b/Documentation/admin-guide/hw-vuln/l1tf.rst
+@@ -0,0 +1,615 @@
++L1TF - L1 Terminal Fault
++========================
++
++L1 Terminal Fault is a hardware vulnerability which allows unprivileged
++speculative access to data which is available in the Level 1 Data Cache
++when the page table entry controlling the virtual address, which is used
++for the access, has the Present bit cleared or other reserved bits set.
++
++Affected processors
++-------------------
++
++This vulnerability affects a wide range of Intel processors. The
++vulnerability is not present on:
++
++ - Processors from AMD, Centaur and other non Intel vendors
++
++ - Older processor models, where the CPU family is < 6
++
++ - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
++ Penwell, Pineview, Silvermont, Airmont, Merrifield)
++
++ - The Intel XEON PHI family
++
++ - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
++ IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
++ by the Meltdown vulnerability either. These CPUs should become
++ available by end of 2018.
++
++Whether a processor is affected or not can be read out from the L1TF
++vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
++
++Related CVEs
++------------
++
++The following CVE entries are related to the L1TF vulnerability:
++
++ ============= ================= ==============================
++ CVE-2018-3615 L1 Terminal Fault SGX related aspects
++ CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
++ CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
++ ============= ================= ==============================
++
++Problem
++-------
++
++If an instruction accesses a virtual address for which the relevant page
++table entry (PTE) has the Present bit cleared or other reserved bits set,
++then speculative execution ignores the invalid PTE and loads the referenced
++data if it is present in the Level 1 Data Cache, as if the page referenced
++by the address bits in the PTE was still present and accessible.
++
++While this is a purely speculative mechanism and the instruction will raise
++a page fault when it is retired eventually, the pure act of loading the
++data and making it available to other speculative instructions opens up the
++opportunity for side channel attacks to unprivileged malicious code,
++similar to the Meltdown attack.
++
++While Meltdown breaks the user space to kernel space protection, L1TF
++allows to attack any physical memory address in the system and the attack
++works across all protection domains. It allows an attack of SGX and also
++works from inside virtual machines because the speculation bypasses the
++extended page table (EPT) protection mechanism.
++
++
++Attack scenarios
++----------------
++
++1. Malicious user space
++^^^^^^^^^^^^^^^^^^^^^^^
++
++ Operating Systems store arbitrary information in the address bits of a
++ PTE which is marked non present. This allows a malicious user space
++ application to attack the physical memory to which these PTEs resolve.
++ In some cases user-space can maliciously influence the information
++ encoded in the address bits of the PTE, thus making attacks more
++ deterministic and more practical.
++
++ The Linux kernel contains a mitigation for this attack vector, PTE
++ inversion, which is permanently enabled and has no performance
++ impact. The kernel ensures that the address bits of PTEs, which are not
++ marked present, never point to cacheable physical memory space.
++
++ A system with an up to date kernel is protected against attacks from
++ malicious user space applications.
++
++2. Malicious guest in a virtual machine
++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
++
++ The fact that L1TF breaks all domain protections allows malicious guest
++ OSes, which can control the PTEs directly, and malicious guest user
++ space applications, which run on an unprotected guest kernel lacking the
++ PTE inversion mitigation for L1TF, to attack physical host memory.
++
++ A special aspect of L1TF in the context of virtualization is symmetric
++ multi threading (SMT). The Intel implementation of SMT is called
++ HyperThreading. The fact that Hyperthreads on the affected processors
++ share the L1 Data Cache (L1D) is important for this. As the flaw allows
++ only to attack data which is present in L1D, a malicious guest running
++ on one Hyperthread can attack the data which is brought into the L1D by
++ the context which runs on the sibling Hyperthread of the same physical
++ core. This context can be host OS, host user space or a different guest.
++
++ If the processor does not support Extended Page Tables, the attack is
++ only possible, when the hypervisor does not sanitize the content of the
++ effective (shadow) page tables.
++
++ While solutions exist to mitigate these attack vectors fully, these
++ mitigations are not enabled by default in the Linux kernel because they
++ can affect performance significantly. The kernel provides several
++ mechanisms which can be utilized to address the problem depending on the
++ deployment scenario. The mitigations, their protection scope and impact
++ are described in the next sections.
++
++ The default mitigations and the rationale for choosing them are explained
++ at the end of this document. See :ref:`default_mitigations`.
++
++.. _l1tf_sys_info:
++
++L1TF system information
++-----------------------
++
++The Linux kernel provides a sysfs interface to enumerate the current L1TF
++status of the system: whether the system is vulnerable, and which
++mitigations are active. The relevant sysfs file is:
++
++/sys/devices/system/cpu/vulnerabilities/l1tf
++
++The possible values in this file are:
++
++ =========================== ===============================
++ 'Not affected' The processor is not vulnerable
++ 'Mitigation: PTE Inversion' The host protection is active
++ =========================== ===============================
++
++If KVM/VMX is enabled and the processor is vulnerable then the following
++information is appended to the 'Mitigation: PTE Inversion' part:
++
++ - SMT status:
++
++ ===================== ================
++ 'VMX: SMT vulnerable' SMT is enabled
++ 'VMX: SMT disabled' SMT is disabled
++ ===================== ================
++
++ - L1D Flush mode:
++
++ ================================ ====================================
++ 'L1D vulnerable' L1D flushing is disabled
++
++ 'L1D conditional cache flushes' L1D flush is conditionally enabled
++
++ 'L1D cache flushes' L1D flush is unconditionally enabled
++ ================================ ====================================
++
++The resulting grade of protection is discussed in the following sections.
++
++
++Host mitigation mechanism
++-------------------------
++
++The kernel is unconditionally protected against L1TF attacks from malicious
++user space running on the host.
++
++
++Guest mitigation mechanisms
++---------------------------
++
++.. _l1d_flush:
++
++1. L1D flush on VMENTER
++^^^^^^^^^^^^^^^^^^^^^^^
++
++ To make sure that a guest cannot attack data which is present in the L1D
++ the hypervisor flushes the L1D before entering the guest.
++
++ Flushing the L1D evicts not only the data which should not be accessed
++ by a potentially malicious guest, it also flushes the guest
++ data. Flushing the L1D has a performance impact as the processor has to
++ bring the flushed guest data back into the L1D. Depending on the
++ frequency of VMEXIT/VMENTER and the type of computations in the guest
++ performance degradation in the range of 1% to 50% has been observed. For
++ scenarios where guest VMEXIT/VMENTER are rare the performance impact is
++ minimal. Virtio and mechanisms like posted interrupts are designed to
++ confine the VMEXITs to a bare minimum, but specific configurations and
++ application scenarios might still suffer from a high VMEXIT rate.
++
++ The kernel provides two L1D flush modes:
++ - conditional ('cond')
++ - unconditional ('always')
++
++ The conditional mode avoids L1D flushing after VMEXITs which execute
++ only audited code paths before the corresponding VMENTER. These code
++ paths have been verified that they cannot expose secrets or other
++ interesting data to an attacker, but they can leak information about the
++ address space layout of the hypervisor.
++
++ Unconditional mode flushes L1D on all VMENTER invocations and provides
++ maximum protection. It has a higher overhead than the conditional
++ mode. The overhead cannot be quantified correctly as it depends on the
++ workload scenario and the resulting number of VMEXITs.
++
++ The general recommendation is to enable L1D flush on VMENTER. The kernel
++ defaults to conditional mode on affected processors.
++
++ **Note**, that L1D flush does not prevent the SMT problem because the
++ sibling thread will also bring back its data into the L1D which makes it
++ attackable again.
++
++ L1D flush can be controlled by the administrator via the kernel command
++ line and sysfs control files. See :ref:`mitigation_control_command_line`
++ and :ref:`mitigation_control_kvm`.
++
++.. _guest_confinement:
++
++2. Guest VCPU confinement to dedicated physical cores
++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
++
++ To address the SMT problem, it is possible to make a guest or a group of
++ guests affine to one or more physical cores. The proper mechanism for
++ that is to utilize exclusive cpusets to ensure that no other guest or
++ host tasks can run on these cores.
++
++ If only a single guest or related guests run on sibling SMT threads on
++ the same physical core then they can only attack their own memory and
++ restricted parts of the host memory.
++
++ Host memory is attackable, when one of the sibling SMT threads runs in
++ host OS (hypervisor) context and the other in guest context. The amount
++ of valuable information from the host OS context depends on the context
++ which the host OS executes, i.e. interrupts, soft interrupts and kernel
++ threads. The amount of valuable data from these contexts cannot be
++ declared as non-interesting for an attacker without deep inspection of
++ the code.
++
++ **Note**, that assigning guests to a fixed set of physical cores affects
++ the ability of the scheduler to do load balancing and might have
++ negative effects on CPU utilization depending on the hosting
++ scenario. Disabling SMT might be a viable alternative for particular
++ scenarios.
++
++ For further information about confining guests to a single or to a group
++ of cores consult the cpusets documentation:
++
++ https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
++
++.. _interrupt_isolation:
++
++3. Interrupt affinity
++^^^^^^^^^^^^^^^^^^^^^
++
++ Interrupts can be made affine to logical CPUs. This is not universally
++ true because there are types of interrupts which are truly per CPU
++ interrupts, e.g. the local timer interrupt. Aside of that multi queue
++ devices affine their interrupts to single CPUs or groups of CPUs per
++ queue without allowing the administrator to control the affinities.
++
++ Moving the interrupts, which can be affinity controlled, away from CPUs
++ which run untrusted guests, reduces the attack vector space.
++
++ Whether the interrupts with are affine to CPUs, which run untrusted
++ guests, provide interesting data for an attacker depends on the system
++ configuration and the scenarios which run on the system. While for some
++ of the interrupts it can be assumed that they won't expose interesting
++ information beyond exposing hints about the host OS memory layout, there
++ is no way to make general assumptions.
++
++ Interrupt affinity can be controlled by the administrator via the
++ /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
++ available at:
++
++ https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
++
++.. _smt_control:
++
++4. SMT control
++^^^^^^^^^^^^^^
++
++ To prevent the SMT issues of L1TF it might be necessary to disable SMT
++ completely. Disabling SMT can have a significant performance impact, but
++ the impact depends on the hosting scenario and the type of workloads.
++ The impact of disabling SMT needs also to be weighted against the impact
++ of other mitigation solutions like confining guests to dedicated cores.
++
++ The kernel provides a sysfs interface to retrieve the status of SMT and
++ to control it. It also provides a kernel command line interface to
++ control SMT.
++
++ The kernel command line interface consists of the following options:
++
++ =========== ==========================================================
++ nosmt Affects the bring up of the secondary CPUs during boot. The
++ kernel tries to bring all present CPUs online during the
++ boot process. "nosmt" makes sure that from each physical
++ core only one - the so called primary (hyper) thread is
++ activated. Due to a design flaw of Intel processors related
++ to Machine Check Exceptions the non primary siblings have
++ to be brought up at least partially and are then shut down
++ again. "nosmt" can be undone via the sysfs interface.
++
++ nosmt=force Has the same effect as "nosmt" but it does not allow to
++ undo the SMT disable via the sysfs interface.
++ =========== ==========================================================
++
++ The sysfs interface provides two files:
++
++ - /sys/devices/system/cpu/smt/control
++ - /sys/devices/system/cpu/smt/active
++
++ /sys/devices/system/cpu/smt/control:
++
++ This file allows to read out the SMT control state and provides the
++ ability to disable or (re)enable SMT. The possible states are:
++
++ ============== ===================================================
++ on SMT is supported by the CPU and enabled. All
++ logical CPUs can be onlined and offlined without
++ restrictions.
++
++ off SMT is supported by the CPU and disabled. Only
++ the so called primary SMT threads can be onlined
++ and offlined without restrictions. An attempt to
++ online a non-primary sibling is rejected
++
++ forceoff Same as 'off' but the state cannot be controlled.
++ Attempts to write to the control file are rejected.
++
++ notsupported The processor does not support SMT. It's therefore
++ not affected by the SMT implications of L1TF.
++ Attempts to write to the control file are rejected.
++ ============== ===================================================
++
++ The possible states which can be written into this file to control SMT
++ state are:
++
++ - on
++ - off
++ - forceoff
++
++ /sys/devices/system/cpu/smt/active:
++
++ This file reports whether SMT is enabled and active, i.e. if on any
++ physical core two or more sibling threads are online.
++
++ SMT control is also possible at boot time via the l1tf kernel command
++ line parameter in combination with L1D flush control. See
++ :ref:`mitigation_control_command_line`.
++
++5. Disabling EPT
++^^^^^^^^^^^^^^^^
++
++ Disabling EPT for virtual machines provides full mitigation for L1TF even
++ with SMT enabled, because the effective page tables for guests are
++ managed and sanitized by the hypervisor. Though disabling EPT has a
++ significant performance impact especially when the Meltdown mitigation
++ KPTI is enabled.
++
++ EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
++
++There is ongoing research and development for new mitigation mechanisms to
++address the performance impact of disabling SMT or EPT.
++
++.. _mitigation_control_command_line:
++
++Mitigation control on the kernel command line
++---------------------------------------------
++
++The kernel command line allows to control the L1TF mitigations at boot
++time with the option "l1tf=". The valid arguments for this option are:
++
++ ============ =============================================================
++ full Provides all available mitigations for the L1TF
++ vulnerability. Disables SMT and enables all mitigations in
++ the hypervisors, i.e. unconditional L1D flushing
++
++ SMT control and L1D flush control via the sysfs interface
++ is still possible after boot. Hypervisors will issue a
++ warning when the first VM is started in a potentially
++ insecure configuration, i.e. SMT enabled or L1D flush
++ disabled.
++
++ full,force Same as 'full', but disables SMT and L1D flush runtime
++ control. Implies the 'nosmt=force' command line option.
++ (i.e. sysfs control of SMT is disabled.)
++
++ flush Leaves SMT enabled and enables the default hypervisor
++ mitigation, i.e. conditional L1D flushing
++
++ SMT control and L1D flush control via the sysfs interface
++ is still possible after boot. Hypervisors will issue a
++ warning when the first VM is started in a potentially
++ insecure configuration, i.e. SMT enabled or L1D flush
++ disabled.
++
++ flush,nosmt Disables SMT and enables the default hypervisor mitigation,
++ i.e. conditional L1D flushing.
++
++ SMT control and L1D flush control via the sysfs interface
++ is still possible after boot. Hypervisors will issue a
++ warning when the first VM is started in a potentially
++ insecure configuration, i.e. SMT enabled or L1D flush
++ disabled.
++
++ flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
++ started in a potentially insecure configuration.
++
++ off Disables hypervisor mitigations and doesn't emit any
++ warnings.
++ It also drops the swap size and available RAM limit restrictions
++ on both hypervisor and bare metal.
++
++ ============ =============================================================
++
++The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
++
++
++.. _mitigation_control_kvm:
++
++Mitigation control for KVM - module parameter
++-------------------------------------------------------------
++
++The KVM hypervisor mitigation mechanism, flushing the L1D cache when
++entering a guest, can be controlled with a module parameter.
++
++The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
++following arguments:
++
++ ============ ==============================================================
++ always L1D cache flush on every VMENTER.
++
++ cond Flush L1D on VMENTER only when the code between VMEXIT and
++ VMENTER can leak host memory which is considered
++ interesting for an attacker. This still can leak host memory
++ which allows e.g. to determine the hosts address space layout.
++
++ never Disables the mitigation
++ ============ ==============================================================
++
++The parameter can be provided on the kernel command line, as a module
++parameter when loading the modules and at runtime modified via the sysfs
++file:
++
++/sys/module/kvm_intel/parameters/vmentry_l1d_flush
++
++The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
++line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
++module parameter is ignored and writes to the sysfs file are rejected.
++
++.. _mitigation_selection:
++
++Mitigation selection guide
++--------------------------
++
++1. No virtualization in use
++^^^^^^^^^^^^^^^^^^^^^^^^^^^
++
++ The system is protected by the kernel unconditionally and no further
++ action is required.
++
++2. Virtualization with trusted guests
++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
++
++ If the guest comes from a trusted source and the guest OS kernel is
++ guaranteed to have the L1TF mitigations in place the system is fully
++ protected against L1TF and no further action is required.
++
++ To avoid the overhead of the default L1D flushing on VMENTER the
++ administrator can disable the flushing via the kernel command line and
++ sysfs control files. See :ref:`mitigation_control_command_line` and
++ :ref:`mitigation_control_kvm`.
++
++
++3. Virtualization with untrusted guests
++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
++
++3.1. SMT not supported or disabled
++""""""""""""""""""""""""""""""""""
++
++ If SMT is not supported by the processor or disabled in the BIOS or by
++ the kernel, it's only required to enforce L1D flushing on VMENTER.
++
++ Conditional L1D flushing is the default behaviour and can be tuned. See
++ :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
++
++3.2. EPT not supported or disabled
++""""""""""""""""""""""""""""""""""
++
++ If EPT is not supported by the processor or disabled in the hypervisor,
++ the system is fully protected. SMT can stay enabled and L1D flushing on
++ VMENTER is not required.
++
++ EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
++
++3.3. SMT and EPT supported and active
++"""""""""""""""""""""""""""""""""""""
++
++ If SMT and EPT are supported and active then various degrees of
++ mitigations can be employed:
++
++ - L1D flushing on VMENTER:
++
++ L1D flushing on VMENTER is the minimal protection requirement, but it
++ is only potent in combination with other mitigation methods.
++
++ Conditional L1D flushing is the default behaviour and can be tuned. See
++ :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
++
++ - Guest confinement:
++
++ Confinement of guests to a single or a group of physical cores which
++ are not running any other processes, can reduce the attack surface
++ significantly, but interrupts, soft interrupts and kernel threads can
++ still expose valuable data to a potential attacker. See
++ :ref:`guest_confinement`.
++
++ - Interrupt isolation:
++
++ Isolating the guest CPUs from interrupts can reduce the attack surface
++ further, but still allows a malicious guest to explore a limited amount
++ of host physical memory. This can at least be used to gain knowledge
++ about the host address space layout. The interrupts which have a fixed
++ affinity to the CPUs which run the untrusted guests can depending on
++ the scenario still trigger soft interrupts and schedule kernel threads
++ which might expose valuable information. See
++ :ref:`interrupt_isolation`.
++
++The above three mitigation methods combined can provide protection to a
++certain degree, but the risk of the remaining attack surface has to be
++carefully analyzed. For full protection the following methods are
++available:
++
++ - Disabling SMT:
++
++ Disabling SMT and enforcing the L1D flushing provides the maximum
++ amount of protection. This mitigation is not depending on any of the
++ above mitigation methods.
++
++ SMT control and L1D flushing can be tuned by the command line
++ parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
++ time with the matching sysfs control files. See :ref:`smt_control`,
++ :ref:`mitigation_control_command_line` and
++ :ref:`mitigation_control_kvm`.
++
++ - Disabling EPT:
++
++ Disabling EPT provides the maximum amount of protection as well. It is
++ not depending on any of the above mitigation methods. SMT can stay
++ enabled and L1D flushing is not required, but the performance impact is
++ significant.
++
++ EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
++ parameter.
++
++3.4. Nested virtual machines
++""""""""""""""""""""""""""""
++
++When nested virtualization is in use, three operating systems are involved:
++the bare metal hypervisor, the nested hypervisor and the nested virtual
++machine. VMENTER operations from the nested hypervisor into the nested
++guest will always be processed by the bare metal hypervisor. If KVM is the
++bare metal hypervisor it will:
++
++ - Flush the L1D cache on every switch from the nested hypervisor to the
++ nested virtual machine, so that the nested hypervisor's secrets are not
++ exposed to the nested virtual machine;
++
++ - Flush the L1D cache on every switch from the nested virtual machine to
++ the nested hypervisor; this is a complex operation, and flushing the L1D
++ cache avoids that the bare metal hypervisor's secrets are exposed to the
++ nested virtual machine;
++
++ - Instruct the nested hypervisor to not perform any L1D cache flush. This
++ is an optimization to avoid double L1D flushing.
++
++
++.. _default_mitigations:
++
++Default mitigations
++-------------------
++
++ The kernel default mitigations for vulnerable processors are:
++
++ - PTE inversion to protect against malicious user space. This is done
++ unconditionally and cannot be controlled. The swap storage is limited
++ to ~16TB.
++
++ - L1D conditional flushing on VMENTER when EPT is enabled for
++ a guest.
++
++ The kernel does not by default enforce the disabling of SMT, which leaves
++ SMT systems vulnerable when running untrusted guests with EPT enabled.
++
++ The rationale for this choice is:
++
++ - Force disabling SMT can break existing setups, especially with
++ unattended updates.
++
++ - If regular users run untrusted guests on their machine, then L1TF is
++ just an add on to other malware which might be embedded in an untrusted
++ guest, e.g. spam-bots or attacks on the local network.
++
++ There is no technical way to prevent a user from running untrusted code
++ on their machines blindly.
++
++ - It's technically extremely unlikely and from today's knowledge even
++ impossible that L1TF can be exploited via the most popular attack
++ mechanisms like JavaScript because these mechanisms have no way to
++ control PTEs. If this would be possible and not other mitigation would
++ be possible, then the default might be different.
++
++ - The administrators of cloud and hosting setups have to carefully
++ analyze the risk for their scenarios and make the appropriate
++ mitigation choices, which might even vary across their deployed
++ machines and also result in other changes of their overall setup.
++ There is no way for the kernel to provide a sensible default for this
++ kind of scenarios.
+diff --git a/Documentation/admin-guide/hw-vuln/mds.rst b/Documentation/admin-guide/hw-vuln/mds.rst
+new file mode 100644
+index 000000000000..e3a796c0d3a2
+--- /dev/null
++++ b/Documentation/admin-guide/hw-vuln/mds.rst
+@@ -0,0 +1,308 @@
++MDS - Microarchitectural Data Sampling
++======================================
++
++Microarchitectural Data Sampling is a hardware vulnerability which allows
++unprivileged speculative access to data which is available in various CPU
++internal buffers.
++
++Affected processors
++-------------------
++
++This vulnerability affects a wide range of Intel processors. The
++vulnerability is not present on:
++
++ - Processors from AMD, Centaur and other non Intel vendors
++
++ - Older processor models, where the CPU family is < 6
++
++ - Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus)
++
++ - Intel processors which have the ARCH_CAP_MDS_NO bit set in the
++ IA32_ARCH_CAPABILITIES MSR.
++
++Whether a processor is affected or not can be read out from the MDS
++vulnerability file in sysfs. See :ref:`mds_sys_info`.
++
++Not all processors are affected by all variants of MDS, but the mitigation
++is identical for all of them so the kernel treats them as a single
++vulnerability.
++
++Related CVEs
++------------
++
++The following CVE entries are related to the MDS vulnerability:
++
++ ============== ===== ===================================================
++ CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling
++ CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling
++ CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling
++ CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory
++ ============== ===== ===================================================
++
++Problem
++-------
++
++When performing store, load, L1 refill operations, processors write data
++into temporary microarchitectural structures (buffers). The data in the
++buffer can be forwarded to load operations as an optimization.
++
++Under certain conditions, usually a fault/assist caused by a load
++operation, data unrelated to the load memory address can be speculatively
++forwarded from the buffers. Because the load operation causes a fault or
++assist and its result will be discarded, the forwarded data will not cause
++incorrect program execution or state changes. But a malicious operation
++may be able to forward this speculative data to a disclosure gadget which
++allows in turn to infer the value via a cache side channel attack.
++
++Because the buffers are potentially shared between Hyper-Threads cross
++Hyper-Thread attacks are possible.
++
++Deeper technical information is available in the MDS specific x86
++architecture section: :ref:`Documentation/x86/mds.rst <mds>`.
++
++
++Attack scenarios
++----------------
++
++Attacks against the MDS vulnerabilities can be mounted from malicious non
++priviledged user space applications running on hosts or guest. Malicious
++guest OSes can obviously mount attacks as well.
++
++Contrary to other speculation based vulnerabilities the MDS vulnerability
++does not allow the attacker to control the memory target address. As a
++consequence the attacks are purely sampling based, but as demonstrated with
++the TLBleed attack samples can be postprocessed successfully.
++
++Web-Browsers
++^^^^^^^^^^^^
++
++ It's unclear whether attacks through Web-Browsers are possible at
++ all. The exploitation through Java-Script is considered very unlikely,
++ but other widely used web technologies like Webassembly could possibly be
++ abused.
++
++
++.. _mds_sys_info:
++
++MDS system information
++-----------------------
++
++The Linux kernel provides a sysfs interface to enumerate the current MDS
++status of the system: whether the system is vulnerable, and which
++mitigations are active. The relevant sysfs file is:
++
++/sys/devices/system/cpu/vulnerabilities/mds
++
++The possible values in this file are:
++
++ .. list-table::
++
++ * - 'Not affected'
++ - The processor is not vulnerable
++ * - 'Vulnerable'
++ - The processor is vulnerable, but no mitigation enabled
++ * - 'Vulnerable: Clear CPU buffers attempted, no microcode'
++ - The processor is vulnerable but microcode is not updated.
++
++ The mitigation is enabled on a best effort basis. See :ref:`vmwerv`
++ * - 'Mitigation: Clear CPU buffers'
++ - The processor is vulnerable and the CPU buffer clearing mitigation is
++ enabled.
++
++If the processor is vulnerable then the following information is appended
++to the above information:
++
++ ======================== ============================================
++ 'SMT vulnerable' SMT is enabled
++ 'SMT mitigated' SMT is enabled and mitigated
++ 'SMT disabled' SMT is disabled
++ 'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
++ ======================== ============================================
++
++.. _vmwerv:
++
++Best effort mitigation mode
++^^^^^^^^^^^^^^^^^^^^^^^^^^^
++
++ If the processor is vulnerable, but the availability of the microcode based
++ mitigation mechanism is not advertised via CPUID the kernel selects a best
++ effort mitigation mode. This mode invokes the mitigation instructions
++ without a guarantee that they clear the CPU buffers.
++
++ This is done to address virtualization scenarios where the host has the
++ microcode update applied, but the hypervisor is not yet updated to expose
++ the CPUID to the guest. If the host has updated microcode the protection
++ takes effect otherwise a few cpu cycles are wasted pointlessly.
++
++ The state in the mds sysfs file reflects this situation accordingly.
++
++
++Mitigation mechanism
++-------------------------
++
++The kernel detects the affected CPUs and the presence of the microcode
++which is required.
++
++If a CPU is affected and the microcode is available, then the kernel
++enables the mitigation by default. The mitigation can be controlled at boot
++time via a kernel command line option. See
++:ref:`mds_mitigation_control_command_line`.
++
++.. _cpu_buffer_clear:
++
++CPU buffer clearing
++^^^^^^^^^^^^^^^^^^^
++
++ The mitigation for MDS clears the affected CPU buffers on return to user
++ space and when entering a guest.
++
++ If SMT is enabled it also clears the buffers on idle entry when the CPU
++ is only affected by MSBDS and not any other MDS variant, because the
++ other variants cannot be protected against cross Hyper-Thread attacks.
++
++ For CPUs which are only affected by MSBDS the user space, guest and idle
++ transition mitigations are sufficient and SMT is not affected.
++
++.. _virt_mechanism:
++
++Virtualization mitigation
++^^^^^^^^^^^^^^^^^^^^^^^^^
++
++ The protection for host to guest transition depends on the L1TF
++ vulnerability of the CPU:
++
++ - CPU is affected by L1TF:
++
++ If the L1D flush mitigation is enabled and up to date microcode is
++ available, the L1D flush mitigation is automatically protecting the
++ guest transition.
++
++ If the L1D flush mitigation is disabled then the MDS mitigation is
++ invoked explicit when the host MDS mitigation is enabled.
++
++ For details on L1TF and virtualization see:
++ :ref:`Documentation/admin-guide/hw-vuln//l1tf.rst <mitigation_control_kvm>`.
++
++ - CPU is not affected by L1TF:
++
++ CPU buffers are flushed before entering the guest when the host MDS
++ mitigation is enabled.
++
++ The resulting MDS protection matrix for the host to guest transition:
++
++ ============ ===== ============= ============ =================
++ L1TF MDS VMX-L1FLUSH Host MDS MDS-State
++
++ Don't care No Don't care N/A Not affected
++
++ Yes Yes Disabled Off Vulnerable
++
++ Yes Yes Disabled Full Mitigated
++
++ Yes Yes Enabled Don't care Mitigated
++
++ No Yes N/A Off Vulnerable
++
++ No Yes N/A Full Mitigated
++ ============ ===== ============= ============ =================
++
++ This only covers the host to guest transition, i.e. prevents leakage from
++ host to guest, but does not protect the guest internally. Guests need to
++ have their own protections.
++
++.. _xeon_phi:
++
++XEON PHI specific considerations
++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
++
++ The XEON PHI processor family is affected by MSBDS which can be exploited
++ cross Hyper-Threads when entering idle states. Some XEON PHI variants allow
++ to use MWAIT in user space (Ring 3) which opens an potential attack vector
++ for malicious user space. The exposure can be disabled on the kernel
++ command line with the 'ring3mwait=disable' command line option.
++
++ XEON PHI is not affected by the other MDS variants and MSBDS is mitigated
++ before the CPU enters a idle state. As XEON PHI is not affected by L1TF
++ either disabling SMT is not required for full protection.
++
++.. _mds_smt_control:
++
++SMT control
++^^^^^^^^^^^
++
++ All MDS variants except MSBDS can be attacked cross Hyper-Threads. That
++ means on CPUs which are affected by MFBDS or MLPDS it is necessary to
++ disable SMT for full protection. These are most of the affected CPUs; the
++ exception is XEON PHI, see :ref:`xeon_phi`.
++
++ Disabling SMT can have a significant performance impact, but the impact
++ depends on the type of workloads.
++
++ See the relevant chapter in the L1TF mitigation documentation for details:
++ :ref:`Documentation/admin-guide/hw-vuln/l1tf.rst <smt_control>`.
++
++
++.. _mds_mitigation_control_command_line:
++
++Mitigation control on the kernel command line
++---------------------------------------------
++
++The kernel command line allows to control the MDS mitigations at boot
++time with the option "mds=". The valid arguments for this option are:
++
++ ============ =============================================================
++ full If the CPU is vulnerable, enable all available mitigations
++ for the MDS vulnerability, CPU buffer clearing on exit to
++ userspace and when entering a VM. Idle transitions are
++ protected as well if SMT is enabled.
++
++ It does not automatically disable SMT.
++
++ full,nosmt The same as mds=full, with SMT disabled on vulnerable
++ CPUs. This is the complete mitigation.
++
++ off Disables MDS mitigations completely.
++
++ ============ =============================================================
++
++Not specifying this option is equivalent to "mds=full".
++
++
++Mitigation selection guide
++--------------------------
++
++1. Trusted userspace
++^^^^^^^^^^^^^^^^^^^^
++
++ If all userspace applications are from a trusted source and do not
++ execute untrusted code which is supplied externally, then the mitigation
++ can be disabled.
++
++
++2. Virtualization with trusted guests
++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
++
++ The same considerations as above versus trusted user space apply.
++
++3. Virtualization with untrusted guests
++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
++
++ The protection depends on the state of the L1TF mitigations.
++ See :ref:`virt_mechanism`.
++
++ If the MDS mitigation is enabled and SMT is disabled, guest to host and
++ guest to guest attacks are prevented.
++
++.. _mds_default_mitigations:
++
++Default mitigations
++-------------------
++
++ The kernel default mitigations for vulnerable processors are:
++
++ - Enable CPU buffer clearing
++
++ The kernel does not by default enforce the disabling of SMT, which leaves
++ SMT systems vulnerable when running untrusted code. The same rationale as
++ for L1TF applies.
++ See :ref:`Documentation/admin-guide/hw-vuln//l1tf.rst <default_mitigations>`.
+diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
+index 0a491676685e..42247516962a 100644
+--- a/Documentation/admin-guide/index.rst
++++ b/Documentation/admin-guide/index.rst
+@@ -17,14 +17,12 @@ etc.
+ kernel-parameters
+ devices
+
+-This section describes CPU vulnerabilities and provides an overview of the
+-possible mitigations along with guidance for selecting mitigations if they
+-are configurable at compile, boot or run time.
++This section describes CPU vulnerabilities and their mitigations.
+
+ .. toctree::
+ :maxdepth: 1
+
+- l1tf
++ hw-vuln/index
+
+ Here is a set of documents aimed at users who are trying to track down
+ problems and bugs in particular.
+diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
+index 2b8ee90bb644..c7937f379d22 100644
+--- a/Documentation/admin-guide/kernel-parameters.txt
++++ b/Documentation/admin-guide/kernel-parameters.txt
+@@ -2141,7 +2141,7 @@
+
+ Default is 'flush'.
+
+- For details see: Documentation/admin-guide/l1tf.rst
++ For details see: Documentation/admin-guide/hw-vuln/l1tf.rst
+
+ l2cr= [PPC]
+
+@@ -2387,6 +2387,32 @@
+ Format: <first>,<last>
+ Specifies range of consoles to be captured by the MDA.
+
++ mds= [X86,INTEL]
++ Control mitigation for the Micro-architectural Data
++ Sampling (MDS) vulnerability.
++
++ Certain CPUs are vulnerable to an exploit against CPU
++ internal buffers which can forward information to a
++ disclosure gadget under certain conditions.
++
++ In vulnerable processors, the speculatively
++ forwarded data can be used in a cache side channel
++ attack, to access data to which the attacker does
++ not have direct access.
++
++ This parameter controls the MDS mitigation. The
++ options are:
++
++ full - Enable MDS mitigation on vulnerable CPUs
++ full,nosmt - Enable MDS mitigation and disable
++ SMT on vulnerable CPUs
++ off - Unconditionally disable MDS mitigation
++
++ Not specifying this option is equivalent to
++ mds=full.
++
++ For details see: Documentation/admin-guide/hw-vuln/mds.rst
++
+ mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
+ Amount of memory to be used when the kernel is not able
+ to see the whole system memory or for test.
+@@ -2544,6 +2570,40 @@
+ in the "bleeding edge" mini2440 support kernel at
+ http://repo.or.cz/w/linux-2.6/mini2440.git
+
++ mitigations=
++ [X86,PPC,S390] Control optional mitigations for CPU
++ vulnerabilities. This is a set of curated,
++ arch-independent options, each of which is an
++ aggregation of existing arch-specific options.
++
++ off
++ Disable all optional CPU mitigations. This
++ improves system performance, but it may also
++ expose users to several CPU vulnerabilities.
++ Equivalent to: nopti [X86,PPC]
++ nospectre_v1 [PPC]
++ nobp=0 [S390]
++ nospectre_v2 [X86,PPC,S390]
++ spectre_v2_user=off [X86]
++ spec_store_bypass_disable=off [X86,PPC]
++ l1tf=off [X86]
++ mds=off [X86]
++
++ auto (default)
++ Mitigate all CPU vulnerabilities, but leave SMT
++ enabled, even if it's vulnerable. This is for
++ users who don't want to be surprised by SMT
++ getting disabled across kernel upgrades, or who
++ have other ways of avoiding SMT-based attacks.
++ Equivalent to: (default behavior)
++
++ auto,nosmt
++ Mitigate all CPU vulnerabilities, disabling SMT
++ if needed. This is for users who always want to
++ be fully mitigated, even if it means losing SMT.
++ Equivalent to: l1tf=flush,nosmt [X86]
++ mds=full,nosmt [X86]
++
+ mminit_loglevel=
+ [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
+ parameter allows control of the logging verbosity for
+diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst
+deleted file mode 100644
+index 9af977384168..000000000000
+--- a/Documentation/admin-guide/l1tf.rst
++++ /dev/null
+@@ -1,614 +0,0 @@
+-L1TF - L1 Terminal Fault
+-========================
+-
+-L1 Terminal Fault is a hardware vulnerability which allows unprivileged
+-speculative access to data which is available in the Level 1 Data Cache
+-when the page table entry controlling the virtual address, which is used
+-for the access, has the Present bit cleared or other reserved bits set.
+-
+-Affected processors
+--------------------
+-
+-This vulnerability affects a wide range of Intel processors. The
+-vulnerability is not present on:
+-
+- - Processors from AMD, Centaur and other non Intel vendors
+-
+- - Older processor models, where the CPU family is < 6
+-
+- - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
+- Penwell, Pineview, Silvermont, Airmont, Merrifield)
+-
+- - The Intel XEON PHI family
+-
+- - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
+- IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
+- by the Meltdown vulnerability either. These CPUs should become
+- available by end of 2018.
+-
+-Whether a processor is affected or not can be read out from the L1TF
+-vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
+-
+-Related CVEs
+-------------
+-
+-The following CVE entries are related to the L1TF vulnerability:
+-
+- ============= ================= ==============================
+- CVE-2018-3615 L1 Terminal Fault SGX related aspects
+- CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
+- CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
+- ============= ================= ==============================
+-
+-Problem
+--------
+-
+-If an instruction accesses a virtual address for which the relevant page
+-table entry (PTE) has the Present bit cleared or other reserved bits set,
+-then speculative execution ignores the invalid PTE and loads the referenced
+-data if it is present in the Level 1 Data Cache, as if the page referenced
+-by the address bits in the PTE was still present and accessible.
+-
+-While this is a purely speculative mechanism and the instruction will raise
+-a page fault when it is retired eventually, the pure act of loading the
+-data and making it available to other speculative instructions opens up the
+-opportunity for side channel attacks to unprivileged malicious code,
+-similar to the Meltdown attack.
+-
+-While Meltdown breaks the user space to kernel space protection, L1TF
+-allows to attack any physical memory address in the system and the attack
+-works across all protection domains. It allows an attack of SGX and also
+-works from inside virtual machines because the speculation bypasses the
+-extended page table (EPT) protection mechanism.
+-
+-
+-Attack scenarios
+-----------------
+-
+-1. Malicious user space
+-^^^^^^^^^^^^^^^^^^^^^^^
+-
+- Operating Systems store arbitrary information in the address bits of a
+- PTE which is marked non present. This allows a malicious user space
+- application to attack the physical memory to which these PTEs resolve.
+- In some cases user-space can maliciously influence the information
+- encoded in the address bits of the PTE, thus making attacks more
+- deterministic and more practical.
+-
+- The Linux kernel contains a mitigation for this attack vector, PTE
+- inversion, which is permanently enabled and has no performance
+- impact. The kernel ensures that the address bits of PTEs, which are not
+- marked present, never point to cacheable physical memory space.
+-
+- A system with an up to date kernel is protected against attacks from
+- malicious user space applications.
+-
+-2. Malicious guest in a virtual machine
+-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-
+- The fact that L1TF breaks all domain protections allows malicious guest
+- OSes, which can control the PTEs directly, and malicious guest user
+- space applications, which run on an unprotected guest kernel lacking the
+- PTE inversion mitigation for L1TF, to attack physical host memory.
+-
+- A special aspect of L1TF in the context of virtualization is symmetric
+- multi threading (SMT). The Intel implementation of SMT is called
+- HyperThreading. The fact that Hyperthreads on the affected processors
+- share the L1 Data Cache (L1D) is important for this. As the flaw allows
+- only to attack data which is present in L1D, a malicious guest running
+- on one Hyperthread can attack the data which is brought into the L1D by
+- the context which runs on the sibling Hyperthread of the same physical
+- core. This context can be host OS, host user space or a different guest.
+-
+- If the processor does not support Extended Page Tables, the attack is
+- only possible, when the hypervisor does not sanitize the content of the
+- effective (shadow) page tables.
+-
+- While solutions exist to mitigate these attack vectors fully, these
+- mitigations are not enabled by default in the Linux kernel because they
+- can affect performance significantly. The kernel provides several
+- mechanisms which can be utilized to address the problem depending on the
+- deployment scenario. The mitigations, their protection scope and impact
+- are described in the next sections.
+-
+- The default mitigations and the rationale for choosing them are explained
+- at the end of this document. See :ref:`default_mitigations`.
+-
+-.. _l1tf_sys_info:
+-
+-L1TF system information
+------------------------
+-
+-The Linux kernel provides a sysfs interface to enumerate the current L1TF
+-status of the system: whether the system is vulnerable, and which
+-mitigations are active. The relevant sysfs file is:
+-
+-/sys/devices/system/cpu/vulnerabilities/l1tf
+-
+-The possible values in this file are:
+-
+- =========================== ===============================
+- 'Not affected' The processor is not vulnerable
+- 'Mitigation: PTE Inversion' The host protection is active
+- =========================== ===============================
+-
+-If KVM/VMX is enabled and the processor is vulnerable then the following
+-information is appended to the 'Mitigation: PTE Inversion' part:
+-
+- - SMT status:
+-
+- ===================== ================
+- 'VMX: SMT vulnerable' SMT is enabled
+- 'VMX: SMT disabled' SMT is disabled
+- ===================== ================
+-
+- - L1D Flush mode:
+-
+- ================================ ====================================
+- 'L1D vulnerable' L1D flushing is disabled
+-
+- 'L1D conditional cache flushes' L1D flush is conditionally enabled
+-
+- 'L1D cache flushes' L1D flush is unconditionally enabled
+- ================================ ====================================
+-
+-The resulting grade of protection is discussed in the following sections.
+-
+-
+-Host mitigation mechanism
+--------------------------
+-
+-The kernel is unconditionally protected against L1TF attacks from malicious
+-user space running on the host.
+-
+-
+-Guest mitigation mechanisms
+----------------------------
+-
+-.. _l1d_flush:
+-
+-1. L1D flush on VMENTER
+-^^^^^^^^^^^^^^^^^^^^^^^
+-
+- To make sure that a guest cannot attack data which is present in the L1D
+- the hypervisor flushes the L1D before entering the guest.
+-
+- Flushing the L1D evicts not only the data which should not be accessed
+- by a potentially malicious guest, it also flushes the guest
+- data. Flushing the L1D has a performance impact as the processor has to
+- bring the flushed guest data back into the L1D. Depending on the
+- frequency of VMEXIT/VMENTER and the type of computations in the guest
+- performance degradation in the range of 1% to 50% has been observed. For
+- scenarios where guest VMEXIT/VMENTER are rare the performance impact is
+- minimal. Virtio and mechanisms like posted interrupts are designed to
+- confine the VMEXITs to a bare minimum, but specific configurations and
+- application scenarios might still suffer from a high VMEXIT rate.
+-
+- The kernel provides two L1D flush modes:
+- - conditional ('cond')
+- - unconditional ('always')
+-
+- The conditional mode avoids L1D flushing after VMEXITs which execute
+- only audited code paths before the corresponding VMENTER. These code
+- paths have been verified that they cannot expose secrets or other
+- interesting data to an attacker, but they can leak information about the
+- address space layout of the hypervisor.
+-
+- Unconditional mode flushes L1D on all VMENTER invocations and provides
+- maximum protection. It has a higher overhead than the conditional
+- mode. The overhead cannot be quantified correctly as it depends on the
+- workload scenario and the resulting number of VMEXITs.
+-
+- The general recommendation is to enable L1D flush on VMENTER. The kernel
+- defaults to conditional mode on affected processors.
+-
+- **Note**, that L1D flush does not prevent the SMT problem because the
+- sibling thread will also bring back its data into the L1D which makes it
+- attackable again.
+-
+- L1D flush can be controlled by the administrator via the kernel command
+- line and sysfs control files. See :ref:`mitigation_control_command_line`
+- and :ref:`mitigation_control_kvm`.
+-
+-.. _guest_confinement:
+-
+-2. Guest VCPU confinement to dedicated physical cores
+-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-
+- To address the SMT problem, it is possible to make a guest or a group of
+- guests affine to one or more physical cores. The proper mechanism for
+- that is to utilize exclusive cpusets to ensure that no other guest or
+- host tasks can run on these cores.
+-
+- If only a single guest or related guests run on sibling SMT threads on
+- the same physical core then they can only attack their own memory and
+- restricted parts of the host memory.
+-
+- Host memory is attackable, when one of the sibling SMT threads runs in
+- host OS (hypervisor) context and the other in guest context. The amount
+- of valuable information from the host OS context depends on the context
+- which the host OS executes, i.e. interrupts, soft interrupts and kernel
+- threads. The amount of valuable data from these contexts cannot be
+- declared as non-interesting for an attacker without deep inspection of
+- the code.
+-
+- **Note**, that assigning guests to a fixed set of physical cores affects
+- the ability of the scheduler to do load balancing and might have
+- negative effects on CPU utilization depending on the hosting
+- scenario. Disabling SMT might be a viable alternative for particular
+- scenarios.
+-
+- For further information about confining guests to a single or to a group
+- of cores consult the cpusets documentation:
+-
+- https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
+-
+-.. _interrupt_isolation:
+-
+-3. Interrupt affinity
+-^^^^^^^^^^^^^^^^^^^^^
+-
+- Interrupts can be made affine to logical CPUs. This is not universally
+- true because there are types of interrupts which are truly per CPU
+- interrupts, e.g. the local timer interrupt. Aside of that multi queue
+- devices affine their interrupts to single CPUs or groups of CPUs per
+- queue without allowing the administrator to control the affinities.
+-
+- Moving the interrupts, which can be affinity controlled, away from CPUs
+- which run untrusted guests, reduces the attack vector space.
+-
+- Whether the interrupts with are affine to CPUs, which run untrusted
+- guests, provide interesting data for an attacker depends on the system
+- configuration and the scenarios which run on the system. While for some
+- of the interrupts it can be assumed that they won't expose interesting
+- information beyond exposing hints about the host OS memory layout, there
+- is no way to make general assumptions.
+-
+- Interrupt affinity can be controlled by the administrator via the
+- /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
+- available at:
+-
+- https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
+-
+-.. _smt_control:
+-
+-4. SMT control
+-^^^^^^^^^^^^^^
+-
+- To prevent the SMT issues of L1TF it might be necessary to disable SMT
+- completely. Disabling SMT can have a significant performance impact, but
+- the impact depends on the hosting scenario and the type of workloads.
+- The impact of disabling SMT needs also to be weighted against the impact
+- of other mitigation solutions like confining guests to dedicated cores.
+-
+- The kernel provides a sysfs interface to retrieve the status of SMT and
+- to control it. It also provides a kernel command line interface to
+- control SMT.
+-
+- The kernel command line interface consists of the following options:
+-
+- =========== ==========================================================
+- nosmt Affects the bring up of the secondary CPUs during boot. The
+- kernel tries to bring all present CPUs online during the
+- boot process. "nosmt" makes sure that from each physical
+- core only one - the so called primary (hyper) thread is
+- activated. Due to a design flaw of Intel processors related
+- to Machine Check Exceptions the non primary siblings have
+- to be brought up at least partially and are then shut down
+- again. "nosmt" can be undone via the sysfs interface.
+-
+- nosmt=force Has the same effect as "nosmt" but it does not allow to
+- undo the SMT disable via the sysfs interface.
+- =========== ==========================================================
+-
+- The sysfs interface provides two files:
+-
+- - /sys/devices/system/cpu/smt/control
+- - /sys/devices/system/cpu/smt/active
+-
+- /sys/devices/system/cpu/smt/control:
+-
+- This file allows to read out the SMT control state and provides the
+- ability to disable or (re)enable SMT. The possible states are:
+-
+- ============== ===================================================
+- on SMT is supported by the CPU and enabled. All
+- logical CPUs can be onlined and offlined without
+- restrictions.
+-
+- off SMT is supported by the CPU and disabled. Only
+- the so called primary SMT threads can be onlined
+- and offlined without restrictions. An attempt to
+- online a non-primary sibling is rejected
+-
+- forceoff Same as 'off' but the state cannot be controlled.
+- Attempts to write to the control file are rejected.
+-
+- notsupported The processor does not support SMT. It's therefore
+- not affected by the SMT implications of L1TF.
+- Attempts to write to the control file are rejected.
+- ============== ===================================================
+-
+- The possible states which can be written into this file to control SMT
+- state are:
+-
+- - on
+- - off
+- - forceoff
+-
+- /sys/devices/system/cpu/smt/active:
+-
+- This file reports whether SMT is enabled and active, i.e. if on any
+- physical core two or more sibling threads are online.
+-
+- SMT control is also possible at boot time via the l1tf kernel command
+- line parameter in combination with L1D flush control. See
+- :ref:`mitigation_control_command_line`.
+-
+-5. Disabling EPT
+-^^^^^^^^^^^^^^^^
+-
+- Disabling EPT for virtual machines provides full mitigation for L1TF even
+- with SMT enabled, because the effective page tables for guests are
+- managed and sanitized by the hypervisor. Though disabling EPT has a
+- significant performance impact especially when the Meltdown mitigation
+- KPTI is enabled.
+-
+- EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+-
+-There is ongoing research and development for new mitigation mechanisms to
+-address the performance impact of disabling SMT or EPT.
+-
+-.. _mitigation_control_command_line:
+-
+-Mitigation control on the kernel command line
+----------------------------------------------
+-
+-The kernel command line allows to control the L1TF mitigations at boot
+-time with the option "l1tf=". The valid arguments for this option are:
+-
+- ============ =============================================================
+- full Provides all available mitigations for the L1TF
+- vulnerability. Disables SMT and enables all mitigations in
+- the hypervisors, i.e. unconditional L1D flushing
+-
+- SMT control and L1D flush control via the sysfs interface
+- is still possible after boot. Hypervisors will issue a
+- warning when the first VM is started in a potentially
+- insecure configuration, i.e. SMT enabled or L1D flush
+- disabled.
+-
+- full,force Same as 'full', but disables SMT and L1D flush runtime
+- control. Implies the 'nosmt=force' command line option.
+- (i.e. sysfs control of SMT is disabled.)
+-
+- flush Leaves SMT enabled and enables the default hypervisor
+- mitigation, i.e. conditional L1D flushing
+-
+- SMT control and L1D flush control via the sysfs interface
+- is still possible after boot. Hypervisors will issue a
+- warning when the first VM is started in a potentially
+- insecure configuration, i.e. SMT enabled or L1D flush
+- disabled.
+-
+- flush,nosmt Disables SMT and enables the default hypervisor mitigation,
+- i.e. conditional L1D flushing.
+-
+- SMT control and L1D flush control via the sysfs interface
+- is still possible after boot. Hypervisors will issue a
+- warning when the first VM is started in a potentially
+- insecure configuration, i.e. SMT enabled or L1D flush
+- disabled.
+-
+- flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
+- started in a potentially insecure configuration.
+-
+- off Disables hypervisor mitigations and doesn't emit any
+- warnings.
+- It also drops the swap size and available RAM limit restrictions
+- on both hypervisor and bare metal.
+-
+- ============ =============================================================
+-
+-The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
+-
+-
+-.. _mitigation_control_kvm:
+-
+-Mitigation control for KVM - module parameter
+--------------------------------------------------------------
+-
+-The KVM hypervisor mitigation mechanism, flushing the L1D cache when
+-entering a guest, can be controlled with a module parameter.
+-
+-The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
+-following arguments:
+-
+- ============ ==============================================================
+- always L1D cache flush on every VMENTER.
+-
+- cond Flush L1D on VMENTER only when the code between VMEXIT and
+- VMENTER can leak host memory which is considered
+- interesting for an attacker. This still can leak host memory
+- which allows e.g. to determine the hosts address space layout.
+-
+- never Disables the mitigation
+- ============ ==============================================================
+-
+-The parameter can be provided on the kernel command line, as a module
+-parameter when loading the modules and at runtime modified via the sysfs
+-file:
+-
+-/sys/module/kvm_intel/parameters/vmentry_l1d_flush
+-
+-The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
+-line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
+-module parameter is ignored and writes to the sysfs file are rejected.
+-
+-
+-Mitigation selection guide
+---------------------------
+-
+-1. No virtualization in use
+-^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-
+- The system is protected by the kernel unconditionally and no further
+- action is required.
+-
+-2. Virtualization with trusted guests
+-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-
+- If the guest comes from a trusted source and the guest OS kernel is
+- guaranteed to have the L1TF mitigations in place the system is fully
+- protected against L1TF and no further action is required.
+-
+- To avoid the overhead of the default L1D flushing on VMENTER the
+- administrator can disable the flushing via the kernel command line and
+- sysfs control files. See :ref:`mitigation_control_command_line` and
+- :ref:`mitigation_control_kvm`.
+-
+-
+-3. Virtualization with untrusted guests
+-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-
+-3.1. SMT not supported or disabled
+-""""""""""""""""""""""""""""""""""
+-
+- If SMT is not supported by the processor or disabled in the BIOS or by
+- the kernel, it's only required to enforce L1D flushing on VMENTER.
+-
+- Conditional L1D flushing is the default behaviour and can be tuned. See
+- :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+-
+-3.2. EPT not supported or disabled
+-""""""""""""""""""""""""""""""""""
+-
+- If EPT is not supported by the processor or disabled in the hypervisor,
+- the system is fully protected. SMT can stay enabled and L1D flushing on
+- VMENTER is not required.
+-
+- EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+-
+-3.3. SMT and EPT supported and active
+-"""""""""""""""""""""""""""""""""""""
+-
+- If SMT and EPT are supported and active then various degrees of
+- mitigations can be employed:
+-
+- - L1D flushing on VMENTER:
+-
+- L1D flushing on VMENTER is the minimal protection requirement, but it
+- is only potent in combination with other mitigation methods.
+-
+- Conditional L1D flushing is the default behaviour and can be tuned. See
+- :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+-
+- - Guest confinement:
+-
+- Confinement of guests to a single or a group of physical cores which
+- are not running any other processes, can reduce the attack surface
+- significantly, but interrupts, soft interrupts and kernel threads can
+- still expose valuable data to a potential attacker. See
+- :ref:`guest_confinement`.
+-
+- - Interrupt isolation:
+-
+- Isolating the guest CPUs from interrupts can reduce the attack surface
+- further, but still allows a malicious guest to explore a limited amount
+- of host physical memory. This can at least be used to gain knowledge
+- about the host address space layout. The interrupts which have a fixed
+- affinity to the CPUs which run the untrusted guests can depending on
+- the scenario still trigger soft interrupts and schedule kernel threads
+- which might expose valuable information. See
+- :ref:`interrupt_isolation`.
+-
+-The above three mitigation methods combined can provide protection to a
+-certain degree, but the risk of the remaining attack surface has to be
+-carefully analyzed. For full protection the following methods are
+-available:
+-
+- - Disabling SMT:
+-
+- Disabling SMT and enforcing the L1D flushing provides the maximum
+- amount of protection. This mitigation is not depending on any of the
+- above mitigation methods.
+-
+- SMT control and L1D flushing can be tuned by the command line
+- parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
+- time with the matching sysfs control files. See :ref:`smt_control`,
+- :ref:`mitigation_control_command_line` and
+- :ref:`mitigation_control_kvm`.
+-
+- - Disabling EPT:
+-
+- Disabling EPT provides the maximum amount of protection as well. It is
+- not depending on any of the above mitigation methods. SMT can stay
+- enabled and L1D flushing is not required, but the performance impact is
+- significant.
+-
+- EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
+- parameter.
+-
+-3.4. Nested virtual machines
+-""""""""""""""""""""""""""""
+-
+-When nested virtualization is in use, three operating systems are involved:
+-the bare metal hypervisor, the nested hypervisor and the nested virtual
+-machine. VMENTER operations from the nested hypervisor into the nested
+-guest will always be processed by the bare metal hypervisor. If KVM is the
+-bare metal hypervisor it will:
+-
+- - Flush the L1D cache on every switch from the nested hypervisor to the
+- nested virtual machine, so that the nested hypervisor's secrets are not
+- exposed to the nested virtual machine;
+-
+- - Flush the L1D cache on every switch from the nested virtual machine to
+- the nested hypervisor; this is a complex operation, and flushing the L1D
+- cache avoids that the bare metal hypervisor's secrets are exposed to the
+- nested virtual machine;
+-
+- - Instruct the nested hypervisor to not perform any L1D cache flush. This
+- is an optimization to avoid double L1D flushing.
+-
+-
+-.. _default_mitigations:
+-
+-Default mitigations
+--------------------
+-
+- The kernel default mitigations for vulnerable processors are:
+-
+- - PTE inversion to protect against malicious user space. This is done
+- unconditionally and cannot be controlled. The swap storage is limited
+- to ~16TB.
+-
+- - L1D conditional flushing on VMENTER when EPT is enabled for
+- a guest.
+-
+- The kernel does not by default enforce the disabling of SMT, which leaves
+- SMT systems vulnerable when running untrusted guests with EPT enabled.
+-
+- The rationale for this choice is:
+-
+- - Force disabling SMT can break existing setups, especially with
+- unattended updates.
+-
+- - If regular users run untrusted guests on their machine, then L1TF is
+- just an add on to other malware which might be embedded in an untrusted
+- guest, e.g. spam-bots or attacks on the local network.
+-
+- There is no technical way to prevent a user from running untrusted code
+- on their machines blindly.
+-
+- - It's technically extremely unlikely and from today's knowledge even
+- impossible that L1TF can be exploited via the most popular attack
+- mechanisms like JavaScript because these mechanisms have no way to
+- control PTEs. If this would be possible and not other mitigation would
+- be possible, then the default might be different.
+-
+- - The administrators of cloud and hosting setups have to carefully
+- analyze the risk for their scenarios and make the appropriate
+- mitigation choices, which might even vary across their deployed
+- machines and also result in other changes of their overall setup.
+- There is no way for the kernel to provide a sensible default for this
+- kind of scenarios.
+diff --git a/Documentation/index.rst b/Documentation/index.rst
+index 80a421cb935e..3511400dc092 100644
+--- a/Documentation/index.rst
++++ b/Documentation/index.rst
+@@ -102,6 +102,7 @@ implementation.
+ :maxdepth: 2
+
+ sh/index
++ x86/index
+
+ Filesystem Documentation
+ ------------------------
+diff --git a/Documentation/x86/conf.py b/Documentation/x86/conf.py
+new file mode 100644
+index 000000000000..33c5c3142e20
+--- /dev/null
++++ b/Documentation/x86/conf.py
+@@ -0,0 +1,10 @@
++# -*- coding: utf-8; mode: python -*-
++
++project = "X86 architecture specific documentation"
++
++tags.add("subproject")
++
++latex_documents = [
++ ('index', 'x86.tex', project,
++ 'The kernel development community', 'manual'),
++]
+diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
+new file mode 100644
+index 000000000000..ef389dcf1b1d
+--- /dev/null
++++ b/Documentation/x86/index.rst
+@@ -0,0 +1,8 @@
++==========================
++x86 architecture specifics
++==========================
++
++.. toctree::
++ :maxdepth: 1
++
++ mds
+diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst
+new file mode 100644
+index 000000000000..534e9baa4e1d
+--- /dev/null
++++ b/Documentation/x86/mds.rst
+@@ -0,0 +1,225 @@
++Microarchitectural Data Sampling (MDS) mitigation
++=================================================
++
++.. _mds:
++
++Overview
++--------
++
++Microarchitectural Data Sampling (MDS) is a family of side channel attacks
++on internal buffers in Intel CPUs. The variants are:
++
++ - Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126)
++ - Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130)
++ - Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127)
++ - Microarchitectural Data Sampling Uncacheable Memory (MDSUM) (CVE-2019-11091)
++
++MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a
++dependent load (store-to-load forwarding) as an optimization. The forward
++can also happen to a faulting or assisting load operation for a different
++memory address, which can be exploited under certain conditions. Store
++buffers are partitioned between Hyper-Threads so cross thread forwarding is
++not possible. But if a thread enters or exits a sleep state the store
++buffer is repartitioned which can expose data from one thread to the other.
++
++MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage
++L1 miss situations and to hold data which is returned or sent in response
++to a memory or I/O operation. Fill buffers can forward data to a load
++operation and also write data to the cache. When the fill buffer is
++deallocated it can retain the stale data of the preceding operations which
++can then be forwarded to a faulting or assisting load operation, which can
++be exploited under certain conditions. Fill buffers are shared between
++Hyper-Threads so cross thread leakage is possible.
++
++MLPDS leaks Load Port Data. Load ports are used to perform load operations
++from memory or I/O. The received data is then forwarded to the register
++file or a subsequent operation. In some implementations the Load Port can
++contain stale data from a previous operation which can be forwarded to
++faulting or assisting loads under certain conditions, which again can be
++exploited eventually. Load ports are shared between Hyper-Threads so cross
++thread leakage is possible.
++
++MDSUM is a special case of MSBDS, MFBDS and MLPDS. An uncacheable load from
++memory that takes a fault or assist can leave data in a microarchitectural
++structure that may later be observed using one of the same methods used by
++MSBDS, MFBDS or MLPDS.
++
++Exposure assumptions
++--------------------
++
++It is assumed that attack code resides in user space or in a guest with one
++exception. The rationale behind this assumption is that the code construct
++needed for exploiting MDS requires:
++
++ - to control the load to trigger a fault or assist
++
++ - to have a disclosure gadget which exposes the speculatively accessed
++ data for consumption through a side channel.
++
++ - to control the pointer through which the disclosure gadget exposes the
++ data
++
++The existence of such a construct in the kernel cannot be excluded with
++100% certainty, but the complexity involved makes it extremly unlikely.
++
++There is one exception, which is untrusted BPF. The functionality of
++untrusted BPF is limited, but it needs to be thoroughly investigated
++whether it can be used to create such a construct.
++
++
++Mitigation strategy
++-------------------
++
++All variants have the same mitigation strategy at least for the single CPU
++thread case (SMT off): Force the CPU to clear the affected buffers.
++
++This is achieved by using the otherwise unused and obsolete VERW
++instruction in combination with a microcode update. The microcode clears
++the affected CPU buffers when the VERW instruction is executed.
++
++For virtualization there are two ways to achieve CPU buffer
++clearing. Either the modified VERW instruction or via the L1D Flush
++command. The latter is issued when L1TF mitigation is enabled so the extra
++VERW can be avoided. If the CPU is not affected by L1TF then VERW needs to
++be issued.
++
++If the VERW instruction with the supplied segment selector argument is
++executed on a CPU without the microcode update there is no side effect
++other than a small number of pointlessly wasted CPU cycles.
++
++This does not protect against cross Hyper-Thread attacks except for MSBDS
++which is only exploitable cross Hyper-thread when one of the Hyper-Threads
++enters a C-state.
++
++The kernel provides a function to invoke the buffer clearing:
++
++ mds_clear_cpu_buffers()
++
++The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state
++(idle) transitions.
++
++As a special quirk to address virtualization scenarios where the host has
++the microcode updated, but the hypervisor does not (yet) expose the
++MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the
++hope that it might actually clear the buffers. The state is reflected
++accordingly.
++
++According to current knowledge additional mitigations inside the kernel
++itself are not required because the necessary gadgets to expose the leaked
++data cannot be controlled in a way which allows exploitation from malicious
++user space or VM guests.
++
++Kernel internal mitigation modes
++--------------------------------
++
++ ======= ============================================================
++ off Mitigation is disabled. Either the CPU is not affected or
++ mds=off is supplied on the kernel command line
++
++ full Mitigation is enabled. CPU is affected and MD_CLEAR is
++ advertised in CPUID.
++
++ vmwerv Mitigation is enabled. CPU is affected and MD_CLEAR is not
++ advertised in CPUID. That is mainly for virtualization
++ scenarios where the host has the updated microcode but the
++ hypervisor does not expose MD_CLEAR in CPUID. It's a best
++ effort approach without guarantee.
++ ======= ============================================================
++
++If the CPU is affected and mds=off is not supplied on the kernel command
++line then the kernel selects the appropriate mitigation mode depending on
++the availability of the MD_CLEAR CPUID bit.
++
++Mitigation points
++-----------------
++
++1. Return to user space
++^^^^^^^^^^^^^^^^^^^^^^^
++
++ When transitioning from kernel to user space the CPU buffers are flushed
++ on affected CPUs when the mitigation is not disabled on the kernel
++ command line. The migitation is enabled through the static key
++ mds_user_clear.
++
++ The mitigation is invoked in prepare_exit_to_usermode() which covers
++ most of the kernel to user space transitions. There are a few exceptions
++ which are not invoking prepare_exit_to_usermode() on return to user
++ space. These exceptions use the paranoid exit code.
++
++ - Non Maskable Interrupt (NMI):
++
++ Access to sensible data like keys, credentials in the NMI context is
++ mostly theoretical: The CPU can do prefetching or execute a
++ misspeculated code path and thereby fetching data which might end up
++ leaking through a buffer.
++
++ But for mounting other attacks the kernel stack address of the task is
++ already valuable information. So in full mitigation mode, the NMI is
++ mitigated on the return from do_nmi() to provide almost complete
++ coverage.
++
++ - Double fault (#DF):
++
++ A double fault is usually fatal, but the ESPFIX workaround, which can
++ be triggered from user space through modify_ldt(2) is a recoverable
++ double fault. #DF uses the paranoid exit path, so explicit mitigation
++ in the double fault handler is required.
++
++ - Machine Check Exception (#MC):
++
++ Another corner case is a #MC which hits between the CPU buffer clear
++ invocation and the actual return to user. As this still is in kernel
++ space it takes the paranoid exit path which does not clear the CPU
++ buffers. So the #MC handler repopulates the buffers to some
++ extent. Machine checks are not reliably controllable and the window is
++ extremly small so mitigation would just tick a checkbox that this
++ theoretical corner case is covered. To keep the amount of special
++ cases small, ignore #MC.
++
++ - Debug Exception (#DB):
++
++ This takes the paranoid exit path only when the INT1 breakpoint is in
++ kernel space. #DB on a user space address takes the regular exit path,
++ so no extra mitigation required.
++
++
++2. C-State transition
++^^^^^^^^^^^^^^^^^^^^^
++
++ When a CPU goes idle and enters a C-State the CPU buffers need to be
++ cleared on affected CPUs when SMT is active. This addresses the
++ repartitioning of the store buffer when one of the Hyper-Threads enters
++ a C-State.
++
++ When SMT is inactive, i.e. either the CPU does not support it or all
++ sibling threads are offline CPU buffer clearing is not required.
++
++ The idle clearing is enabled on CPUs which are only affected by MSBDS
++ and not by any other MDS variant. The other MDS variants cannot be
++ protected against cross Hyper-Thread attacks because the Fill Buffer and
++ the Load Ports are shared. So on CPUs affected by other variants, the
++ idle clearing would be a window dressing exercise and is therefore not
++ activated.
++
++ The invocation is controlled by the static key mds_idle_clear which is
++ switched depending on the chosen mitigation mode and the SMT state of
++ the system.
++
++ The buffer clear is only invoked before entering the C-State to prevent
++ that stale data from the idling CPU from spilling to the Hyper-Thread
++ sibling after the store buffer got repartitioned and all entries are
++ available to the non idle sibling.
++
++ When coming out of idle the store buffer is partitioned again so each
++ sibling has half of it available. The back from idle CPU could be then
++ speculatively exposed to contents of the sibling. The buffers are
++ flushed either on exit to user space or on VMENTER so malicious code
++ in user space or the guest cannot speculatively access them.
++
++ The mitigation is hooked into all variants of halt()/mwait(), but does
++ not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver
++ has been superseded by the intel_idle driver around 2010 and is
++ preferred on all affected CPUs which are expected to gain the MD_CLEAR
++ functionality in microcode. Aside of that the IO-Port mechanism is a
++ legacy interface which is only used on older systems which are either
++ not affected or do not receive microcode updates anymore.
+diff --git a/Makefile b/Makefile
+index bf604f77e5e5..58ec07990e76 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,7 +1,7 @@
+ # SPDX-License-Identifier: GPL-2.0
+ VERSION = 5
+ PATCHLEVEL = 1
+-SUBLEVEL = 1
++SUBLEVEL = 2
+ EXTRAVERSION =
+ NAME = Shy Crocodile
+
+diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
+index b33bafb8fcea..70568ccbd9fd 100644
+--- a/arch/powerpc/kernel/security.c
++++ b/arch/powerpc/kernel/security.c
+@@ -57,7 +57,7 @@ void setup_barrier_nospec(void)
+ enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR);
+
+- if (!no_nospec)
++ if (!no_nospec && !cpu_mitigations_off())
+ enable_barrier_nospec(enable);
+ }
+
+@@ -116,7 +116,7 @@ static int __init handle_nospectre_v2(char *p)
+ early_param("nospectre_v2", handle_nospectre_v2);
+ void setup_spectre_v2(void)
+ {
+- if (no_spectrev2)
++ if (no_spectrev2 || cpu_mitigations_off())
+ do_btb_flush_fixups();
+ else
+ btb_flush_enabled = true;
+@@ -300,7 +300,7 @@ void setup_stf_barrier(void)
+
+ stf_enabled_flush_types = type;
+
+- if (!no_stf_barrier)
++ if (!no_stf_barrier && !cpu_mitigations_off())
+ stf_barrier_enable(enable);
+ }
+
+diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
+index ba404dd9ce1d..4f49e1a3594c 100644
+--- a/arch/powerpc/kernel/setup_64.c
++++ b/arch/powerpc/kernel/setup_64.c
+@@ -932,7 +932,7 @@ void setup_rfi_flush(enum l1d_flush_type types, bool enable)
+
+ enabled_flush_types = types;
+
+- if (!no_rfi_flush)
++ if (!no_rfi_flush && !cpu_mitigations_off())
+ rfi_flush_enable(enable);
+ }
+
+diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
+index bdddaae96559..649135cbedd5 100644
+--- a/arch/s390/kernel/nospec-branch.c
++++ b/arch/s390/kernel/nospec-branch.c
+@@ -1,6 +1,7 @@
+ // SPDX-License-Identifier: GPL-2.0
+ #include <linux/module.h>
+ #include <linux/device.h>
++#include <linux/cpu.h>
+ #include <asm/nospec-branch.h>
+
+ static int __init nobp_setup_early(char *str)
+@@ -58,7 +59,7 @@ early_param("nospectre_v2", nospectre_v2_setup_early);
+
+ void __init nospec_auto_detect(void)
+ {
+- if (test_facility(156)) {
++ if (test_facility(156) || cpu_mitigations_off()) {
+ /*
+ * The machine supports etokens.
+ * Disable expolines and disable nobp.
+diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
+index 7bc105f47d21..19f650d729f5 100644
+--- a/arch/x86/entry/common.c
++++ b/arch/x86/entry/common.c
+@@ -31,6 +31,7 @@
+ #include <asm/vdso.h>
+ #include <linux/uaccess.h>
+ #include <asm/cpufeature.h>
++#include <asm/nospec-branch.h>
+
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/syscalls.h>
+@@ -212,6 +213,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
+ #endif
+
+ user_enter_irqoff();
++
++ mds_user_clear_cpu_buffers();
+ }
+
+ #define SYSCALL_EXIT_WORK_FLAGS \
+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
+index 981ff9479648..75f27ee2c263 100644
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -344,6 +344,7 @@
+ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
+ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
+ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
++#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
+ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
+ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
+ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
+@@ -382,5 +383,7 @@
+ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
+ #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
+ #define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
++#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
++#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */
+
+ #endif /* _ASM_X86_CPUFEATURES_H */
+diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
+index 058e40fed167..8a0e56e1dcc9 100644
+--- a/arch/x86/include/asm/irqflags.h
++++ b/arch/x86/include/asm/irqflags.h
+@@ -6,6 +6,8 @@
+
+ #ifndef __ASSEMBLY__
+
++#include <asm/nospec-branch.h>
++
+ /* Provide __cpuidle; we can't safely include <linux/cpu.h> */
+ #define __cpuidle __attribute__((__section__(".cpuidle.text")))
+
+@@ -54,11 +56,13 @@ static inline void native_irq_enable(void)
+
+ static inline __cpuidle void native_safe_halt(void)
+ {
++ mds_idle_clear_cpu_buffers();
+ asm volatile("sti; hlt": : :"memory");
+ }
+
+ static inline __cpuidle void native_halt(void)
+ {
++ mds_idle_clear_cpu_buffers();
+ asm volatile("hlt": : :"memory");
+ }
+
+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
+index ca5bc0eacb95..20f7da552e90 100644
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -2,6 +2,8 @@
+ #ifndef _ASM_X86_MSR_INDEX_H
+ #define _ASM_X86_MSR_INDEX_H
+
++#include <linux/bits.h>
++
+ /*
+ * CPU model specific register (MSR) numbers.
+ *
+@@ -40,14 +42,14 @@
+ /* Intel MSRs. Some also available on other CPUs */
+
+ #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
+-#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
++#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */
+ #define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */
+-#define SPEC_CTRL_STIBP (1 << SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */
++#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */
+ #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */
+-#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
++#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
+
+ #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
+-#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
++#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
+
+ #define MSR_PPIN_CTL 0x0000004e
+ #define MSR_PPIN 0x0000004f
+@@ -69,20 +71,25 @@
+ #define MSR_MTRRcap 0x000000fe
+
+ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
+-#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
+-#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
+-#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */
+-#define ARCH_CAP_SSB_NO (1 << 4) /*
+- * Not susceptible to Speculative Store Bypass
+- * attack, so no Speculative Store Bypass
+- * control required.
+- */
++#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */
++#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */
++#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */
++#define ARCH_CAP_SSB_NO BIT(4) /*
++ * Not susceptible to Speculative Store Bypass
++ * attack, so no Speculative Store Bypass
++ * control required.
++ */
++#define ARCH_CAP_MDS_NO BIT(5) /*
++ * Not susceptible to
++ * Microarchitectural Data
++ * Sampling (MDS) vulnerabilities.
++ */
+
+ #define MSR_IA32_FLUSH_CMD 0x0000010b
+-#define L1D_FLUSH (1 << 0) /*
+- * Writeback and invalidate the
+- * L1 data cache.
+- */
++#define L1D_FLUSH BIT(0) /*
++ * Writeback and invalidate the
++ * L1 data cache.
++ */
+
+ #define MSR_IA32_BBL_CR_CTL 0x00000119
+ #define MSR_IA32_BBL_CR_CTL3 0x0000011e
+diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
+index 39a2fb29378a..eb0f80ce8524 100644
+--- a/arch/x86/include/asm/mwait.h
++++ b/arch/x86/include/asm/mwait.h
+@@ -6,6 +6,7 @@
+ #include <linux/sched/idle.h>
+
+ #include <asm/cpufeature.h>
++#include <asm/nospec-branch.h>
+
+ #define MWAIT_SUBSTATE_MASK 0xf
+ #define MWAIT_CSTATE_MASK 0xf
+@@ -40,6 +41,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx,
+
+ static inline void __mwait(unsigned long eax, unsigned long ecx)
+ {
++ mds_idle_clear_cpu_buffers();
++
+ /* "mwait %eax, %ecx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+@@ -74,6 +77,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
+ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
+ unsigned long ecx)
+ {
++ /* No MDS buffer clear as this is AMD/HYGON only */
++
+ /* "mwaitx %eax, %ebx, %ecx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xfb;"
+ :: "a" (eax), "b" (ebx), "c" (ecx));
+@@ -81,6 +86,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
+
+ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+ {
++ mds_idle_clear_cpu_buffers();
++
+ trace_hardirqs_on();
+ /* "mwait %eax, %ecx;" */
+ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
+index dad12b767ba0..4e970390110f 100644
+--- a/arch/x86/include/asm/nospec-branch.h
++++ b/arch/x86/include/asm/nospec-branch.h
+@@ -318,6 +318,56 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+ DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+ DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
++DECLARE_STATIC_KEY_FALSE(mds_user_clear);
++DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
++
++#include <asm/segment.h>
++
++/**
++ * mds_clear_cpu_buffers - Mitigation for MDS vulnerability
++ *
++ * This uses the otherwise unused and obsolete VERW instruction in
++ * combination with microcode which triggers a CPU buffer flush when the
++ * instruction is executed.
++ */
++static inline void mds_clear_cpu_buffers(void)
++{
++ static const u16 ds = __KERNEL_DS;
++
++ /*
++ * Has to be the memory-operand variant because only that
++ * guarantees the CPU buffer flush functionality according to
++ * documentation. The register-operand variant does not.
++ * Works with any segment selector, but a valid writable
++ * data segment is the fastest variant.
++ *
++ * "cc" clobber is required because VERW modifies ZF.
++ */
++ asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
++}
++
++/**
++ * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability
++ *
++ * Clear CPU buffers if the corresponding static key is enabled
++ */
++static inline void mds_user_clear_cpu_buffers(void)
++{
++ if (static_branch_likely(&mds_user_clear))
++ mds_clear_cpu_buffers();
++}
++
++/**
++ * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
++ *
++ * Clear CPU buffers if the corresponding static key is enabled
++ */
++static inline void mds_idle_clear_cpu_buffers(void)
++{
++ if (static_branch_likely(&mds_idle_clear))
++ mds_clear_cpu_buffers();
++}
++
+ #endif /* __ASSEMBLY__ */
+
+ /*
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
+index 2bb3a648fc12..31e9895db75e 100644
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -991,4 +991,10 @@ enum l1tf_mitigations {
+
+ extern enum l1tf_mitigations l1tf_mitigation;
+
++enum mds_mitigations {
++ MDS_MITIGATION_OFF,
++ MDS_MITIGATION_FULL,
++ MDS_MITIGATION_VMWERV,
++};
++
+ #endif /* _ASM_X86_PROCESSOR_H */
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index b91b3bfa5cfb..03b4cc0ec3a7 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -37,6 +37,7 @@
+ static void __init spectre_v2_select_mitigation(void);
+ static void __init ssb_select_mitigation(void);
+ static void __init l1tf_select_mitigation(void);
++static void __init mds_select_mitigation(void);
+
+ /* The base value of the SPEC_CTRL MSR that always has to be preserved. */
+ u64 x86_spec_ctrl_base;
+@@ -63,6 +64,13 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+ /* Control unconditional IBPB in switch_mm() */
+ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
++/* Control MDS CPU buffer clear before returning to user space */
++DEFINE_STATIC_KEY_FALSE(mds_user_clear);
++EXPORT_SYMBOL_GPL(mds_user_clear);
++/* Control MDS CPU buffer clear before idling (halt, mwait) */
++DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
++EXPORT_SYMBOL_GPL(mds_idle_clear);
++
+ void __init check_bugs(void)
+ {
+ identify_boot_cpu();
+@@ -101,6 +109,10 @@ void __init check_bugs(void)
+
+ l1tf_select_mitigation();
+
++ mds_select_mitigation();
++
++ arch_smt_update();
++
+ #ifdef CONFIG_X86_32
+ /*
+ * Check whether we are able to run this kernel safely on SMP.
+@@ -206,6 +218,61 @@ static void x86_amd_ssb_disable(void)
+ wrmsrl(MSR_AMD64_LS_CFG, msrval);
+ }
+
++#undef pr_fmt
++#define pr_fmt(fmt) "MDS: " fmt
++
++/* Default mitigation for MDS-affected CPUs */
++static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
++static bool mds_nosmt __ro_after_init = false;
++
++static const char * const mds_strings[] = {
++ [MDS_MITIGATION_OFF] = "Vulnerable",
++ [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
++ [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
++};
++
++static void __init mds_select_mitigation(void)
++{
++ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
++ mds_mitigation = MDS_MITIGATION_OFF;
++ return;
++ }
++
++ if (mds_mitigation == MDS_MITIGATION_FULL) {
++ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
++ mds_mitigation = MDS_MITIGATION_VMWERV;
++
++ static_branch_enable(&mds_user_clear);
++
++ if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
++ (mds_nosmt || cpu_mitigations_auto_nosmt()))
++ cpu_smt_disable(false);
++ }
++
++ pr_info("%s\n", mds_strings[mds_mitigation]);
++}
++
++static int __init mds_cmdline(char *str)
++{
++ if (!boot_cpu_has_bug(X86_BUG_MDS))
++ return 0;
++
++ if (!str)
++ return -EINVAL;
++
++ if (!strcmp(str, "off"))
++ mds_mitigation = MDS_MITIGATION_OFF;
++ else if (!strcmp(str, "full"))
++ mds_mitigation = MDS_MITIGATION_FULL;
++ else if (!strcmp(str, "full,nosmt")) {
++ mds_mitigation = MDS_MITIGATION_FULL;
++ mds_nosmt = true;
++ }
++
++ return 0;
++}
++early_param("mds", mds_cmdline);
++
+ #undef pr_fmt
+ #define pr_fmt(fmt) "Spectre V2 : " fmt
+
+@@ -440,7 +507,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
+ char arg[20];
+ int ret, i;
+
+- if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
++ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
++ cpu_mitigations_off())
+ return SPECTRE_V2_CMD_NONE;
+
+ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
+@@ -574,9 +642,6 @@ specv2_set_mode:
+
+ /* Set up IBPB and STIBP depending on the general spectre V2 command */
+ spectre_v2_user_select_mitigation(cmd);
+-
+- /* Enable STIBP if appropriate */
+- arch_smt_update();
+ }
+
+ static void update_stibp_msr(void * __unused)
+@@ -610,6 +675,31 @@ static void update_indir_branch_cond(void)
+ static_branch_disable(&switch_to_cond_stibp);
+ }
+
++#undef pr_fmt
++#define pr_fmt(fmt) fmt
++
++/* Update the static key controlling the MDS CPU buffer clear in idle */
++static void update_mds_branch_idle(void)
++{
++ /*
++ * Enable the idle clearing if SMT is active on CPUs which are
++ * affected only by MSBDS and not any other MDS variant.
++ *
++ * The other variants cannot be mitigated when SMT is enabled, so
++ * clearing the buffers on idle just to prevent the Store Buffer
++ * repartitioning leak would be a window dressing exercise.
++ */
++ if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
++ return;
++
++ if (sched_smt_active())
++ static_branch_enable(&mds_idle_clear);
++ else
++ static_branch_disable(&mds_idle_clear);
++}
++
++#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
++
+ void arch_smt_update(void)
+ {
+ /* Enhanced IBRS implies STIBP. No update required. */
+@@ -631,6 +721,17 @@ void arch_smt_update(void)
+ break;
+ }
+
++ switch (mds_mitigation) {
++ case MDS_MITIGATION_FULL:
++ case MDS_MITIGATION_VMWERV:
++ if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
++ pr_warn_once(MDS_MSG_SMT);
++ update_mds_branch_idle();
++ break;
++ case MDS_MITIGATION_OFF:
++ break;
++ }
++
+ mutex_unlock(&spec_ctrl_mutex);
+ }
+
+@@ -672,7 +773,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
+ char arg[20];
+ int ret, i;
+
+- if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) {
++ if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
++ cpu_mitigations_off()) {
+ return SPEC_STORE_BYPASS_CMD_NONE;
+ } else {
+ ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
+@@ -1008,6 +1110,11 @@ static void __init l1tf_select_mitigation(void)
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return;
+
++ if (cpu_mitigations_off())
++ l1tf_mitigation = L1TF_MITIGATION_OFF;
++ else if (cpu_mitigations_auto_nosmt())
++ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
++
+ override_cache_bits(&boot_cpu_data);
+
+ switch (l1tf_mitigation) {
+@@ -1036,7 +1143,7 @@ static void __init l1tf_select_mitigation(void)
+ pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
+ half_pa);
+ pr_info("However, doing so will make a part of your RAM unusable.\n");
+- pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n");
++ pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n");
+ return;
+ }
+
+@@ -1069,6 +1176,7 @@ static int __init l1tf_cmdline(char *str)
+ early_param("l1tf", l1tf_cmdline);
+
+ #undef pr_fmt
++#define pr_fmt(fmt) fmt
+
+ #ifdef CONFIG_SYSFS
+
+@@ -1107,6 +1215,23 @@ static ssize_t l1tf_show_state(char *buf)
+ }
+ #endif
+
++static ssize_t mds_show_state(char *buf)
++{
++ if (!hypervisor_is_type(X86_HYPER_NATIVE)) {
++ return sprintf(buf, "%s; SMT Host state unknown\n",
++ mds_strings[mds_mitigation]);
++ }
++
++ if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
++ return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
++ (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
++ sched_smt_active() ? "mitigated" : "disabled"));
++ }
++
++ return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
++ sched_smt_active() ? "vulnerable" : "disabled");
++}
++
+ static char *stibp_state(void)
+ {
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+@@ -1173,6 +1298,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
+ if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
+ return l1tf_show_state(buf);
+ break;
++
++ case X86_BUG_MDS:
++ return mds_show_state(buf);
++
+ default:
+ break;
+ }
+@@ -1204,4 +1333,9 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b
+ {
+ return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
+ }
++
++ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf)
++{
++ return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
++}
+ #endif
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
+index cb28e98a0659..132a63dc5a76 100644
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -948,61 +948,77 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+ #endif
+ }
+
+-static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL, X86_FEATURE_ANY },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_TABLET, X86_FEATURE_ANY },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL_MID, X86_FEATURE_ANY },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_MID, X86_FEATURE_ANY },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL, X86_FEATURE_ANY },
+- { X86_VENDOR_CENTAUR, 5 },
+- { X86_VENDOR_INTEL, 5 },
+- { X86_VENDOR_NSC, 5 },
+- { X86_VENDOR_ANY, 4 },
++#define NO_SPECULATION BIT(0)
++#define NO_MELTDOWN BIT(1)
++#define NO_SSB BIT(2)
++#define NO_L1TF BIT(3)
++#define NO_MDS BIT(4)
++#define MSBDS_ONLY BIT(5)
++
++#define VULNWL(_vendor, _family, _model, _whitelist) \
++ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
++
++#define VULNWL_INTEL(model, whitelist) \
++ VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
++
++#define VULNWL_AMD(family, whitelist) \
++ VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
++
++#define VULNWL_HYGON(family, whitelist) \
++ VULNWL(HYGON, family, X86_MODEL_ANY, whitelist)
++
++static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
++ VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION),
++ VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
++ VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
++ VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
++
++ /* Intel Family 6 */
++ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION),
++ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION),
++ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION),
++ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION),
++ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION),
++
++ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY),
++ VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY),
++ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY),
++ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY),
++ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY),
++ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY),
++
++ VULNWL_INTEL(CORE_YONAH, NO_SSB),
++
++ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY),
++
++ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF),
++ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF),
++ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF),
++
++ /* AMD Family 0xf - 0x12 */
++ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
++ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
++ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
++ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
++
++ /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
++ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS),
++ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS),
+ {}
+ };
+
+-static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
+- { X86_VENDOR_AMD },
+- { X86_VENDOR_HYGON },
+- {}
+-};
+-
+-/* Only list CPUs which speculate but are non susceptible to SSB */
+-static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
+- { X86_VENDOR_AMD, 0x12, },
+- { X86_VENDOR_AMD, 0x11, },
+- { X86_VENDOR_AMD, 0x10, },
+- { X86_VENDOR_AMD, 0xf, },
+- {}
+-};
++static bool __init cpu_matches(unsigned long which)
++{
++ const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist);
+
+-static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
+- /* in addition to cpu_no_speculation */
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT_MID },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_X },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_PLUS },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
+- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
+- {}
+-};
++ return m && !!(m->driver_data & which);
++}
+
+ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+ {
+ u64 ia32_cap = 0;
+
+- if (x86_match_cpu(cpu_no_speculation))
++ if (cpu_matches(NO_SPECULATION))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+@@ -1011,15 +1027,20 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+ if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+
+- if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
+- !(ia32_cap & ARCH_CAP_SSB_NO) &&
++ if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
+ setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+
+ if (ia32_cap & ARCH_CAP_IBRS_ALL)
+ setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
+
+- if (x86_match_cpu(cpu_no_meltdown))
++ if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) {
++ setup_force_cpu_bug(X86_BUG_MDS);
++ if (cpu_matches(MSBDS_ONLY))
++ setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
++ }
++
++ if (cpu_matches(NO_MELTDOWN))
+ return;
+
+ /* Rogue Data Cache Load? No! */
+@@ -1028,7 +1049,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+
+ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
+
+- if (x86_match_cpu(cpu_no_l1tf))
++ if (cpu_matches(NO_L1TF))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_L1TF);
+diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
+index 18bc9b51ac9b..086cf1d1d71d 100644
+--- a/arch/x86/kernel/nmi.c
++++ b/arch/x86/kernel/nmi.c
+@@ -34,6 +34,7 @@
+ #include <asm/x86_init.h>
+ #include <asm/reboot.h>
+ #include <asm/cache.h>
++#include <asm/nospec-branch.h>
+
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/nmi.h>
+@@ -533,6 +534,9 @@ nmi_restart:
+ write_cr2(this_cpu_read(nmi_cr2));
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
++
++ if (user_mode(regs))
++ mds_user_clear_cpu_buffers();
+ }
+ NOKPROBE_SYMBOL(do_nmi);
+
+diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
+index d26f9e9c3d83..07c7bbe79e8b 100644
+--- a/arch/x86/kernel/traps.c
++++ b/arch/x86/kernel/traps.c
+@@ -58,6 +58,7 @@
+ #include <asm/alternative.h>
+ #include <asm/fpu/xstate.h>
+ #include <asm/trace/mpx.h>
++#include <asm/nospec-branch.h>
+ #include <asm/mpx.h>
+ #include <asm/vm86.h>
+ #include <asm/umip.h>
+@@ -367,6 +368,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+ regs->ip = (unsigned long)general_protection;
+ regs->sp = (unsigned long)&gpregs->orig_ax;
+
++ /*
++ * This situation can be triggered by userspace via
++ * modify_ldt(2) and the return does not take the regular
++ * user space exit, so a CPU buffer clear is required when
++ * MDS mitigation is enabled.
++ */
++ mds_user_clear_cpu_buffers();
+ return;
+ }
+ #endif
+diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
+index fd3951638ae4..bbbe611f0c49 100644
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -410,7 +410,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ /* cpuid 7.0.edx*/
+ const u32 kvm_cpuid_7_0_edx_x86_features =
+ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+- F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP);
++ F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
++ F(MD_CLEAR);
+
+ /* all calls to cpuid_count() should be made on the same cpu */
+ get_cpu();
+diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
+index 0c955bb286ff..194c6ec11f4c 100644
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -6431,8 +6431,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+ */
+ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
+
++ /* L1D Flush includes CPU buffer clear to mitigate MDS */
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
++ else if (static_branch_unlikely(&mds_user_clear))
++ mds_clear_cpu_buffers();
+
+ if (vcpu->arch.cr2 != read_cr2())
+ write_cr2(vcpu->arch.cr2);
+@@ -6668,8 +6671,8 @@ free_partial_vcpu:
+ return ERR_PTR(err);
+ }
+
+-#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+-#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
++#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
++#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+
+ static int vmx_vm_init(struct kvm *kvm)
+ {
+diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
+index 139b28a01ce4..d0255d64edce 100644
+--- a/arch/x86/mm/pti.c
++++ b/arch/x86/mm/pti.c
+@@ -35,6 +35,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/mm.h>
+ #include <linux/uaccess.h>
++#include <linux/cpu.h>
+
+ #include <asm/cpufeature.h>
+ #include <asm/hypervisor.h>
+@@ -115,7 +116,8 @@ void __init pti_check_boottime_disable(void)
+ }
+ }
+
+- if (cmdline_find_option_bool(boot_command_line, "nopti")) {
++ if (cmdline_find_option_bool(boot_command_line, "nopti") ||
++ cpu_mitigations_off()) {
+ pti_mode = PTI_FORCE_OFF;
+ pti_print_if_insecure("disabled on command line.");
+ return;
+diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
+index 668139cfa664..cc37511de866 100644
+--- a/drivers/base/cpu.c
++++ b/drivers/base/cpu.c
+@@ -548,11 +548,18 @@ ssize_t __weak cpu_show_l1tf(struct device *dev,
+ return sprintf(buf, "Not affected\n");
+ }
+
++ssize_t __weak cpu_show_mds(struct device *dev,
++ struct device_attribute *attr, char *buf)
++{
++ return sprintf(buf, "Not affected\n");
++}
++
+ static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
+ static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
+ static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
+ static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
+ static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
++static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);
+
+ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
+ &dev_attr_meltdown.attr,
+@@ -560,6 +567,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
+ &dev_attr_spectre_v2.attr,
+ &dev_attr_spec_store_bypass.attr,
+ &dev_attr_l1tf.attr,
++ &dev_attr_mds.attr,
+ NULL
+ };
+
+diff --git a/include/linux/cpu.h b/include/linux/cpu.h
+index 5041357d0297..57ae83c4d5f4 100644
+--- a/include/linux/cpu.h
++++ b/include/linux/cpu.h
+@@ -57,6 +57,8 @@ extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
+ struct device_attribute *attr, char *buf);
+ extern ssize_t cpu_show_l1tf(struct device *dev,
+ struct device_attribute *attr, char *buf);
++extern ssize_t cpu_show_mds(struct device *dev,
++ struct device_attribute *attr, char *buf);
+
+ extern __printf(4, 5)
+ struct device *cpu_device_create(struct device *parent, void *drvdata,
+@@ -187,4 +189,28 @@ static inline void cpu_smt_disable(bool force) { }
+ static inline void cpu_smt_check_topology(void) { }
+ #endif
+
++/*
++ * These are used for a global "mitigations=" cmdline option for toggling
++ * optional CPU mitigations.
++ */
++enum cpu_mitigations {
++ CPU_MITIGATIONS_OFF,
++ CPU_MITIGATIONS_AUTO,
++ CPU_MITIGATIONS_AUTO_NOSMT,
++};
++
++extern enum cpu_mitigations cpu_mitigations;
++
++/* mitigations=off */
++static inline bool cpu_mitigations_off(void)
++{
++ return cpu_mitigations == CPU_MITIGATIONS_OFF;
++}
++
++/* mitigations=auto,nosmt */
++static inline bool cpu_mitigations_auto_nosmt(void)
++{
++ return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
++}
++
+ #endif /* _LINUX_CPU_H_ */
+diff --git a/kernel/cpu.c b/kernel/cpu.c
+index 6754f3ecfd94..43e741e88691 100644
+--- a/kernel/cpu.c
++++ b/kernel/cpu.c
+@@ -2304,3 +2304,18 @@ void __init boot_cpu_hotplug_init(void)
+ #endif
+ this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
+ }
++
++enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
++
++static int __init mitigations_parse_cmdline(char *arg)
++{
++ if (!strcmp(arg, "off"))
++ cpu_mitigations = CPU_MITIGATIONS_OFF;
++ else if (!strcmp(arg, "auto"))
++ cpu_mitigations = CPU_MITIGATIONS_AUTO;
++ else if (!strcmp(arg, "auto,nosmt"))
++ cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
++
++ return 0;
++}
++early_param("mitigations", mitigations_parse_cmdline);
+diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile
+index 1598b4fa0b11..045f5f7d68ab 100644
+--- a/tools/power/x86/turbostat/Makefile
++++ b/tools/power/x86/turbostat/Makefile
+@@ -9,7 +9,7 @@ ifeq ("$(origin O)", "command line")
+ endif
+
+ turbostat : turbostat.c
+-override CFLAGS += -Wall
++override CFLAGS += -Wall -I../../../include
+ override CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"'
+ override CFLAGS += -DINTEL_FAMILY_HEADER='"../../../../arch/x86/include/asm/intel-family.h"'
+
+diff --git a/tools/power/x86/x86_energy_perf_policy/Makefile b/tools/power/x86/x86_energy_perf_policy/Makefile
+index ae7a0e09b722..1fdeef864e7c 100644
+--- a/tools/power/x86/x86_energy_perf_policy/Makefile
++++ b/tools/power/x86/x86_energy_perf_policy/Makefile
+@@ -9,7 +9,7 @@ ifeq ("$(origin O)", "command line")
+ endif
+
+ x86_energy_perf_policy : x86_energy_perf_policy.c
+-override CFLAGS += -Wall
++override CFLAGS += -Wall -I../../../include
+ override CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"'
+
+ %: %.c