summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFlorian Schmaus <flow@gentoo.org>2022-10-19 11:03:58 +0200
committerFlorian Schmaus <flow@gentoo.org>2022-10-19 11:03:58 +0200
commita0ef09913a37dcad16d28e9f5fa1e4f6a7cc5da7 (patch)
treeee0832868e8dfaf144f16916f7b6dbec24839815
parentCorrectly obtain the array length (diff)
downloadxen-upstream-patches-a0ef09913a37dcad16d28e9f5fa1e4f6a7cc5da7.tar.gz
xen-upstream-patches-a0ef09913a37dcad16d28e9f5fa1e4f6a7cc5da7.tar.bz2
xen-upstream-patches-a0ef09913a37dcad16d28e9f5fa1e4f6a7cc5da7.zip
Xen 4.16.3-pre-patchset-04.16.3-pre-patchset-0
Signed-off-by: Florian Schmaus <flow@gentoo.org>
-rw-r--r--0001-update-Xen-version-to-4.16.3-pre.patch (renamed from 0001-update-Xen-version-to-4.16.2-pre.patch)14
-rw-r--r--0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch50
-rw-r--r--0002-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch62
-rw-r--r--0003-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch167
-rw-r--r--0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch63
-rw-r--r--0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch49
-rw-r--r--0004-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch138
-rw-r--r--0005-x86-HAP-adjust-monitor-table-related-error-handling.patch77
-rw-r--r--0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch116
-rw-r--r--0006-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch76
-rw-r--r--0006-xen-build-Fix-dependency-for-the-MAP-rule.patch29
-rw-r--r--0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch74
-rw-r--r--0007-x86-shadow-tolerate-failure-in-shadow_prealloc.patch279
-rw-r--r--0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch36
-rw-r--r--0008-x86-p2m-refuse-new-allocations-for-dying-domains.patch100
-rw-r--r--0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch32
-rw-r--r--0009-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch115
-rw-r--r--0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch32
-rw-r--r--0010-x86-p2m-free-the-paging-memory-pool-preemptively.patch181
-rw-r--r--0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch112
-rw-r--r--0011-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch197
-rw-r--r--0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch36
-rw-r--r--0012-libxl-docs-Use-arch-specific-default-paging-memory.patch149
-rw-r--r--0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch229
-rw-r--r--0013-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch189
-rw-r--r--0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch121
-rw-r--r--0014-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch108
-rw-r--r--0015-kconfig-detect-LD-implementation.patch46
-rw-r--r--0015-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch289
-rw-r--r--0016-gnttab-correct-locking-on-transitive-grant-copy-erro.patch66
-rw-r--r--0016-linker-lld-do-not-generate-quoted-section-names.patch54
-rw-r--r--0017-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch112
-rw-r--r--0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch142
-rw-r--r--0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch35
-rw-r--r--0018-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch44
-rw-r--r--0019-build-silence-GNU-ld-warning-about-executable-stacks.patch35
-rw-r--r--0019-xen-sched-introduce-cpupool_update_node_affinity.patch257
-rw-r--r--0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch50
-rw-r--r--0020-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch263
-rw-r--r--0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch56
-rw-r--r--0021-xen-sched-fix-cpu-hotplug.patch307
-rw-r--r--0022-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch58
-rw-r--r--0022-x86-pv-Clean-up-_get_page_type.patch180
-rw-r--r--0023-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch41
-rw-r--r--0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch201
-rw-r--r--0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch53
-rw-r--r--0024-xen-gnttab-fix-gnttab_acquire_resource.patch69
-rw-r--r--0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch223
-rw-r--r--0025-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch59
-rw-r--r--0026-x86-Split-cache_flush-out-of-cache_writeback.patch294
-rw-r--r--0026-x86-vpmu-Fix-race-condition-in-vpmu_load.patch97
-rw-r--r--0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch95
-rw-r--r--0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch160
-rw-r--r--0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch37
-rw-r--r--0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch258
-rw-r--r--0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch98
-rw-r--r--0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch187
-rw-r--r--0033-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch52
-rw-r--r--0034-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch36
-rw-r--r--0035-tools-xenstored-Harden-corrupt.patch44
-rw-r--r--0036-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch93
-rw-r--r--0037-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch234
-rw-r--r--0038-libxc-fix-compilation-error-with-gcc13.patch33
-rw-r--r--0039-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch32
-rw-r--r--0040-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch87
-rw-r--r--0041-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch137
-rw-r--r--0042-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch28
-rw-r--r--0043-libxl-check-return-value-of-libxl__xs_directory-in-n.patch38
-rw-r--r--0044-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch167
-rw-r--r--0045-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch110
-rw-r--r--0046-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch97
-rw-r--r--0047-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch106
-rw-r--r--0048-x86-spec-ctrl-Support-IBPB-on-entry.patch300
-rw-r--r--0049-x86-cpuid-Enumeration-for-BTC_NO.patch106
-rw-r--r--0050-x86-spec-ctrl-Enable-Zen2-chickenbit.patch106
-rw-r--r--0051-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch305
-rw-r--r--info.txt6
77 files changed, 3510 insertions, 5304 deletions
diff --git a/0001-update-Xen-version-to-4.16.2-pre.patch b/0001-update-Xen-version-to-4.16.3-pre.patch
index 2e62c21..6ae690c 100644
--- a/0001-update-Xen-version-to-4.16.2-pre.patch
+++ b/0001-update-Xen-version-to-4.16.3-pre.patch
@@ -1,25 +1,25 @@
-From 5be9edb482ab20cf3e7acb05b511465294d1e19b Mon Sep 17 00:00:00 2001
+From 4aa32912ebeda8cb94d1c3941e7f1f0a2d4f921b Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
-Date: Tue, 7 Jun 2022 13:55:17 +0200
-Subject: [PATCH 01/51] update Xen version to 4.16.2-pre
+Date: Tue, 11 Oct 2022 14:49:41 +0200
+Subject: [PATCH 01/26] update Xen version to 4.16.3-pre
---
xen/Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/xen/Makefile b/xen/Makefile
-index 8abc71cf73aa..90a29782dbf4 100644
+index 76d0a3ff253f..8a403ee896cd 100644
--- a/xen/Makefile
+++ b/xen/Makefile
@@ -2,7 +2,7 @@
# All other places this is stored (eg. compile.h) should be autogenerated.
export XEN_VERSION = 4
export XEN_SUBVERSION = 16
--export XEN_EXTRAVERSION ?= .1$(XEN_VENDORVERSION)
-+export XEN_EXTRAVERSION ?= .2-pre$(XEN_VENDORVERSION)
+-export XEN_EXTRAVERSION ?= .2$(XEN_VENDORVERSION)
++export XEN_EXTRAVERSION ?= .3-pre$(XEN_VENDORVERSION)
export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION)
-include xen-version
--
-2.35.1
+2.37.3
diff --git a/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch b/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch
deleted file mode 100644
index 0ba090e..0000000
--- a/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch
+++ /dev/null
@@ -1,50 +0,0 @@
-From b58fb6e81bd55b6bd946abc3070770f7994c9ef9 Mon Sep 17 00:00:00 2001
-From: Jason Andryuk <jandryuk@gmail.com>
-Date: Tue, 7 Jun 2022 13:55:39 +0200
-Subject: [PATCH 02/51] x86/irq: skip unmap_domain_pirq XSM during destruction
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-xsm_unmap_domain_irq was seen denying unmap_domain_pirq when called from
-complete_domain_destroy as an RCU callback. The source context was an
-unexpected, random domain. Since this is a xen-internal operation,
-going through the XSM hook is inapproriate.
-
-Check d->is_dying and skip the XSM hook when set since this is a cleanup
-operation for a domain being destroyed.
-
-Suggested-by: Roger Pau Monné <roger.pau@citrix.com>
-Signed-off-by: Jason Andryuk <jandryuk@gmail.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-master commit: 2e6f95a942d1927a53f077c301db0b799c54c05a
-master date: 2022-04-08 14:51:52 +0200
----
- xen/arch/x86/irq.c | 10 ++++++++--
- 1 file changed, 8 insertions(+), 2 deletions(-)
-
-diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
-index 67cbf6b979dc..47b86af5dce9 100644
---- a/xen/arch/x86/irq.c
-+++ b/xen/arch/x86/irq.c
-@@ -2342,8 +2342,14 @@ int unmap_domain_pirq(struct domain *d, int pirq)
- nr = msi_desc->msi.nvec;
- }
-
-- ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq,
-- msi_desc ? msi_desc->dev : NULL);
-+ /*
-+ * When called by complete_domain_destroy via RCU, current is a random
-+ * domain. Skip the XSM check since this is a Xen-initiated action.
-+ */
-+ if ( !d->is_dying )
-+ ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq,
-+ msi_desc ? msi_desc->dev : NULL);
-+
- if ( ret )
- goto done;
-
---
-2.35.1
-
diff --git a/0002-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch b/0002-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch
new file mode 100644
index 0000000..fecc260
--- /dev/null
+++ b/0002-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch
@@ -0,0 +1,62 @@
+From 8d9531a3421dad2b0012e09e6f41d5274e162064 Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall@amazon.com>
+Date: Tue, 11 Oct 2022 14:52:13 +0200
+Subject: [PATCH 02/26] xen/arm: p2m: Prevent adding mapping when domain is
+ dying
+
+During the domain destroy process, the domain will still be accessible
+until it is fully destroyed. So does the P2M because we don't bail
+out early if is_dying is non-zero. If a domain has permission to
+modify the other domain's P2M (i.e. dom0, or a stubdomain), then
+foreign mapping can be added past relinquish_p2m_mapping().
+
+Therefore, we need to prevent mapping to be added when the domain
+is dying. This commit prevents such adding of mapping by adding the
+d->is_dying check to p2m_set_entry(). Also this commit enhances the
+check in relinquish_p2m_mapping() to make sure that no mappings can
+be added in the P2M after the P2M lock is released.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Henry Wang <Henry.Wang@arm.com>
+Tested-by: Henry Wang <Henry.Wang@arm.com>
+Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
+master commit: 3ebe773293e3b945460a3d6f54f3b91915397bab
+master date: 2022-10-11 14:20:18 +0200
+---
+ xen/arch/arm/p2m.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
+index 3349b464a39e..1affdafadbeb 100644
+--- a/xen/arch/arm/p2m.c
++++ b/xen/arch/arm/p2m.c
+@@ -1093,6 +1093,15 @@ int p2m_set_entry(struct p2m_domain *p2m,
+ {
+ int rc = 0;
+
++ /*
++ * Any reference taken by the P2M mappings (e.g. foreign mapping) will
++ * be dropped in relinquish_p2m_mapping(). As the P2M will still
++ * be accessible after, we need to prevent mapping to be added when the
++ * domain is dying.
++ */
++ if ( unlikely(p2m->domain->is_dying) )
++ return -ENOMEM;
++
+ while ( nr )
+ {
+ unsigned long mask;
+@@ -1610,6 +1619,8 @@ int relinquish_p2m_mapping(struct domain *d)
+ unsigned int order;
+ gfn_t start, end;
+
++ BUG_ON(!d->is_dying);
++ /* No mappings can be added in the P2M after the P2M lock is released. */
+ p2m_write_lock(p2m);
+
+ start = p2m->lowest_mapped_gfn;
+--
+2.37.3
+
diff --git a/0003-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch b/0003-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch
new file mode 100644
index 0000000..3190db8
--- /dev/null
+++ b/0003-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch
@@ -0,0 +1,167 @@
+From 937fdbad5180440888f1fcee46299103327efa90 Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall@amazon.com>
+Date: Tue, 11 Oct 2022 14:52:27 +0200
+Subject: [PATCH 03/26] xen/arm: p2m: Handle preemption when freeing
+ intermediate page tables
+
+At the moment the P2M page tables will be freed when the domain structure
+is freed without any preemption. As the P2M is quite large, iterating
+through this may take more time than it is reasonable without intermediate
+preemption (to run softirqs and perhaps scheduler).
+
+Split p2m_teardown() in two parts: one preemptible and called when
+relinquishing the resources, the other one non-preemptible and called
+when freeing the domain structure.
+
+As we are now freeing the P2M pages early, we also need to prevent
+further allocation if someone call p2m_set_entry() past p2m_teardown()
+(I wasn't able to prove this will never happen). This is done by
+the checking domain->is_dying from previous patch in p2m_set_entry().
+
+Similarly, we want to make sure that no-one can accessed the free
+pages. Therefore the root is cleared before freeing pages.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Henry Wang <Henry.Wang@arm.com>
+Tested-by: Henry Wang <Henry.Wang@arm.com>
+Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
+master commit: 3202084566bba0ef0c45caf8c24302f83d92f9c8
+master date: 2022-10-11 14:20:56 +0200
+---
+ xen/arch/arm/domain.c | 10 +++++++--
+ xen/arch/arm/p2m.c | 47 ++++++++++++++++++++++++++++++++++++---
+ xen/include/asm-arm/p2m.h | 13 +++++++++--
+ 3 files changed, 63 insertions(+), 7 deletions(-)
+
+diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
+index 96e1b235501d..2694c39127c5 100644
+--- a/xen/arch/arm/domain.c
++++ b/xen/arch/arm/domain.c
+@@ -789,10 +789,10 @@ fail:
+ void arch_domain_destroy(struct domain *d)
+ {
+ /* IOMMU page table is shared with P2M, always call
+- * iommu_domain_destroy() before p2m_teardown().
++ * iommu_domain_destroy() before p2m_final_teardown().
+ */
+ iommu_domain_destroy(d);
+- p2m_teardown(d);
++ p2m_final_teardown(d);
+ domain_vgic_free(d);
+ domain_vuart_free(d);
+ free_xenheap_page(d->shared_info);
+@@ -996,6 +996,7 @@ enum {
+ PROG_xen,
+ PROG_page,
+ PROG_mapping,
++ PROG_p2m,
+ PROG_done,
+ };
+
+@@ -1056,6 +1057,11 @@ int domain_relinquish_resources(struct domain *d)
+ if ( ret )
+ return ret;
+
++ PROGRESS(p2m):
++ ret = p2m_teardown(d);
++ if ( ret )
++ return ret;
++
+ PROGRESS(done):
+ break;
+
+diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
+index 1affdafadbeb..27418ee5ee98 100644
+--- a/xen/arch/arm/p2m.c
++++ b/xen/arch/arm/p2m.c
+@@ -1527,17 +1527,58 @@ static void p2m_free_vmid(struct domain *d)
+ spin_unlock(&vmid_alloc_lock);
+ }
+
+-void p2m_teardown(struct domain *d)
++int p2m_teardown(struct domain *d)
+ {
+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
++ unsigned long count = 0;
+ struct page_info *pg;
++ unsigned int i;
++ int rc = 0;
++
++ p2m_write_lock(p2m);
++
++ /*
++ * We are about to free the intermediate page-tables, so clear the
++ * root to prevent any walk to use them.
++ */
++ for ( i = 0; i < P2M_ROOT_PAGES; i++ )
++ clear_and_clean_page(p2m->root + i);
++
++ /*
++ * The domain will not be scheduled anymore, so in theory we should
++ * not need to flush the TLBs. Do it for safety purpose.
++ *
++ * Note that all the devices have already been de-assigned. So we don't
++ * need to flush the IOMMU TLB here.
++ */
++ p2m_force_tlb_flush_sync(p2m);
++
++ while ( (pg = page_list_remove_head(&p2m->pages)) )
++ {
++ free_domheap_page(pg);
++ count++;
++ /* Arbitrarily preempt every 512 iterations */
++ if ( !(count % 512) && hypercall_preempt_check() )
++ {
++ rc = -ERESTART;
++ break;
++ }
++ }
++
++ p2m_write_unlock(p2m);
++
++ return rc;
++}
++
++void p2m_final_teardown(struct domain *d)
++{
++ struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+ /* p2m not actually initialized */
+ if ( !p2m->domain )
+ return;
+
+- while ( (pg = page_list_remove_head(&p2m->pages)) )
+- free_domheap_page(pg);
++ ASSERT(page_list_empty(&p2m->pages));
+
+ if ( p2m->root )
+ free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
+diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
+index 8f11d9c97b5d..b3ba83283e11 100644
+--- a/xen/include/asm-arm/p2m.h
++++ b/xen/include/asm-arm/p2m.h
+@@ -192,8 +192,17 @@ void setup_virt_paging(void);
+ /* Init the datastructures for later use by the p2m code */
+ int p2m_init(struct domain *d);
+
+-/* Return all the p2m resources to Xen. */
+-void p2m_teardown(struct domain *d);
++/*
++ * The P2M resources are freed in two parts:
++ * - p2m_teardown() will be called when relinquish the resources. It
++ * will free large resources (e.g. intermediate page-tables) that
++ * requires preemption.
++ * - p2m_final_teardown() will be called when domain struct is been
++ * freed. This *cannot* be preempted and therefore one small
++ * resources should be freed here.
++ */
++int p2m_teardown(struct domain *d);
++void p2m_final_teardown(struct domain *d);
+
+ /*
+ * Remove mapping refcount on each mapping page in the p2m
+--
+2.37.3
+
diff --git a/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch b/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch
deleted file mode 100644
index fa1443c..0000000
--- a/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch
+++ /dev/null
@@ -1,63 +0,0 @@
-From 6c6bbfdff9374ef41f84c4ebed7b8a7a40767ef6 Mon Sep 17 00:00:00 2001
-From: Juergen Gross <jgross@suse.com>
-Date: Tue, 7 Jun 2022 13:56:54 +0200
-Subject: [PATCH 03/51] xen: fix XEN_DOMCTL_gdbsx_guestmemio crash
-
-A hypervisor built without CONFIG_GDBSX will crash in case the
-XEN_DOMCTL_gdbsx_guestmemio domctl is being called, as the call will
-end up in iommu_do_domctl() with d == NULL:
-
- (XEN) CPU: 6
- (XEN) RIP: e008:[<ffff82d040269984>] iommu_do_domctl+0x4/0x30
- (XEN) RFLAGS: 0000000000010202 CONTEXT: hypervisor (d0v0)
- (XEN) rax: 00000000000003e8 rbx: ffff830856277ef8 rcx: ffff830856277fff
- ...
- (XEN) Xen call trace:
- (XEN) [<ffff82d040269984>] R iommu_do_domctl+0x4/0x30
- (XEN) [<ffff82d04035cd5f>] S arch_do_domctl+0x7f/0x2330
- (XEN) [<ffff82d040239e46>] S do_domctl+0xe56/0x1930
- (XEN) [<ffff82d040238ff0>] S do_domctl+0/0x1930
- (XEN) [<ffff82d0402f8c59>] S pv_hypercall+0x99/0x110
- (XEN) [<ffff82d0402f5161>] S arch/x86/pv/domain.c#_toggle_guest_pt+0x11/0x90
- (XEN) [<ffff82d040366288>] S lstar_enter+0x128/0x130
- (XEN)
- (XEN) Pagetable walk from 0000000000000144:
- (XEN) L4[0x000] = 0000000000000000 ffffffffffffffff
- (XEN)
- (XEN) ****************************************
- (XEN) Panic on CPU 6:
- (XEN) FATAL PAGE FAULT
- (XEN) [error_code=0000]
- (XEN) Faulting linear address: 0000000000000144
- (XEN) ****************************************
-
-It used to be permitted to pass DOMID_IDLE to dbg_rw_mem(), which is why the
-special case skipping the domid checks exists. Now that it is only permitted
-to pass proper domids, remove the special case, making 'd' always valid.
-
-Reported-by: Cheyenne Wills <cheyenne.wills@gmail.com>
-Fixes: e726a82ca0dc ("xen: make gdbsx support configurable")
-Signed-off-by: Juergen Gross <jgross@suse.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
-master commit: f00daf1fb3213a9b0335d9dcd90fe9cb5c02b7a9
-master date: 2022-04-19 17:07:08 +0100
----
- xen/common/domctl.c | 1 -
- 1 file changed, 1 deletion(-)
-
-diff --git a/xen/common/domctl.c b/xen/common/domctl.c
-index 271862ae587f..419e4070f59d 100644
---- a/xen/common/domctl.c
-+++ b/xen/common/domctl.c
-@@ -304,7 +304,6 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
- if ( op->domain == DOMID_INVALID )
- {
- case XEN_DOMCTL_createdomain:
-- case XEN_DOMCTL_gdbsx_guestmemio:
- d = NULL;
- break;
- }
---
-2.35.1
-
diff --git a/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch b/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch
deleted file mode 100644
index a4d229a..0000000
--- a/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch
+++ /dev/null
@@ -1,49 +0,0 @@
-From b378ee56c7e0bb5eeb35dcc55b3d29e5f50eb566 Mon Sep 17 00:00:00 2001
-From: Jan Beulich <jbeulich@suse.com>
-Date: Tue, 7 Jun 2022 13:58:16 +0200
-Subject: [PATCH 04/51] VT-d: refuse to use IOMMU with reserved CAP.ND value
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-The field taking the value 7 (resulting in 18-bit DIDs when using the
-calculation in cap_ndoms(), when the DID fields are only 16 bits wide)
-is reserved. Instead of misbehaving in case we would encounter such an
-IOMMU, refuse to use it.
-
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-Reviewed-by: Kevin Tian <kevin.tian@intel.com>
-master commit: a1545fbf45c689aff39ce76a6eaa609d32ef72a7
-master date: 2022-04-20 10:54:26 +0200
----
- xen/drivers/passthrough/vtd/iommu.c | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
-index 93dd8aa643aa..8975c1de61bc 100644
---- a/xen/drivers/passthrough/vtd/iommu.c
-+++ b/xen/drivers/passthrough/vtd/iommu.c
-@@ -1279,8 +1279,11 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
-
- quirk_iommu_caps(iommu);
-
-+ nr_dom = cap_ndoms(iommu->cap);
-+
- if ( cap_fault_reg_offset(iommu->cap) +
- cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
-+ ((nr_dom - 1) >> 16) /* I.e. cap.nd > 6 */ ||
- ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
- {
- printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported\n");
-@@ -1305,7 +1308,6 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
- vtd_ops.sync_cache = sync_cache;
-
- /* allocate domain id bitmap */
-- nr_dom = cap_ndoms(iommu->cap);
- iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
- if ( !iommu->domid_bitmap )
- return -ENOMEM;
---
-2.35.1
-
diff --git a/0004-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch b/0004-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch
new file mode 100644
index 0000000..b3edbd9
--- /dev/null
+++ b/0004-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch
@@ -0,0 +1,138 @@
+From 8fc19c143b8aa563077f3d5c46fcc0a54dc04f35 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 11 Oct 2022 14:52:39 +0200
+Subject: [PATCH 04/26] x86/p2m: add option to skip root pagetable removal in
+ p2m_teardown()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add a new parameter to p2m_teardown() in order to select whether the
+root page table should also be freed. Note that all users are
+adjusted to pass the parameter to remove the root page tables, so
+behavior is not modified.
+
+No functional change intended.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Suggested-by: Julien Grall <julien@xen.org>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+master commit: 1df52a270225527ae27bfa2fc40347bf93b78357
+master date: 2022-10-11 14:21:23 +0200
+---
+ xen/arch/x86/mm/hap/hap.c | 6 +++---
+ xen/arch/x86/mm/p2m.c | 20 ++++++++++++++++----
+ xen/arch/x86/mm/shadow/common.c | 4 ++--
+ xen/include/asm-x86/p2m.h | 2 +-
+ 4 files changed, 22 insertions(+), 10 deletions(-)
+
+diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
+index 47a7487fa7a3..a8f5a19da917 100644
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -541,18 +541,18 @@ void hap_final_teardown(struct domain *d)
+ }
+
+ for ( i = 0; i < MAX_ALTP2M; i++ )
+- p2m_teardown(d->arch.altp2m_p2m[i]);
++ p2m_teardown(d->arch.altp2m_p2m[i], true);
+ }
+
+ /* Destroy nestedp2m's first */
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+- p2m_teardown(d->arch.nested_p2m[i]);
++ p2m_teardown(d->arch.nested_p2m[i], true);
+ }
+
+ if ( d->arch.paging.hap.total_pages != 0 )
+ hap_teardown(d, NULL);
+
+- p2m_teardown(p2m_get_hostp2m(d));
++ p2m_teardown(p2m_get_hostp2m(d), true);
+ /* Free any memory that the p2m teardown released */
+ paging_lock(d);
+ hap_set_allocation(d, 0, NULL);
+diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
+index def1695cf00b..aba4f17cbe12 100644
+--- a/xen/arch/x86/mm/p2m.c
++++ b/xen/arch/x86/mm/p2m.c
+@@ -749,11 +749,11 @@ int p2m_alloc_table(struct p2m_domain *p2m)
+ * hvm fixme: when adding support for pvh non-hardware domains, this path must
+ * cleanup any foreign p2m types (release refcnts on them).
+ */
+-void p2m_teardown(struct p2m_domain *p2m)
++void p2m_teardown(struct p2m_domain *p2m, bool remove_root)
+ /* Return all the p2m pages to Xen.
+ * We know we don't have any extra mappings to these pages */
+ {
+- struct page_info *pg;
++ struct page_info *pg, *root_pg = NULL;
+ struct domain *d;
+
+ if (p2m == NULL)
+@@ -763,10 +763,22 @@ void p2m_teardown(struct p2m_domain *p2m)
+
+ p2m_lock(p2m);
+ ASSERT(atomic_read(&d->shr_pages) == 0);
+- p2m->phys_table = pagetable_null();
++
++ if ( remove_root )
++ p2m->phys_table = pagetable_null();
++ else if ( !pagetable_is_null(p2m->phys_table) )
++ {
++ root_pg = pagetable_get_page(p2m->phys_table);
++ clear_domain_page(pagetable_get_mfn(p2m->phys_table));
++ }
+
+ while ( (pg = page_list_remove_head(&p2m->pages)) )
+- d->arch.paging.free_page(d, pg);
++ if ( pg != root_pg )
++ d->arch.paging.free_page(d, pg);
++
++ if ( root_pg )
++ page_list_add(root_pg, &p2m->pages);
++
+ p2m_unlock(p2m);
+ }
+
+diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
+index 8c1b041f7135..8c5baba9544d 100644
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -2701,7 +2701,7 @@ int shadow_enable(struct domain *d, u32 mode)
+ paging_unlock(d);
+ out_unlocked:
+ if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) )
+- p2m_teardown(p2m);
++ p2m_teardown(p2m, true);
+ if ( rv != 0 && pg != NULL )
+ {
+ pg->count_info &= ~PGC_count_mask;
+@@ -2866,7 +2866,7 @@ void shadow_final_teardown(struct domain *d)
+ shadow_teardown(d, NULL);
+
+ /* It is now safe to pull down the p2m map. */
+- p2m_teardown(p2m_get_hostp2m(d));
++ p2m_teardown(p2m_get_hostp2m(d), true);
+ /* Free any shadow memory that the p2m teardown released */
+ paging_lock(d);
+ shadow_set_allocation(d, 0, NULL);
+diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
+index f2af7a746ced..c3c16748e7d5 100644
+--- a/xen/include/asm-x86/p2m.h
++++ b/xen/include/asm-x86/p2m.h
+@@ -574,7 +574,7 @@ int p2m_init(struct domain *d);
+ int p2m_alloc_table(struct p2m_domain *p2m);
+
+ /* Return all the p2m resources to Xen. */
+-void p2m_teardown(struct p2m_domain *p2m);
++void p2m_teardown(struct p2m_domain *p2m, bool remove_root);
+ void p2m_final_teardown(struct domain *d);
+
+ /* Add a page to a domain's p2m table */
+--
+2.37.3
+
diff --git a/0005-x86-HAP-adjust-monitor-table-related-error-handling.patch b/0005-x86-HAP-adjust-monitor-table-related-error-handling.patch
new file mode 100644
index 0000000..33ab1ad
--- /dev/null
+++ b/0005-x86-HAP-adjust-monitor-table-related-error-handling.patch
@@ -0,0 +1,77 @@
+From 3422c19d85a3d23a9d798eafb739ffb8865522d2 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 11 Oct 2022 14:52:59 +0200
+Subject: [PATCH 05/26] x86/HAP: adjust monitor table related error handling
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+hap_make_monitor_table() will return INVALID_MFN if it encounters an
+error condition, but hap_update_paging_modes() wasn’t handling this
+value, resulting in an inappropriate value being stored in
+monitor_table. This would subsequently misguide at least
+hap_vcpu_teardown(). Avoid this by bailing early.
+
+Further, when a domain has/was already crashed or (perhaps less
+important as there's no such path known to lead here) is already dying,
+avoid calling domain_crash() on it again - that's at best confusing.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 5b44a61180f4f2e4f490a28400c884dd357ff45d
+master date: 2022-10-11 14:21:56 +0200
+---
+ xen/arch/x86/mm/hap/hap.c | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
+index a8f5a19da917..d75dc2b9ed3d 100644
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -39,6 +39,7 @@
+ #include <asm/domain.h>
+ #include <xen/numa.h>
+ #include <asm/hvm/nestedhvm.h>
++#include <public/sched.h>
+
+ #include "private.h"
+
+@@ -405,8 +406,13 @@ static mfn_t hap_make_monitor_table(struct vcpu *v)
+ return m4mfn;
+
+ oom:
+- printk(XENLOG_G_ERR "out of memory building monitor pagetable\n");
+- domain_crash(d);
++ if ( !d->is_dying &&
++ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
++ {
++ printk(XENLOG_G_ERR "%pd: out of memory building monitor pagetable\n",
++ d);
++ domain_crash(d);
++ }
+ return INVALID_MFN;
+ }
+
+@@ -766,6 +772,9 @@ static void hap_update_paging_modes(struct vcpu *v)
+ if ( pagetable_is_null(v->arch.hvm.monitor_table) )
+ {
+ mfn_t mmfn = hap_make_monitor_table(v);
++
++ if ( mfn_eq(mmfn, INVALID_MFN) )
++ goto unlock;
+ v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn);
+ make_cr3(v, mmfn);
+ hvm_update_host_cr3(v);
+@@ -774,6 +783,7 @@ static void hap_update_paging_modes(struct vcpu *v)
+ /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */
+ hap_update_cr3(v, 0, false);
+
++ unlock:
+ paging_unlock(d);
+ put_gfn(d, cr3_gfn);
+ }
+--
+2.37.3
+
diff --git a/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch b/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch
deleted file mode 100644
index 45a1825..0000000
--- a/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch
+++ /dev/null
@@ -1,116 +0,0 @@
-From 7c003ab4a398ff4ddd54d15d4158cffb463134cc Mon Sep 17 00:00:00 2001
-From: David Vrabel <dvrabel@amazon.co.uk>
-Date: Tue, 7 Jun 2022 13:59:31 +0200
-Subject: [PATCH 05/51] x86/mm: avoid inadvertently degrading a TLB flush to
- local only
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-If the direct map is incorrectly modified with interrupts disabled,
-the required TLB flushes are degraded to flushing the local CPU only.
-
-This could lead to very hard to diagnose problems as different CPUs will
-end up with different views of memory. Although, no such issues have yet
-been identified.
-
-Change the check in the flush_area() macro to look at system_state
-instead. This defers the switch from local to all later in the boot
-(see xen/arch/x86/setup.c:__start_xen()). This is fine because
-additional PCPUs are not brought up until after the system state is
-SYS_STATE_smp_boot.
-
-Signed-off-by: David Vrabel <dvrabel@amazon.co.uk>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
-x86/flushtlb: remove flush_area check on system state
-
-Booting with Shadow Stacks leads to the following assert on a debug
-hypervisor:
-
-Assertion 'local_irq_is_enabled()' failed at arch/x86/smp.c:265
-----[ Xen-4.17.0-10.24-d x86_64 debug=y Not tainted ]----
-CPU: 0
-RIP: e008:[<ffff82d040345300>] flush_area_mask+0x40/0x13e
-[...]
-Xen call trace:
- [<ffff82d040345300>] R flush_area_mask+0x40/0x13e
- [<ffff82d040338a40>] F modify_xen_mappings+0xc5/0x958
- [<ffff82d0404474f9>] F arch/x86/alternative.c#_alternative_instructions+0xb7/0xb9
- [<ffff82d0404476cc>] F alternative_branches+0xf/0x12
- [<ffff82d04044e37d>] F __start_xen+0x1ef4/0x2776
- [<ffff82d040203344>] F __high_start+0x94/0xa0
-
-This is due to SYS_STATE_smp_boot being set before calling
-alternative_branches(), and the flush in modify_xen_mappings() then
-using flush_area_all() with interrupts disabled. Note that
-alternative_branches() is called before APs are started, so the flush
-must be a local one (and indeed the cpumask passed to
-flush_area_mask() just contains one CPU).
-
-Take the opportunity to simplify a bit the logic and make flush_area()
-an alias of flush_area_all() in mm.c, taking into account that
-cpu_online_map just contains the BSP before APs are started. This
-requires widening the assert in flush_area_mask() to allow being
-called with interrupts disabled as long as it's strictly a local only
-flush.
-
-The overall result is that a conditional can be removed from
-flush_area().
-
-While there also introduce an ASSERT to check that a vCPU state flush
-is not issued for the local CPU only.
-
-Fixes: 78e072bc37 ('x86/mm: avoid inadvertently degrading a TLB flush to local only')
-Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-master commit: 78e072bc375043e81691a59454e09f0b38241ddd
-master date: 2022-04-20 10:55:01 +0200
-master commit: 9f735ee4903f1b9f1966bb4ba5b5616b03ae08b5
-master date: 2022-05-25 11:09:46 +0200
----
- xen/arch/x86/mm.c | 10 ++--------
- xen/arch/x86/smp.c | 5 ++++-
- 2 files changed, 6 insertions(+), 9 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index 4d799032dc82..e222d9aa98ee 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -5051,14 +5051,8 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
- #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
- #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
-
--/*
-- * map_pages_to_xen() can be called with interrupts disabled during
-- * early bootstrap. In this case it is safe to use flush_area_local()
-- * and avoid locking because only the local CPU is online.
-- */
--#define flush_area(v,f) (!local_irq_is_enabled() ? \
-- flush_area_local((const void *)v, f) : \
-- flush_area_all((const void *)v, f))
-+/* flush_area_all() can be used prior to any other CPU being online. */
-+#define flush_area(v, f) flush_area_all((const void *)(v), f)
-
- #define L3T_INIT(page) (page) = ZERO_BLOCK_PTR
-
-diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
-index eef0f9c6cbf4..3556ec116608 100644
---- a/xen/arch/x86/smp.c
-+++ b/xen/arch/x86/smp.c
-@@ -262,7 +262,10 @@ void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
- {
- unsigned int cpu = smp_processor_id();
-
-- ASSERT(local_irq_is_enabled());
-+ /* Local flushes can be performed with interrupts disabled. */
-+ ASSERT(local_irq_is_enabled() || cpumask_subset(mask, cpumask_of(cpu)));
-+ /* Exclude use of FLUSH_VCPU_STATE for the local CPU. */
-+ ASSERT(!cpumask_test_cpu(cpu, mask) || !(flags & FLUSH_VCPU_STATE));
-
- if ( (flags & ~(FLUSH_VCPU_STATE | FLUSH_ORDER_MASK)) &&
- cpumask_test_cpu(cpu, mask) )
---
-2.35.1
-
diff --git a/0006-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch b/0006-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch
new file mode 100644
index 0000000..bbae48b
--- /dev/null
+++ b/0006-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch
@@ -0,0 +1,76 @@
+From 40e9daf6b56ae49bda3ba4e254ccf0e998e52a8c Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 11 Oct 2022 14:53:12 +0200
+Subject: [PATCH 06/26] x86/shadow: tolerate failure of
+ sh_set_toplevel_shadow()
+
+Subsequently sh_set_toplevel_shadow() will be adjusted to install a
+blank entry in case prealloc fails. There are, in fact, pre-existing
+error paths which would put in place a blank entry. The 4- and 2-level
+code in sh_update_cr3(), however, assume the top level entry to be
+valid.
+
+Hence bail from the function in the unlikely event that it's not. Note
+that 3-level logic works differently: In particular a guest is free to
+supply a PDPTR pointing at 4 non-present (or otherwise deemed invalid)
+entries. The guest will crash, but we already cope with that.
+
+Really mfn_valid() is likely wrong to use in sh_set_toplevel_shadow(),
+and it should instead be !mfn_eq(gmfn, INVALID_MFN). Avoid such a change
+in security context, but add a respective assertion.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: eac000978c1feb5a9ee3236ab0c0da9a477e5336
+master date: 2022-10-11 14:22:24 +0200
+---
+ xen/arch/x86/mm/shadow/common.c | 1 +
+ xen/arch/x86/mm/shadow/multi.c | 10 ++++++++++
+ 2 files changed, 11 insertions(+)
+
+diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
+index 8c5baba9544d..00e520cbd05b 100644
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -2516,6 +2516,7 @@ void sh_set_toplevel_shadow(struct vcpu *v,
+ /* Now figure out the new contents: is this a valid guest MFN? */
+ if ( !mfn_valid(gmfn) )
+ {
++ ASSERT(mfn_eq(gmfn, INVALID_MFN));
+ new_entry = pagetable_null();
+ goto install_new_entry;
+ }
+diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
+index 7b8f4dd13b03..2ff78fe3362c 100644
+--- a/xen/arch/x86/mm/shadow/multi.c
++++ b/xen/arch/x86/mm/shadow/multi.c
+@@ -3312,6 +3312,11 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush)
+ if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 )
+ guest_flush_tlb_mask(d, d->dirty_cpumask);
+ sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow);
++ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
++ {
++ ASSERT(d->is_dying || d->is_shutting_down);
++ return;
++ }
+ if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) )
+ {
+ mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]);
+@@ -3370,6 +3375,11 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush)
+ if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 )
+ guest_flush_tlb_mask(d, d->dirty_cpumask);
+ sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow);
++ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
++ {
++ ASSERT(d->is_dying || d->is_shutting_down);
++ return;
++ }
+ #else
+ #error This should never happen
+ #endif
+--
+2.37.3
+
diff --git a/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch b/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch
deleted file mode 100644
index 7eb13cd..0000000
--- a/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-From 4bb8c34ba4241c2bf7845cd8b80c17530dbfb085 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Tue, 7 Jun 2022 14:00:09 +0200
-Subject: [PATCH 06/51] xen/build: Fix dependency for the MAP rule
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Acked-by: Jan Beulich <jbeulich@suse.com>
-master commit: e1e72198213b80b7a82bdc90f96ed05ae4f53e20
-master date: 2022-04-20 19:10:59 +0100
----
- xen/Makefile | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xen/Makefile b/xen/Makefile
-index 90a29782dbf4..ce4eca3ee4d7 100644
---- a/xen/Makefile
-+++ b/xen/Makefile
-@@ -507,7 +507,7 @@ cscope:
- cscope -k -b -q
-
- .PHONY: _MAP
--_MAP:
-+_MAP: $(TARGET)
- $(NM) -n $(TARGET)-syms | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' > System.map
-
- %.o %.i %.s: %.c FORCE
---
-2.35.1
-
diff --git a/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch b/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch
deleted file mode 100644
index ed98922..0000000
--- a/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch
+++ /dev/null
@@ -1,74 +0,0 @@
-From 13a29f3756bc4cab96c59f46c3875b483553fb8f Mon Sep 17 00:00:00 2001
-From: Juergen Gross <jgross@suse.com>
-Date: Tue, 7 Jun 2022 14:00:31 +0200
-Subject: [PATCH 07/51] tools/libs/evtchn: don't set errno to negative values
-
-Setting errno to a negative value makes no sense.
-
-Fixes: 6b6500b3cbaa ("tools/libs/evtchn: Add support for restricting a handle")
-Signed-off-by: Juergen Gross <jgross@suse.com>
-Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
-master commit: 60245b71c1cd001686fa7b7a26869cbcb80d074c
-master date: 2022-04-22 20:39:34 +0100
----
- tools/libs/evtchn/freebsd.c | 2 +-
- tools/libs/evtchn/minios.c | 2 +-
- tools/libs/evtchn/netbsd.c | 2 +-
- tools/libs/evtchn/solaris.c | 2 +-
- 4 files changed, 4 insertions(+), 4 deletions(-)
-
-diff --git a/tools/libs/evtchn/freebsd.c b/tools/libs/evtchn/freebsd.c
-index 7427ab240860..fa17a0f8dbb5 100644
---- a/tools/libs/evtchn/freebsd.c
-+++ b/tools/libs/evtchn/freebsd.c
-@@ -58,7 +58,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce)
-
- int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid)
- {
-- errno = -EOPNOTSUPP;
-+ errno = EOPNOTSUPP;
-
- return -1;
- }
-diff --git a/tools/libs/evtchn/minios.c b/tools/libs/evtchn/minios.c
-index e5dfdc5ef52e..c0bd5429eea2 100644
---- a/tools/libs/evtchn/minios.c
-+++ b/tools/libs/evtchn/minios.c
-@@ -97,7 +97,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce)
-
- int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid)
- {
-- errno = -EOPNOTSUPP;
-+ errno = EOPNOTSUPP;
-
- return -1;
- }
-diff --git a/tools/libs/evtchn/netbsd.c b/tools/libs/evtchn/netbsd.c
-index 1cebc21ffce0..56409513bc23 100644
---- a/tools/libs/evtchn/netbsd.c
-+++ b/tools/libs/evtchn/netbsd.c
-@@ -53,7 +53,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce)
-
- int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid)
- {
-- errno = -EOPNOTSUPP;
-+ errno = EOPNOTSUPP;
-
- return -1;
- }
-diff --git a/tools/libs/evtchn/solaris.c b/tools/libs/evtchn/solaris.c
-index df9579df1778..beaa7721425f 100644
---- a/tools/libs/evtchn/solaris.c
-+++ b/tools/libs/evtchn/solaris.c
-@@ -53,7 +53,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce)
-
- int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid)
- {
-- errno = -EOPNOTSUPP;
-+ errno = EOPNOTSUPP;
- return -1;
- }
-
---
-2.35.1
-
diff --git a/0007-x86-shadow-tolerate-failure-in-shadow_prealloc.patch b/0007-x86-shadow-tolerate-failure-in-shadow_prealloc.patch
new file mode 100644
index 0000000..5e2f8ab
--- /dev/null
+++ b/0007-x86-shadow-tolerate-failure-in-shadow_prealloc.patch
@@ -0,0 +1,279 @@
+From 28d3f677ec97c98154311f64871ac48762cf980a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 11 Oct 2022 14:53:27 +0200
+Subject: [PATCH 07/26] x86/shadow: tolerate failure in shadow_prealloc()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Prevent _shadow_prealloc() from calling BUG() when unable to fulfill
+the pre-allocation and instead return true/false. Modify
+shadow_prealloc() to crash the domain on allocation failure (if the
+domain is not already dying), as shadow cannot operate normally after
+that. Modify callers to also gracefully handle {_,}shadow_prealloc()
+failing to fulfill the request.
+
+Note this in turn requires adjusting the callers of
+sh_make_monitor_table() also to handle it returning INVALID_MFN.
+sh_update_paging_modes() is also modified to add additional error
+paths in case of allocation failure, some of those will return with
+null monitor page tables (and the domain likely crashed). This is no
+different that current error paths, but the newly introduced ones are
+more likely to trigger.
+
+The now added failure points in sh_update_paging_modes() also require
+that on some error return paths the previous structures are cleared,
+and thus monitor table is null.
+
+While there adjust the 'type' parameter type of shadow_prealloc() to
+unsigned int rather than u32.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+master commit: b7f93c6afb12b6061e2d19de2f39ea09b569ac68
+master date: 2022-10-11 14:22:53 +0200
+---
+ xen/arch/x86/mm/shadow/common.c | 69 ++++++++++++++++++++++++--------
+ xen/arch/x86/mm/shadow/hvm.c | 4 +-
+ xen/arch/x86/mm/shadow/multi.c | 11 +++--
+ xen/arch/x86/mm/shadow/private.h | 3 +-
+ 4 files changed, 66 insertions(+), 21 deletions(-)
+
+diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
+index 00e520cbd05b..2067c7d16bb4 100644
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -36,6 +36,7 @@
+ #include <asm/flushtlb.h>
+ #include <asm/shadow.h>
+ #include <xen/numa.h>
++#include <public/sched.h>
+ #include "private.h"
+
+ DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
+@@ -928,14 +929,15 @@ static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn)
+
+ /* Make sure there are at least count order-sized pages
+ * available in the shadow page pool. */
+-static void _shadow_prealloc(struct domain *d, unsigned int pages)
++static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages)
+ {
+ struct vcpu *v;
+ struct page_info *sp, *t;
+ mfn_t smfn;
+ int i;
+
+- if ( d->arch.paging.shadow.free_pages >= pages ) return;
++ if ( d->arch.paging.shadow.free_pages >= pages )
++ return true;
+
+ /* Shouldn't have enabled shadows if we've no vcpus. */
+ ASSERT(d->vcpu && d->vcpu[0]);
+@@ -951,7 +953,8 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages)
+ sh_unpin(d, smfn);
+
+ /* See if that freed up enough space */
+- if ( d->arch.paging.shadow.free_pages >= pages ) return;
++ if ( d->arch.paging.shadow.free_pages >= pages )
++ return true;
+ }
+
+ /* Stage two: all shadow pages are in use in hierarchies that are
+@@ -974,7 +977,7 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages)
+ if ( d->arch.paging.shadow.free_pages >= pages )
+ {
+ guest_flush_tlb_mask(d, d->dirty_cpumask);
+- return;
++ return true;
+ }
+ }
+ }
+@@ -987,7 +990,12 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages)
+ d->arch.paging.shadow.total_pages,
+ d->arch.paging.shadow.free_pages,
+ d->arch.paging.shadow.p2m_pages);
+- BUG();
++
++ ASSERT(d->is_dying);
++
++ guest_flush_tlb_mask(d, d->dirty_cpumask);
++
++ return false;
+ }
+
+ /* Make sure there are at least count pages of the order according to
+@@ -995,9 +1003,19 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages)
+ * This must be called before any calls to shadow_alloc(). Since this
+ * will free existing shadows to make room, it must be called early enough
+ * to avoid freeing shadows that the caller is currently working on. */
+-void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
++bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count)
+ {
+- return _shadow_prealloc(d, shadow_size(type) * count);
++ bool ret = _shadow_prealloc(d, shadow_size(type) * count);
++
++ if ( !ret && !d->is_dying &&
++ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
++ /*
++ * Failing to allocate memory required for shadow usage can only result in
++ * a domain crash, do it here rather that relying on every caller to do it.
++ */
++ domain_crash(d);
++
++ return ret;
+ }
+
+ /* Deliberately free all the memory we can: this will tear down all of
+@@ -1218,7 +1236,7 @@ void shadow_free(struct domain *d, mfn_t smfn)
+ static struct page_info *
+ shadow_alloc_p2m_page(struct domain *d)
+ {
+- struct page_info *pg;
++ struct page_info *pg = NULL;
+
+ /* This is called both from the p2m code (which never holds the
+ * paging lock) and the log-dirty code (which always does). */
+@@ -1236,16 +1254,18 @@ shadow_alloc_p2m_page(struct domain *d)
+ d->arch.paging.shadow.p2m_pages,
+ shadow_min_acceptable_pages(d));
+ }
+- paging_unlock(d);
+- return NULL;
++ goto out;
+ }
+
+- shadow_prealloc(d, SH_type_p2m_table, 1);
++ if ( !shadow_prealloc(d, SH_type_p2m_table, 1) )
++ goto out;
++
+ pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
+ d->arch.paging.shadow.p2m_pages++;
+ d->arch.paging.shadow.total_pages--;
+ ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask));
+
++ out:
+ paging_unlock(d);
+
+ return pg;
+@@ -1336,7 +1356,9 @@ int shadow_set_allocation(struct domain *d, unsigned int pages, bool *preempted)
+ else if ( d->arch.paging.shadow.total_pages > pages )
+ {
+ /* Need to return memory to domheap */
+- _shadow_prealloc(d, 1);
++ if ( !_shadow_prealloc(d, 1) )
++ return -ENOMEM;
++
+ sp = page_list_remove_head(&d->arch.paging.shadow.freelist);
+ ASSERT(sp);
+ /*
+@@ -2334,12 +2356,13 @@ static void sh_update_paging_modes(struct vcpu *v)
+ if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) )
+ {
+ int i;
++
++ if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) )
++ return;
++
+ for(i = 0; i < SHADOW_OOS_PAGES; i++)
+- {
+- shadow_prealloc(d, SH_type_oos_snapshot, 1);
+ v->arch.paging.shadow.oos_snapshot[i] =
+ shadow_alloc(d, SH_type_oos_snapshot, 0);
+- }
+ }
+ #endif /* OOS */
+
+@@ -2403,6 +2426,9 @@ static void sh_update_paging_modes(struct vcpu *v)
+ mfn_t mmfn = sh_make_monitor_table(
+ v, v->arch.paging.mode->shadow.shadow_levels);
+
++ if ( mfn_eq(mmfn, INVALID_MFN) )
++ return;
++
+ v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn);
+ make_cr3(v, mmfn);
+ hvm_update_host_cr3(v);
+@@ -2441,6 +2467,12 @@ static void sh_update_paging_modes(struct vcpu *v)
+ v->arch.hvm.monitor_table = pagetable_null();
+ new_mfn = sh_make_monitor_table(
+ v, v->arch.paging.mode->shadow.shadow_levels);
++ if ( mfn_eq(new_mfn, INVALID_MFN) )
++ {
++ sh_destroy_monitor_table(v, old_mfn,
++ old_mode->shadow.shadow_levels);
++ return;
++ }
+ v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn);
+ SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
+ mfn_x(new_mfn));
+@@ -2526,7 +2558,12 @@ void sh_set_toplevel_shadow(struct vcpu *v,
+ if ( !mfn_valid(smfn) )
+ {
+ /* Make sure there's enough free shadow memory. */
+- shadow_prealloc(d, root_type, 1);
++ if ( !shadow_prealloc(d, root_type, 1) )
++ {
++ new_entry = pagetable_null();
++ goto install_new_entry;
++ }
++
+ /* Shadow the page. */
+ smfn = make_shadow(v, gmfn, root_type);
+ }
+diff --git a/xen/arch/x86/mm/shadow/hvm.c b/xen/arch/x86/mm/shadow/hvm.c
+index d5f42102a0bd..a0878d9ad71a 100644
+--- a/xen/arch/x86/mm/shadow/hvm.c
++++ b/xen/arch/x86/mm/shadow/hvm.c
+@@ -700,7 +700,9 @@ mfn_t sh_make_monitor_table(const struct vcpu *v, unsigned int shadow_levels)
+ ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table));
+
+ /* Guarantee we can get the memory we need */
+- shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
++ if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) )
++ return INVALID_MFN;
++
+ m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
+ mfn_to_page(m4mfn)->shadow_flags = 4;
+
+diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
+index 2ff78fe3362c..c07af0bd99da 100644
+--- a/xen/arch/x86/mm/shadow/multi.c
++++ b/xen/arch/x86/mm/shadow/multi.c
+@@ -2440,9 +2440,14 @@ static int sh_page_fault(struct vcpu *v,
+ * Preallocate shadow pages *before* removing writable accesses
+ * otherwhise an OOS L1 might be demoted and promoted again with
+ * writable mappings. */
+- shadow_prealloc(d,
+- SH_type_l1_shadow,
+- GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
++ if ( !shadow_prealloc(d, SH_type_l1_shadow,
++ GUEST_PAGING_LEVELS < 4
++ ? 1 : GUEST_PAGING_LEVELS - 1) )
++ {
++ paging_unlock(d);
++ put_gfn(d, gfn_x(gfn));
++ return 0;
++ }
+
+ rc = gw_remove_write_accesses(v, va, &gw);
+
+diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
+index 35efb1b984fb..738214f75e8d 100644
+--- a/xen/arch/x86/mm/shadow/private.h
++++ b/xen/arch/x86/mm/shadow/private.h
+@@ -383,7 +383,8 @@ void shadow_promote(struct domain *d, mfn_t gmfn, u32 type);
+ void shadow_demote(struct domain *d, mfn_t gmfn, u32 type);
+
+ /* Shadow page allocation functions */
+-void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count);
++bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type,
++ unsigned int count);
+ mfn_t shadow_alloc(struct domain *d,
+ u32 shadow_type,
+ unsigned long backpointer);
+--
+2.37.3
+
diff --git a/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch b/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch
deleted file mode 100644
index 166f0ff..0000000
--- a/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-From ba62afdbc31a8cfe897191efd25ed4449d9acd94 Mon Sep 17 00:00:00 2001
-From: Juergen Gross <jgross@suse.com>
-Date: Tue, 7 Jun 2022 14:01:03 +0200
-Subject: [PATCH 08/51] tools/libs/ctrl: don't set errno to a negative value
-
-The claimed reason for setting errno to -1 is wrong. On x86
-xc_domain_pod_target() will set errno to a sane value in the error
-case.
-
-Fixes: ff1745d5882b ("tools: libxl: do not set the PoD target on ARM")
-Signed-off-by: Juergen Gross <jgross@suse.com>
-Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
-master commit: a0fb7e0e73483ed042d5ca34861a891a51ad337b
-master date: 2022-04-22 20:39:34 +0100
----
- tools/libs/ctrl/xc_domain.c | 4 +---
- 1 file changed, 1 insertion(+), 3 deletions(-)
-
-diff --git a/tools/libs/ctrl/xc_domain.c b/tools/libs/ctrl/xc_domain.c
-index b155d6afd2ef..9d675c8f21e1 100644
---- a/tools/libs/ctrl/xc_domain.c
-+++ b/tools/libs/ctrl/xc_domain.c
-@@ -1297,9 +1297,7 @@ int xc_domain_get_pod_target(xc_interface *xch,
- uint64_t *pod_cache_pages,
- uint64_t *pod_entries)
- {
-- /* On x86 (above) xc_domain_pod_target will incorrectly return -1
-- * with errno==-1 on error. Do the same for least surprise. */
-- errno = -1;
-+ errno = EOPNOTSUPP;
- return -1;
- }
- #endif
---
-2.35.1
-
diff --git a/0008-x86-p2m-refuse-new-allocations-for-dying-domains.patch b/0008-x86-p2m-refuse-new-allocations-for-dying-domains.patch
new file mode 100644
index 0000000..70b5cc9
--- /dev/null
+++ b/0008-x86-p2m-refuse-new-allocations-for-dying-domains.patch
@@ -0,0 +1,100 @@
+From 745e0b300dc3f5000e6d48c273b405d4bcc29ba7 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 11 Oct 2022 14:53:41 +0200
+Subject: [PATCH 08/26] x86/p2m: refuse new allocations for dying domains
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This will in particular prevent any attempts to add entries to the p2m,
+once - in a subsequent change - non-root entries have been removed.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+master commit: ff600a8cf8e36f8ecbffecf96a035952e022ab87
+master date: 2022-10-11 14:23:22 +0200
+---
+ xen/arch/x86/mm/hap/hap.c | 5 ++++-
+ xen/arch/x86/mm/shadow/common.c | 18 ++++++++++++++----
+ 2 files changed, 18 insertions(+), 5 deletions(-)
+
+diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
+index d75dc2b9ed3d..787991233e53 100644
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -245,6 +245,9 @@ static struct page_info *hap_alloc(struct domain *d)
+
+ ASSERT(paging_locked_by_me(d));
+
++ if ( unlikely(d->is_dying) )
++ return NULL;
++
+ pg = page_list_remove_head(&d->arch.paging.hap.freelist);
+ if ( unlikely(!pg) )
+ return NULL;
+@@ -281,7 +284,7 @@ static struct page_info *hap_alloc_p2m_page(struct domain *d)
+ d->arch.paging.hap.p2m_pages++;
+ ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask));
+ }
+- else if ( !d->arch.paging.p2m_alloc_failed )
++ else if ( !d->arch.paging.p2m_alloc_failed && !d->is_dying )
+ {
+ d->arch.paging.p2m_alloc_failed = 1;
+ dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n",
+diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
+index 2067c7d16bb4..9807f6ec6c00 100644
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -939,6 +939,10 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages)
+ if ( d->arch.paging.shadow.free_pages >= pages )
+ return true;
+
++ if ( unlikely(d->is_dying) )
++ /* No reclaim when the domain is dying, teardown will take care of it. */
++ return false;
++
+ /* Shouldn't have enabled shadows if we've no vcpus. */
+ ASSERT(d->vcpu && d->vcpu[0]);
+
+@@ -991,7 +995,7 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages)
+ d->arch.paging.shadow.free_pages,
+ d->arch.paging.shadow.p2m_pages);
+
+- ASSERT(d->is_dying);
++ ASSERT_UNREACHABLE();
+
+ guest_flush_tlb_mask(d, d->dirty_cpumask);
+
+@@ -1005,10 +1009,13 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages)
+ * to avoid freeing shadows that the caller is currently working on. */
+ bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count)
+ {
+- bool ret = _shadow_prealloc(d, shadow_size(type) * count);
++ bool ret;
+
+- if ( !ret && !d->is_dying &&
+- (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
++ if ( unlikely(d->is_dying) )
++ return false;
++
++ ret = _shadow_prealloc(d, shadow_size(type) * count);
++ if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
+ /*
+ * Failing to allocate memory required for shadow usage can only result in
+ * a domain crash, do it here rather that relying on every caller to do it.
+@@ -1238,6 +1245,9 @@ shadow_alloc_p2m_page(struct domain *d)
+ {
+ struct page_info *pg = NULL;
+
++ if ( unlikely(d->is_dying) )
++ return NULL;
++
+ /* This is called both from the p2m code (which never holds the
+ * paging lock) and the log-dirty code (which always does). */
+ paging_lock_recursive(d);
+--
+2.37.3
+
diff --git a/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch b/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch
deleted file mode 100644
index 5d035f6..0000000
--- a/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-From a2cf30eec08db5df974a9e8bb7366fee8fc7fcd9 Mon Sep 17 00:00:00 2001
-From: Juergen Gross <jgross@suse.com>
-Date: Tue, 7 Jun 2022 14:01:27 +0200
-Subject: [PATCH 09/51] tools/libs/guest: don't set errno to a negative value
-
-Setting errno to a negative error value makes no sense.
-
-Fixes: cb99a64029c9 ("libxc: arm: allow passing a device tree blob to the guest")
-Signed-off-by: Juergen Gross <jgross@suse.com>
-Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
-master commit: 438e96ab479495a932391a22e219ee62fa8c4f47
-master date: 2022-04-22 20:39:34 +0100
----
- tools/libs/guest/xg_dom_core.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/tools/libs/guest/xg_dom_core.c b/tools/libs/guest/xg_dom_core.c
-index 2e4c1330ea6b..65975a75da37 100644
---- a/tools/libs/guest/xg_dom_core.c
-+++ b/tools/libs/guest/xg_dom_core.c
-@@ -856,7 +856,7 @@ int xc_dom_devicetree_file(struct xc_dom_image *dom, const char *filename)
- return -1;
- return 0;
- #else
-- errno = -EINVAL;
-+ errno = EINVAL;
- return -1;
- #endif
- }
---
-2.35.1
-
diff --git a/0009-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch b/0009-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch
new file mode 100644
index 0000000..07e63ac
--- /dev/null
+++ b/0009-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch
@@ -0,0 +1,115 @@
+From 943635d8f8486209e4e48966507ad57963e96284 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 11 Oct 2022 14:54:00 +0200
+Subject: [PATCH 09/26] x86/p2m: truly free paging pool memory for dying
+ domains
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Modify {hap,shadow}_free to free the page immediately if the domain is
+dying, so that pages don't accumulate in the pool when
+{shadow,hap}_final_teardown() get called. This is to limit the amount of
+work which needs to be done there (in a non-preemptable manner).
+
+Note the call to shadow_free() in shadow_free_p2m_page() is moved after
+increasing total_pages, so that the decrease done in shadow_free() in
+case the domain is dying doesn't underflow the counter, even if just for
+a short interval.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+master commit: f50a2c0e1d057c00d6061f40ae24d068226052ad
+master date: 2022-10-11 14:23:51 +0200
+---
+ xen/arch/x86/mm/hap/hap.c | 12 ++++++++++++
+ xen/arch/x86/mm/shadow/common.c | 28 +++++++++++++++++++++++++---
+ 2 files changed, 37 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
+index 787991233e53..aef2297450e1 100644
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -265,6 +265,18 @@ static void hap_free(struct domain *d, mfn_t mfn)
+
+ ASSERT(paging_locked_by_me(d));
+
++ /*
++ * For dying domains, actually free the memory here. This way less work is
++ * left to hap_final_teardown(), which cannot easily have preemption checks
++ * added.
++ */
++ if ( unlikely(d->is_dying) )
++ {
++ free_domheap_page(pg);
++ d->arch.paging.hap.total_pages--;
++ return;
++ }
++
+ d->arch.paging.hap.free_pages++;
+ page_list_add_tail(pg, &d->arch.paging.hap.freelist);
+ }
+diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
+index 9807f6ec6c00..9eb33eafc7f7 100644
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -1187,6 +1187,7 @@ mfn_t shadow_alloc(struct domain *d,
+ void shadow_free(struct domain *d, mfn_t smfn)
+ {
+ struct page_info *next = NULL, *sp = mfn_to_page(smfn);
++ bool dying = ACCESS_ONCE(d->is_dying);
+ struct page_list_head *pin_list;
+ unsigned int pages;
+ u32 shadow_type;
+@@ -1229,11 +1230,32 @@ void shadow_free(struct domain *d, mfn_t smfn)
+ * just before the allocator hands the page out again. */
+ page_set_tlbflush_timestamp(sp);
+ perfc_decr(shadow_alloc_count);
+- page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
++
++ /*
++ * For dying domains, actually free the memory here. This way less
++ * work is left to shadow_final_teardown(), which cannot easily have
++ * preemption checks added.
++ */
++ if ( unlikely(dying) )
++ {
++ /*
++ * The backpointer field (sh.back) used by shadow code aliases the
++ * domain owner field, unconditionally clear it here to avoid
++ * free_domheap_page() attempting to parse it.
++ */
++ page_set_owner(sp, NULL);
++ free_domheap_page(sp);
++ }
++ else
++ page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
++
+ sp = next;
+ }
+
+- d->arch.paging.shadow.free_pages += pages;
++ if ( unlikely(dying) )
++ d->arch.paging.shadow.total_pages -= pages;
++ else
++ d->arch.paging.shadow.free_pages += pages;
+ }
+
+ /* Divert a page from the pool to be used by the p2m mapping.
+@@ -1303,9 +1325,9 @@ shadow_free_p2m_page(struct domain *d, struct page_info *pg)
+ * paging lock) and the log-dirty code (which always does). */
+ paging_lock_recursive(d);
+
+- shadow_free(d, page_to_mfn(pg));
+ d->arch.paging.shadow.p2m_pages--;
+ d->arch.paging.shadow.total_pages++;
++ shadow_free(d, page_to_mfn(pg));
+
+ paging_unlock(d);
+ }
+--
+2.37.3
+
diff --git a/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch b/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch
deleted file mode 100644
index ac900ae..0000000
--- a/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-From 15391de8e2bb6153eadd483154c53044ab53d98d Mon Sep 17 00:00:00 2001
-From: Juergen Gross <jgross@suse.com>
-Date: Tue, 7 Jun 2022 14:01:44 +0200
-Subject: [PATCH 10/51] tools/libs/light: don't set errno to a negative value
-
-Setting errno to a negative value makes no sense.
-
-Fixes: e78e8b9bb649 ("libxl: Add interface for querying hypervisor about PCI topology")
-Signed-off-by: Juergen Gross <jgross@suse.com>
-Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
-master commit: 2419a159fb943c24a6f2439604b9fdb1478fcd08
-master date: 2022-04-22 20:39:34 +0100
----
- tools/libs/light/libxl_linux.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/tools/libs/light/libxl_linux.c b/tools/libs/light/libxl_linux.c
-index 8d62dfd255cb..27f2bce71837 100644
---- a/tools/libs/light/libxl_linux.c
-+++ b/tools/libs/light/libxl_linux.c
-@@ -288,7 +288,7 @@ int libxl__pci_topology_init(libxl__gc *gc,
- if (i == num_devs) {
- LOG(ERROR, "Too many devices");
- err = ERROR_FAIL;
-- errno = -ENOSPC;
-+ errno = ENOSPC;
- goto out;
- }
-
---
-2.35.1
-
diff --git a/0010-x86-p2m-free-the-paging-memory-pool-preemptively.patch b/0010-x86-p2m-free-the-paging-memory-pool-preemptively.patch
new file mode 100644
index 0000000..59c6940
--- /dev/null
+++ b/0010-x86-p2m-free-the-paging-memory-pool-preemptively.patch
@@ -0,0 +1,181 @@
+From f5959ed715e19cf2844656477dbf74c2f576c9d4 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 11 Oct 2022 14:54:21 +0200
+Subject: [PATCH 10/26] x86/p2m: free the paging memory pool preemptively
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The paging memory pool is currently freed in two different places:
+from {shadow,hap}_teardown() via domain_relinquish_resources() and
+from {shadow,hap}_final_teardown() via complete_domain_destroy().
+While the former does handle preemption, the later doesn't.
+
+Attempt to move as much p2m related freeing as possible to happen
+before the call to {shadow,hap}_teardown(), so that most memory can be
+freed in a preemptive way. In order to avoid causing issues to
+existing callers leave the root p2m page tables set and free them in
+{hap,shadow}_final_teardown(). Also modify {hap,shadow}_free to free
+the page immediately if the domain is dying, so that pages don't
+accumulate in the pool when {shadow,hap}_final_teardown() get called.
+
+Move altp2m_vcpu_disable_ve() to be done in hap_teardown(), as that's
+the place where altp2m_active gets disabled now.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Reported-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+master commit: e7aa55c0aab36d994bf627c92bd5386ae167e16e
+master date: 2022-10-11 14:24:21 +0200
+---
+ xen/arch/x86/domain.c | 7 ------
+ xen/arch/x86/mm/hap/hap.c | 42 ++++++++++++++++++++-------------
+ xen/arch/x86/mm/shadow/common.c | 12 ++++++++++
+ 3 files changed, 38 insertions(+), 23 deletions(-)
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 0d39981550ca..a4356893bdbc 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -38,7 +38,6 @@
+ #include <xen/livepatch.h>
+ #include <public/sysctl.h>
+ #include <public/hvm/hvm_vcpu.h>
+-#include <asm/altp2m.h>
+ #include <asm/regs.h>
+ #include <asm/mc146818rtc.h>
+ #include <asm/system.h>
+@@ -2381,12 +2380,6 @@ int domain_relinquish_resources(struct domain *d)
+ vpmu_destroy(v);
+ }
+
+- if ( altp2m_active(d) )
+- {
+- for_each_vcpu ( d, v )
+- altp2m_vcpu_disable_ve(v);
+- }
+-
+ if ( is_pv_domain(d) )
+ {
+ for_each_vcpu ( d, v )
+diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
+index aef2297450e1..a44fcfd95e1e 100644
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -28,6 +28,7 @@
+ #include <xen/domain_page.h>
+ #include <xen/guest_access.h>
+ #include <xen/keyhandler.h>
++#include <asm/altp2m.h>
+ #include <asm/event.h>
+ #include <asm/page.h>
+ #include <asm/current.h>
+@@ -546,24 +547,8 @@ void hap_final_teardown(struct domain *d)
+ unsigned int i;
+
+ if ( hvm_altp2m_supported() )
+- {
+- d->arch.altp2m_active = 0;
+-
+- if ( d->arch.altp2m_eptp )
+- {
+- free_xenheap_page(d->arch.altp2m_eptp);
+- d->arch.altp2m_eptp = NULL;
+- }
+-
+- if ( d->arch.altp2m_visible_eptp )
+- {
+- free_xenheap_page(d->arch.altp2m_visible_eptp);
+- d->arch.altp2m_visible_eptp = NULL;
+- }
+-
+ for ( i = 0; i < MAX_ALTP2M; i++ )
+ p2m_teardown(d->arch.altp2m_p2m[i], true);
+- }
+
+ /* Destroy nestedp2m's first */
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+@@ -578,6 +563,8 @@ void hap_final_teardown(struct domain *d)
+ paging_lock(d);
+ hap_set_allocation(d, 0, NULL);
+ ASSERT(d->arch.paging.hap.p2m_pages == 0);
++ ASSERT(d->arch.paging.hap.free_pages == 0);
++ ASSERT(d->arch.paging.hap.total_pages == 0);
+ paging_unlock(d);
+ }
+
+@@ -603,6 +590,7 @@ void hap_vcpu_teardown(struct vcpu *v)
+ void hap_teardown(struct domain *d, bool *preempted)
+ {
+ struct vcpu *v;
++ unsigned int i;
+
+ ASSERT(d->is_dying);
+ ASSERT(d != current->domain);
+@@ -611,6 +599,28 @@ void hap_teardown(struct domain *d, bool *preempted)
+ for_each_vcpu ( d, v )
+ hap_vcpu_teardown(v);
+
++ /* Leave the root pt in case we get further attempts to modify the p2m. */
++ if ( hvm_altp2m_supported() )
++ {
++ if ( altp2m_active(d) )
++ for_each_vcpu ( d, v )
++ altp2m_vcpu_disable_ve(v);
++
++ d->arch.altp2m_active = 0;
++
++ FREE_XENHEAP_PAGE(d->arch.altp2m_eptp);
++ FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp);
++
++ for ( i = 0; i < MAX_ALTP2M; i++ )
++ p2m_teardown(d->arch.altp2m_p2m[i], false);
++ }
++
++ /* Destroy nestedp2m's after altp2m. */
++ for ( i = 0; i < MAX_NESTEDP2M; i++ )
++ p2m_teardown(d->arch.nested_p2m[i], false);
++
++ p2m_teardown(p2m_get_hostp2m(d), false);
++
+ paging_lock(d); /* Keep various asserts happy */
+
+ if ( d->arch.paging.hap.total_pages != 0 )
+diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
+index 9eb33eafc7f7..ac9a1ae07808 100644
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -2824,8 +2824,17 @@ void shadow_teardown(struct domain *d, bool *preempted)
+ for_each_vcpu ( d, v )
+ shadow_vcpu_teardown(v);
+
++ p2m_teardown(p2m_get_hostp2m(d), false);
++
+ paging_lock(d);
+
++ /*
++ * Reclaim all shadow memory so that shadow_set_allocation() doesn't find
++ * in-use pages, as _shadow_prealloc() will no longer try to reclaim pages
++ * because the domain is dying.
++ */
++ shadow_blow_tables(d);
++
+ #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
+ /* Free the virtual-TLB array attached to each vcpu */
+ for_each_vcpu(d, v)
+@@ -2946,6 +2955,9 @@ void shadow_final_teardown(struct domain *d)
+ d->arch.paging.shadow.total_pages,
+ d->arch.paging.shadow.free_pages,
+ d->arch.paging.shadow.p2m_pages);
++ ASSERT(!d->arch.paging.shadow.total_pages);
++ ASSERT(!d->arch.paging.shadow.free_pages);
++ ASSERT(!d->arch.paging.shadow.p2m_pages);
+ paging_unlock(d);
+ }
+
+--
+2.37.3
+
diff --git a/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch b/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch
deleted file mode 100644
index 3c60de4..0000000
--- a/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch
+++ /dev/null
@@ -1,112 +0,0 @@
-From a6c32abd144ec6443c6a433b5a2ac00e2615aa86 Mon Sep 17 00:00:00 2001
-From: Juergen Gross <jgross@suse.com>
-Date: Tue, 7 Jun 2022 14:02:08 +0200
-Subject: [PATCH 11/51] xen/iommu: cleanup iommu related domctl handling
-
-Today iommu_do_domctl() is being called from arch_do_domctl() in the
-"default:" case of a switch statement. This has led already to crashes
-due to unvalidated parameters.
-
-Fix that by moving the call of iommu_do_domctl() to the main switch
-statement of do_domctl().
-
-Signed-off-by: Juergen Gross <jgross@suse.com>
-Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> # Arm
-master commit: 9cd7e31b3f584e97a138a770cfb031a91a867936
-master date: 2022-04-26 10:23:58 +0200
----
- xen/arch/arm/domctl.c | 11 +----------
- xen/arch/x86/domctl.c | 2 +-
- xen/common/domctl.c | 7 +++++++
- xen/include/xen/iommu.h | 12 +++++++++---
- 4 files changed, 18 insertions(+), 14 deletions(-)
-
-diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
-index 6245af6d0bab..1baf25c3d98b 100644
---- a/xen/arch/arm/domctl.c
-+++ b/xen/arch/arm/domctl.c
-@@ -176,16 +176,7 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
- return rc;
- }
- default:
-- {
-- int rc;
--
-- rc = subarch_do_domctl(domctl, d, u_domctl);
--
-- if ( rc == -ENOSYS )
-- rc = iommu_do_domctl(domctl, d, u_domctl);
--
-- return rc;
-- }
-+ return subarch_do_domctl(domctl, d, u_domctl);
- }
- }
-
-diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
-index 7d102e0647ec..0fa51f2ebd10 100644
---- a/xen/arch/x86/domctl.c
-+++ b/xen/arch/x86/domctl.c
-@@ -1380,7 +1380,7 @@ long arch_do_domctl(
- break;
-
- default:
-- ret = iommu_do_domctl(domctl, d, u_domctl);
-+ ret = -ENOSYS;
- break;
- }
-
-diff --git a/xen/common/domctl.c b/xen/common/domctl.c
-index 419e4070f59d..65d2a4588b71 100644
---- a/xen/common/domctl.c
-+++ b/xen/common/domctl.c
-@@ -870,6 +870,13 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
- copyback = 1;
- break;
-
-+ case XEN_DOMCTL_assign_device:
-+ case XEN_DOMCTL_test_assign_device:
-+ case XEN_DOMCTL_deassign_device:
-+ case XEN_DOMCTL_get_device_group:
-+ ret = iommu_do_domctl(op, d, u_domctl);
-+ break;
-+
- default:
- ret = arch_do_domctl(op, d, u_domctl);
- break;
-diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h
-index 92b2d23f0ba2..861579562e8a 100644
---- a/xen/include/xen/iommu.h
-+++ b/xen/include/xen/iommu.h
-@@ -342,8 +342,17 @@ struct domain_iommu {
- /* Does the IOMMU pagetable need to be kept synchronized with the P2M */
- #ifdef CONFIG_HAS_PASSTHROUGH
- #define need_iommu_pt_sync(d) (dom_iommu(d)->need_sync)
-+
-+int iommu_do_domctl(struct xen_domctl *domctl, struct domain *d,
-+ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl);
- #else
- #define need_iommu_pt_sync(d) ({ (void)(d); false; })
-+
-+static inline int iommu_do_domctl(struct xen_domctl *domctl, struct domain *d,
-+ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
-+{
-+ return -ENOSYS;
-+}
- #endif
-
- int __must_check iommu_suspend(void);
-@@ -357,9 +366,6 @@ int iommu_do_pci_domctl(struct xen_domctl *, struct domain *d,
- XEN_GUEST_HANDLE_PARAM(xen_domctl_t));
- #endif
-
--int iommu_do_domctl(struct xen_domctl *, struct domain *d,
-- XEN_GUEST_HANDLE_PARAM(xen_domctl_t));
--
- void iommu_dev_iotlb_flush_timeout(struct domain *d, struct pci_dev *pdev);
-
- /*
---
-2.35.1
-
diff --git a/0011-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch b/0011-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch
new file mode 100644
index 0000000..5520627
--- /dev/null
+++ b/0011-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch
@@ -0,0 +1,197 @@
+From a603386b422f5cb4c5e2639a7e20a1d99dba2175 Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall@amazon.com>
+Date: Tue, 11 Oct 2022 14:54:44 +0200
+Subject: [PATCH 11/26] xen/x86: p2m: Add preemption in p2m_teardown()
+
+The list p2m->pages contain all the pages used by the P2M. On large
+instance this can be quite large and the time spent to call
+d->arch.paging.free_page() will take more than 1ms for a 80GB guest
+on a Xen running in nested environment on a c5.metal.
+
+By extrapolation, it would take > 100ms for a 8TB guest (what we
+current security support). So add some preemption in p2m_teardown()
+and propagate to the callers. Note there are 3 places where
+the preemption is not enabled:
+ - hap_final_teardown()/shadow_final_teardown(): We are
+ preventing update the P2M once the domain is dying (so
+ no more pages could be allocated) and most of the P2M pages
+ will be freed in preemptive manneer when relinquishing the
+ resources. So this is fine to disable preemption.
+ - shadow_enable(): This is fine because it will undo the allocation
+ that may have been made by p2m_alloc_table() (so only the root
+ page table).
+
+The preemption is arbitrarily checked every 1024 iterations.
+
+We now need to include <xen/event.h> in p2m-basic in order to
+import the definition for local_events_need_delivery() used by
+general_preempt_check(). Ideally, the inclusion should happen in
+xen/sched.h but it opened a can of worms.
+
+Note that with the current approach, Xen doesn't keep track on whether
+the alt/nested P2Ms have been cleared. So there are some redundant work.
+However, this is not expected to incurr too much overhead (the P2M lock
+shouldn't be contended during teardown). So this is optimization is
+left outside of the security event.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+master commit: 8a2111250b424edc49c65c4d41b276766d30635c
+master date: 2022-10-11 14:24:48 +0200
+---
+ xen/arch/x86/mm/hap/hap.c | 22 ++++++++++++++++------
+ xen/arch/x86/mm/p2m.c | 18 +++++++++++++++---
+ xen/arch/x86/mm/shadow/common.c | 12 +++++++++---
+ xen/include/asm-x86/p2m.h | 2 +-
+ 4 files changed, 41 insertions(+), 13 deletions(-)
+
+diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
+index a44fcfd95e1e..1f9a157a0c34 100644
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -548,17 +548,17 @@ void hap_final_teardown(struct domain *d)
+
+ if ( hvm_altp2m_supported() )
+ for ( i = 0; i < MAX_ALTP2M; i++ )
+- p2m_teardown(d->arch.altp2m_p2m[i], true);
++ p2m_teardown(d->arch.altp2m_p2m[i], true, NULL);
+
+ /* Destroy nestedp2m's first */
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+- p2m_teardown(d->arch.nested_p2m[i], true);
++ p2m_teardown(d->arch.nested_p2m[i], true, NULL);
+ }
+
+ if ( d->arch.paging.hap.total_pages != 0 )
+ hap_teardown(d, NULL);
+
+- p2m_teardown(p2m_get_hostp2m(d), true);
++ p2m_teardown(p2m_get_hostp2m(d), true, NULL);
+ /* Free any memory that the p2m teardown released */
+ paging_lock(d);
+ hap_set_allocation(d, 0, NULL);
+@@ -612,14 +612,24 @@ void hap_teardown(struct domain *d, bool *preempted)
+ FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp);
+
+ for ( i = 0; i < MAX_ALTP2M; i++ )
+- p2m_teardown(d->arch.altp2m_p2m[i], false);
++ {
++ p2m_teardown(d->arch.altp2m_p2m[i], false, preempted);
++ if ( preempted && *preempted )
++ return;
++ }
+ }
+
+ /* Destroy nestedp2m's after altp2m. */
+ for ( i = 0; i < MAX_NESTEDP2M; i++ )
+- p2m_teardown(d->arch.nested_p2m[i], false);
++ {
++ p2m_teardown(d->arch.nested_p2m[i], false, preempted);
++ if ( preempted && *preempted )
++ return;
++ }
+
+- p2m_teardown(p2m_get_hostp2m(d), false);
++ p2m_teardown(p2m_get_hostp2m(d), false, preempted);
++ if ( preempted && *preempted )
++ return;
+
+ paging_lock(d); /* Keep various asserts happy */
+
+diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
+index aba4f17cbe12..8781df9dda8d 100644
+--- a/xen/arch/x86/mm/p2m.c
++++ b/xen/arch/x86/mm/p2m.c
+@@ -749,12 +749,13 @@ int p2m_alloc_table(struct p2m_domain *p2m)
+ * hvm fixme: when adding support for pvh non-hardware domains, this path must
+ * cleanup any foreign p2m types (release refcnts on them).
+ */
+-void p2m_teardown(struct p2m_domain *p2m, bool remove_root)
++void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted)
+ /* Return all the p2m pages to Xen.
+ * We know we don't have any extra mappings to these pages */
+ {
+ struct page_info *pg, *root_pg = NULL;
+ struct domain *d;
++ unsigned int i = 0;
+
+ if (p2m == NULL)
+ return;
+@@ -773,8 +774,19 @@ void p2m_teardown(struct p2m_domain *p2m, bool remove_root)
+ }
+
+ while ( (pg = page_list_remove_head(&p2m->pages)) )
+- if ( pg != root_pg )
+- d->arch.paging.free_page(d, pg);
++ {
++ if ( pg == root_pg )
++ continue;
++
++ d->arch.paging.free_page(d, pg);
++
++ /* Arbitrarily check preemption every 1024 iterations */
++ if ( preempted && !(++i % 1024) && general_preempt_check() )
++ {
++ *preempted = true;
++ break;
++ }
++ }
+
+ if ( root_pg )
+ page_list_add(root_pg, &p2m->pages);
+diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
+index ac9a1ae07808..3b0d781991b5 100644
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -2770,8 +2770,12 @@ int shadow_enable(struct domain *d, u32 mode)
+ out_locked:
+ paging_unlock(d);
+ out_unlocked:
++ /*
++ * This is fine to ignore the preemption here because only the root
++ * will be allocated by p2m_alloc_table().
++ */
+ if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) )
+- p2m_teardown(p2m, true);
++ p2m_teardown(p2m, true, NULL);
+ if ( rv != 0 && pg != NULL )
+ {
+ pg->count_info &= ~PGC_count_mask;
+@@ -2824,7 +2828,9 @@ void shadow_teardown(struct domain *d, bool *preempted)
+ for_each_vcpu ( d, v )
+ shadow_vcpu_teardown(v);
+
+- p2m_teardown(p2m_get_hostp2m(d), false);
++ p2m_teardown(p2m_get_hostp2m(d), false, preempted);
++ if ( preempted && *preempted )
++ return;
+
+ paging_lock(d);
+
+@@ -2945,7 +2951,7 @@ void shadow_final_teardown(struct domain *d)
+ shadow_teardown(d, NULL);
+
+ /* It is now safe to pull down the p2m map. */
+- p2m_teardown(p2m_get_hostp2m(d), true);
++ p2m_teardown(p2m_get_hostp2m(d), true, NULL);
+ /* Free any shadow memory that the p2m teardown released */
+ paging_lock(d);
+ shadow_set_allocation(d, 0, NULL);
+diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
+index c3c16748e7d5..2db9ab0122f2 100644
+--- a/xen/include/asm-x86/p2m.h
++++ b/xen/include/asm-x86/p2m.h
+@@ -574,7 +574,7 @@ int p2m_init(struct domain *d);
+ int p2m_alloc_table(struct p2m_domain *p2m);
+
+ /* Return all the p2m resources to Xen. */
+-void p2m_teardown(struct p2m_domain *p2m, bool remove_root);
++void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted);
+ void p2m_final_teardown(struct domain *d);
+
+ /* Add a page to a domain's p2m table */
+--
+2.37.3
+
diff --git a/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch b/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch
deleted file mode 100644
index 37b9005..0000000
--- a/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-From 4cf9a7c7bdb9d544fbac81105bbc1059ba3dd932 Mon Sep 17 00:00:00 2001
-From: Jan Beulich <jbeulich@suse.com>
-Date: Tue, 7 Jun 2022 14:02:30 +0200
-Subject: [PATCH 12/51] IOMMU: make domctl handler tolerate NULL domain
-
-Besides the reporter's issue of hitting a NULL deref when !CONFIG_GDBSX,
-XEN_DOMCTL_test_assign_device can legitimately end up having NULL passed
-here, when the domctl was passed DOMID_INVALID.
-
-Fixes: 71e617a6b8f6 ("use is_iommu_enabled() where appropriate...")
-Reported-by: Cheyenne Wills <cheyenne.wills@gmail.com>
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Paul Durrant <paul@xen.org>
-Reviewed-by: Juergen Gross <jgross@suse.com>
-master commit: fa4d84e6dd3c3bfd23a525b75a5483d4ce15adbb
-master date: 2022-04-26 10:25:54 +0200
----
- xen/drivers/passthrough/iommu.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
-index caaba62c8865..287f63fc736f 100644
---- a/xen/drivers/passthrough/iommu.c
-+++ b/xen/drivers/passthrough/iommu.c
-@@ -535,7 +535,7 @@ int iommu_do_domctl(
- {
- int ret = -ENODEV;
-
-- if ( !is_iommu_enabled(d) )
-+ if ( !(d ? is_iommu_enabled(d) : iommu_enabled) )
- return -EOPNOTSUPP;
-
- #ifdef CONFIG_HAS_PCI
---
-2.35.1
-
diff --git a/0012-libxl-docs-Use-arch-specific-default-paging-memory.patch b/0012-libxl-docs-Use-arch-specific-default-paging-memory.patch
new file mode 100644
index 0000000..9390500
--- /dev/null
+++ b/0012-libxl-docs-Use-arch-specific-default-paging-memory.patch
@@ -0,0 +1,149 @@
+From 755a9b52844de3e1e47aa1fc9991a4240ccfbf35 Mon Sep 17 00:00:00 2001
+From: Henry Wang <Henry.Wang@arm.com>
+Date: Tue, 11 Oct 2022 14:55:08 +0200
+Subject: [PATCH 12/26] libxl, docs: Use arch-specific default paging memory
+
+The default paging memory (descibed in `shadow_memory` entry in xl
+config) in libxl is used to determine the memory pool size for xl
+guests. Currently this size is only used for x86, and contains a part
+of RAM to shadow the resident processes. Since on Arm there is no
+shadow mode guests, so the part of RAM to shadow the resident processes
+is not necessary. Therefore, this commit splits the function
+`libxl_get_required_shadow_memory()` to arch specific helpers and
+renamed the helper to `libxl__arch_get_required_paging_memory()`.
+
+On x86, this helper calls the original value from
+`libxl_get_required_shadow_memory()` so no functional change intended.
+
+On Arm, this helper returns 1MB per vcpu plus 4KB per MiB of RAM
+for the P2M map and additional 512KB.
+
+Also update the xl.cfg documentation to add Arm documentation
+according to code changes and correct the comment style following Xen
+coding style.
+
+This is part of CVE-2022-33747 / XSA-409.
+
+Suggested-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Henry Wang <Henry.Wang@arm.com>
+Reviewed-by: Anthony PERARD <anthony.perard@citrix.com>
+master commit: 156a239ea288972425f967ac807b3cb5b5e14874
+master date: 2022-10-11 14:28:37 +0200
+---
+ docs/man/xl.cfg.5.pod.in | 5 +++++
+ tools/libs/light/libxl_arch.h | 4 ++++
+ tools/libs/light/libxl_arm.c | 14 ++++++++++++++
+ tools/libs/light/libxl_utils.c | 9 ++-------
+ tools/libs/light/libxl_x86.c | 13 +++++++++++++
+ 5 files changed, 38 insertions(+), 7 deletions(-)
+
+diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in
+index b98d1613987e..eda1e77ebd06 100644
+--- a/docs/man/xl.cfg.5.pod.in
++++ b/docs/man/xl.cfg.5.pod.in
+@@ -1768,6 +1768,11 @@ are not using hardware assisted paging (i.e. you are using shadow
+ mode) and your guest workload consists of a very large number of
+ similar processes then increasing this value may improve performance.
+
++On Arm, this field is used to determine the size of the guest P2M pages
++pool, and the default value is 1MB per vCPU plus 4KB per MB of RAM for
++the P2M map and additional 512KB for extended regions. Users should
++adjust this value if bigger P2M pool size is needed.
++
+ =back
+
+ =head3 Processor and Platform Features
+diff --git a/tools/libs/light/libxl_arch.h b/tools/libs/light/libxl_arch.h
+index 1522ecb97f72..5a060c2c3033 100644
+--- a/tools/libs/light/libxl_arch.h
++++ b/tools/libs/light/libxl_arch.h
+@@ -90,6 +90,10 @@ void libxl__arch_update_domain_config(libxl__gc *gc,
+ libxl_domain_config *dst,
+ const libxl_domain_config *src);
+
++_hidden
++unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
++ unsigned int smp_cpus);
++
+ #if defined(__i386__) || defined(__x86_64__)
+
+ #define LAPIC_BASE_ADDRESS 0xfee00000
+diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c
+index eef1de093914..73a95e83af24 100644
+--- a/tools/libs/light/libxl_arm.c
++++ b/tools/libs/light/libxl_arm.c
+@@ -154,6 +154,20 @@ out:
+ return rc;
+ }
+
++unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
++ unsigned int smp_cpus)
++{
++ /*
++ * 256 pages (1MB) per vcpu,
++ * plus 1 page per MiB of RAM for the P2M map,
++ * plus 1 page per MiB of extended region. This default value is 128 MiB
++ * which should be enough for domains that are not running backend.
++ * This is higher than the minimum that Xen would allocate if no value
++ * were given (but the Xen minimum is for safety, not performance).
++ */
++ return 4 * (256 * smp_cpus + maxmem_kb / 1024 + 128);
++}
++
+ static struct arch_info {
+ const char *guest_type;
+ const char *timer_compat;
+diff --git a/tools/libs/light/libxl_utils.c b/tools/libs/light/libxl_utils.c
+index 4699c4a0a36f..e276c0ee9cc3 100644
+--- a/tools/libs/light/libxl_utils.c
++++ b/tools/libs/light/libxl_utils.c
+@@ -18,6 +18,7 @@
+ #include <ctype.h>
+
+ #include "libxl_internal.h"
++#include "libxl_arch.h"
+ #include "_paths.h"
+
+ #ifndef LIBXL_HAVE_NONCONST_LIBXL_BASENAME_RETURN_VALUE
+@@ -39,13 +40,7 @@ char *libxl_basename(const char *name)
+
+ unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned int smp_cpus)
+ {
+- /* 256 pages (1MB) per vcpu,
+- plus 1 page per MiB of RAM for the P2M map,
+- plus 1 page per MiB of RAM to shadow the resident processes.
+- This is higher than the minimum that Xen would allocate if no value
+- were given (but the Xen minimum is for safety, not performance).
+- */
+- return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024));
++ return libxl__arch_get_required_paging_memory(maxmem_kb, smp_cpus);
+ }
+
+ char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid)
+diff --git a/tools/libs/light/libxl_x86.c b/tools/libs/light/libxl_x86.c
+index 1feadebb1852..51362893cf98 100644
+--- a/tools/libs/light/libxl_x86.c
++++ b/tools/libs/light/libxl_x86.c
+@@ -882,6 +882,19 @@ void libxl__arch_update_domain_config(libxl__gc *gc,
+ libxl_defbool_val(src->b_info.arch_x86.msr_relaxed));
+ }
+
++unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
++ unsigned int smp_cpus)
++{
++ /*
++ * 256 pages (1MB) per vcpu,
++ * plus 1 page per MiB of RAM for the P2M map,
++ * plus 1 page per MiB of RAM to shadow the resident processes.
++ * This is higher than the minimum that Xen would allocate if no value
++ * were given (but the Xen minimum is for safety, not performance).
++ */
++ return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024));
++}
++
+ /*
+ * Local variables:
+ * mode: C
+--
+2.37.3
+
diff --git a/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch b/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch
deleted file mode 100644
index 8416c96..0000000
--- a/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch
+++ /dev/null
@@ -1,229 +0,0 @@
-From 838f6c211f7f05f107e1acdfb0977ab61ec0bf2e Mon Sep 17 00:00:00 2001
-From: Jan Beulich <jbeulich@suse.com>
-Date: Tue, 7 Jun 2022 14:03:20 +0200
-Subject: [PATCH 13/51] IOMMU/x86: disallow device assignment to PoD guests
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-While it is okay for IOMMU page tables to be set up for guests starting
-in PoD mode, actual device assignment may only occur once all PoD
-entries have been removed from the P2M. So far this was enforced only
-for boot-time assignment, and only in the tool stack.
-
-Also use the new function to replace p2m_pod_entry_count(): Its unlocked
-access to p2m->pod.entry_count wasn't really okay (irrespective of the
-result being stale by the time the caller gets to see it). Nor was the
-use of that function in line with the immediately preceding comment: A
-PoD guest isn't just one with a non-zero entry count, but also one with
-a non-empty cache (e.g. prior to actually launching the guest).
-
-To allow the tool stack to see a consistent snapshot of PoD state, move
-the tail of XENMEM_{get,set}_pod_target handling into a function, adding
-proper locking there.
-
-In libxl take the liberty to use the new local variable r also for a
-pre-existing call into libxc.
-
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-master commit: ad4312d764e8b40a1e45b64aac6d840a60c59f13
-master date: 2022-05-02 08:48:02 +0200
----
- xen/arch/x86/mm.c | 6 +---
- xen/arch/x86/mm/p2m-pod.c | 43 ++++++++++++++++++++++++++++-
- xen/common/vm_event.c | 2 +-
- xen/drivers/passthrough/x86/iommu.c | 3 +-
- xen/include/asm-x86/p2m.h | 21 +++++++-------
- 5 files changed, 57 insertions(+), 18 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index e222d9aa98ee..4ee2de11051d 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -4777,7 +4777,6 @@ long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
- {
- xen_pod_target_t target;
- struct domain *d;
-- struct p2m_domain *p2m;
-
- if ( copy_from_guest(&target, arg, 1) )
- return -EFAULT;
-@@ -4812,10 +4811,7 @@ long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
- }
- else if ( rc >= 0 )
- {
-- p2m = p2m_get_hostp2m(d);
-- target.tot_pages = domain_tot_pages(d);
-- target.pod_cache_pages = p2m->pod.count;
-- target.pod_entries = p2m->pod.entry_count;
-+ p2m_pod_get_mem_target(d, &target);
-
- if ( __copy_to_guest(arg, &target, 1) )
- {
-diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
-index d8d1a0ce7ed7..a3c9d8a97423 100644
---- a/xen/arch/x86/mm/p2m-pod.c
-+++ b/xen/arch/x86/mm/p2m-pod.c
-@@ -20,6 +20,7 @@
- */
-
- #include <xen/event.h>
-+#include <xen/iocap.h>
- #include <xen/ioreq.h>
- #include <xen/mm.h>
- #include <xen/sched.h>
-@@ -362,7 +363,10 @@ p2m_pod_set_mem_target(struct domain *d, unsigned long target)
-
- ASSERT( pod_target >= p2m->pod.count );
-
-- ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
-+ if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
-+ ret = -ENOTEMPTY;
-+ else
-+ ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
-
- out:
- pod_unlock(p2m);
-@@ -370,6 +374,23 @@ out:
- return ret;
- }
-
-+void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target)
-+{
-+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
-+
-+ ASSERT(is_hvm_domain(d));
-+
-+ pod_lock(p2m);
-+ lock_page_alloc(p2m);
-+
-+ target->tot_pages = domain_tot_pages(d);
-+ target->pod_cache_pages = p2m->pod.count;
-+ target->pod_entries = p2m->pod.entry_count;
-+
-+ unlock_page_alloc(p2m);
-+ pod_unlock(p2m);
-+}
-+
- int p2m_pod_empty_cache(struct domain *d)
- {
- struct p2m_domain *p2m = p2m_get_hostp2m(d);
-@@ -1387,6 +1408,9 @@ guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
- if ( !paging_mode_translate(d) )
- return -EINVAL;
-
-+ if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
-+ return -ENOTEMPTY;
-+
- do {
- rc = mark_populate_on_demand(d, gfn, chunk_order);
-
-@@ -1408,3 +1432,20 @@ void p2m_pod_init(struct p2m_domain *p2m)
- for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
- p2m->pod.mrp.list[i] = gfn_x(INVALID_GFN);
- }
-+
-+bool p2m_pod_active(const struct domain *d)
-+{
-+ struct p2m_domain *p2m;
-+ bool res;
-+
-+ if ( !is_hvm_domain(d) )
-+ return false;
-+
-+ p2m = p2m_get_hostp2m(d);
-+
-+ pod_lock(p2m);
-+ res = p2m->pod.entry_count | p2m->pod.count;
-+ pod_unlock(p2m);
-+
-+ return res;
-+}
-diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
-index 70ab3ba406ff..21d2f0edf727 100644
---- a/xen/common/vm_event.c
-+++ b/xen/common/vm_event.c
-@@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec)
-
- rc = -EXDEV;
- /* Disallow paging in a PoD guest */
-- if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
-+ if ( p2m_pod_active(d) )
- break;
-
- /* domain_pause() not required here, see XSA-99 */
-diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c
-index a36a6bd4b249..dc9936e16930 100644
---- a/xen/drivers/passthrough/x86/iommu.c
-+++ b/xen/drivers/passthrough/x86/iommu.c
-@@ -502,11 +502,12 @@ bool arch_iommu_use_permitted(const struct domain *d)
- {
- /*
- * Prevent device assign if mem paging, mem sharing or log-dirty
-- * have been enabled for this domain.
-+ * have been enabled for this domain, or if PoD is still in active use.
- */
- return d == dom_io ||
- (likely(!mem_sharing_enabled(d)) &&
- likely(!mem_paging_enabled(d)) &&
-+ likely(!p2m_pod_active(d)) &&
- likely(!p2m_get_hostp2m(d)->global_logdirty));
- }
-
-diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
-index 357a8087481e..f2af7a746ced 100644
---- a/xen/include/asm-x86/p2m.h
-+++ b/xen/include/asm-x86/p2m.h
-@@ -661,6 +661,12 @@ int p2m_pod_empty_cache(struct domain *d);
- * domain matches target */
- int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
-
-+/* Obtain a consistent snapshot of PoD related domain state. */
-+void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target);
-+
-+/* Check whether PoD is (still) active in a domain. */
-+bool p2m_pod_active(const struct domain *d);
-+
- /* Scan pod cache when offline/broken page triggered */
- int
- p2m_pod_offline_or_broken_hit(struct page_info *p);
-@@ -669,11 +675,6 @@ p2m_pod_offline_or_broken_hit(struct page_info *p);
- void
- p2m_pod_offline_or_broken_replace(struct page_info *p);
-
--static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
--{
-- return p2m->pod.entry_count;
--}
--
- void p2m_pod_init(struct p2m_domain *p2m);
-
- #else
-@@ -689,6 +690,11 @@ static inline int p2m_pod_empty_cache(struct domain *d)
- return 0;
- }
-
-+static inline bool p2m_pod_active(const struct domain *d)
-+{
-+ return false;
-+}
-+
- static inline int p2m_pod_offline_or_broken_hit(struct page_info *p)
- {
- return 0;
-@@ -699,11 +705,6 @@ static inline void p2m_pod_offline_or_broken_replace(struct page_info *p)
- ASSERT_UNREACHABLE();
- }
-
--static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
--{
-- return 0;
--}
--
- static inline void p2m_pod_init(struct p2m_domain *p2m) {}
-
- #endif
---
-2.35.1
-
diff --git a/0013-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch b/0013-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch
new file mode 100644
index 0000000..dee9d9c
--- /dev/null
+++ b/0013-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch
@@ -0,0 +1,189 @@
+From 914fc8e8b4cc003e90d51bee0aef54687358530a Mon Sep 17 00:00:00 2001
+From: Henry Wang <Henry.Wang@arm.com>
+Date: Tue, 11 Oct 2022 14:55:21 +0200
+Subject: [PATCH 13/26] xen/arm: Construct the P2M pages pool for guests
+
+This commit constructs the p2m pages pool for guests from the
+data structure and helper perspective.
+
+This is implemented by:
+
+- Adding a `struct paging_domain` which contains a freelist, a
+counter variable and a spinlock to `struct arch_domain` to
+indicate the free p2m pages and the number of p2m total pages in
+the p2m pages pool.
+
+- Adding a helper `p2m_get_allocation` to get the p2m pool size.
+
+- Adding a helper `p2m_set_allocation` to set the p2m pages pool
+size. This helper should be called before allocating memory for
+a guest.
+
+- Adding a helper `p2m_teardown_allocation` to free the p2m pages
+pool. This helper should be called during the xl domain destory.
+
+This is part of CVE-2022-33747 / XSA-409.
+
+Signed-off-by: Henry Wang <Henry.Wang@arm.com>
+Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
+master commit: 55914f7fc91a468649b8a3ec3f53ae1c4aca6670
+master date: 2022-10-11 14:28:39 +0200
+---
+ xen/arch/arm/p2m.c | 88 ++++++++++++++++++++++++++++++++++++
+ xen/include/asm-arm/domain.h | 10 ++++
+ xen/include/asm-arm/p2m.h | 4 ++
+ 3 files changed, 102 insertions(+)
+
+diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
+index 27418ee5ee98..d8957dd8727c 100644
+--- a/xen/arch/arm/p2m.c
++++ b/xen/arch/arm/p2m.c
+@@ -50,6 +50,92 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
+ return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
+ }
+
++/* Return the size of the pool, rounded up to the nearest MB */
++unsigned int p2m_get_allocation(struct domain *d)
++{
++ unsigned long nr_pages = ACCESS_ONCE(d->arch.paging.p2m_total_pages);
++
++ return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT);
++}
++
++/*
++ * Set the pool of pages to the required number of pages.
++ * Returns 0 for success, non-zero for failure.
++ * Call with d->arch.paging.lock held.
++ */
++int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted)
++{
++ struct page_info *pg;
++
++ ASSERT(spin_is_locked(&d->arch.paging.lock));
++
++ for ( ; ; )
++ {
++ if ( d->arch.paging.p2m_total_pages < pages )
++ {
++ /* Need to allocate more memory from domheap */
++ pg = alloc_domheap_page(NULL, 0);
++ if ( pg == NULL )
++ {
++ printk(XENLOG_ERR "Failed to allocate P2M pages.\n");
++ return -ENOMEM;
++ }
++ ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
++ d->arch.paging.p2m_total_pages + 1;
++ page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
++ }
++ else if ( d->arch.paging.p2m_total_pages > pages )
++ {
++ /* Need to return memory to domheap */
++ pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
++ if( pg )
++ {
++ ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
++ d->arch.paging.p2m_total_pages - 1;
++ free_domheap_page(pg);
++ }
++ else
++ {
++ printk(XENLOG_ERR
++ "Failed to free P2M pages, P2M freelist is empty.\n");
++ return -ENOMEM;
++ }
++ }
++ else
++ break;
++
++ /* Check to see if we need to yield and try again */
++ if ( preempted && general_preempt_check() )
++ {
++ *preempted = true;
++ return -ERESTART;
++ }
++ }
++
++ return 0;
++}
++
++int p2m_teardown_allocation(struct domain *d)
++{
++ int ret = 0;
++ bool preempted = false;
++
++ spin_lock(&d->arch.paging.lock);
++ if ( d->arch.paging.p2m_total_pages != 0 )
++ {
++ ret = p2m_set_allocation(d, 0, &preempted);
++ if ( preempted )
++ {
++ spin_unlock(&d->arch.paging.lock);
++ return -ERESTART;
++ }
++ ASSERT(d->arch.paging.p2m_total_pages == 0);
++ }
++ spin_unlock(&d->arch.paging.lock);
++
++ return ret;
++}
++
+ /* Unlock the flush and do a P2M TLB flush if necessary */
+ void p2m_write_unlock(struct p2m_domain *p2m)
+ {
+@@ -1599,7 +1685,9 @@ int p2m_init(struct domain *d)
+ unsigned int cpu;
+
+ rwlock_init(&p2m->lock);
++ spin_lock_init(&d->arch.paging.lock);
+ INIT_PAGE_LIST_HEAD(&p2m->pages);
++ INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist);
+
+ p2m->vmid = INVALID_VMID;
+
+diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
+index 7f8ddd3f5c3b..2f31795ab96d 100644
+--- a/xen/include/asm-arm/domain.h
++++ b/xen/include/asm-arm/domain.h
+@@ -40,6 +40,14 @@ struct vtimer {
+ uint64_t cval;
+ };
+
++struct paging_domain {
++ spinlock_t lock;
++ /* Free P2M pages from the pre-allocated P2M pool */
++ struct page_list_head p2m_freelist;
++ /* Number of pages from the pre-allocated P2M pool */
++ unsigned long p2m_total_pages;
++};
++
+ struct arch_domain
+ {
+ #ifdef CONFIG_ARM_64
+@@ -51,6 +59,8 @@ struct arch_domain
+
+ struct hvm_domain hvm;
+
++ struct paging_domain paging;
++
+ struct vmmio vmmio;
+
+ /* Continuable domain_relinquish_resources(). */
+diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
+index b3ba83283e11..c9598740bd02 100644
+--- a/xen/include/asm-arm/p2m.h
++++ b/xen/include/asm-arm/p2m.h
+@@ -218,6 +218,10 @@ void p2m_restore_state(struct vcpu *n);
+ /* Print debugging/statistial info about a domain's p2m */
+ void p2m_dump_info(struct domain *d);
+
++unsigned int p2m_get_allocation(struct domain *d);
++int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted);
++int p2m_teardown_allocation(struct domain *d);
++
+ static inline void p2m_write_lock(struct p2m_domain *p2m)
+ {
+ write_lock(&p2m->lock);
+--
+2.37.3
+
diff --git a/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch b/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch
deleted file mode 100644
index 69049f1..0000000
--- a/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch
+++ /dev/null
@@ -1,121 +0,0 @@
-From 9ebe2ba83644ec6cd33a93c68dab5f551adcbea0 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
-Date: Tue, 7 Jun 2022 14:04:16 +0200
-Subject: [PATCH 14/51] x86/msr: handle reads to MSR_P5_MC_{ADDR,TYPE}
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Windows Server 2019 Essentials will unconditionally attempt to read
-P5_MC_ADDR MSR at boot and throw a BSOD if injected a #GP.
-
-Fix this by mapping MSR_P5_MC_{ADDR,TYPE} to
-MSR_IA32_MCi_{ADDR,STATUS}, as reported also done by hardware in Intel
-SDM "Mapping of the Pentium Processor Machine-Check Errors to the
-Machine-Check Architecture" section.
-
-Reported-by: Steffen Einsle <einsle@phptrix.de>
-Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-master commit: ce59e472b581e4923f6892172dde62b88c8aa8b7
-master date: 2022-05-02 08:49:12 +0200
----
- xen/arch/x86/cpu/mcheck/mce.h | 6 ++++++
- xen/arch/x86/cpu/mcheck/mce_intel.c | 19 +++++++++++++++++++
- xen/arch/x86/cpu/mcheck/vmce.c | 2 ++
- xen/arch/x86/msr.c | 2 ++
- xen/include/asm-x86/msr-index.h | 3 +++
- 5 files changed, 32 insertions(+)
-
-diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h
-index 195362691904..192315ecfa3d 100644
---- a/xen/arch/x86/cpu/mcheck/mce.h
-+++ b/xen/arch/x86/cpu/mcheck/mce.h
-@@ -169,6 +169,12 @@ static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr)
- if (msr >= MSR_IA32_MC0_CTL2 &&
- msr < MSR_IA32_MCx_CTL2(v->arch.vmce.mcg_cap & MCG_CAP_COUNT) )
- return 1;
-+ fallthrough;
-+
-+ case X86_VENDOR_CENTAUR:
-+ case X86_VENDOR_SHANGHAI:
-+ if (msr == MSR_P5_MC_ADDR || msr == MSR_P5_MC_TYPE)
-+ return 1;
- break;
-
- case X86_VENDOR_AMD:
-diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
-index bb9f3a3ff795..d364e9bf5ad1 100644
---- a/xen/arch/x86/cpu/mcheck/mce_intel.c
-+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
-@@ -1001,8 +1001,27 @@ int vmce_intel_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
-
- int vmce_intel_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
- {
-+ const struct cpuid_policy *cp = v->domain->arch.cpuid;
- unsigned int bank = msr - MSR_IA32_MC0_CTL2;
-
-+ switch ( msr )
-+ {
-+ case MSR_P5_MC_ADDR:
-+ /*
-+ * Bank 0 is used for the 'bank 0 quirk' on older processors.
-+ * See vcpu_fill_mc_msrs() for reference.
-+ */
-+ *val = v->arch.vmce.bank[1].mci_addr;
-+ return 1;
-+
-+ case MSR_P5_MC_TYPE:
-+ *val = v->arch.vmce.bank[1].mci_status;
-+ return 1;
-+ }
-+
-+ if ( !(cp->x86_vendor & X86_VENDOR_INTEL) )
-+ return 0;
-+
- if ( bank < GUEST_MC_BANK_NUM )
- {
- *val = v->arch.vmce.bank[bank].mci_ctl2;
-diff --git a/xen/arch/x86/cpu/mcheck/vmce.c b/xen/arch/x86/cpu/mcheck/vmce.c
-index eb6434a3ba20..0899df58bcbf 100644
---- a/xen/arch/x86/cpu/mcheck/vmce.c
-+++ b/xen/arch/x86/cpu/mcheck/vmce.c
-@@ -150,6 +150,8 @@ static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
- default:
- switch ( boot_cpu_data.x86_vendor )
- {
-+ case X86_VENDOR_CENTAUR:
-+ case X86_VENDOR_SHANGHAI:
- case X86_VENDOR_INTEL:
- ret = vmce_intel_rdmsr(v, msr, val);
- break;
-diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
-index aaedb2c31287..da305c7aa4c9 100644
---- a/xen/arch/x86/msr.c
-+++ b/xen/arch/x86/msr.c
-@@ -282,6 +282,8 @@ int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val)
- *val = msrs->misc_features_enables.raw;
- break;
-
-+ case MSR_P5_MC_ADDR:
-+ case MSR_P5_MC_TYPE:
- case MSR_IA32_MCG_CAP ... MSR_IA32_MCG_CTL: /* 0x179 -> 0x17b */
- case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
- case MSR_IA32_MCx_CTL(0) ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
-diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
-index 3e038db618ff..31964b88af7a 100644
---- a/xen/include/asm-x86/msr-index.h
-+++ b/xen/include/asm-x86/msr-index.h
-@@ -15,6 +15,9 @@
- * abbreviated name. Exceptions will be considered on a case-by-case basis.
- */
-
-+#define MSR_P5_MC_ADDR 0
-+#define MSR_P5_MC_TYPE 0x00000001
-+
- #define MSR_APIC_BASE 0x0000001b
- #define APIC_BASE_BSP (_AC(1, ULL) << 8)
- #define APIC_BASE_EXTD (_AC(1, ULL) << 10)
---
-2.35.1
-
diff --git a/0014-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch b/0014-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch
new file mode 100644
index 0000000..fe24269
--- /dev/null
+++ b/0014-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch
@@ -0,0 +1,108 @@
+From 3a16da801e14b8ff996b6f7408391ce488abd925 Mon Sep 17 00:00:00 2001
+From: Henry Wang <Henry.Wang@arm.com>
+Date: Tue, 11 Oct 2022 14:55:40 +0200
+Subject: [PATCH 14/26] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm
+
+This commit implements the `XEN_DOMCTL_shadow_op` support in Xen
+for Arm. The p2m pages pool size for xl guests is supposed to be
+determined by `XEN_DOMCTL_shadow_op`. Hence, this commit:
+
+- Introduces a function `p2m_domctl` and implements the subops
+`XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` and
+`XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION` of `XEN_DOMCTL_shadow_op`.
+
+- Adds the `XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` support in libxl.
+
+Therefore enabling the setting of shadow memory pool size
+when creating a guest from xl and getting shadow memory pool size
+from Xen.
+
+Note that the `XEN_DOMCTL_shadow_op` added in this commit is only
+a dummy op, and the functionality of setting/getting p2m memory pool
+size for xl guests will be added in following commits.
+
+This is part of CVE-2022-33747 / XSA-409.
+
+Signed-off-by: Henry Wang <Henry.Wang@arm.com>
+Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
+master commit: cf2a68d2ffbc3ce95e01449d46180bddb10d24a0
+master date: 2022-10-11 14:28:42 +0200
+---
+ tools/libs/light/libxl_arm.c | 12 ++++++++++++
+ xen/arch/arm/domctl.c | 32 ++++++++++++++++++++++++++++++++
+ 2 files changed, 44 insertions(+)
+
+diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c
+index 73a95e83af24..22a0c561bbc6 100644
+--- a/tools/libs/light/libxl_arm.c
++++ b/tools/libs/light/libxl_arm.c
+@@ -131,6 +131,18 @@ int libxl__arch_domain_create(libxl__gc *gc,
+ libxl__domain_build_state *state,
+ uint32_t domid)
+ {
++ libxl_ctx *ctx = libxl__gc_owner(gc);
++ unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024);
++
++ int r = xc_shadow_control(ctx->xch, domid,
++ XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION,
++ &shadow_mb, 0);
++ if (r) {
++ LOGED(ERROR, domid,
++ "Failed to set %u MiB shadow allocation", shadow_mb);
++ return ERROR_FAIL;
++ }
++
+ return 0;
+ }
+
+diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
+index 1baf25c3d98b..9bf72e693019 100644
+--- a/xen/arch/arm/domctl.c
++++ b/xen/arch/arm/domctl.c
+@@ -47,11 +47,43 @@ static int handle_vuart_init(struct domain *d,
+ return rc;
+ }
+
++static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
++ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
++{
++ if ( unlikely(d == current->domain) )
++ {
++ printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
++ return -EINVAL;
++ }
++
++ if ( unlikely(d->is_dying) )
++ {
++ printk(XENLOG_ERR "Tried to do a p2m domctl op on dying domain %u\n",
++ d->domain_id);
++ return -EINVAL;
++ }
++
++ switch ( sc->op )
++ {
++ case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
++ return 0;
++ case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
++ return 0;
++ default:
++ {
++ printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
++ return -EINVAL;
++ }
++ }
++}
++
+ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
+ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+ {
+ switch ( domctl->cmd )
+ {
++ case XEN_DOMCTL_shadow_op:
++ return p2m_domctl(d, &domctl->u.shadow_op, u_domctl);
+ case XEN_DOMCTL_cacheflush:
+ {
+ gfn_t s = _gfn(domctl->u.cacheflush.start_pfn);
+--
+2.37.3
+
diff --git a/0015-kconfig-detect-LD-implementation.patch b/0015-kconfig-detect-LD-implementation.patch
deleted file mode 100644
index 4507bc7..0000000
--- a/0015-kconfig-detect-LD-implementation.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-From 3754bd128d1a6b3d5864d1a3ee5d27b67d35387a Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
-Date: Tue, 7 Jun 2022 14:05:06 +0200
-Subject: [PATCH 15/51] kconfig: detect LD implementation
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Detect GNU and LLVM ld implementations. This is required for further
-patches that will introduce diverging behaviour depending on the
-linker implementation in use.
-
-Note that LLVM ld returns "compatible with GNU linkers" as part of the
-version string, so be on the safe side and use '^' to only match at
-the start of the line in case LLVM ever decides to change the text to
-use "compatible with GNU ld" instead.
-
-Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
-Reviewed-by: Michal Orzel <michal.orzel@arm.com>
-Acked-by: Julien Grall <jgrall@amazon.com>
-master commit: c70c4b624f85f7d4e28c70a804a0a3f20d73092b
-master date: 2022-05-02 08:50:39 +0200
----
- xen/Kconfig | 6 ++++++
- 1 file changed, 6 insertions(+)
-
-diff --git a/xen/Kconfig b/xen/Kconfig
-index bcbd2758e5d3..0c89afd50fcf 100644
---- a/xen/Kconfig
-+++ b/xen/Kconfig
-@@ -23,6 +23,12 @@ config CLANG_VERSION
- int
- default $(shell,$(BASEDIR)/scripts/clang-version.sh $(CC))
-
-+config LD_IS_GNU
-+ def_bool $(success,$(LD) --version | head -n 1 | grep -q "^GNU ld")
-+
-+config LD_IS_LLVM
-+ def_bool $(success,$(LD) --version | head -n 1 | grep -q "^LLD")
-+
- # -fvisibility=hidden reduces -fpic cost, if it's available
- config CC_HAS_VISIBILITY_ATTRIBUTE
- def_bool $(cc-option,-fvisibility=hidden)
---
-2.35.1
-
diff --git a/0015-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch b/0015-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch
new file mode 100644
index 0000000..704543a
--- /dev/null
+++ b/0015-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch
@@ -0,0 +1,289 @@
+From 44e9dcc48b81bca202a5b31926125a6a59a4c72e Mon Sep 17 00:00:00 2001
+From: Henry Wang <Henry.Wang@arm.com>
+Date: Tue, 11 Oct 2022 14:55:53 +0200
+Subject: [PATCH 15/26] xen/arm: Allocate and free P2M pages from the P2M pool
+
+This commit sets/tearsdown of p2m pages pool for non-privileged Arm
+guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`.
+
+- For dom0, P2M pages should come from heap directly instead of p2m
+pool, so that the kernel may take advantage of the extended regions.
+
+- For xl guests, the setting of the p2m pool is called in
+`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in
+`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is
+updated with the new size when setting the p2m pool.
+
+- For dom0less domUs, the setting of the p2m pool is called before
+allocating memory during domain creation. Users can specify the p2m
+pool size by `xen,domain-p2m-mem-mb` dts property.
+
+To actually allocate/free pages from the p2m pool, this commit adds
+two helper functions namely `p2m_alloc_page` and `p2m_free_page` to
+`struct p2m_domain`. By replacing the `alloc_domheap_page` and
+`free_domheap_page` with these two helper functions, p2m pages can
+be added/removed from the list of p2m pool rather than from the heap.
+
+Since page from `p2m_alloc_page` is cleaned, take the opportunity
+to remove the redundant `clean_page` in `p2m_create_table`.
+
+This is part of CVE-2022-33747 / XSA-409.
+
+Signed-off-by: Henry Wang <Henry.Wang@arm.com>
+Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
+master commit: cbea5a1149ca7fd4b7cdbfa3ec2e4f109b601ff7
+master date: 2022-10-11 14:28:44 +0200
+---
+ docs/misc/arm/device-tree/booting.txt | 8 ++++
+ xen/arch/arm/domain.c | 6 +++
+ xen/arch/arm/domain_build.c | 29 ++++++++++++++
+ xen/arch/arm/domctl.c | 23 ++++++++++-
+ xen/arch/arm/p2m.c | 57 +++++++++++++++++++++++++--
+ 5 files changed, 118 insertions(+), 5 deletions(-)
+
+diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
+index 71895663a4de..d92ccc56ffe0 100644
+--- a/docs/misc/arm/device-tree/booting.txt
++++ b/docs/misc/arm/device-tree/booting.txt
+@@ -182,6 +182,14 @@ with the following properties:
+ Both #address-cells and #size-cells need to be specified because
+ both sub-nodes (described shortly) have reg properties.
+
++- xen,domain-p2m-mem-mb
++
++ Optional. A 32-bit integer specifying the amount of megabytes of RAM
++ used for the domain P2M pool. This is in-sync with the shadow_memory
++ option in xl.cfg. Leaving this field empty in device tree will lead to
++ the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB
++ per MB of guest RAM plus 512KB for guest extended regions.
++
+ Under the "xen,domain" compatible node, one or more sub-nodes are present
+ for the DomU kernel and ramdisk.
+
+diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
+index 2694c39127c5..a818f33a1afa 100644
+--- a/xen/arch/arm/domain.c
++++ b/xen/arch/arm/domain.c
+@@ -997,6 +997,7 @@ enum {
+ PROG_page,
+ PROG_mapping,
+ PROG_p2m,
++ PROG_p2m_pool,
+ PROG_done,
+ };
+
+@@ -1062,6 +1063,11 @@ int domain_relinquish_resources(struct domain *d)
+ if ( ret )
+ return ret;
+
++ PROGRESS(p2m_pool):
++ ret = p2m_teardown_allocation(d);
++ if( ret )
++ return ret;
++
+ PROGRESS(done):
+ break;
+
+diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
+index d02bacbcd1ed..8aec3755ca5d 100644
+--- a/xen/arch/arm/domain_build.c
++++ b/xen/arch/arm/domain_build.c
+@@ -2833,6 +2833,21 @@ static void __init find_gnttab_region(struct domain *d,
+ kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size);
+ }
+
++static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb,
++ unsigned int smp_cpus)
++{
++ /*
++ * Keep in sync with libxl__get_required_paging_memory().
++ * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map,
++ * plus 128 pages to cover extended regions.
++ */
++ unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128);
++
++ BUILD_BUG_ON(PAGE_SIZE != SZ_4K);
++
++ return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT);
++}
++
+ static int __init construct_domain(struct domain *d, struct kernel_info *kinfo)
+ {
+ unsigned int i;
+@@ -2924,6 +2939,8 @@ static int __init construct_domU(struct domain *d,
+ struct kernel_info kinfo = {};
+ int rc;
+ u64 mem;
++ u32 p2m_mem_mb;
++ unsigned long p2m_pages;
+
+ rc = dt_property_read_u64(node, "memory", &mem);
+ if ( !rc )
+@@ -2933,6 +2950,18 @@ static int __init construct_domU(struct domain *d,
+ }
+ kinfo.unassigned_mem = (paddr_t)mem * SZ_1K;
+
++ rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb);
++ /* If xen,domain-p2m-mem-mb is not specified, use the default value. */
++ p2m_pages = rc ?
++ p2m_mem_mb << (20 - PAGE_SHIFT) :
++ domain_p2m_pages(mem, d->max_vcpus);
++
++ spin_lock(&d->arch.paging.lock);
++ rc = p2m_set_allocation(d, p2m_pages, NULL);
++ spin_unlock(&d->arch.paging.lock);
++ if ( rc != 0 )
++ return rc;
++
+ printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem);
+
+ kinfo.vpl011 = dt_property_read_bool(node, "vpl011");
+diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
+index 9bf72e693019..c8fdeb124084 100644
+--- a/xen/arch/arm/domctl.c
++++ b/xen/arch/arm/domctl.c
+@@ -50,6 +50,9 @@ static int handle_vuart_init(struct domain *d,
+ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
+ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+ {
++ long rc;
++ bool preempted = false;
++
+ if ( unlikely(d == current->domain) )
+ {
+ printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
+@@ -66,9 +69,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
+ switch ( sc->op )
+ {
+ case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
+- return 0;
++ {
++ /* Allow and handle preemption */
++ spin_lock(&d->arch.paging.lock);
++ rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
++ spin_unlock(&d->arch.paging.lock);
++
++ if ( preempted )
++ /* Not finished. Set up to re-run the call. */
++ rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h",
++ u_domctl);
++ else
++ /* Finished. Return the new allocation. */
++ sc->mb = p2m_get_allocation(d);
++
++ return rc;
++ }
+ case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
++ {
++ sc->mb = p2m_get_allocation(d);
+ return 0;
++ }
+ default:
+ {
+ printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
+diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
+index d8957dd8727c..b2d856a801af 100644
+--- a/xen/arch/arm/p2m.c
++++ b/xen/arch/arm/p2m.c
+@@ -50,6 +50,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
+ return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
+ }
+
++static struct page_info *p2m_alloc_page(struct domain *d)
++{
++ struct page_info *pg;
++
++ spin_lock(&d->arch.paging.lock);
++ /*
++ * For hardware domain, there should be no limit in the number of pages that
++ * can be allocated, so that the kernel may take advantage of the extended
++ * regions. Hence, allocate p2m pages for hardware domains from heap.
++ */
++ if ( is_hardware_domain(d) )
++ {
++ pg = alloc_domheap_page(NULL, 0);
++ if ( pg == NULL )
++ {
++ printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n");
++ spin_unlock(&d->arch.paging.lock);
++ return NULL;
++ }
++ }
++ else
++ {
++ pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
++ if ( unlikely(!pg) )
++ {
++ spin_unlock(&d->arch.paging.lock);
++ return NULL;
++ }
++ d->arch.paging.p2m_total_pages--;
++ }
++ spin_unlock(&d->arch.paging.lock);
++
++ return pg;
++}
++
++static void p2m_free_page(struct domain *d, struct page_info *pg)
++{
++ spin_lock(&d->arch.paging.lock);
++ if ( is_hardware_domain(d) )
++ free_domheap_page(pg);
++ else
++ {
++ d->arch.paging.p2m_total_pages++;
++ page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
++ }
++ spin_unlock(&d->arch.paging.lock);
++}
++
+ /* Return the size of the pool, rounded up to the nearest MB */
+ unsigned int p2m_get_allocation(struct domain *d)
+ {
+@@ -751,7 +799,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry)
+
+ ASSERT(!p2m_is_valid(*entry));
+
+- page = alloc_domheap_page(NULL, 0);
++ page = p2m_alloc_page(p2m->domain);
+ if ( page == NULL )
+ return -ENOMEM;
+
+@@ -878,7 +926,7 @@ static void p2m_free_entry(struct p2m_domain *p2m,
+ pg = mfn_to_page(mfn);
+
+ page_list_del(pg, &p2m->pages);
+- free_domheap_page(pg);
++ p2m_free_page(p2m->domain, pg);
+ }
+
+ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
+@@ -902,7 +950,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
+ ASSERT(level < target);
+ ASSERT(p2m_is_superpage(*entry, level));
+
+- page = alloc_domheap_page(NULL, 0);
++ page = p2m_alloc_page(p2m->domain);
+ if ( !page )
+ return false;
+
+@@ -1641,7 +1689,7 @@ int p2m_teardown(struct domain *d)
+
+ while ( (pg = page_list_remove_head(&p2m->pages)) )
+ {
+- free_domheap_page(pg);
++ p2m_free_page(p2m->domain, pg);
+ count++;
+ /* Arbitrarily preempt every 512 iterations */
+ if ( !(count % 512) && hypercall_preempt_check() )
+@@ -1665,6 +1713,7 @@ void p2m_final_teardown(struct domain *d)
+ return;
+
+ ASSERT(page_list_empty(&p2m->pages));
++ ASSERT(page_list_empty(&d->arch.paging.p2m_freelist));
+
+ if ( p2m->root )
+ free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
+--
+2.37.3
+
diff --git a/0016-gnttab-correct-locking-on-transitive-grant-copy-erro.patch b/0016-gnttab-correct-locking-on-transitive-grant-copy-erro.patch
new file mode 100644
index 0000000..6283d47
--- /dev/null
+++ b/0016-gnttab-correct-locking-on-transitive-grant-copy-erro.patch
@@ -0,0 +1,66 @@
+From 32cb81501c8b858fe9a451650804ec3024a8b364 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 11 Oct 2022 14:56:29 +0200
+Subject: [PATCH 16/26] gnttab: correct locking on transitive grant copy error
+ path
+
+While the comment next to the lock dropping in preparation of
+recursively calling acquire_grant_for_copy() mistakenly talks about the
+rd == td case (excluded a few lines further up), the same concerns apply
+to the calling of release_grant_for_copy() on a subsequent error path.
+
+This is CVE-2022-33748 / XSA-411.
+
+Fixes: ad48fb963dbf ("gnttab: fix transitive grant handling")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+master commit: 6e3aab858eef614a21a782a3b73acc88e74690ea
+master date: 2022-10-11 14:29:30 +0200
+---
+ xen/common/grant_table.c | 19 ++++++++++++++++---
+ 1 file changed, 16 insertions(+), 3 deletions(-)
+
+diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
+index 4c742cd8fe81..d8ca645b96ff 100644
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -2613,9 +2613,8 @@ acquire_grant_for_copy(
+ trans_domid);
+
+ /*
+- * acquire_grant_for_copy() could take the lock on the
+- * remote table (if rd == td), so we have to drop the lock
+- * here and reacquire.
++ * acquire_grant_for_copy() will take the lock on the remote table,
++ * so we have to drop the lock here and reacquire.
+ */
+ active_entry_release(act);
+ grant_read_unlock(rgt);
+@@ -2652,11 +2651,25 @@ acquire_grant_for_copy(
+ act->trans_gref != trans_gref ||
+ !act->is_sub_page)) )
+ {
++ /*
++ * Like above for acquire_grant_for_copy() we need to drop and then
++ * re-acquire the locks here to prevent lock order inversion issues.
++ * Unlike for acquire_grant_for_copy() we don't need to re-check
++ * anything, as release_grant_for_copy() doesn't depend on the grant
++ * table entry: It only updates internal state and the status flags.
++ */
++ active_entry_release(act);
++ grant_read_unlock(rgt);
++
+ release_grant_for_copy(td, trans_gref, readonly);
+ rcu_unlock_domain(td);
++
++ grant_read_lock(rgt);
++ act = active_entry_acquire(rgt, gref);
+ reduce_status_for_pin(rd, act, status, readonly);
+ active_entry_release(act);
+ grant_read_unlock(rgt);
++
+ put_page(*page);
+ *page = NULL;
+ return ERESTART;
+--
+2.37.3
+
diff --git a/0016-linker-lld-do-not-generate-quoted-section-names.patch b/0016-linker-lld-do-not-generate-quoted-section-names.patch
deleted file mode 100644
index 5b3a8cd..0000000
--- a/0016-linker-lld-do-not-generate-quoted-section-names.patch
+++ /dev/null
@@ -1,54 +0,0 @@
-From 88b653f73928117461dc250acd1e830a47a14c2b Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
-Date: Tue, 7 Jun 2022 14:05:24 +0200
-Subject: [PATCH 16/51] linker/lld: do not generate quoted section names
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-LLVM LD doesn't strip the quotes from the section names, and so the
-resulting binary ends up with section names like:
-
- [ 1] ".text" PROGBITS ffff82d040200000 00008000
- 000000000018cbc1 0000000000000000 AX 0 0 4096
-
-This confuses some tools (like gdb) and prevents proper parsing of the
-binary.
-
-The issue has already been reported and is being fixed in LLD. In
-order to workaround this issue and keep the GNU ld support define
-different DECL_SECTION macros depending on the used ld
-implementation.
-
-Drop the quotes from the definitions of the debug sections in
-DECL_DEBUG{2}, as those quotes are not required for GNU ld either.
-
-Fixes: 6254920587c3 ('x86: quote section names when defining them in linker script')
-Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-master commit: 702c9a800eb3ecd4b8595998d37a769d470c5bb0
-master date: 2022-05-02 08:51:45 +0200
----
- xen/arch/x86/xen.lds.S | 6 +++++-
- 1 file changed, 5 insertions(+), 1 deletion(-)
-
-diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
-index 4c58f3209c3d..bc9b9651b192 100644
---- a/xen/arch/x86/xen.lds.S
-+++ b/xen/arch/x86/xen.lds.S
-@@ -18,7 +18,11 @@ ENTRY(efi_start)
- #else /* !EFI */
-
- #define FORMAT "elf64-x86-64"
--#define DECL_SECTION(x) #x : AT(ADDR(#x) - __XEN_VIRT_START)
-+#ifdef CONFIG_LD_IS_GNU
-+# define DECL_SECTION(x) x : AT(ADDR(#x) - __XEN_VIRT_START)
-+#else
-+# define DECL_SECTION(x) x : AT(ADDR(x) - __XEN_VIRT_START)
-+#endif
-
- ENTRY(start_pa)
-
---
-2.35.1
-
diff --git a/0017-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch b/0017-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch
new file mode 100644
index 0000000..ffbc311
--- /dev/null
+++ b/0017-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch
@@ -0,0 +1,112 @@
+From e85e2a3c17b6cd38de041cdaf14d9efdcdabad1a Mon Sep 17 00:00:00 2001
+From: Anthony PERARD <anthony.perard@citrix.com>
+Date: Tue, 11 Oct 2022 14:59:10 +0200
+Subject: [PATCH 17/26] tools/libxl: Replace deprecated -soundhw on QEMU
+ command line
+
+-soundhw is deprecated since 825ff02911c9 ("audio: add soundhw
+deprecation notice"), QEMU v5.1, and is been remove for upcoming v7.1
+by 039a68373c45 ("introduce -audio as a replacement for -soundhw").
+
+Instead we can just add the sound card with "-device", for most option
+that "-soundhw" could handle. "-device" is an option that existed
+before QEMU 1.0, and could already be used to add audio hardware.
+
+The list of possible option for libxl's "soundhw" is taken the list
+from QEMU 7.0.
+
+The list of options for "soundhw" are listed in order of preference in
+the manual. The first three (hda, ac97, es1370) are PCI devices and
+easy to test on Linux, and the last four are ISA devices which doesn't
+seems to work out of the box on linux.
+
+The sound card 'pcspk' isn't listed even if it used to be accepted by
+'-soundhw' because QEMU crash when trying to add it to a Xen domain.
+Also, it wouldn't work with "-device" might need to be "-machine
+pcspk-audiodev=default" instead.
+
+Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
+Reviewed-by: Jason Andryuk <jandryuk@gmail.com>
+master commit: 62ca138c2c052187783aca3957d3f47c4dcfd683
+master date: 2022-08-18 09:25:50 +0200
+---
+ docs/man/xl.cfg.5.pod.in | 6 +++---
+ tools/libs/light/libxl_dm.c | 19 ++++++++++++++++++-
+ tools/libs/light/libxl_types_internal.idl | 10 ++++++++++
+ 3 files changed, 31 insertions(+), 4 deletions(-)
+
+diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in
+index eda1e77ebd06..ab7541f22c3e 100644
+--- a/docs/man/xl.cfg.5.pod.in
++++ b/docs/man/xl.cfg.5.pod.in
+@@ -2545,9 +2545,9 @@ The form serial=DEVICE is also accepted for backwards compatibility.
+
+ =item B<soundhw="DEVICE">
+
+-Select the virtual sound card to expose to the guest. The valid
+-devices are defined by the device model configuration, please see the
+-B<qemu(1)> manpage for details. The default is not to export any sound
++Select the virtual sound card to expose to the guest. The valid devices are
++B<hda>, B<ac97>, B<es1370>, B<adlib>, B<cs4231a>, B<gus>, B<sb16> if there are
++available with the device model QEMU. The default is not to export any sound
+ device.
+
+ =item B<vkb_device=BOOLEAN>
+diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c
+index 04bf5d85632e..fc264a3a13a6 100644
+--- a/tools/libs/light/libxl_dm.c
++++ b/tools/libs/light/libxl_dm.c
+@@ -1204,6 +1204,7 @@ static int libxl__build_device_model_args_new(libxl__gc *gc,
+ uint64_t ram_size;
+ const char *path, *chardev;
+ bool is_stubdom = libxl_defbool_val(b_info->device_model_stubdomain);
++ int rc;
+
+ dm_args = flexarray_make(gc, 16, 1);
+ dm_envs = flexarray_make(gc, 16, 1);
+@@ -1531,7 +1532,23 @@ static int libxl__build_device_model_args_new(libxl__gc *gc,
+ }
+ }
+ if (b_info->u.hvm.soundhw) {
+- flexarray_vappend(dm_args, "-soundhw", b_info->u.hvm.soundhw, NULL);
++ libxl__qemu_soundhw soundhw;
++
++ rc = libxl__qemu_soundhw_from_string(b_info->u.hvm.soundhw, &soundhw);
++ if (rc) {
++ LOGD(ERROR, guest_domid, "Unknown soundhw option '%s'", b_info->u.hvm.soundhw);
++ return ERROR_INVAL;
++ }
++
++ switch (soundhw) {
++ case LIBXL__QEMU_SOUNDHW_HDA:
++ flexarray_vappend(dm_args, "-device", "intel-hda",
++ "-device", "hda-duplex", NULL);
++ break;
++ default:
++ flexarray_append_pair(dm_args, "-device",
++ (char*)libxl__qemu_soundhw_to_string(soundhw));
++ }
+ }
+ if (!libxl__acpi_defbool_val(b_info)) {
+ flexarray_append(dm_args, "-no-acpi");
+diff --git a/tools/libs/light/libxl_types_internal.idl b/tools/libs/light/libxl_types_internal.idl
+index 3593e21dbb64..caa08d3229cd 100644
+--- a/tools/libs/light/libxl_types_internal.idl
++++ b/tools/libs/light/libxl_types_internal.idl
+@@ -55,3 +55,13 @@ libxl__device_action = Enumeration("device_action", [
+ (1, "ADD"),
+ (2, "REMOVE"),
+ ])
++
++libxl__qemu_soundhw = Enumeration("qemu_soundhw", [
++ (1, "ac97"),
++ (2, "adlib"),
++ (3, "cs4231a"),
++ (4, "es1370"),
++ (5, "gus"),
++ (6, "hda"),
++ (7, "sb16"),
++ ])
+--
+2.37.3
+
diff --git a/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch b/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch
deleted file mode 100644
index bc48a84..0000000
--- a/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch
+++ /dev/null
@@ -1,142 +0,0 @@
-From 982a314bd3000a16c3128afadb36a8ff41029adc Mon Sep 17 00:00:00 2001
-From: Julien Grall <jgrall@amazon.com>
-Date: Tue, 7 Jun 2022 14:06:11 +0200
-Subject: [PATCH 17/51] xen: io: Fix race between sending an I/O and domain
- shutdown
-
-Xen provides hypercalls to shutdown (SCHEDOP_shutdown{,_code}) and
-resume a domain (XEN_DOMCTL_resumedomain). They can be used for checkpoint
-where the expectation is the domain should continue as nothing happened
-afterwards.
-
-hvmemul_do_io() and handle_pio() will act differently if the return
-code of hvm_send_ioreq() (resp. hvmemul_do_pio_buffer()) is X86EMUL_RETRY.
-
-In this case, the I/O state will be reset to STATE_IOREQ_NONE (i.e
-no I/O is pending) and/or the PC will not be advanced.
-
-If the shutdown request happens right after the I/O was sent to the
-IOREQ, then emulation code will end up to re-execute the instruction
-and therefore forward again the same I/O (at least when reading IO port).
-
-This would be problem if the access has a side-effect. A dumb example,
-is a device implementing a counter which is incremented by one for every
-access. When running shutdown/resume in a loop, the value read by the
-OS may not be the old value + 1.
-
-Add an extra boolean in the structure hvm_vcpu_io to indicate whether
-the I/O was suspended. This is then used in place of checking the domain
-is shutting down in hvmemul_do_io() and handle_pio() as they should
-act on suspend (i.e. vcpu_start_shutdown_deferral() returns false) rather
-than shutdown.
-
-Signed-off-by: Julien Grall <jgrall@amazon.com>
-Reviewed-by: Paul Durrant <paul@xen.org>
-master commit: b7e0d8978810b534725e94a321736496928f00a5
-master date: 2022-05-06 17:16:22 +0100
----
- xen/arch/arm/ioreq.c | 3 ++-
- xen/arch/x86/hvm/emulate.c | 3 ++-
- xen/arch/x86/hvm/io.c | 7 ++++---
- xen/common/ioreq.c | 4 ++++
- xen/include/xen/sched.h | 5 +++++
- 5 files changed, 17 insertions(+), 5 deletions(-)
-
-diff --git a/xen/arch/arm/ioreq.c b/xen/arch/arm/ioreq.c
-index 308650b40051..fbccef212bf1 100644
---- a/xen/arch/arm/ioreq.c
-+++ b/xen/arch/arm/ioreq.c
-@@ -80,9 +80,10 @@ enum io_state try_fwd_ioserv(struct cpu_user_regs *regs,
- return IO_ABORT;
-
- vio->req = p;
-+ vio->suspended = false;
-
- rc = ioreq_send(s, &p, 0);
-- if ( rc != IO_RETRY || v->domain->is_shutting_down )
-+ if ( rc != IO_RETRY || vio->suspended )
- vio->req.state = STATE_IOREQ_NONE;
- else if ( !ioreq_needs_completion(&vio->req) )
- rc = IO_HANDLED;
-diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
-index 76a2ccfafe23..7da348b5d486 100644
---- a/xen/arch/x86/hvm/emulate.c
-+++ b/xen/arch/x86/hvm/emulate.c
-@@ -239,6 +239,7 @@ static int hvmemul_do_io(
- ASSERT(p.count);
-
- vio->req = p;
-+ vio->suspended = false;
-
- rc = hvm_io_intercept(&p);
-
-@@ -334,7 +335,7 @@ static int hvmemul_do_io(
- else
- {
- rc = ioreq_send(s, &p, 0);
-- if ( rc != X86EMUL_RETRY || currd->is_shutting_down )
-+ if ( rc != X86EMUL_RETRY || vio->suspended )
- vio->req.state = STATE_IOREQ_NONE;
- else if ( !ioreq_needs_completion(&vio->req) )
- rc = X86EMUL_OKAY;
-diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
-index 93f1d1503fa6..80915f27e488 100644
---- a/xen/arch/x86/hvm/io.c
-+++ b/xen/arch/x86/hvm/io.c
-@@ -138,10 +138,11 @@ bool handle_pio(uint16_t port, unsigned int size, int dir)
-
- case X86EMUL_RETRY:
- /*
-- * We should not advance RIP/EIP if the domain is shutting down or
-- * if X86EMUL_RETRY has been returned by an internal handler.
-+ * We should not advance RIP/EIP if the vio was suspended (e.g.
-+ * because the domain is shutting down) or if X86EMUL_RETRY has
-+ * been returned by an internal handler.
- */
-- if ( curr->domain->is_shutting_down || !vcpu_ioreq_pending(curr) )
-+ if ( vio->suspended || !vcpu_ioreq_pending(curr) )
- return false;
- break;
-
-diff --git a/xen/common/ioreq.c b/xen/common/ioreq.c
-index d732dc045df9..42414b750bef 100644
---- a/xen/common/ioreq.c
-+++ b/xen/common/ioreq.c
-@@ -1256,6 +1256,7 @@ int ioreq_send(struct ioreq_server *s, ioreq_t *proto_p,
- struct vcpu *curr = current;
- struct domain *d = curr->domain;
- struct ioreq_vcpu *sv;
-+ struct vcpu_io *vio = &curr->io;
-
- ASSERT(s);
-
-@@ -1263,7 +1264,10 @@ int ioreq_send(struct ioreq_server *s, ioreq_t *proto_p,
- return ioreq_send_buffered(s, proto_p);
-
- if ( unlikely(!vcpu_start_shutdown_deferral(curr)) )
-+ {
-+ vio->suspended = true;
- return IOREQ_STATUS_RETRY;
-+ }
-
- list_for_each_entry ( sv,
- &s->ioreq_vcpu_list,
-diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
-index 28146ee404e6..9671062360ac 100644
---- a/xen/include/xen/sched.h
-+++ b/xen/include/xen/sched.h
-@@ -159,6 +159,11 @@ enum vio_completion {
- struct vcpu_io {
- /* I/O request in flight to device model. */
- enum vio_completion completion;
-+ /*
-+ * Indicate whether the I/O was not handled because the domain
-+ * is about to be paused.
-+ */
-+ bool suspended;
- ioreq_t req;
- };
-
---
-2.35.1
-
diff --git a/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch b/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch
deleted file mode 100644
index b20a99a..0000000
--- a/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-From 4890031d224262a6cf43d3bef1af4a16c13db306 Mon Sep 17 00:00:00 2001
-From: Jan Beulich <jbeulich@suse.com>
-Date: Tue, 7 Jun 2022 14:06:51 +0200
-Subject: [PATCH 18/51] build: suppress GNU ld warning about RWX load segments
-
-We cannot really avoid such and we're also not really at risk because of
-them, as we control page table permissions ourselves rather than relying
-on a loader of some sort. Present GNU ld master started warning about
-such, and hence 2.39 is anticipated to have this warning.
-
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Acked-by: Julien Grall <jgrall@amazon.com>
-master commit: 68f5aac012b9ae36ce9b65d9ca9cc9f232191ad3
-master date: 2022-05-18 11:17:19 +0200
----
- xen/Makefile | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/xen/Makefile b/xen/Makefile
-index ce4eca3ee4d7..4d9abe704628 100644
---- a/xen/Makefile
-+++ b/xen/Makefile
-@@ -260,6 +260,8 @@ endif
-
- AFLAGS += -D__ASSEMBLY__
-
-+LDFLAGS-$(call ld-option,--warn-rwx-segments) += --no-warn-rwx-segments
-+
- CFLAGS += $(CFLAGS-y)
- # allow extra CFLAGS externally via EXTRA_CFLAGS_XEN_CORE
- CFLAGS += $(EXTRA_CFLAGS_XEN_CORE)
---
-2.35.1
-
diff --git a/0018-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch b/0018-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch
new file mode 100644
index 0000000..d6ade98
--- /dev/null
+++ b/0018-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch
@@ -0,0 +1,44 @@
+From e8882bcfe35520e950ba60acd6e67e65f1ce90a8 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 11 Oct 2022 14:59:26 +0200
+Subject: [PATCH 18/26] x86/CPUID: surface suitable value in EBX of XSTATE
+ subleaf 1
+
+While the SDM isn't very clear about this, our present behavior make
+Linux 5.19 unhappy. As of commit 8ad7e8f69695 ("x86/fpu/xsave: Support
+XSAVEC in the kernel") they're using this CPUID output also to size
+the compacted area used by XSAVEC. Getting back zero there isn't really
+liked, yet for PV that's the default on capable hardware: XSAVES isn't
+exposed to PV domains.
+
+Considering that the size reported is that of the compacted save area,
+I view Linux'es assumption as appropriate (short of the SDM properly
+considering the case). Therefore we need to populate the field also when
+only XSAVEC is supported for a guest.
+
+Fixes: 460b9a4b3630 ("x86/xsaves: enable xsaves/xrstors for hvm guest")
+Fixes: 8d050ed1097c ("x86: don't expose XSAVES capability to PV guests")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: c3bd0b83ea5b7c0da6542687436042eeea1e7909
+master date: 2022-08-24 14:23:59 +0200
+---
+ xen/arch/x86/cpuid.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
+index ff335f16390d..a647331f4793 100644
+--- a/xen/arch/x86/cpuid.c
++++ b/xen/arch/x86/cpuid.c
+@@ -1060,7 +1060,7 @@ void guest_cpuid(const struct vcpu *v, uint32_t leaf,
+ switch ( subleaf )
+ {
+ case 1:
+- if ( p->xstate.xsaves )
++ if ( p->xstate.xsavec || p->xstate.xsaves )
+ {
+ /*
+ * TODO: Figure out what to do for XSS state. VT-x manages
+--
+2.37.3
+
diff --git a/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch b/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch
deleted file mode 100644
index e4d739b..0000000
--- a/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-From 1bc669a568a9f4bdab9e9ddb95823ba370dc0baf Mon Sep 17 00:00:00 2001
-From: Jan Beulich <jbeulich@suse.com>
-Date: Tue, 7 Jun 2022 14:07:11 +0200
-Subject: [PATCH 19/51] build: silence GNU ld warning about executable stacks
-
-While for C files the compiler is supposed to arrange for emitting
-respective information, for assembly sources we're responsible ourselves.
-Present GNU ld master started warning about such, and hence 2.39 is
-anticipated to have this warning.
-
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Acked-by: Julien Grall <jgrall@amazon.com>
-master commit: 62d22296a95d259c934ca2f39ac511d729cfbb68
-master date: 2022-05-18 11:18:45 +0200
----
- xen/Makefile | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/xen/Makefile b/xen/Makefile
-index 4d9abe704628..971028eda240 100644
---- a/xen/Makefile
-+++ b/xen/Makefile
-@@ -260,6 +260,8 @@ endif
-
- AFLAGS += -D__ASSEMBLY__
-
-+$(call cc-option-add,AFLAGS,CC,-Wa$(comma)--noexecstack)
-+
- LDFLAGS-$(call ld-option,--warn-rwx-segments) += --no-warn-rwx-segments
-
- CFLAGS += $(CFLAGS-y)
---
-2.35.1
-
diff --git a/0019-xen-sched-introduce-cpupool_update_node_affinity.patch b/0019-xen-sched-introduce-cpupool_update_node_affinity.patch
new file mode 100644
index 0000000..957d0fe
--- /dev/null
+++ b/0019-xen-sched-introduce-cpupool_update_node_affinity.patch
@@ -0,0 +1,257 @@
+From d4e971ad12dd27913dffcf96b5de378ea7b476e1 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 11 Oct 2022 14:59:40 +0200
+Subject: [PATCH 19/26] xen/sched: introduce cpupool_update_node_affinity()
+
+For updating the node affinities of all domains in a cpupool add a new
+function cpupool_update_node_affinity().
+
+In order to avoid multiple allocations of cpumasks carve out memory
+allocation and freeing from domain_update_node_affinity() into new
+helpers, which can be used by cpupool_update_node_affinity().
+
+Modify domain_update_node_affinity() to take an additional parameter
+for passing the allocated memory in and to allocate and free the memory
+via the new helpers in case NULL was passed.
+
+This will help later to pre-allocate the cpumasks in order to avoid
+allocations in stop-machine context.
+
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Tested-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: a83fa1e2b96ace65b45dde6954d67012633a082b
+master date: 2022-09-05 11:42:30 +0100
+---
+ xen/common/sched/core.c | 54 ++++++++++++++++++++++++++------------
+ xen/common/sched/cpupool.c | 39 +++++++++++++++------------
+ xen/common/sched/private.h | 7 +++++
+ xen/include/xen/sched.h | 9 ++++++-
+ 4 files changed, 74 insertions(+), 35 deletions(-)
+
+diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
+index f07bd2681fcb..065a83eca912 100644
+--- a/xen/common/sched/core.c
++++ b/xen/common/sched/core.c
+@@ -1824,9 +1824,28 @@ int vcpu_affinity_domctl(struct domain *d, uint32_t cmd,
+ return ret;
+ }
+
+-void domain_update_node_affinity(struct domain *d)
++bool alloc_affinity_masks(struct affinity_masks *affinity)
+ {
+- cpumask_var_t dom_cpumask, dom_cpumask_soft;
++ if ( !alloc_cpumask_var(&affinity->hard) )
++ return false;
++ if ( !alloc_cpumask_var(&affinity->soft) )
++ {
++ free_cpumask_var(affinity->hard);
++ return false;
++ }
++
++ return true;
++}
++
++void free_affinity_masks(struct affinity_masks *affinity)
++{
++ free_cpumask_var(affinity->soft);
++ free_cpumask_var(affinity->hard);
++}
++
++void domain_update_node_aff(struct domain *d, struct affinity_masks *affinity)
++{
++ struct affinity_masks masks;
+ cpumask_t *dom_affinity;
+ const cpumask_t *online;
+ struct sched_unit *unit;
+@@ -1836,14 +1855,16 @@ void domain_update_node_affinity(struct domain *d)
+ if ( !d->vcpu || !d->vcpu[0] )
+ return;
+
+- if ( !zalloc_cpumask_var(&dom_cpumask) )
+- return;
+- if ( !zalloc_cpumask_var(&dom_cpumask_soft) )
++ if ( !affinity )
+ {
+- free_cpumask_var(dom_cpumask);
+- return;
++ affinity = &masks;
++ if ( !alloc_affinity_masks(affinity) )
++ return;
+ }
+
++ cpumask_clear(affinity->hard);
++ cpumask_clear(affinity->soft);
++
+ online = cpupool_domain_master_cpumask(d);
+
+ spin_lock(&d->node_affinity_lock);
+@@ -1864,22 +1885,21 @@ void domain_update_node_affinity(struct domain *d)
+ */
+ for_each_sched_unit ( d, unit )
+ {
+- cpumask_or(dom_cpumask, dom_cpumask, unit->cpu_hard_affinity);
+- cpumask_or(dom_cpumask_soft, dom_cpumask_soft,
+- unit->cpu_soft_affinity);
++ cpumask_or(affinity->hard, affinity->hard, unit->cpu_hard_affinity);
++ cpumask_or(affinity->soft, affinity->soft, unit->cpu_soft_affinity);
+ }
+ /* Filter out non-online cpus */
+- cpumask_and(dom_cpumask, dom_cpumask, online);
+- ASSERT(!cpumask_empty(dom_cpumask));
++ cpumask_and(affinity->hard, affinity->hard, online);
++ ASSERT(!cpumask_empty(affinity->hard));
+ /* And compute the intersection between hard, online and soft */
+- cpumask_and(dom_cpumask_soft, dom_cpumask_soft, dom_cpumask);
++ cpumask_and(affinity->soft, affinity->soft, affinity->hard);
+
+ /*
+ * If not empty, the intersection of hard, soft and online is the
+ * narrowest set we want. If empty, we fall back to hard&online.
+ */
+- dom_affinity = cpumask_empty(dom_cpumask_soft) ?
+- dom_cpumask : dom_cpumask_soft;
++ dom_affinity = cpumask_empty(affinity->soft) ? affinity->hard
++ : affinity->soft;
+
+ nodes_clear(d->node_affinity);
+ for_each_cpu ( cpu, dom_affinity )
+@@ -1888,8 +1908,8 @@ void domain_update_node_affinity(struct domain *d)
+
+ spin_unlock(&d->node_affinity_lock);
+
+- free_cpumask_var(dom_cpumask_soft);
+- free_cpumask_var(dom_cpumask);
++ if ( affinity == &masks )
++ free_affinity_masks(affinity);
+ }
+
+ typedef long ret_t;
+diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c
+index 8c6e6eb9ccd5..45b6ff99561a 100644
+--- a/xen/common/sched/cpupool.c
++++ b/xen/common/sched/cpupool.c
+@@ -401,6 +401,25 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c)
+ return ret;
+ }
+
++/* Update affinities of all domains in a cpupool. */
++static void cpupool_update_node_affinity(const struct cpupool *c)
++{
++ struct affinity_masks masks;
++ struct domain *d;
++
++ if ( !alloc_affinity_masks(&masks) )
++ return;
++
++ rcu_read_lock(&domlist_read_lock);
++
++ for_each_domain_in_cpupool(d, c)
++ domain_update_node_aff(d, &masks);
++
++ rcu_read_unlock(&domlist_read_lock);
++
++ free_affinity_masks(&masks);
++}
++
+ /*
+ * assign a specific cpu to a cpupool
+ * cpupool_lock must be held
+@@ -408,7 +427,6 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c)
+ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
+ {
+ int ret;
+- struct domain *d;
+ const cpumask_t *cpus;
+
+ cpus = sched_get_opt_cpumask(c->gran, cpu);
+@@ -433,12 +451,7 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
+
+ rcu_read_unlock(&sched_res_rculock);
+
+- rcu_read_lock(&domlist_read_lock);
+- for_each_domain_in_cpupool(d, c)
+- {
+- domain_update_node_affinity(d);
+- }
+- rcu_read_unlock(&domlist_read_lock);
++ cpupool_update_node_affinity(c);
+
+ return 0;
+ }
+@@ -447,18 +460,14 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c)
+ {
+ int cpu = cpupool_moving_cpu;
+ const cpumask_t *cpus;
+- struct domain *d;
+ int ret;
+
+ if ( c != cpupool_cpu_moving )
+ return -EADDRNOTAVAIL;
+
+- /*
+- * We need this for scanning the domain list, both in
+- * cpu_disable_scheduler(), and at the bottom of this function.
+- */
+ rcu_read_lock(&domlist_read_lock);
+ ret = cpu_disable_scheduler(cpu);
++ rcu_read_unlock(&domlist_read_lock);
+
+ rcu_read_lock(&sched_res_rculock);
+ cpus = get_sched_res(cpu)->cpus;
+@@ -485,11 +494,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c)
+ }
+ rcu_read_unlock(&sched_res_rculock);
+
+- for_each_domain_in_cpupool(d, c)
+- {
+- domain_update_node_affinity(d);
+- }
+- rcu_read_unlock(&domlist_read_lock);
++ cpupool_update_node_affinity(c);
+
+ return ret;
+ }
+diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h
+index a870320146ef..2b04b01a0c0a 100644
+--- a/xen/common/sched/private.h
++++ b/xen/common/sched/private.h
+@@ -593,6 +593,13 @@ affinity_balance_cpumask(const struct sched_unit *unit, int step,
+ cpumask_copy(mask, unit->cpu_hard_affinity);
+ }
+
++struct affinity_masks {
++ cpumask_var_t hard;
++ cpumask_var_t soft;
++};
++
++bool alloc_affinity_masks(struct affinity_masks *affinity);
++void free_affinity_masks(struct affinity_masks *affinity);
+ void sched_rm_cpu(unsigned int cpu);
+ const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu);
+ void schedule_dump(struct cpupool *c);
+diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
+index 9671062360ac..3f4225738a40 100644
+--- a/xen/include/xen/sched.h
++++ b/xen/include/xen/sched.h
+@@ -655,8 +655,15 @@ static inline void get_knownalive_domain(struct domain *d)
+ ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED));
+ }
+
++struct affinity_masks;
++
+ int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity);
+-void domain_update_node_affinity(struct domain *d);
++void domain_update_node_aff(struct domain *d, struct affinity_masks *affinity);
++
++static inline void domain_update_node_affinity(struct domain *d)
++{
++ domain_update_node_aff(d, NULL);
++}
+
+ /*
+ * To be implemented by each architecture, sanity checking the configuration
+--
+2.37.3
+
diff --git a/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch b/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch
deleted file mode 100644
index baa1e15..0000000
--- a/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch
+++ /dev/null
@@ -1,50 +0,0 @@
-From f1be0b62a03b90a40a03e21f965e4cbb89809bb1 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
- <marmarek@invisiblethingslab.com>
-Date: Tue, 7 Jun 2022 14:07:34 +0200
-Subject: [PATCH 20/51] ns16550: use poll mode if INTERRUPT_LINE is 0xff
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Intel LPSS has INTERRUPT_LINE set to 0xff by default, that is declared
-by the PCI Local Bus Specification Revision 3.0 (from 2004) as
-"unknown"/"no connection". Fallback to poll mode in this case.
-The 0xff handling is x86-specific, the surrounding code is guarded with
-CONFIG_X86 anyway.
-
-Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-master commit: 6a2ea1a2370a0c8a0210accac0ae62e68c185134
-master date: 2022-05-20 12:19:45 +0200
----
- xen/drivers/char/ns16550.c | 13 +++++++++++++
- 1 file changed, 13 insertions(+)
-
-diff --git a/xen/drivers/char/ns16550.c b/xen/drivers/char/ns16550.c
-index 30596d60d4ed..2d2bd2a02469 100644
---- a/xen/drivers/char/ns16550.c
-+++ b/xen/drivers/char/ns16550.c
-@@ -1221,6 +1221,19 @@ pci_uart_config(struct ns16550 *uart, bool_t skip_amt, unsigned int idx)
- pci_conf_read8(PCI_SBDF(0, b, d, f),
- PCI_INTERRUPT_LINE) : 0;
-
-+#ifdef CONFIG_X86
-+ /*
-+ * PCI Local Bus Specification Revision 3.0 defines 0xff value
-+ * as special only for X86.
-+ */
-+ if ( uart->irq == 0xff )
-+ uart->irq = 0;
-+#endif
-+ if ( !uart->irq )
-+ printk(XENLOG_INFO
-+ "ns16550: %pp: no legacy IRQ, using poll mode\n",
-+ &PCI_SBDF(0, b, d, f));
-+
- return 0;
- }
- }
---
-2.35.1
-
diff --git a/0020-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch b/0020-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch
new file mode 100644
index 0000000..30784c3
--- /dev/null
+++ b/0020-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch
@@ -0,0 +1,263 @@
+From c377ceab0a007690a1e71c81a5232613c99e944d Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 11 Oct 2022 15:00:05 +0200
+Subject: [PATCH 20/26] xen/sched: carve out memory allocation and freeing from
+ schedule_cpu_rm()
+
+In order to prepare not allocating or freeing memory from
+schedule_cpu_rm(), move this functionality to dedicated functions.
+
+For now call those functions from schedule_cpu_rm().
+
+No change of behavior expected.
+
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: d42be6f83480b3ada286dc18444331a816be88a3
+master date: 2022-09-05 11:42:30 +0100
+---
+ xen/common/sched/core.c | 143 ++++++++++++++++++++++---------------
+ xen/common/sched/private.h | 11 +++
+ 2 files changed, 98 insertions(+), 56 deletions(-)
+
+diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
+index 065a83eca912..2decb1161a63 100644
+--- a/xen/common/sched/core.c
++++ b/xen/common/sched/core.c
+@@ -3221,6 +3221,75 @@ out:
+ return ret;
+ }
+
++/*
++ * Allocate all memory needed for free_cpu_rm_data(), as allocations cannot
++ * be made in stop_machine() context.
++ *
++ * Between alloc_cpu_rm_data() and the real cpu removal action the relevant
++ * contents of struct sched_resource can't change, as the cpu in question is
++ * locked against any other movement to or from cpupools, and the data copied
++ * by alloc_cpu_rm_data() is modified only in case the cpu in question is
++ * being moved from or to a cpupool.
++ */
++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu)
++{
++ struct cpu_rm_data *data;
++ const struct sched_resource *sr;
++ unsigned int idx;
++
++ rcu_read_lock(&sched_res_rculock);
++
++ sr = get_sched_res(cpu);
++ data = xmalloc_flex_struct(struct cpu_rm_data, sr, sr->granularity - 1);
++ if ( !data )
++ goto out;
++
++ data->old_ops = sr->scheduler;
++ data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
++ data->ppriv_old = sr->sched_priv;
++
++ for ( idx = 0; idx < sr->granularity - 1; idx++ )
++ {
++ data->sr[idx] = sched_alloc_res();
++ if ( data->sr[idx] )
++ {
++ data->sr[idx]->sched_unit_idle = sched_alloc_unit_mem();
++ if ( !data->sr[idx]->sched_unit_idle )
++ {
++ sched_res_free(&data->sr[idx]->rcu);
++ data->sr[idx] = NULL;
++ }
++ }
++ if ( !data->sr[idx] )
++ {
++ while ( idx > 0 )
++ sched_res_free(&data->sr[--idx]->rcu);
++ XFREE(data);
++ goto out;
++ }
++
++ data->sr[idx]->curr = data->sr[idx]->sched_unit_idle;
++ data->sr[idx]->scheduler = &sched_idle_ops;
++ data->sr[idx]->granularity = 1;
++
++ /* We want the lock not to change when replacing the resource. */
++ data->sr[idx]->schedule_lock = sr->schedule_lock;
++ }
++
++ out:
++ rcu_read_unlock(&sched_res_rculock);
++
++ return data;
++}
++
++void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu)
++{
++ sched_free_udata(mem->old_ops, mem->vpriv_old);
++ sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu);
++
++ xfree(mem);
++}
++
+ /*
+ * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops
+ * (the idle scheduler).
+@@ -3229,53 +3298,23 @@ out:
+ */
+ int schedule_cpu_rm(unsigned int cpu)
+ {
+- void *ppriv_old, *vpriv_old;
+- struct sched_resource *sr, **sr_new = NULL;
++ struct sched_resource *sr;
++ struct cpu_rm_data *data;
+ struct sched_unit *unit;
+- struct scheduler *old_ops;
+ spinlock_t *old_lock;
+ unsigned long flags;
+- int idx, ret = -ENOMEM;
++ int idx = 0;
+ unsigned int cpu_iter;
+
++ data = alloc_cpu_rm_data(cpu);
++ if ( !data )
++ return -ENOMEM;
++
+ rcu_read_lock(&sched_res_rculock);
+
+ sr = get_sched_res(cpu);
+- old_ops = sr->scheduler;
+
+- if ( sr->granularity > 1 )
+- {
+- sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1);
+- if ( !sr_new )
+- goto out;
+- for ( idx = 0; idx < sr->granularity - 1; idx++ )
+- {
+- sr_new[idx] = sched_alloc_res();
+- if ( sr_new[idx] )
+- {
+- sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem();
+- if ( !sr_new[idx]->sched_unit_idle )
+- {
+- sched_res_free(&sr_new[idx]->rcu);
+- sr_new[idx] = NULL;
+- }
+- }
+- if ( !sr_new[idx] )
+- {
+- for ( idx--; idx >= 0; idx-- )
+- sched_res_free(&sr_new[idx]->rcu);
+- goto out;
+- }
+- sr_new[idx]->curr = sr_new[idx]->sched_unit_idle;
+- sr_new[idx]->scheduler = &sched_idle_ops;
+- sr_new[idx]->granularity = 1;
+-
+- /* We want the lock not to change when replacing the resource. */
+- sr_new[idx]->schedule_lock = sr->schedule_lock;
+- }
+- }
+-
+- ret = 0;
++ ASSERT(sr->granularity);
+ ASSERT(sr->cpupool != NULL);
+ ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
+ ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid));
+@@ -3283,10 +3322,6 @@ int schedule_cpu_rm(unsigned int cpu)
+ /* See comment in schedule_cpu_add() regarding lock switching. */
+ old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+
+- vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
+- ppriv_old = sr->sched_priv;
+-
+- idx = 0;
+ for_each_cpu ( cpu_iter, sr->cpus )
+ {
+ per_cpu(sched_res_idx, cpu_iter) = 0;
+@@ -3300,27 +3335,27 @@ int schedule_cpu_rm(unsigned int cpu)
+ else
+ {
+ /* Initialize unit. */
+- unit = sr_new[idx]->sched_unit_idle;
+- unit->res = sr_new[idx];
++ unit = data->sr[idx]->sched_unit_idle;
++ unit->res = data->sr[idx];
+ unit->is_running = true;
+ sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]);
+ sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain);
+
+ /* Adjust cpu masks of resources (old and new). */
+ cpumask_clear_cpu(cpu_iter, sr->cpus);
+- cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus);
++ cpumask_set_cpu(cpu_iter, data->sr[idx]->cpus);
+ cpumask_set_cpu(cpu_iter, &sched_res_mask);
+
+ /* Init timer. */
+- init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
++ init_timer(&data->sr[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
+
+ /* Last resource initializations and insert resource pointer. */
+- sr_new[idx]->master_cpu = cpu_iter;
+- set_sched_res(cpu_iter, sr_new[idx]);
++ data->sr[idx]->master_cpu = cpu_iter;
++ set_sched_res(cpu_iter, data->sr[idx]);
+
+ /* Last action: set the new lock pointer. */
+ smp_mb();
+- sr_new[idx]->schedule_lock = &sched_free_cpu_lock;
++ data->sr[idx]->schedule_lock = &sched_free_cpu_lock;
+
+ idx++;
+ }
+@@ -3336,16 +3371,12 @@ int schedule_cpu_rm(unsigned int cpu)
+ /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
+ spin_unlock_irqrestore(old_lock, flags);
+
+- sched_deinit_pdata(old_ops, ppriv_old, cpu);
++ sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu);
+
+- sched_free_udata(old_ops, vpriv_old);
+- sched_free_pdata(old_ops, ppriv_old, cpu);
+-
+-out:
+ rcu_read_unlock(&sched_res_rculock);
+- xfree(sr_new);
++ free_cpu_rm_data(data, cpu);
+
+- return ret;
++ return 0;
+ }
+
+ struct scheduler *scheduler_get_default(void)
+diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h
+index 2b04b01a0c0a..e286849a1312 100644
+--- a/xen/common/sched/private.h
++++ b/xen/common/sched/private.h
+@@ -600,6 +600,15 @@ struct affinity_masks {
+
+ bool alloc_affinity_masks(struct affinity_masks *affinity);
+ void free_affinity_masks(struct affinity_masks *affinity);
++
++/* Memory allocation related data for schedule_cpu_rm(). */
++struct cpu_rm_data {
++ const struct scheduler *old_ops;
++ void *ppriv_old;
++ void *vpriv_old;
++ struct sched_resource *sr[];
++};
++
+ void sched_rm_cpu(unsigned int cpu);
+ const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu);
+ void schedule_dump(struct cpupool *c);
+@@ -608,6 +617,8 @@ struct scheduler *scheduler_alloc(unsigned int sched_id);
+ void scheduler_free(struct scheduler *sched);
+ int cpu_disable_scheduler(unsigned int cpu);
+ int schedule_cpu_add(unsigned int cpu, struct cpupool *c);
++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu);
++void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu);
+ int schedule_cpu_rm(unsigned int cpu);
+ int sched_move_domain(struct domain *d, struct cpupool *c);
+ struct cpupool *cpupool_get_by_id(unsigned int poolid);
+--
+2.37.3
+
diff --git a/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch b/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch
deleted file mode 100644
index 1312bda..0000000
--- a/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch
+++ /dev/null
@@ -1,56 +0,0 @@
-From 8e11ec8fbf6f933f8854f4bc54226653316903f2 Mon Sep 17 00:00:00 2001
-From: Jan Beulich <jbeulich@suse.com>
-Date: Tue, 7 Jun 2022 14:08:06 +0200
-Subject: [PATCH 21/51] PCI: don't allow "pci-phantom=" to mark real devices as
- phantom functions
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-IOMMU code mapping / unmapping devices and interrupts will misbehave if
-a wrong command line option declared a function "phantom" when there's a
-real device at that position. Warn about this and adjust the specified
-stride (in the worst case ignoring the option altogether).
-
-Requested-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-master commit: 444b555dc9e09fa3ce90f066e0c88dec9b47f422
-master date: 2022-05-20 12:20:35 +0200
----
- xen/drivers/passthrough/pci.c | 19 ++++++++++++++++++-
- 1 file changed, 18 insertions(+), 1 deletion(-)
-
-diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
-index 395958698e6a..e0491c908f10 100644
---- a/xen/drivers/passthrough/pci.c
-+++ b/xen/drivers/passthrough/pci.c
-@@ -382,7 +382,24 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
- phantom_devs[i].slot == PCI_SLOT(devfn) &&
- phantom_devs[i].stride > PCI_FUNC(devfn) )
- {
-- pdev->phantom_stride = phantom_devs[i].stride;
-+ pci_sbdf_t sbdf = pdev->sbdf;
-+ unsigned int stride = phantom_devs[i].stride;
-+
-+ while ( (sbdf.fn += stride) > PCI_FUNC(devfn) )
-+ {
-+ if ( pci_conf_read16(sbdf, PCI_VENDOR_ID) == 0xffff &&
-+ pci_conf_read16(sbdf, PCI_DEVICE_ID) == 0xffff )
-+ continue;
-+ stride <<= 1;
-+ printk(XENLOG_WARNING
-+ "%pp looks to be a real device; bumping %04x:%02x:%02x stride to %u\n",
-+ &sbdf, phantom_devs[i].seg,
-+ phantom_devs[i].bus, phantom_devs[i].slot,
-+ stride);
-+ sbdf = pdev->sbdf;
-+ }
-+ if ( PCI_FUNC(stride) )
-+ pdev->phantom_stride = stride;
- break;
- }
- }
---
-2.35.1
-
diff --git a/0021-xen-sched-fix-cpu-hotplug.patch b/0021-xen-sched-fix-cpu-hotplug.patch
new file mode 100644
index 0000000..ea0b732
--- /dev/null
+++ b/0021-xen-sched-fix-cpu-hotplug.patch
@@ -0,0 +1,307 @@
+From 4f3204c2bc66db18c61600dd3e08bf1fd9584a1b Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 11 Oct 2022 15:00:19 +0200
+Subject: [PATCH 21/26] xen/sched: fix cpu hotplug
+
+Cpu unplugging is calling schedule_cpu_rm() via stop_machine_run() with
+interrupts disabled, thus any memory allocation or freeing must be
+avoided.
+
+Since commit 5047cd1d5dea ("xen/common: Use enhanced
+ASSERT_ALLOC_CONTEXT in xmalloc()") this restriction is being enforced
+via an assertion, which will now fail.
+
+Fix this by allocating needed memory before entering stop_machine_run()
+and freeing any memory only after having finished stop_machine_run().
+
+Fixes: 1ec410112cdd ("xen/sched: support differing granularity in schedule_cpu_[add/rm]()")
+Reported-by: Gao Ruifeng <ruifeng.gao@intel.com>
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Tested-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: d84473689611eed32fd90b27e614f28af767fa3f
+master date: 2022-09-05 11:42:30 +0100
+---
+ xen/common/sched/core.c | 25 +++++++++++---
+ xen/common/sched/cpupool.c | 69 +++++++++++++++++++++++++++++---------
+ xen/common/sched/private.h | 5 +--
+ 3 files changed, 77 insertions(+), 22 deletions(-)
+
+diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
+index 2decb1161a63..900aab8f66a7 100644
+--- a/xen/common/sched/core.c
++++ b/xen/common/sched/core.c
+@@ -3231,7 +3231,7 @@ out:
+ * by alloc_cpu_rm_data() is modified only in case the cpu in question is
+ * being moved from or to a cpupool.
+ */
+-struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu)
++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc)
+ {
+ struct cpu_rm_data *data;
+ const struct sched_resource *sr;
+@@ -3244,6 +3244,17 @@ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu)
+ if ( !data )
+ goto out;
+
++ if ( aff_alloc )
++ {
++ if ( !alloc_affinity_masks(&data->affinity) )
++ {
++ XFREE(data);
++ goto out;
++ }
++ }
++ else
++ memset(&data->affinity, 0, sizeof(data->affinity));
++
+ data->old_ops = sr->scheduler;
+ data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
+ data->ppriv_old = sr->sched_priv;
+@@ -3264,6 +3275,7 @@ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu)
+ {
+ while ( idx > 0 )
+ sched_res_free(&data->sr[--idx]->rcu);
++ free_affinity_masks(&data->affinity);
+ XFREE(data);
+ goto out;
+ }
+@@ -3286,6 +3298,7 @@ void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu)
+ {
+ sched_free_udata(mem->old_ops, mem->vpriv_old);
+ sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu);
++ free_affinity_masks(&mem->affinity);
+
+ xfree(mem);
+ }
+@@ -3296,17 +3309,18 @@ void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu)
+ * The cpu is already marked as "free" and not valid any longer for its
+ * cpupool.
+ */
+-int schedule_cpu_rm(unsigned int cpu)
++int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *data)
+ {
+ struct sched_resource *sr;
+- struct cpu_rm_data *data;
+ struct sched_unit *unit;
+ spinlock_t *old_lock;
+ unsigned long flags;
+ int idx = 0;
+ unsigned int cpu_iter;
++ bool free_data = !data;
+
+- data = alloc_cpu_rm_data(cpu);
++ if ( !data )
++ data = alloc_cpu_rm_data(cpu, false);
+ if ( !data )
+ return -ENOMEM;
+
+@@ -3374,7 +3388,8 @@ int schedule_cpu_rm(unsigned int cpu)
+ sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu);
+
+ rcu_read_unlock(&sched_res_rculock);
+- free_cpu_rm_data(data, cpu);
++ if ( free_data )
++ free_cpu_rm_data(data, cpu);
+
+ return 0;
+ }
+diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c
+index 45b6ff99561a..b5a948639aad 100644
+--- a/xen/common/sched/cpupool.c
++++ b/xen/common/sched/cpupool.c
+@@ -402,22 +402,28 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c)
+ }
+
+ /* Update affinities of all domains in a cpupool. */
+-static void cpupool_update_node_affinity(const struct cpupool *c)
++static void cpupool_update_node_affinity(const struct cpupool *c,
++ struct affinity_masks *masks)
+ {
+- struct affinity_masks masks;
++ struct affinity_masks local_masks;
+ struct domain *d;
+
+- if ( !alloc_affinity_masks(&masks) )
+- return;
++ if ( !masks )
++ {
++ if ( !alloc_affinity_masks(&local_masks) )
++ return;
++ masks = &local_masks;
++ }
+
+ rcu_read_lock(&domlist_read_lock);
+
+ for_each_domain_in_cpupool(d, c)
+- domain_update_node_aff(d, &masks);
++ domain_update_node_aff(d, masks);
+
+ rcu_read_unlock(&domlist_read_lock);
+
+- free_affinity_masks(&masks);
++ if ( masks == &local_masks )
++ free_affinity_masks(masks);
+ }
+
+ /*
+@@ -451,15 +457,17 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
+
+ rcu_read_unlock(&sched_res_rculock);
+
+- cpupool_update_node_affinity(c);
++ cpupool_update_node_affinity(c, NULL);
+
+ return 0;
+ }
+
+-static int cpupool_unassign_cpu_finish(struct cpupool *c)
++static int cpupool_unassign_cpu_finish(struct cpupool *c,
++ struct cpu_rm_data *mem)
+ {
+ int cpu = cpupool_moving_cpu;
+ const cpumask_t *cpus;
++ struct affinity_masks *masks = mem ? &mem->affinity : NULL;
+ int ret;
+
+ if ( c != cpupool_cpu_moving )
+@@ -482,7 +490,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c)
+ */
+ if ( !ret )
+ {
+- ret = schedule_cpu_rm(cpu);
++ ret = schedule_cpu_rm(cpu, mem);
+ if ( ret )
+ cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
+ else
+@@ -494,7 +502,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c)
+ }
+ rcu_read_unlock(&sched_res_rculock);
+
+- cpupool_update_node_affinity(c);
++ cpupool_update_node_affinity(c, masks);
+
+ return ret;
+ }
+@@ -558,7 +566,7 @@ static long cpupool_unassign_cpu_helper(void *info)
+ cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu);
+ spin_lock(&cpupool_lock);
+
+- ret = cpupool_unassign_cpu_finish(c);
++ ret = cpupool_unassign_cpu_finish(c, NULL);
+
+ spin_unlock(&cpupool_lock);
+ debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret);
+@@ -701,7 +709,7 @@ static int cpupool_cpu_add(unsigned int cpu)
+ * This function is called in stop_machine context, so we can be sure no
+ * non-idle vcpu is active on the system.
+ */
+-static void cpupool_cpu_remove(unsigned int cpu)
++static void cpupool_cpu_remove(unsigned int cpu, struct cpu_rm_data *mem)
+ {
+ int ret;
+
+@@ -709,7 +717,7 @@ static void cpupool_cpu_remove(unsigned int cpu)
+
+ if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) )
+ {
+- ret = cpupool_unassign_cpu_finish(cpupool0);
++ ret = cpupool_unassign_cpu_finish(cpupool0, mem);
+ BUG_ON(ret);
+ }
+ cpumask_clear_cpu(cpu, &cpupool_free_cpus);
+@@ -775,7 +783,7 @@ static void cpupool_cpu_remove_forced(unsigned int cpu)
+ {
+ ret = cpupool_unassign_cpu_start(c, master_cpu);
+ BUG_ON(ret);
+- ret = cpupool_unassign_cpu_finish(c);
++ ret = cpupool_unassign_cpu_finish(c, NULL);
+ BUG_ON(ret);
+ }
+ }
+@@ -993,12 +1001,24 @@ void dump_runq(unsigned char key)
+ static int cpu_callback(
+ struct notifier_block *nfb, unsigned long action, void *hcpu)
+ {
++ static struct cpu_rm_data *mem;
++
+ unsigned int cpu = (unsigned long)hcpu;
+ int rc = 0;
+
+ switch ( action )
+ {
+ case CPU_DOWN_FAILED:
++ if ( system_state <= SYS_STATE_active )
++ {
++ if ( mem )
++ {
++ free_cpu_rm_data(mem, cpu);
++ mem = NULL;
++ }
++ rc = cpupool_cpu_add(cpu);
++ }
++ break;
+ case CPU_ONLINE:
+ if ( system_state <= SYS_STATE_active )
+ rc = cpupool_cpu_add(cpu);
+@@ -1006,12 +1026,31 @@ static int cpu_callback(
+ case CPU_DOWN_PREPARE:
+ /* Suspend/Resume don't change assignments of cpus to cpupools. */
+ if ( system_state <= SYS_STATE_active )
++ {
+ rc = cpupool_cpu_remove_prologue(cpu);
++ if ( !rc )
++ {
++ ASSERT(!mem);
++ mem = alloc_cpu_rm_data(cpu, true);
++ rc = mem ? 0 : -ENOMEM;
++ }
++ }
+ break;
+ case CPU_DYING:
+ /* Suspend/Resume don't change assignments of cpus to cpupools. */
+ if ( system_state <= SYS_STATE_active )
+- cpupool_cpu_remove(cpu);
++ {
++ ASSERT(mem);
++ cpupool_cpu_remove(cpu, mem);
++ }
++ break;
++ case CPU_DEAD:
++ if ( system_state <= SYS_STATE_active )
++ {
++ ASSERT(mem);
++ free_cpu_rm_data(mem, cpu);
++ mem = NULL;
++ }
+ break;
+ case CPU_RESUME_FAILED:
+ cpupool_cpu_remove_forced(cpu);
+diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h
+index e286849a1312..0126a4bb9ed3 100644
+--- a/xen/common/sched/private.h
++++ b/xen/common/sched/private.h
+@@ -603,6 +603,7 @@ void free_affinity_masks(struct affinity_masks *affinity);
+
+ /* Memory allocation related data for schedule_cpu_rm(). */
+ struct cpu_rm_data {
++ struct affinity_masks affinity;
+ const struct scheduler *old_ops;
+ void *ppriv_old;
+ void *vpriv_old;
+@@ -617,9 +618,9 @@ struct scheduler *scheduler_alloc(unsigned int sched_id);
+ void scheduler_free(struct scheduler *sched);
+ int cpu_disable_scheduler(unsigned int cpu);
+ int schedule_cpu_add(unsigned int cpu, struct cpupool *c);
+-struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu);
++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc);
+ void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu);
+-int schedule_cpu_rm(unsigned int cpu);
++int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *mem);
+ int sched_move_domain(struct domain *d, struct cpupool *c);
+ struct cpupool *cpupool_get_by_id(unsigned int poolid);
+ void cpupool_put(struct cpupool *pool);
+--
+2.37.3
+
diff --git a/0022-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch b/0022-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch
new file mode 100644
index 0000000..03f485a
--- /dev/null
+++ b/0022-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch
@@ -0,0 +1,58 @@
+From 2b694dd2932be78431b14257f23b738f2fc8f6a1 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 11 Oct 2022 15:00:33 +0200
+Subject: [PATCH 22/26] Config.mk: correct PIE-related option(s) in
+ EMBEDDED_EXTRA_CFLAGS
+
+I haven't been able to find evidence of "-nopie" ever having been a
+supported compiler option. The correct spelling is "-no-pie".
+Furthermore like "-pie" this is an option which is solely passed to the
+linker. The compiler only recognizes "-fpie" / "-fPIE" / "-fno-pie", and
+it doesn't infer these options from "-pie" / "-no-pie".
+
+Add the compiler recognized form, but for the possible case of the
+variable also being used somewhere for linking keep the linker option as
+well (with corrected spelling).
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Julien Grall <jgrall@amazon.com>
+
+Build: Drop -no-pie from EMBEDDED_EXTRA_CFLAGS
+
+This breaks all Clang builds, as demostrated by Gitlab CI.
+
+Contrary to the description in ecd6b9759919, -no-pie is not even an option
+passed to the linker. GCC's actual behaviour is to inhibit the passing of
+-pie to the linker, as well as selecting different cr0 artefacts to be linked.
+
+EMBEDDED_EXTRA_CFLAGS is not used for $(CC)-doing-linking, and not liable to
+gain such a usecase.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+Tested-by: Stefano Stabellini <sstabellini@kernel.org>
+Fixes: ecd6b9759919 ("Config.mk: correct PIE-related option(s) in EMBEDDED_EXTRA_CFLAGS")
+master commit: ecd6b9759919fa6335b0be1b5fc5cce29a30c4f1
+master date: 2022-09-08 09:25:26 +0200
+master commit: 13a7c0074ac8fb31f6c0485429b7a20a1946cb22
+master date: 2022-09-27 15:40:42 -0700
+---
+ Config.mk | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/Config.mk b/Config.mk
+index 46de3cd1e0e1..6f95067b8de6 100644
+--- a/Config.mk
++++ b/Config.mk
+@@ -197,7 +197,7 @@ endif
+ APPEND_LDFLAGS += $(foreach i, $(APPEND_LIB), -L$(i))
+ APPEND_CFLAGS += $(foreach i, $(APPEND_INCLUDES), -I$(i))
+
+-EMBEDDED_EXTRA_CFLAGS := -nopie -fno-stack-protector -fno-stack-protector-all
++EMBEDDED_EXTRA_CFLAGS := -fno-pie -fno-stack-protector -fno-stack-protector-all
+ EMBEDDED_EXTRA_CFLAGS += -fno-exceptions -fno-asynchronous-unwind-tables
+
+ XEN_EXTFILES_URL ?= http://xenbits.xen.org/xen-extfiles
+--
+2.37.3
+
diff --git a/0022-x86-pv-Clean-up-_get_page_type.patch b/0022-x86-pv-Clean-up-_get_page_type.patch
deleted file mode 100644
index 0270beb..0000000
--- a/0022-x86-pv-Clean-up-_get_page_type.patch
+++ /dev/null
@@ -1,180 +0,0 @@
-From b152dfbc3ad71a788996440b18174d995c3bffc9 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Thu, 9 Jun 2022 15:27:19 +0200
-Subject: [PATCH 22/51] x86/pv: Clean up _get_page_type()
-
-Various fixes for clarity, ahead of making complicated changes.
-
- * Split the overflow check out of the if/else chain for type handling, as
- it's somewhat unrelated.
- * Comment the main if/else chain to explain what is going on. Adjust one
- ASSERT() and state the bit layout for validate-locked and partial states.
- * Correct the comment about TLB flushing, as it's backwards. The problem
- case is when writeable mappings are retained to a page becoming read-only,
- as it allows the guest to bypass Xen's safety checks for updates.
- * Reduce the scope of 'y'. It is an artefact of the cmpxchg loop and not
- valid for use by subsequent logic. Switch to using ACCESS_ONCE() to treat
- all reads as explicitly volatile. The only thing preventing the validated
- wait-loop being infinite is the compiler barrier hidden in cpu_relax().
- * Replace one page_get_owner(page) with the already-calculated 'd' already in
- scope.
-
-No functional change.
-
-This is part of XSA-401 / CVE-2022-26362.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: George Dunlap <george.dunlap@citrix.com>
-master commit: 9186e96b199e4f7e52e033b238f9fe869afb69c7
-master date: 2022-06-09 14:20:36 +0200
----
- xen/arch/x86/mm.c | 72 +++++++++++++++++++++++++++++++++++++++--------
- 1 file changed, 61 insertions(+), 11 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index 4ee2de11051d..79ad7fdd2b82 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -2906,16 +2906,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
- static int _get_page_type(struct page_info *page, unsigned long type,
- bool preemptible)
- {
-- unsigned long nx, x, y = page->u.inuse.type_info;
-+ unsigned long nx, x;
- int rc = 0;
-
- ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
- ASSERT(!in_irq());
-
-- for ( ; ; )
-+ for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; )
- {
- x = y;
- nx = x + 1;
-+
- if ( unlikely((nx & PGT_count_mask) == 0) )
- {
- gdprintk(XENLOG_WARNING,
-@@ -2923,8 +2924,15 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- mfn_x(page_to_mfn(page)));
- return -EINVAL;
- }
-- else if ( unlikely((x & PGT_count_mask) == 0) )
-+
-+ if ( unlikely((x & PGT_count_mask) == 0) )
- {
-+ /*
-+ * Typeref 0 -> 1.
-+ *
-+ * Type changes are permitted when the typeref is 0. If the type
-+ * actually changes, the page needs re-validating.
-+ */
- struct domain *d = page_get_owner(page);
-
- if ( d && shadow_mode_enabled(d) )
-@@ -2935,8 +2943,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- {
- /*
- * On type change we check to flush stale TLB entries. It is
-- * vital that no other CPUs are left with mappings of a frame
-- * which is about to become writeable to the guest.
-+ * vital that no other CPUs are left with writeable mappings
-+ * to a frame which is intending to become pgtable/segdesc.
- */
- cpumask_t *mask = this_cpu(scratch_cpumask);
-
-@@ -2948,7 +2956,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-
- if ( unlikely(!cpumask_empty(mask)) &&
- /* Shadow mode: track only writable pages. */
-- (!shadow_mode_enabled(page_get_owner(page)) ||
-+ (!shadow_mode_enabled(d) ||
- ((nx & PGT_type_mask) == PGT_writable_page)) )
- {
- perfc_incr(need_flush_tlb_flush);
-@@ -2979,7 +2987,14 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- }
- else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
- {
-- /* Don't log failure if it could be a recursive-mapping attempt. */
-+ /*
-+ * else, we're trying to take a new reference, of the wrong type.
-+ *
-+ * This (being able to prohibit use of the wrong type) is what the
-+ * typeref system exists for, but skip printing the failure if it
-+ * looks like a recursive mapping, as subsequent logic might
-+ * ultimately permit the attempt.
-+ */
- if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
- (type == PGT_l1_page_table) )
- return -EINVAL;
-@@ -2998,18 +3013,46 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- }
- else if ( unlikely(!(x & PGT_validated)) )
- {
-+ /*
-+ * else, the count is non-zero, and we're grabbing the right type;
-+ * but the page hasn't been validated yet.
-+ *
-+ * The page is in one of two states (depending on PGT_partial),
-+ * and should have exactly one reference.
-+ */
-+ ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1));
-+
- if ( !(x & PGT_partial) )
- {
-- /* Someone else is updating validation of this page. Wait... */
-+ /*
-+ * The page has been left in the "validate locked" state
-+ * (i.e. PGT_[type] | 1) which means that a concurrent caller
-+ * of _get_page_type() is in the middle of validation.
-+ *
-+ * Spin waiting for the concurrent user to complete (partial
-+ * or fully validated), then restart our attempt to acquire a
-+ * type reference.
-+ */
- do {
- if ( preemptible && hypercall_preempt_check() )
- return -EINTR;
- cpu_relax();
-- } while ( (y = page->u.inuse.type_info) == x );
-+ } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x );
- continue;
- }
-- /* Type ref count was left at 1 when PGT_partial got set. */
-- ASSERT((x & PGT_count_mask) == 1);
-+
-+ /*
-+ * The page has been left in the "partial" state
-+ * (i.e., PGT_[type] | PGT_partial | 1).
-+ *
-+ * Rather than bumping the type count, we need to try to grab the
-+ * validation lock; if we succeed, we need to validate the page,
-+ * then drop the general ref associated with the PGT_partial bit.
-+ *
-+ * We grab the validation lock by setting nx to (PGT_[type] | 1)
-+ * (i.e., non-zero type count, neither PGT_validated nor
-+ * PGT_partial set).
-+ */
- nx = x & ~PGT_partial;
- }
-
-@@ -3058,6 +3101,13 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- }
-
- out:
-+ /*
-+ * Did we drop the PGT_partial bit when acquiring the typeref? If so,
-+ * drop the general reference that went along with it.
-+ *
-+ * N.B. validate_page() may have have re-set PGT_partial, not reflected in
-+ * nx, but will have taken an extra ref when doing so.
-+ */
- if ( (x & PGT_partial) && !(nx & PGT_partial) )
- put_page(page);
-
---
-2.35.1
-
diff --git a/0023-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch b/0023-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch
new file mode 100644
index 0000000..45f7509
--- /dev/null
+++ b/0023-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch
@@ -0,0 +1,41 @@
+From 49510071ee93905378e54664778760ed3908d447 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 11 Oct 2022 15:00:59 +0200
+Subject: [PATCH 23/26] tools/xenstore: minor fix of the migration stream doc
+
+Drop mentioning the non-existent read-only socket in the migration
+stream description document.
+
+The related record field was removed in commit 8868a0e3f674 ("docs:
+update the xenstore migration stream documentation).
+
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Acked-by: Julien Grall <jgrall@amazon.com>
+master commit: ace1d2eff80d3d66c37ae765dae3e3cb5697e5a4
+master date: 2022-09-08 09:25:58 +0200
+---
+ docs/designs/xenstore-migration.md | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+diff --git a/docs/designs/xenstore-migration.md b/docs/designs/xenstore-migration.md
+index 5f1155273ec3..78530bbb0ef4 100644
+--- a/docs/designs/xenstore-migration.md
++++ b/docs/designs/xenstore-migration.md
+@@ -129,11 +129,9 @@ xenstored state that needs to be restored.
+ | `evtchn-fd` | The file descriptor used to communicate with |
+ | | the event channel driver |
+
+-xenstored will resume in the original process context. Hence `rw-socket-fd` and
+-`ro-socket-fd` simply specify the file descriptors of the sockets. Sockets
+-are not always used, however, and so -1 will be used to denote an unused
+-socket.
+-
++xenstored will resume in the original process context. Hence `rw-socket-fd`
++simply specifies the file descriptor of the socket. Sockets are not always
++used, however, and so -1 will be used to denote an unused socket.
+
+ \pagebreak
+
+--
+2.37.3
+
diff --git a/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch b/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch
deleted file mode 100644
index 1e3febd..0000000
--- a/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch
+++ /dev/null
@@ -1,201 +0,0 @@
-From 8dab3f79b122e69cbcdebca72cdc14f004ee2193 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Thu, 9 Jun 2022 15:27:37 +0200
-Subject: [PATCH 23/51] x86/pv: Fix ABAC cmpxchg() race in _get_page_type()
-
-_get_page_type() suffers from a race condition where it incorrectly assumes
-that because 'x' was read and a subsequent a cmpxchg() succeeds, the type
-cannot have changed in-between. Consider:
-
-CPU A:
- 1. Creates an L2e referencing pg
- `-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page
- 2. Issues flush_tlb_mask()
-CPU B:
- 3. Creates a writeable mapping of pg
- `-> _get_page_type(pg, PGT_writable_page), count increases to 1
- 4. Writes into new mapping, creating a TLB entry for pg
- 5. Removes the writeable mapping of pg
- `-> _put_page_type(pg), count goes back down to 0
-CPU A:
- 7. Issues cmpxchg(), setting count 1, type PGT_l1_page_table
-
-CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and
-suitably protected (i.e. read-only). The TLB flush in step 2 must be deferred
-until after the guest is prohibited from creating new writeable mappings,
-which is after step 7.
-
-Defer all safety actions until after the cmpxchg() has successfully taken the
-intended typeref, because that is what prevents concurrent users from using
-the old type.
-
-Also remove the early validation for writeable and shared pages. This removes
-race conditions where one half of a parallel mapping attempt can return
-successfully before:
- * The IOMMU pagetables are in sync with the new page type
- * Writeable mappings to shared pages have been torn down
-
-This is part of XSA-401 / CVE-2022-26362.
-
-Reported-by: Jann Horn <jannh@google.com>
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: George Dunlap <george.dunlap@citrix.com>
-master commit: 8cc5036bc385112a82f1faff27a0970e6440dfed
-master date: 2022-06-09 14:21:04 +0200
----
- xen/arch/x86/mm.c | 116 ++++++++++++++++++++++++++--------------------
- 1 file changed, 67 insertions(+), 49 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index 79ad7fdd2b82..c6429b0f749a 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -2933,56 +2933,12 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- * Type changes are permitted when the typeref is 0. If the type
- * actually changes, the page needs re-validating.
- */
-- struct domain *d = page_get_owner(page);
--
-- if ( d && shadow_mode_enabled(d) )
-- shadow_prepare_page_type_change(d, page, type);
-
- ASSERT(!(x & PGT_pae_xen_l2));
- if ( (x & PGT_type_mask) != type )
- {
-- /*
-- * On type change we check to flush stale TLB entries. It is
-- * vital that no other CPUs are left with writeable mappings
-- * to a frame which is intending to become pgtable/segdesc.
-- */
-- cpumask_t *mask = this_cpu(scratch_cpumask);
--
-- BUG_ON(in_irq());
-- cpumask_copy(mask, d->dirty_cpumask);
--
-- /* Don't flush if the timestamp is old enough */
-- tlbflush_filter(mask, page->tlbflush_timestamp);
--
-- if ( unlikely(!cpumask_empty(mask)) &&
-- /* Shadow mode: track only writable pages. */
-- (!shadow_mode_enabled(d) ||
-- ((nx & PGT_type_mask) == PGT_writable_page)) )
-- {
-- perfc_incr(need_flush_tlb_flush);
-- /*
-- * If page was a page table make sure the flush is
-- * performed using an IPI in order to avoid changing the
-- * type of a page table page under the feet of
-- * spurious_page_fault().
-- */
-- flush_mask(mask,
-- (x & PGT_type_mask) &&
-- (x & PGT_type_mask) <= PGT_root_page_table
-- ? FLUSH_TLB | FLUSH_FORCE_IPI
-- : FLUSH_TLB);
-- }
--
-- /* We lose existing type and validity. */
- nx &= ~(PGT_type_mask | PGT_validated);
- nx |= type;
--
-- /*
-- * No special validation needed for writable pages.
-- * Page tables and GDT/LDT need to be scanned for validity.
-- */
-- if ( type == PGT_writable_page || type == PGT_shared_page )
-- nx |= PGT_validated;
- }
- }
- else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
-@@ -3063,6 +3019,56 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- return -EINTR;
- }
-
-+ /*
-+ * One typeref has been taken and is now globally visible.
-+ *
-+ * The page is either in the "validate locked" state (PGT_[type] | 1) or
-+ * fully validated (PGT_[type] | PGT_validated | >0).
-+ */
-+
-+ if ( unlikely((x & PGT_count_mask) == 0) )
-+ {
-+ struct domain *d = page_get_owner(page);
-+
-+ if ( d && shadow_mode_enabled(d) )
-+ shadow_prepare_page_type_change(d, page, type);
-+
-+ if ( (x & PGT_type_mask) != type )
-+ {
-+ /*
-+ * On type change we check to flush stale TLB entries. It is
-+ * vital that no other CPUs are left with writeable mappings
-+ * to a frame which is intending to become pgtable/segdesc.
-+ */
-+ cpumask_t *mask = this_cpu(scratch_cpumask);
-+
-+ BUG_ON(in_irq());
-+ cpumask_copy(mask, d->dirty_cpumask);
-+
-+ /* Don't flush if the timestamp is old enough */
-+ tlbflush_filter(mask, page->tlbflush_timestamp);
-+
-+ if ( unlikely(!cpumask_empty(mask)) &&
-+ /* Shadow mode: track only writable pages. */
-+ (!shadow_mode_enabled(d) ||
-+ ((nx & PGT_type_mask) == PGT_writable_page)) )
-+ {
-+ perfc_incr(need_flush_tlb_flush);
-+ /*
-+ * If page was a page table make sure the flush is
-+ * performed using an IPI in order to avoid changing the
-+ * type of a page table page under the feet of
-+ * spurious_page_fault().
-+ */
-+ flush_mask(mask,
-+ (x & PGT_type_mask) &&
-+ (x & PGT_type_mask) <= PGT_root_page_table
-+ ? FLUSH_TLB | FLUSH_FORCE_IPI
-+ : FLUSH_TLB);
-+ }
-+ }
-+ }
-+
- if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) !=
- (type == PGT_writable_page)) )
- {
-@@ -3091,13 +3097,25 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-
- if ( unlikely(!(nx & PGT_validated)) )
- {
-- if ( !(x & PGT_partial) )
-+ /*
-+ * No special validation needed for writable or shared pages. Page
-+ * tables and GDT/LDT need to have their contents audited.
-+ *
-+ * per validate_page(), non-atomic updates are fine here.
-+ */
-+ if ( type == PGT_writable_page || type == PGT_shared_page )
-+ page->u.inuse.type_info |= PGT_validated;
-+ else
- {
-- page->nr_validated_ptes = 0;
-- page->partial_flags = 0;
-- page->linear_pt_count = 0;
-+ if ( !(x & PGT_partial) )
-+ {
-+ page->nr_validated_ptes = 0;
-+ page->partial_flags = 0;
-+ page->linear_pt_count = 0;
-+ }
-+
-+ rc = validate_page(page, type, preemptible);
- }
-- rc = validate_page(page, type, preemptible);
- }
-
- out:
---
-2.35.1
-
diff --git a/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch b/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch
deleted file mode 100644
index 409b72f..0000000
--- a/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch
+++ /dev/null
@@ -1,53 +0,0 @@
-From 9cfd796ae05421ded8e4f70b2c55352491cfa841 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Thu, 9 Jun 2022 15:27:53 +0200
-Subject: [PATCH 24/51] x86/page: Introduce _PAGE_* constants for memory types
-
-... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_*
-constants. These are going to be needed by forthcoming logic.
-
-No functional change.
-
-This is part of XSA-402.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-master commit: 1be8707c75bf4ba68447c74e1618b521dd432499
-master date: 2022-06-09 14:21:38 +0200
----
- xen/include/asm-x86/page.h | 12 ++++++++++--
- 1 file changed, 10 insertions(+), 2 deletions(-)
-
-diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
-index 1d080cffbe84..2e542050f65a 100644
---- a/xen/include/asm-x86/page.h
-+++ b/xen/include/asm-x86/page.h
-@@ -331,6 +331,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
-
- #define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
-
-+/* Memory types, encoded under Xen's choice of MSR_PAT. */
-+#define _PAGE_WB ( 0)
-+#define _PAGE_WT ( _PAGE_PWT)
-+#define _PAGE_UCM ( _PAGE_PCD )
-+#define _PAGE_UC ( _PAGE_PCD | _PAGE_PWT)
-+#define _PAGE_WC (_PAGE_PAT )
-+#define _PAGE_WP (_PAGE_PAT | _PAGE_PWT)
-+
- /*
- * Debug option: Ensure that granted mappings are not implicitly unmapped.
- * WARNING: This will need to be disabled to run OSes that use the spare PTE
-@@ -349,8 +357,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
- #define __PAGE_HYPERVISOR_RX (_PAGE_PRESENT | _PAGE_ACCESSED)
- #define __PAGE_HYPERVISOR (__PAGE_HYPERVISOR_RX | \
- _PAGE_DIRTY | _PAGE_RW)
--#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
--#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
-+#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM)
-+#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_UC)
- #define __PAGE_HYPERVISOR_SHSTK (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY)
-
- #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
---
-2.35.1
-
diff --git a/0024-xen-gnttab-fix-gnttab_acquire_resource.patch b/0024-xen-gnttab-fix-gnttab_acquire_resource.patch
new file mode 100644
index 0000000..898503f
--- /dev/null
+++ b/0024-xen-gnttab-fix-gnttab_acquire_resource.patch
@@ -0,0 +1,69 @@
+From b9560762392c01b3ee84148c07be8017cb42dbc9 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 11 Oct 2022 15:01:22 +0200
+Subject: [PATCH 24/26] xen/gnttab: fix gnttab_acquire_resource()
+
+Commit 9dc46386d89d ("gnttab: work around "may be used uninitialized"
+warning") was wrong, as vaddrs can legitimately be NULL in case
+XENMEM_resource_grant_table_id_status was specified for a grant table
+v1. This would result in crashes in debug builds due to
+ASSERT_UNREACHABLE() triggering.
+
+Check vaddrs only to be NULL in the rc == 0 case.
+
+Expand the tests in tools/tests/resource to tickle this path, and verify that
+using XENMEM_resource_grant_table_id_status on a v1 grant table fails.
+
+Fixes: 9dc46386d89d ("gnttab: work around "may be used uninitialized" warning")
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com> # xen
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 52daa6a8483e4fbd6757c9d1b791e23931791608
+master date: 2022-09-09 16:28:38 +0100
+---
+ tools/tests/resource/test-resource.c | 15 +++++++++++++++
+ xen/common/grant_table.c | 2 +-
+ 2 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/tools/tests/resource/test-resource.c b/tools/tests/resource/test-resource.c
+index 0557f8a1b585..37dfff4dcd20 100644
+--- a/tools/tests/resource/test-resource.c
++++ b/tools/tests/resource/test-resource.c
+@@ -106,6 +106,21 @@ static void test_gnttab(uint32_t domid, unsigned int nr_frames,
+ if ( rc )
+ return fail(" Fail: Unmap grant table %d - %s\n",
+ errno, strerror(errno));
++
++ /*
++ * Verify that an attempt to map the status frames fails, as the domain is
++ * in gnttab v1 mode.
++ */
++ res = xenforeignmemory_map_resource(
++ fh, domid, XENMEM_resource_grant_table,
++ XENMEM_resource_grant_table_id_status, 0, 1,
++ (void **)&gnttab, PROT_READ | PROT_WRITE, 0);
++
++ if ( res )
++ {
++ fail(" Fail: Managed to map gnttab v2 status frames in v1 mode\n");
++ xenforeignmemory_unmap_resource(fh, res);
++ }
+ }
+
+ static void test_domain_configurations(void)
+diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
+index d8ca645b96ff..76272b3c8add 100644
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -4142,7 +4142,7 @@ int gnttab_acquire_resource(
+ * on non-error paths, and hence it needs setting to NULL at the top of the
+ * function. Leave some runtime safety.
+ */
+- if ( !vaddrs )
++ if ( !rc && !vaddrs )
+ {
+ ASSERT_UNREACHABLE();
+ rc = -ENODATA;
+--
+2.37.3
+
diff --git a/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch b/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch
deleted file mode 100644
index 0a24a0a..0000000
--- a/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch
+++ /dev/null
@@ -1,223 +0,0 @@
-From 74193f4292d9cfc2874866e941d9939d8f33fcef Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Thu, 9 Jun 2022 15:28:23 +0200
-Subject: [PATCH 25/51] x86: Don't change the cacheability of the directmap
-
-Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings
-in response to guest mapping requests") attempted to keep the cacheability
-consistent between different mappings of the same page.
-
-The reason wasn't described in the changelog, but it is understood to be in
-regards to a concern over machine check exceptions, owing to errata when using
-mixed cacheabilities. It did this primarily by updating Xen's mapping of the
-page in the direct map when the guest mapped a page with reduced cacheability.
-
-Unfortunately, the logic didn't actually prevent mixed cacheability from
-occurring:
- * A guest could map a page normally, and then map the same page with
- different cacheability; nothing prevented this.
- * The cacheability of the directmap was always latest-takes-precedence in
- terms of guest requests.
- * Grant-mapped frames with lesser cacheability didn't adjust the page's
- cacheattr settings.
- * The map_domain_page() function still unconditionally created WB mappings,
- irrespective of the page's cacheattr settings.
-
-Additionally, update_xen_mappings() had a bug where the alias calculation was
-wrong for mfn's which were .init content, which should have been treated as
-fully guest pages, not Xen pages.
-
-Worse yet, the logic introduced a vulnerability whereby necessary
-pagetable/segdesc adjustments made by Xen in the validation logic could become
-non-coherent between the cache and main memory. The CPU could subsequently
-operate on the stale value in the cache, rather than the safe value in main
-memory.
-
-The directmap contains primarily mappings of RAM. PAT/MTRR conflict
-resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser
-cacheability resolves to being coherent. The special case is WC mappings,
-which are non-coherent against MTRR=WB regions (except for fully-coherent
-CPUs).
-
-Xen must not have any WC cacheability in the directmap, to prevent Xen's
-actions from creating non-coherency. (Guest actions creating non-coherency is
-dealt with in subsequent patches.) As all memory types for MTRR=WB ranges
-inter-operate coherently, so leave Xen's directmap mappings as WB.
-
-Only PV guests with access to devices can use reduced-cacheability mappings to
-begin with, and they're trusted not to mount DoSs against the system anyway.
-
-Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them.
-Shift the later PGC_* constants up, to gain 3 extra bits in the main reference
-count. Retain the check in get_page_from_l1e() for special_pages() because a
-guest has no business using reduced cacheability on these.
-
-This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0
-
-This is CVE-2022-26363, part of XSA-402.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: George Dunlap <george.dunlap@citrix.com>
-master commit: ae09597da34aee6bc5b76475c5eea6994457e854
-master date: 2022-06-09 14:22:08 +0200
----
- xen/arch/x86/mm.c | 84 ++++------------------------------------
- xen/include/asm-x86/mm.h | 23 +++++------
- 2 files changed, 17 insertions(+), 90 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index c6429b0f749a..ab32d13a1a0d 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -783,28 +783,6 @@ bool is_iomem_page(mfn_t mfn)
- return (page_get_owner(page) == dom_io);
- }
-
--static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
--{
-- int err = 0;
-- bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
-- mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
-- unsigned long xen_va =
-- XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
--
-- if ( boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) )
-- return 0;
--
-- if ( unlikely(alias) && cacheattr )
-- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0);
-- if ( !err )
-- err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1,
-- PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
-- if ( unlikely(alias) && !cacheattr && !err )
-- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR);
--
-- return err;
--}
--
- #ifndef NDEBUG
- struct mmio_emul_range_ctxt {
- const struct domain *d;
-@@ -1009,47 +987,14 @@ get_page_from_l1e(
- goto could_not_pin;
- }
-
-- if ( pte_flags_to_cacheattr(l1f) !=
-- ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
-+ if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) )
- {
-- unsigned long x, nx, y = page->count_info;
-- unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
-- int err;
--
-- if ( is_special_page(page) )
-- {
-- if ( write )
-- put_page_type(page);
-- put_page(page);
-- gdprintk(XENLOG_WARNING,
-- "Attempt to change cache attributes of Xen heap page\n");
-- return -EACCES;
-- }
--
-- do {
-- x = y;
-- nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
-- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
--
-- err = update_xen_mappings(mfn, cacheattr);
-- if ( unlikely(err) )
-- {
-- cacheattr = y & PGC_cacheattr_mask;
-- do {
-- x = y;
-- nx = (x & ~PGC_cacheattr_mask) | cacheattr;
-- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
--
-- if ( write )
-- put_page_type(page);
-- put_page(page);
--
-- gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
-- " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
-- mfn, get_gpfn_from_mfn(mfn),
-- l1e_get_intpte(l1e), l1e_owner->domain_id);
-- return err;
-- }
-+ if ( write )
-+ put_page_type(page);
-+ put_page(page);
-+ gdprintk(XENLOG_WARNING,
-+ "Attempt to change cache attributes of Xen heap page\n");
-+ return -EACCES;
- }
-
- return 0;
-@@ -2467,24 +2412,9 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
- */
- static int cleanup_page_mappings(struct page_info *page)
- {
-- unsigned int cacheattr =
-- (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
- int rc = 0;
- unsigned long mfn = mfn_x(page_to_mfn(page));
-
-- /*
-- * If we've modified xen mappings as a result of guest cache
-- * attributes, restore them to the "normal" state.
-- */
-- if ( unlikely(cacheattr) )
-- {
-- page->count_info &= ~PGC_cacheattr_mask;
--
-- BUG_ON(is_special_page(page));
--
-- rc = update_xen_mappings(mfn, 0);
-- }
--
- /*
- * If this may be in a PV domain's IOMMU, remove it.
- *
-diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
-index cb9052749963..8a9a43bb0a9d 100644
---- a/xen/include/asm-x86/mm.h
-+++ b/xen/include/asm-x86/mm.h
-@@ -69,25 +69,22 @@
- /* Set when is using a page as a page table */
- #define _PGC_page_table PG_shift(3)
- #define PGC_page_table PG_mask(1, 3)
-- /* 3-bit PAT/PCD/PWT cache-attribute hint. */
--#define PGC_cacheattr_base PG_shift(6)
--#define PGC_cacheattr_mask PG_mask(7, 6)
- /* Page is broken? */
--#define _PGC_broken PG_shift(7)
--#define PGC_broken PG_mask(1, 7)
-+#define _PGC_broken PG_shift(4)
-+#define PGC_broken PG_mask(1, 4)
- /* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */
--#define PGC_state PG_mask(3, 9)
--#define PGC_state_inuse PG_mask(0, 9)
--#define PGC_state_offlining PG_mask(1, 9)
--#define PGC_state_offlined PG_mask(2, 9)
--#define PGC_state_free PG_mask(3, 9)
-+#define PGC_state PG_mask(3, 6)
-+#define PGC_state_inuse PG_mask(0, 6)
-+#define PGC_state_offlining PG_mask(1, 6)
-+#define PGC_state_offlined PG_mask(2, 6)
-+#define PGC_state_free PG_mask(3, 6)
- #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
- /* Page is not reference counted (see below for caveats) */
--#define _PGC_extra PG_shift(10)
--#define PGC_extra PG_mask(1, 10)
-+#define _PGC_extra PG_shift(7)
-+#define PGC_extra PG_mask(1, 7)
-
- /* Count of references to this frame. */
--#define PGC_count_width PG_shift(10)
-+#define PGC_count_width PG_shift(7)
- #define PGC_count_mask ((1UL<<PGC_count_width)-1)
-
- /*
---
-2.35.1
-
diff --git a/0025-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch b/0025-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch
new file mode 100644
index 0000000..849ef60
--- /dev/null
+++ b/0025-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch
@@ -0,0 +1,59 @@
+From 3f4da85ca8816f6617529c80850eaddd80ea0f1f Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 11 Oct 2022 15:01:36 +0200
+Subject: [PATCH 25/26] x86: wire up VCPUOP_register_vcpu_time_memory_area for
+ 32-bit guests
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Forever sinced its introduction VCPUOP_register_vcpu_time_memory_area
+was available only to native domains. Linux, for example, would attempt
+to use it irrespective of guest bitness (including in its so called
+PVHVM mode) as long as it finds XEN_PVCLOCK_TSC_STABLE_BIT set (which we
+set only for clocksource=tsc, which in turn needs engaging via command
+line option).
+
+Fixes: a5d39947cb89 ("Allow guests to register secondary vcpu_time_info")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: b726541d94bd0a80b5864d17a2cd2e6d73a3fe0a
+master date: 2022-09-29 14:47:45 +0200
+---
+ xen/arch/x86/x86_64/domain.c | 20 ++++++++++++++++++++
+ 1 file changed, 20 insertions(+)
+
+diff --git a/xen/arch/x86/x86_64/domain.c b/xen/arch/x86/x86_64/domain.c
+index c46dccc25a54..d51d99344796 100644
+--- a/xen/arch/x86/x86_64/domain.c
++++ b/xen/arch/x86/x86_64/domain.c
+@@ -54,6 +54,26 @@ arch_compat_vcpu_op(
+ break;
+ }
+
++ case VCPUOP_register_vcpu_time_memory_area:
++ {
++ struct compat_vcpu_register_time_memory_area area = { .addr.p = 0 };
++
++ rc = -EFAULT;
++ if ( copy_from_guest(&area.addr.h, arg, 1) )
++ break;
++
++ if ( area.addr.h.c != area.addr.p ||
++ !compat_handle_okay(area.addr.h, 1) )
++ break;
++
++ rc = 0;
++ guest_from_compat_handle(v->arch.time_info_guest, area.addr.h);
++
++ force_update_vcpu_system_time(v);
++
++ break;
++ }
++
+ case VCPUOP_get_physid:
+ rc = arch_do_vcpu_op(cmd, v, arg);
+ break;
+--
+2.37.3
+
diff --git a/0026-x86-Split-cache_flush-out-of-cache_writeback.patch b/0026-x86-Split-cache_flush-out-of-cache_writeback.patch
deleted file mode 100644
index 50f70f4..0000000
--- a/0026-x86-Split-cache_flush-out-of-cache_writeback.patch
+++ /dev/null
@@ -1,294 +0,0 @@
-From 8eafa2d871ae51d461256e4a14175e24df330c70 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Thu, 9 Jun 2022 15:28:48 +0200
-Subject: [PATCH 26/51] x86: Split cache_flush() out of cache_writeback()
-
-Subsequent changes will want a fully flushing version.
-
-Use the new helper rather than opencoding it in flush_area_local(). This
-resolves an outstanding issue where the conditional sfence is on the wrong
-side of the clflushopt loop. clflushopt is ordered with respect to older
-stores, not to younger stores.
-
-Rename gnttab_cache_flush()'s helper to avoid colliding in name.
-grant_table.c can see the prototype from cache.h so the build fails
-otherwise.
-
-This is part of XSA-402.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-master commit: 9a67ffee3371506e1cbfdfff5b90658d4828f6a2
-master date: 2022-06-09 14:22:38 +0200
----
- xen/arch/x86/flushtlb.c | 84 ++++++++++++++++++++++++---
- xen/common/grant_table.c | 4 +-
- xen/drivers/passthrough/vtd/extern.h | 1 -
- xen/drivers/passthrough/vtd/iommu.c | 53 +----------------
- xen/drivers/passthrough/vtd/x86/vtd.c | 5 --
- xen/include/asm-x86/cache.h | 7 +++
- 6 files changed, 88 insertions(+), 66 deletions(-)
-
-diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
-index 25798df50f54..0c912b8669f8 100644
---- a/xen/arch/x86/flushtlb.c
-+++ b/xen/arch/x86/flushtlb.c
-@@ -234,7 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
- if ( flags & FLUSH_CACHE )
- {
- const struct cpuinfo_x86 *c = &current_cpu_data;
-- unsigned long i, sz = 0;
-+ unsigned long sz = 0;
-
- if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
- sz = 1UL << (order + PAGE_SHIFT);
-@@ -244,13 +244,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
- c->x86_clflush_size && c->x86_cache_size && sz &&
- ((sz >> 10) < c->x86_cache_size) )
- {
-- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
-- for ( i = 0; i < sz; i += c->x86_clflush_size )
-- alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";"
-- " clflush %0",
-- "data16 clflush %0", /* clflushopt */
-- X86_FEATURE_CLFLUSHOPT,
-- "m" (((const char *)va)[i]));
-+ cache_flush(va, sz);
- flags &= ~FLUSH_CACHE;
- }
- else
-@@ -265,6 +259,80 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
- return flags;
- }
-
-+void cache_flush(const void *addr, unsigned int size)
-+{
-+ /*
-+ * This function may be called before current_cpu_data is established.
-+ * Hence a fallback is needed to prevent the loop below becoming infinite.
-+ */
-+ unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
-+ const void *end = addr + size;
-+
-+ addr -= (unsigned long)addr & (clflush_size - 1);
-+ for ( ; addr < end; addr += clflush_size )
-+ {
-+ /*
-+ * Note regarding the "ds" prefix use: it's faster to do a clflush
-+ * + prefix than a clflush + nop, and hence the prefix is added instead
-+ * of letting the alternative framework fill the gap by appending nops.
-+ */
-+ alternative_io("ds; clflush %[p]",
-+ "data16 clflush %[p]", /* clflushopt */
-+ X86_FEATURE_CLFLUSHOPT,
-+ /* no outputs */,
-+ [p] "m" (*(const char *)(addr)));
-+ }
-+
-+ alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
-+}
-+
-+void cache_writeback(const void *addr, unsigned int size)
-+{
-+ unsigned int clflush_size;
-+ const void *end = addr + size;
-+
-+ /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */
-+ if ( !boot_cpu_has(X86_FEATURE_CLWB) )
-+ return cache_flush(addr, size);
-+
-+ /*
-+ * This function may be called before current_cpu_data is established.
-+ * Hence a fallback is needed to prevent the loop below becoming infinite.
-+ */
-+ clflush_size = current_cpu_data.x86_clflush_size ?: 16;
-+ addr -= (unsigned long)addr & (clflush_size - 1);
-+ for ( ; addr < end; addr += clflush_size )
-+ {
-+/*
-+ * The arguments to a macro must not include preprocessor directives. Doing so
-+ * results in undefined behavior, so we have to create some defines here in
-+ * order to avoid it.
-+ */
-+#if defined(HAVE_AS_CLWB)
-+# define CLWB_ENCODING "clwb %[p]"
-+#elif defined(HAVE_AS_XSAVEOPT)
-+# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
-+#else
-+# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
-+#endif
-+
-+#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
-+#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
-+# define INPUT BASE_INPUT
-+#else
-+# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
-+#endif
-+
-+ asm volatile (CLWB_ENCODING :: INPUT(addr));
-+
-+#undef INPUT
-+#undef BASE_INPUT
-+#undef CLWB_ENCODING
-+ }
-+
-+ asm volatile ("sfence" ::: "memory");
-+}
-+
- unsigned int guest_flush_tlb_flags(const struct domain *d)
- {
- bool shadow = paging_mode_shadow(d);
-diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
-index 66f8ce71741c..4c742cd8fe81 100644
---- a/xen/common/grant_table.c
-+++ b/xen/common/grant_table.c
-@@ -3431,7 +3431,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop,
- return 0;
- }
-
--static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
-+static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
- {
- struct domain *d, *owner;
- struct page_info *page;
-@@ -3525,7 +3525,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop,
- return -EFAULT;
- for ( ; ; )
- {
-- int ret = cache_flush(&op, cur_ref);
-+ int ret = _cache_flush(&op, cur_ref);
-
- if ( ret < 0 )
- return ret;
-diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
-index 01e010a10d61..401079299725 100644
---- a/xen/drivers/passthrough/vtd/extern.h
-+++ b/xen/drivers/passthrough/vtd/extern.h
-@@ -76,7 +76,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu,
- struct pci_dev *pdev,
- u16 did, u16 size, u64 addr);
-
--unsigned int get_cache_line_size(void);
- void flush_all_cache(void);
-
- uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node);
-diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
-index 8975c1de61bc..bc377c9bcfa4 100644
---- a/xen/drivers/passthrough/vtd/iommu.c
-+++ b/xen/drivers/passthrough/vtd/iommu.c
-@@ -31,6 +31,7 @@
- #include <xen/pci.h>
- #include <xen/pci_regs.h>
- #include <xen/keyhandler.h>
-+#include <asm/cache.h>
- #include <asm/msi.h>
- #include <asm/nops.h>
- #include <asm/irq.h>
-@@ -206,54 +207,6 @@ static void check_cleanup_domid_map(const struct domain *d,
- }
- }
-
--static void sync_cache(const void *addr, unsigned int size)
--{
-- static unsigned long clflush_size = 0;
-- const void *end = addr + size;
--
-- if ( clflush_size == 0 )
-- clflush_size = get_cache_line_size();
--
-- addr -= (unsigned long)addr & (clflush_size - 1);
-- for ( ; addr < end; addr += clflush_size )
--/*
-- * The arguments to a macro must not include preprocessor directives. Doing so
-- * results in undefined behavior, so we have to create some defines here in
-- * order to avoid it.
-- */
--#if defined(HAVE_AS_CLWB)
--# define CLWB_ENCODING "clwb %[p]"
--#elif defined(HAVE_AS_XSAVEOPT)
--# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
--#else
--# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
--#endif
--
--#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
--#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
--# define INPUT BASE_INPUT
--#else
--# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
--#endif
-- /*
-- * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
-- * + prefix than a clflush + nop, and hence the prefix is added instead
-- * of letting the alternative framework fill the gap by appending nops.
-- */
-- alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
-- "data16 clflush %[p]", /* clflushopt */
-- X86_FEATURE_CLFLUSHOPT,
-- CLWB_ENCODING,
-- X86_FEATURE_CLWB, /* no outputs */,
-- INPUT(addr));
--#undef INPUT
--#undef BASE_INPUT
--#undef CLWB_ENCODING
--
-- alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
-- "sfence", X86_FEATURE_CLWB);
--}
--
- /* Allocate page table, return its machine address */
- uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
- {
-@@ -273,7 +226,7 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
- clear_page(vaddr);
-
- if ( (iommu_ops.init ? &iommu_ops : &vtd_ops)->sync_cache )
-- sync_cache(vaddr, PAGE_SIZE);
-+ cache_writeback(vaddr, PAGE_SIZE);
- unmap_domain_page(vaddr);
- cur_pg++;
- }
-@@ -1305,7 +1258,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
- iommu->nr_pt_levels = agaw_to_level(agaw);
-
- if ( !ecap_coherent(iommu->ecap) )
-- vtd_ops.sync_cache = sync_cache;
-+ vtd_ops.sync_cache = cache_writeback;
-
- /* allocate domain id bitmap */
- iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
-diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
-index 6681dccd6970..55f0faa521cb 100644
---- a/xen/drivers/passthrough/vtd/x86/vtd.c
-+++ b/xen/drivers/passthrough/vtd/x86/vtd.c
-@@ -47,11 +47,6 @@ void unmap_vtd_domain_page(const void *va)
- unmap_domain_page(va);
- }
-
--unsigned int get_cache_line_size(void)
--{
-- return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
--}
--
- void flush_all_cache()
- {
- wbinvd();
-diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h
-index 1f7173d8c72c..e4770efb22b9 100644
---- a/xen/include/asm-x86/cache.h
-+++ b/xen/include/asm-x86/cache.h
-@@ -11,4 +11,11 @@
-
- #define __read_mostly __section(".data.read_mostly")
-
-+#ifndef __ASSEMBLY__
-+
-+void cache_flush(const void *addr, unsigned int size);
-+void cache_writeback(const void *addr, unsigned int size);
-+
-+#endif
-+
- #endif
---
-2.35.1
-
diff --git a/0026-x86-vpmu-Fix-race-condition-in-vpmu_load.patch b/0026-x86-vpmu-Fix-race-condition-in-vpmu_load.patch
new file mode 100644
index 0000000..0f33747
--- /dev/null
+++ b/0026-x86-vpmu-Fix-race-condition-in-vpmu_load.patch
@@ -0,0 +1,97 @@
+From 1bce7fb1f702da4f7a749c6f1457ecb20bf74fca Mon Sep 17 00:00:00 2001
+From: Tamas K Lengyel <tamas.lengyel@intel.com>
+Date: Tue, 11 Oct 2022 15:01:48 +0200
+Subject: [PATCH 26/26] x86/vpmu: Fix race-condition in vpmu_load
+
+The vPMU code-bases attempts to perform an optimization on saving/reloading the
+PMU context by keeping track of what vCPU ran on each pCPU. When a pCPU is
+getting scheduled, checks if the previous vCPU isn't the current one. If so,
+attempts a call to vpmu_save_force. Unfortunately if the previous vCPU is
+already getting scheduled to run on another pCPU its state will be already
+runnable, which results in an ASSERT failure.
+
+Fix this by always performing a pmu context save in vpmu_save when called from
+vpmu_switch_from, and do a vpmu_load when called from vpmu_switch_to.
+
+While this presents a minimal overhead in case the same vCPU is getting
+rescheduled on the same pCPU, the ASSERT failure is avoided and the code is a
+lot easier to reason about.
+
+Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+master commit: defa4e51d20a143bdd4395a075bf0933bb38a9a4
+master date: 2022-09-30 09:53:49 +0200
+---
+ xen/arch/x86/cpu/vpmu.c | 42 ++++-------------------------------------
+ 1 file changed, 4 insertions(+), 38 deletions(-)
+
+diff --git a/xen/arch/x86/cpu/vpmu.c b/xen/arch/x86/cpu/vpmu.c
+index 16e91a3694fe..b6c2ec3cd047 100644
+--- a/xen/arch/x86/cpu/vpmu.c
++++ b/xen/arch/x86/cpu/vpmu.c
+@@ -368,58 +368,24 @@ void vpmu_save(struct vcpu *v)
+ vpmu->last_pcpu = pcpu;
+ per_cpu(last_vcpu, pcpu) = v;
+
++ vpmu_set(vpmu, VPMU_CONTEXT_SAVE);
++
+ if ( vpmu->arch_vpmu_ops )
+ if ( vpmu->arch_vpmu_ops->arch_vpmu_save(v, 0) )
+ vpmu_reset(vpmu, VPMU_CONTEXT_LOADED);
+
++ vpmu_reset(vpmu, VPMU_CONTEXT_SAVE);
++
+ apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED);
+ }
+
+ int vpmu_load(struct vcpu *v, bool_t from_guest)
+ {
+ struct vpmu_struct *vpmu = vcpu_vpmu(v);
+- int pcpu = smp_processor_id();
+- struct vcpu *prev = NULL;
+
+ if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) )
+ return 0;
+
+- /* First time this VCPU is running here */
+- if ( vpmu->last_pcpu != pcpu )
+- {
+- /*
+- * Get the context from last pcpu that we ran on. Note that if another
+- * VCPU is running there it must have saved this VPCU's context before
+- * startig to run (see below).
+- * There should be no race since remote pcpu will disable interrupts
+- * before saving the context.
+- */
+- if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) )
+- {
+- on_selected_cpus(cpumask_of(vpmu->last_pcpu),
+- vpmu_save_force, (void *)v, 1);
+- vpmu_reset(vpmu, VPMU_CONTEXT_LOADED);
+- }
+- }
+-
+- /* Prevent forced context save from remote CPU */
+- local_irq_disable();
+-
+- prev = per_cpu(last_vcpu, pcpu);
+-
+- if ( prev != v && prev )
+- {
+- vpmu = vcpu_vpmu(prev);
+-
+- /* Someone ran here before us */
+- vpmu_save_force(prev);
+- vpmu_reset(vpmu, VPMU_CONTEXT_LOADED);
+-
+- vpmu = vcpu_vpmu(v);
+- }
+-
+- local_irq_enable();
+-
+ /* Only when PMU is counting, we load PMU context immediately. */
+ if ( !vpmu_is_set(vpmu, VPMU_RUNNING) ||
+ (!has_vlapic(vpmu_vcpu(vpmu)->domain) &&
+--
+2.37.3
+
diff --git a/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch b/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch
deleted file mode 100644
index 060bc99..0000000
--- a/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch
+++ /dev/null
@@ -1,95 +0,0 @@
-From c4815be949aae6583a9a22897beb96b095b4f1a2 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Thu, 9 Jun 2022 15:29:13 +0200
-Subject: [PATCH 27/51] x86/amd: Work around CLFLUSH ordering on older parts
-
-On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything,
-including reads and writes to the address, and LFENCE/SFENCE instructions.
-
-This creates a multitude of problematic corner cases, laid out in the manual.
-Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering.
-
-This is part of XSA-402.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-master commit: 062868a5a8b428b85db589fa9a6d6e43969ffeb9
-master date: 2022-06-09 14:23:07 +0200
----
- xen/arch/x86/cpu/amd.c | 8 ++++++++
- xen/arch/x86/flushtlb.c | 13 ++++++++++++-
- xen/include/asm-x86/cpufeatures.h | 1 +
- 3 files changed, 21 insertions(+), 1 deletion(-)
-
-diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
-index a8e37dbb1f5c..b3b9a0df5fed 100644
---- a/xen/arch/x86/cpu/amd.c
-+++ b/xen/arch/x86/cpu/amd.c
-@@ -812,6 +812,14 @@ static void init_amd(struct cpuinfo_x86 *c)
- if (!cpu_has_lfence_dispatch)
- __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
-
-+ /*
-+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with
-+ * everything, including reads and writes to address, and
-+ * LFENCE/SFENCE instructions.
-+ */
-+ if (!cpu_has_clflushopt)
-+ setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE);
-+
- switch(c->x86)
- {
- case 0xf ... 0x11:
-diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
-index 0c912b8669f8..dcbb4064012e 100644
---- a/xen/arch/x86/flushtlb.c
-+++ b/xen/arch/x86/flushtlb.c
-@@ -259,6 +259,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
- return flags;
- }
-
-+/*
-+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything,
-+ * including reads and writes to address, and LFENCE/SFENCE instructions.
-+ *
-+ * This function only works safely after alternatives have run. Luckily, at
-+ * the time of writing, we don't flush the caches that early.
-+ */
- void cache_flush(const void *addr, unsigned int size)
- {
- /*
-@@ -268,6 +275,8 @@ void cache_flush(const void *addr, unsigned int size)
- unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
- const void *end = addr + size;
-
-+ alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE);
-+
- addr -= (unsigned long)addr & (clflush_size - 1);
- for ( ; addr < end; addr += clflush_size )
- {
-@@ -283,7 +292,9 @@ void cache_flush(const void *addr, unsigned int size)
- [p] "m" (*(const char *)(addr)));
- }
-
-- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
-+ alternative_2("",
-+ "sfence", X86_FEATURE_CLFLUSHOPT,
-+ "mfence", X86_BUG_CLFLUSH_MFENCE);
- }
-
- void cache_writeback(const void *addr, unsigned int size)
-diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
-index 7413febd7ad8..ff3157d52d13 100644
---- a/xen/include/asm-x86/cpufeatures.h
-+++ b/xen/include/asm-x86/cpufeatures.h
-@@ -47,6 +47,7 @@ XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch
-
- #define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */
- #define X86_BUG_NULL_SEG X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */
-+#define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */
-
- /* Total number of capability words, inc synth and bug words. */
- #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */
---
-2.35.1
-
diff --git a/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch b/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch
deleted file mode 100644
index af60348..0000000
--- a/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch
+++ /dev/null
@@ -1,160 +0,0 @@
-From dc020d8d1ba420e2dd0e7a40f5045db897f3c4f4 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Thu, 9 Jun 2022 15:29:38 +0200
-Subject: [PATCH 28/51] x86/pv: Track and flush non-coherent mappings of RAM
-
-There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with
-devices that make non-coherent writes. The Linux sound subsystem makes
-extensive use of this technique.
-
-For such usecases, the guest's DMA buffer is mapped and consistently used as
-WC, and Xen doesn't interact with the buffer.
-
-However, a mischevious guest can use WC mappings to deliberately create
-non-coherency between the cache and RAM, and use this to trick Xen into
-validating a pagetable which isn't actually safe.
-
-Allocate a new PGT_non_coherent to track the non-coherency of mappings. Set
-it whenever a non-coherent writeable mapping is created. If the page is used
-as anything other than PGT_writable_page, force a cache flush before
-validation. Also force a cache flush before the page is returned to the heap.
-
-This is CVE-2022-26364, part of XSA-402.
-
-Reported-by: Jann Horn <jannh@google.com>
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-master commit: c1c9cae3a9633054b177c5de21ad7268162b2f2c
-master date: 2022-06-09 14:23:37 +0200
----
- xen/arch/x86/mm.c | 38 +++++++++++++++++++++++++++++++++++
- xen/arch/x86/pv/grant_table.c | 21 +++++++++++++++++++
- xen/include/asm-x86/mm.h | 6 +++++-
- 3 files changed, 64 insertions(+), 1 deletion(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index ab32d13a1a0d..bab9624fabb7 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -997,6 +997,15 @@ get_page_from_l1e(
- return -EACCES;
- }
-
-+ /*
-+ * Track writeable non-coherent mappings to RAM pages, to trigger a cache
-+ * flush later if the target is used as anything but a PGT_writeable page.
-+ * We care about all writeable mappings, including foreign mappings.
-+ */
-+ if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) &&
-+ (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) )
-+ set_bit(_PGT_non_coherent, &page->u.inuse.type_info);
-+
- return 0;
-
- could_not_pin:
-@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct page_info *page)
- }
- }
-
-+ /*
-+ * Flush the cache if there were previously non-coherent writeable
-+ * mappings of this page. This forces the page to be coherent before it
-+ * is freed back to the heap.
-+ */
-+ if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) )
-+ {
-+ void *addr = __map_domain_page(page);
-+
-+ cache_flush(addr, PAGE_SIZE);
-+ unmap_domain_page(addr);
-+ }
-+
- return rc;
- }
-
-@@ -3027,6 +3049,22 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-
- if ( unlikely(!(nx & PGT_validated)) )
- {
-+ /*
-+ * Flush the cache if there were previously non-coherent mappings of
-+ * this page, and we're trying to use it as anything other than a
-+ * writeable page. This forces the page to be coherent before we
-+ * validate its contents for safety.
-+ */
-+ if ( (nx & PGT_non_coherent) && type != PGT_writable_page )
-+ {
-+ void *addr = __map_domain_page(page);
-+
-+ cache_flush(addr, PAGE_SIZE);
-+ unmap_domain_page(addr);
-+
-+ page->u.inuse.type_info &= ~PGT_non_coherent;
-+ }
-+
- /*
- * No special validation needed for writable or shared pages. Page
- * tables and GDT/LDT need to have their contents audited.
-diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c
-index 0325618c9883..81c72e61ed55 100644
---- a/xen/arch/x86/pv/grant_table.c
-+++ b/xen/arch/x86/pv/grant_table.c
-@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame,
-
- ol1e = *pl1e;
- if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
-+ {
-+ /*
-+ * We always create mappings in this path. However, our caller,
-+ * map_grant_ref(), only passes potentially non-zero cache_flags for
-+ * MMIO frames, so this path doesn't create non-coherent mappings of
-+ * RAM frames and there's no need to calculate PGT_non_coherent.
-+ */
-+ ASSERT(!cache_flags || is_iomem_page(frame));
-+
- rc = GNTST_okay;
-+ }
-
- out_unlock:
- page_unlock(page);
-@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame,
- l1e_get_flags(ol1e), addr, grant_pte_flags);
-
- if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
-+ {
-+ /*
-+ * Generally, replace_grant_pv_mapping() is used to destroy mappings
-+ * (n1le = l1e_empty()), but it can be a present mapping on the
-+ * GNTABOP_unmap_and_replace path.
-+ *
-+ * In such cases, the PTE is fully transplanted from its old location
-+ * via steal_linear_addr(), so we need not perform PGT_non_coherent
-+ * checking here.
-+ */
- rc = GNTST_okay;
-+ }
-
- out_unlock:
- page_unlock(page);
-diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
-index 8a9a43bb0a9d..7464167ae192 100644
---- a/xen/include/asm-x86/mm.h
-+++ b/xen/include/asm-x86/mm.h
-@@ -53,8 +53,12 @@
- #define _PGT_partial PG_shift(8)
- #define PGT_partial PG_mask(1, 8)
-
-+/* Has this page been mapped writeable with a non-coherent memory type? */
-+#define _PGT_non_coherent PG_shift(9)
-+#define PGT_non_coherent PG_mask(1, 9)
-+
- /* Count of uses of this frame as its current type. */
--#define PGT_count_width PG_shift(8)
-+#define PGT_count_width PG_shift(9)
- #define PGT_count_mask ((1UL<<PGT_count_width)-1)
-
- /* Are the 'type mask' bits identical? */
---
-2.35.1
-
diff --git a/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch b/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch
deleted file mode 100644
index 90ce4cf..0000000
--- a/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch
+++ /dev/null
@@ -1,37 +0,0 @@
-From 0b4e62847c5af1a59eea8d17093feccd550d1c26 Mon Sep 17 00:00:00 2001
-From: Jan Beulich <jbeulich@suse.com>
-Date: Fri, 10 Jun 2022 10:28:28 +0200
-Subject: [PATCH 29/51] x86/mm: account for PGT_pae_xen_l2 in recently added
- assertion
-
-While PGT_pae_xen_l2 will be zapped once the type refcount of an L2 page
-reaches zero, it'll be retained as long as the type refcount is non-
-zero. Hence any checking against the requested type needs to either zap
-the bit from the type or include it in the used mask.
-
-Fixes: 9186e96b199e ("x86/pv: Clean up _get_page_type()")
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
-master commit: c2095ac76be0f4a1940346c9ffb49fb967345060
-master date: 2022-06-10 10:21:06 +0200
----
- xen/arch/x86/mm.c | 3 ++-
- 1 file changed, 2 insertions(+), 1 deletion(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index bab9624fabb7..c1b9a3bb102a 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -2928,7 +2928,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- * The page is in one of two states (depending on PGT_partial),
- * and should have exactly one reference.
- */
-- ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1));
-+ ASSERT((x & (PGT_type_mask | PGT_pae_xen_l2 | PGT_count_mask)) ==
-+ (type | 1));
-
- if ( !(x & PGT_partial) )
- {
---
-2.35.1
-
diff --git a/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch b/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch
deleted file mode 100644
index af25b5c..0000000
--- a/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch
+++ /dev/null
@@ -1,258 +0,0 @@
-From 0e80f9f61168d4e4f008da75762cee0118f802ed Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Mon, 13 Jun 2022 16:19:01 +0100
-Subject: [PATCH 30/51] x86/spec-ctrl: Make VERW flushing runtime conditional
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Currently, VERW flushing to mitigate MDS is boot time conditional per domain
-type. However, to provide mitigations for DRPW (CVE-2022-21166), we need to
-conditionally use VERW based on the trustworthiness of the guest, and the
-devices passed through.
-
-Remove the PV/HVM alternatives and instead issue a VERW on the return-to-guest
-path depending on the SCF_verw bit in cpuinfo spec_ctrl_flags.
-
-Introduce spec_ctrl_init_domain() and d->arch.verw to calculate the VERW
-disposition at domain creation time, and context switch the SCF_verw bit.
-
-For now, VERW flushing is used and controlled exactly as before, but later
-patches will add per-domain cases too.
-
-No change in behaviour.
-
-This is part of XSA-404.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-(cherry picked from commit e06b95c1d44ab80da255219fc9f1e2fc423edcb6)
----
- docs/misc/xen-command-line.pandoc | 5 ++---
- xen/arch/x86/domain.c | 12 ++++++++++--
- xen/arch/x86/hvm/vmx/entry.S | 2 +-
- xen/arch/x86/spec_ctrl.c | 30 +++++++++++++++++------------
- xen/include/asm-x86/cpufeatures.h | 3 +--
- xen/include/asm-x86/domain.h | 3 +++
- xen/include/asm-x86/spec_ctrl.h | 2 ++
- xen/include/asm-x86/spec_ctrl_asm.h | 16 +++++++++++++--
- 8 files changed, 51 insertions(+), 22 deletions(-)
-
-diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
-index 1d08fb7e9aa6..d5cb09f86541 100644
---- a/docs/misc/xen-command-line.pandoc
-+++ b/docs/misc/xen-command-line.pandoc
-@@ -2258,9 +2258,8 @@ in place for guests to use.
- Use of a positive boolean value for either of these options is invalid.
-
- The booleans `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` offer fine
--grained control over the alternative blocks used by Xen. These impact Xen's
--ability to protect itself, and Xen's ability to virtualise support for guests
--to use.
-+grained control over the primitives by Xen. These impact Xen's ability to
-+protect itself, and Xen's ability to virtualise support for guests to use.
-
- * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests
- respectively.
-diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
-index ef1812dc1402..1fe6644a71ae 100644
---- a/xen/arch/x86/domain.c
-+++ b/xen/arch/x86/domain.c
-@@ -863,6 +863,8 @@ int arch_domain_create(struct domain *d,
-
- d->arch.msr_relaxed = config->arch.misc_flags & XEN_X86_MSR_RELAXED;
-
-+ spec_ctrl_init_domain(d);
-+
- return 0;
-
- fail:
-@@ -2017,14 +2019,15 @@ static void __context_switch(void)
- void context_switch(struct vcpu *prev, struct vcpu *next)
- {
- unsigned int cpu = smp_processor_id();
-+ struct cpu_info *info = get_cpu_info();
- const struct domain *prevd = prev->domain, *nextd = next->domain;
- unsigned int dirty_cpu = read_atomic(&next->dirty_cpu);
-
- ASSERT(prev != next);
- ASSERT(local_irq_is_enabled());
-
-- get_cpu_info()->use_pv_cr3 = false;
-- get_cpu_info()->xen_cr3 = 0;
-+ info->use_pv_cr3 = false;
-+ info->xen_cr3 = 0;
-
- if ( unlikely(dirty_cpu != cpu) && dirty_cpu != VCPU_CPU_CLEAN )
- {
-@@ -2088,6 +2091,11 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
- *last_id = next_id;
- }
- }
-+
-+ /* Update the top-of-stack block with the VERW disposition. */
-+ info->spec_ctrl_flags &= ~SCF_verw;
-+ if ( nextd->arch.verw )
-+ info->spec_ctrl_flags |= SCF_verw;
- }
-
- sched_context_switched(prev, next);
-diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S
-index 49651f3c435a..5f5de45a1309 100644
---- a/xen/arch/x86/hvm/vmx/entry.S
-+++ b/xen/arch/x86/hvm/vmx/entry.S
-@@ -87,7 +87,7 @@ UNLIKELY_END(realmode)
-
- /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
- /* SPEC_CTRL_EXIT_TO_VMX Req: %rsp=regs/cpuinfo Clob: */
-- ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), X86_FEATURE_SC_VERW_HVM
-+ DO_SPEC_CTRL_COND_VERW
-
- mov VCPU_hvm_guest_cr2(%rbx),%rax
-
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index c19464da70ce..21730aa03071 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -36,8 +36,8 @@ static bool __initdata opt_msr_sc_pv = true;
- static bool __initdata opt_msr_sc_hvm = true;
- static int8_t __initdata opt_rsb_pv = -1;
- static bool __initdata opt_rsb_hvm = true;
--static int8_t __initdata opt_md_clear_pv = -1;
--static int8_t __initdata opt_md_clear_hvm = -1;
-+static int8_t __read_mostly opt_md_clear_pv = -1;
-+static int8_t __read_mostly opt_md_clear_hvm = -1;
-
- /* Cmdline controls for Xen's speculative settings. */
- static enum ind_thunk {
-@@ -932,6 +932,13 @@ static __init void mds_calculations(uint64_t caps)
- }
- }
-
-+void spec_ctrl_init_domain(struct domain *d)
-+{
-+ bool pv = is_pv_domain(d);
-+
-+ d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm;
-+}
-+
- void __init init_speculation_mitigations(void)
- {
- enum ind_thunk thunk = THUNK_DEFAULT;
-@@ -1196,21 +1203,20 @@ void __init init_speculation_mitigations(void)
- boot_cpu_has(X86_FEATURE_MD_CLEAR));
-
- /*
-- * Enable MDS defences as applicable. The PV blocks need using all the
-- * time, and the Idle blocks need using if either PV or HVM defences are
-- * used.
-+ * Enable MDS defences as applicable. The Idle blocks need using if
-+ * either PV or HVM defences are used.
- *
- * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with
-- * equivelent semantics to avoid needing to perform both flushes on the
-- * HVM path. The HVM blocks don't need activating if our hypervisor told
-- * us it was handling L1D_FLUSH, or we are using L1D_FLUSH ourselves.
-+ * equivalent semantics to avoid needing to perform both flushes on the
-+ * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH.
-+ *
-+ * After calculating the appropriate idle setting, simplify
-+ * opt_md_clear_hvm to mean just "should we VERW on the way into HVM
-+ * guests", so spec_ctrl_init_domain() can calculate suitable settings.
- */
-- if ( opt_md_clear_pv )
-- setup_force_cpu_cap(X86_FEATURE_SC_VERW_PV);
- if ( opt_md_clear_pv || opt_md_clear_hvm )
- setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE);
-- if ( opt_md_clear_hvm && !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush )
-- setup_force_cpu_cap(X86_FEATURE_SC_VERW_HVM);
-+ opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush;
-
- /*
- * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT
-diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
-index ff3157d52d13..bd45a144ee78 100644
---- a/xen/include/asm-x86/cpufeatures.h
-+++ b/xen/include/asm-x86/cpufeatures.h
-@@ -35,8 +35,7 @@ XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM
- XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */
- XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */
- XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */
--XEN_CPUFEATURE(SC_VERW_PV, X86_SYNTH(23)) /* VERW used by Xen for PV */
--XEN_CPUFEATURE(SC_VERW_HVM, X86_SYNTH(24)) /* VERW used by Xen for HVM */
-+/* Bits 23,24 unused. */
- XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */
- XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */
- XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */
-diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
-index 92d54de0b9a1..2398a1d99da9 100644
---- a/xen/include/asm-x86/domain.h
-+++ b/xen/include/asm-x86/domain.h
-@@ -319,6 +319,9 @@ struct arch_domain
- uint32_t pci_cf8;
- uint8_t cmos_idx;
-
-+ /* Use VERW on return-to-guest for its flushing side effect. */
-+ bool verw;
-+
- union {
- struct pv_domain pv;
- struct hvm_domain hvm;
-diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
-index f76029523610..751355f471f4 100644
---- a/xen/include/asm-x86/spec_ctrl.h
-+++ b/xen/include/asm-x86/spec_ctrl.h
-@@ -24,6 +24,7 @@
- #define SCF_use_shadow (1 << 0)
- #define SCF_ist_wrmsr (1 << 1)
- #define SCF_ist_rsb (1 << 2)
-+#define SCF_verw (1 << 3)
-
- #ifndef __ASSEMBLY__
-
-@@ -32,6 +33,7 @@
- #include <asm/msr-index.h>
-
- void init_speculation_mitigations(void);
-+void spec_ctrl_init_domain(struct domain *d);
-
- extern bool opt_ibpb;
- extern bool opt_ssbd;
-diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h
-index 02b3b18ce69f..5a590bac44aa 100644
---- a/xen/include/asm-x86/spec_ctrl_asm.h
-+++ b/xen/include/asm-x86/spec_ctrl_asm.h
-@@ -136,6 +136,19 @@
- #endif
- .endm
-
-+.macro DO_SPEC_CTRL_COND_VERW
-+/*
-+ * Requires %rsp=cpuinfo
-+ *
-+ * Issue a VERW for its flushing side effect, if indicated. This is a Spectre
-+ * v1 gadget, but the IRET/VMEntry is serialising.
-+ */
-+ testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp)
-+ jz .L\@_verw_skip
-+ verw CPUINFO_verw_sel(%rsp)
-+.L\@_verw_skip:
-+.endm
-+
- .macro DO_SPEC_CTRL_ENTRY maybexen:req
- /*
- * Requires %rsp=regs (also cpuinfo if !maybexen)
-@@ -231,8 +244,7 @@
- #define SPEC_CTRL_EXIT_TO_PV \
- ALTERNATIVE "", \
- DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV; \
-- ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), \
-- X86_FEATURE_SC_VERW_PV
-+ DO_SPEC_CTRL_COND_VERW
-
- /*
- * Use in IST interrupt/exception context. May interrupt Xen or PV context.
---
-2.35.1
-
diff --git a/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch b/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch
deleted file mode 100644
index 3b91fb5..0000000
--- a/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch
+++ /dev/null
@@ -1,98 +0,0 @@
-From a83108736db0ddaa5855f5abda6dcc8ae4fe25e9 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Mon, 20 Sep 2021 18:47:49 +0100
-Subject: [PATCH 31/51] x86/spec-ctrl: Enumeration for MMIO Stale Data controls
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-The three *_NO bits indicate non-susceptibility to the SSDP, FBSDP and PSDP
-data movement primitives.
-
-FB_CLEAR indicates that the VERW instruction has re-gained it's Fill Buffer
-flushing side effect. This is only enumerated on parts where VERW had
-previously lost it's flushing side effect due to the MDS/TAA vulnerabilities
-being fixed in hardware.
-
-FB_CLEAR_CTRL is available on a subset of FB_CLEAR parts where the Fill Buffer
-clearing side effect of VERW can be turned off for performance reasons.
-
-This is part of XSA-404.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-(cherry picked from commit 2ebe8fe9b7e0d36e9ec3cfe4552b2b197ef0dcec)
----
- xen/arch/x86/spec_ctrl.c | 11 ++++++++---
- xen/include/asm-x86/msr-index.h | 6 ++++++
- 2 files changed, 14 insertions(+), 3 deletions(-)
-
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index 21730aa03071..d285538bde9f 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -323,7 +323,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- * Hardware read-only information, stating immunity to certain issues, or
- * suggestions of which mitigation to use.
- */
-- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s\n",
-+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
- (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "",
- (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "",
- (caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
-@@ -332,13 +332,16 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "",
- (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "",
- (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : "",
-+ (caps & ARCH_CAPS_SBDR_SSDP_NO) ? " SBDR_SSDP_NO" : "",
-+ (caps & ARCH_CAPS_FBSDP_NO) ? " FBSDP_NO" : "",
-+ (caps & ARCH_CAPS_PSDP_NO) ? " PSDP_NO" : "",
- (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "",
- (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "",
- (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "",
- (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : "");
-
- /* Hardware features which need driving to mitigate issues. */
-- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s\n",
-+ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n",
- (e8b & cpufeat_mask(X86_FEATURE_IBPB)) ||
- (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "",
- (e8b & cpufeat_mask(X86_FEATURE_IBRS)) ||
-@@ -353,7 +356,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- (_7d0 & cpufeat_mask(X86_FEATURE_MD_CLEAR)) ? " MD_CLEAR" : "",
- (_7d0 & cpufeat_mask(X86_FEATURE_SRBDS_CTRL)) ? " SRBDS_CTRL" : "",
- (e8b & cpufeat_mask(X86_FEATURE_VIRT_SSBD)) ? " VIRT_SSBD" : "",
-- (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "");
-+ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "",
-+ (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "",
-+ (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : "");
-
- /* Compiled-in support which pertains to mitigations. */
- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
-diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
-index 31964b88af7a..72bc32ba04ff 100644
---- a/xen/include/asm-x86/msr-index.h
-+++ b/xen/include/asm-x86/msr-index.h
-@@ -66,6 +66,11 @@
- #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
- #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
- #define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8)
-+#define ARCH_CAPS_SBDR_SSDP_NO (_AC(1, ULL) << 13)
-+#define ARCH_CAPS_FBSDP_NO (_AC(1, ULL) << 14)
-+#define ARCH_CAPS_PSDP_NO (_AC(1, ULL) << 15)
-+#define ARCH_CAPS_FB_CLEAR (_AC(1, ULL) << 17)
-+#define ARCH_CAPS_FB_CLEAR_CTRL (_AC(1, ULL) << 18)
-
- #define MSR_FLUSH_CMD 0x0000010b
- #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
-@@ -83,6 +88,7 @@
- #define MCU_OPT_CTRL_RNGDS_MITG_DIS (_AC(1, ULL) << 0)
- #define MCU_OPT_CTRL_RTM_ALLOW (_AC(1, ULL) << 1)
- #define MCU_OPT_CTRL_RTM_LOCKED (_AC(1, ULL) << 2)
-+#define MCU_OPT_CTRL_FB_CLEAR_DIS (_AC(1, ULL) << 3)
-
- #define MSR_RTIT_OUTPUT_BASE 0x00000560
- #define MSR_RTIT_OUTPUT_MASK 0x00000561
---
-2.35.1
-
diff --git a/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch b/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch
deleted file mode 100644
index c63891a..0000000
--- a/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch
+++ /dev/null
@@ -1,187 +0,0 @@
-From 2e82446cb252f6c8ac697e81f4155872c69afde4 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Mon, 13 Jun 2022 19:18:32 +0100
-Subject: [PATCH 32/51] x86/spec-ctrl: Add spec-ctrl=unpriv-mmio
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Per Xen's support statement, PCI passthrough should be to trusted domains
-because the overall system security depends on factors outside of Xen's
-control.
-
-As such, Xen, in a supported configuration, is not vulnerable to DRPW/SBDR.
-
-However, users who have risk assessed their configuration may be happy with
-the risk of DoS, but unhappy with the risk of cross-domain data leakage. Such
-users should enable this option.
-
-On CPUs vulnerable to MDS, the existing mitigations are the best we can do to
-mitigate MMIO cross-domain data leakage.
-
-On CPUs fixed to MDS but vulnerable MMIO stale data leakage, this option:
-
- * On CPUs susceptible to FBSDP, mitigates cross-domain fill buffer leakage
- using FB_CLEAR.
- * On CPUs susceptible to SBDR, mitigates RNG data recovery by engaging the
- srb-lock, previously used to mitigate SRBDS.
-
-Both mitigations require microcode from IPU 2022.1, May 2022.
-
-This is part of XSA-404.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-(cherry picked from commit 8c24b70fedcb52633b2370f834d8a2be3f7fa38e)
----
- docs/misc/xen-command-line.pandoc | 14 +++++++--
- xen/arch/x86/spec_ctrl.c | 48 ++++++++++++++++++++++++-------
- 2 files changed, 48 insertions(+), 14 deletions(-)
-
-diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
-index d5cb09f86541..a642e43476a2 100644
---- a/docs/misc/xen-command-line.pandoc
-+++ b/docs/misc/xen-command-line.pandoc
-@@ -2235,7 +2235,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
- ### spec-ctrl (x86)
- > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>,
- > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu,
--> l1d-flush,branch-harden,srb-lock}=<bool> ]`
-+> l1d-flush,branch-harden,srb-lock,unpriv-mmio}=<bool> ]`
-
- Controls for speculative execution sidechannel mitigations. By default, Xen
- will pick the most appropriate mitigations based on compiled in support,
-@@ -2314,8 +2314,16 @@ Xen will enable this mitigation.
- On hardware supporting SRBDS_CTRL, the `srb-lock=` option can be used to force
- or prevent Xen from protect the Special Register Buffer from leaking stale
- data. By default, Xen will enable this mitigation, except on parts where MDS
--is fixed and TAA is fixed/mitigated (in which case, there is believed to be no
--way for an attacker to obtain the stale data).
-+is fixed and TAA is fixed/mitigated and there are no unprivileged MMIO
-+mappings (in which case, there is believed to be no way for an attacker to
-+obtain stale data).
-+
-+The `unpriv-mmio=` boolean indicates whether the system has (or will have)
-+less than fully privileged domains granted access to MMIO devices. By
-+default, this option is disabled. If enabled, Xen will use the `FB_CLEAR`
-+and/or `SRBDS_CTRL` functionality available in the Intel May 2022 microcode
-+release to mitigate cross-domain leakage of data via the MMIO Stale Data
-+vulnerabilities.
-
- ### sync_console
- > `= <boolean>`
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index d285538bde9f..099113ba41e6 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -67,6 +67,8 @@ static bool __initdata cpu_has_bug_msbds_only; /* => minimal HT impact. */
- static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. */
-
- static int8_t __initdata opt_srb_lock = -1;
-+static bool __initdata opt_unpriv_mmio;
-+static bool __read_mostly opt_fb_clear_mmio;
-
- static int __init parse_spec_ctrl(const char *s)
- {
-@@ -184,6 +186,8 @@ static int __init parse_spec_ctrl(const char *s)
- opt_branch_harden = val;
- else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 )
- opt_srb_lock = val;
-+ else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 )
-+ opt_unpriv_mmio = val;
- else
- rc = -EINVAL;
-
-@@ -392,7 +396,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-",
- opt_ibpb ? " IBPB" : "",
- opt_l1d_flush ? " L1D_FLUSH" : "",
-- opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "",
-+ opt_md_clear_pv || opt_md_clear_hvm ||
-+ opt_fb_clear_mmio ? " VERW" : "",
- opt_branch_harden ? " BRANCH_HARDEN" : "");
-
- /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
-@@ -941,7 +946,9 @@ void spec_ctrl_init_domain(struct domain *d)
- {
- bool pv = is_pv_domain(d);
-
-- d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm;
-+ d->arch.verw =
-+ (pv ? opt_md_clear_pv : opt_md_clear_hvm) ||
-+ (opt_fb_clear_mmio && is_iommu_enabled(d));
- }
-
- void __init init_speculation_mitigations(void)
-@@ -1195,6 +1202,18 @@ void __init init_speculation_mitigations(void)
-
- mds_calculations(caps);
-
-+ /*
-+ * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have
-+ * reintroduced the VERW fill buffer flushing side effect because of a
-+ * susceptibility to FBSDP.
-+ *
-+ * If unprivileged guests have (or will have) MMIO mappings, we can
-+ * mitigate cross-domain leakage of fill buffer data by issuing VERW on
-+ * the return-to-guest path.
-+ */
-+ if ( opt_unpriv_mmio )
-+ opt_fb_clear_mmio = caps & ARCH_CAPS_FB_CLEAR;
-+
- /*
- * By default, enable PV and HVM mitigations on MDS-vulnerable hardware.
- * This will only be a token effort for MLPDS/MFBDS when HT is enabled,
-@@ -1208,18 +1227,20 @@ void __init init_speculation_mitigations(void)
- boot_cpu_has(X86_FEATURE_MD_CLEAR));
-
- /*
-- * Enable MDS defences as applicable. The Idle blocks need using if
-- * either PV or HVM defences are used.
-+ * Enable MDS/MMIO defences as applicable. The Idle blocks need using if
-+ * either the PV or HVM MDS defences are used, or if we may give MMIO
-+ * access to untrusted guests.
- *
- * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with
- * equivalent semantics to avoid needing to perform both flushes on the
-- * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH.
-+ * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for
-+ * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.)
- *
- * After calculating the appropriate idle setting, simplify
- * opt_md_clear_hvm to mean just "should we VERW on the way into HVM
- * guests", so spec_ctrl_init_domain() can calculate suitable settings.
- */
-- if ( opt_md_clear_pv || opt_md_clear_hvm )
-+ if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio )
- setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE);
- opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush;
-
-@@ -1284,14 +1305,19 @@ void __init init_speculation_mitigations(void)
- * On some SRBDS-affected hardware, it may be safe to relax srb-lock by
- * default.
- *
-- * On parts which enumerate MDS_NO and not TAA_NO, TSX is the only known
-- * way to access the Fill Buffer. If TSX isn't available (inc. SKU
-- * reasons on some models), or TSX is explicitly disabled, then there is
-- * no need for the extra overhead to protect RDRAND/RDSEED.
-+ * All parts with SRBDS_CTRL suffer SSDP, the mechanism by which stale RNG
-+ * data becomes available to other contexts. To recover the data, an
-+ * attacker needs to use:
-+ * - SBDS (MDS or TAA to sample the cores fill buffer)
-+ * - SBDR (Architecturally retrieve stale transaction buffer contents)
-+ * - DRPW (Architecturally latch stale fill buffer data)
-+ *
-+ * On MDS_NO parts, and with TAA_NO or TSX unavailable/disabled, and there
-+ * is no unprivileged MMIO access, the RNG data doesn't need protecting.
- */
- if ( cpu_has_srbds_ctrl )
- {
-- if ( opt_srb_lock == -1 &&
-+ if ( opt_srb_lock == -1 && !opt_unpriv_mmio &&
- (caps & (ARCH_CAPS_MDS_NO|ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO &&
- (!cpu_has_hle || ((caps & ARCH_CAPS_TSX_CTRL) && rtm_disabled)) )
- opt_srb_lock = 0;
---
-2.35.1
-
diff --git a/0033-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch b/0033-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch
deleted file mode 100644
index 07f488d..0000000
--- a/0033-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch
+++ /dev/null
@@ -1,52 +0,0 @@
-From 460b08d6c6c16b3f32aa138e772b759ae02a4479 Mon Sep 17 00:00:00 2001
-From: Jan Beulich <jbeulich@suse.com>
-Date: Tue, 12 Jul 2022 11:10:34 +0200
-Subject: [PATCH 33/51] IOMMU/x86: work around bogus gcc12 warning in
- hvm_gsi_eoi()
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-As per [1] the expansion of the pirq_dpci() macro causes a -Waddress
-controlled warning (enabled implicitly in our builds, if not by default)
-tying the middle part of the involved conditional expression to the
-surrounding boolean context. Work around this by introducing a local
-inline function in the affected source file.
-
-Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Acked-by: Roger Pau Monné <roger.pau@citrix.com>
-
-[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102967
-master commit: 80ad8db8a4d9bb24952f0aea788ce6f47566fa76
-master date: 2022-06-15 10:19:32 +0200
----
- xen/drivers/passthrough/x86/hvm.c | 12 ++++++++++++
- 1 file changed, 12 insertions(+)
-
-diff --git a/xen/drivers/passthrough/x86/hvm.c b/xen/drivers/passthrough/x86/hvm.c
-index 0b37cd145b60..ba0f6c53d742 100644
---- a/xen/drivers/passthrough/x86/hvm.c
-+++ b/xen/drivers/passthrough/x86/hvm.c
-@@ -25,6 +25,18 @@
- #include <asm/hvm/support.h>
- #include <asm/io_apic.h>
-
-+/*
-+ * Gcc12 takes issue with pirq_dpci() being used in boolean context (see gcc
-+ * bug 102967). While we can't replace the macro definition in the header by an
-+ * inline function, we can do so here.
-+ */
-+static inline struct hvm_pirq_dpci *_pirq_dpci(struct pirq *pirq)
-+{
-+ return pirq_dpci(pirq);
-+}
-+#undef pirq_dpci
-+#define pirq_dpci(pirq) _pirq_dpci(pirq)
-+
- static DEFINE_PER_CPU(struct list_head, dpci_list);
-
- /*
---
-2.35.1
-
diff --git a/0034-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch b/0034-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch
deleted file mode 100644
index ac71ab8..0000000
--- a/0034-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-From 5cb8142076ce1ce53eafd7e00acb4d0eac4e7784 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
- <marmarek@invisiblethingslab.com>
-Date: Tue, 12 Jul 2022 11:11:35 +0200
-Subject: [PATCH 34/51] ehci-dbgp: fix selecting n-th ehci controller
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-The ehci<n> number was parsed but ignored.
-
-Fixes: 322ecbe4ac85 ("console: add EHCI debug port based serial console")
-Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-master commit: d6d0cb659fda64430d4649f8680c5cead32da8fd
-master date: 2022-06-16 14:23:37 +0100
----
- xen/drivers/char/ehci-dbgp.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xen/drivers/char/ehci-dbgp.c b/xen/drivers/char/ehci-dbgp.c
-index c893d246defa..66b4811af24a 100644
---- a/xen/drivers/char/ehci-dbgp.c
-+++ b/xen/drivers/char/ehci-dbgp.c
-@@ -1478,7 +1478,7 @@ void __init ehci_dbgp_init(void)
- unsigned int num = 0;
-
- if ( opt_dbgp[4] )
-- simple_strtoul(opt_dbgp + 4, &e, 10);
-+ num = simple_strtoul(opt_dbgp + 4, &e, 10);
-
- dbgp->cap = find_dbgp(dbgp, num);
- if ( !dbgp->cap )
---
-2.35.1
-
diff --git a/0035-tools-xenstored-Harden-corrupt.patch b/0035-tools-xenstored-Harden-corrupt.patch
deleted file mode 100644
index bb0f7f1..0000000
--- a/0035-tools-xenstored-Harden-corrupt.patch
+++ /dev/null
@@ -1,44 +0,0 @@
-From 81ee3d08351be1ef2a14d371993604098d6a4673 Mon Sep 17 00:00:00 2001
-From: Julien Grall <jgrall@amazon.com>
-Date: Tue, 12 Jul 2022 11:12:13 +0200
-Subject: [PATCH 35/51] tools/xenstored: Harden corrupt()
-
-At the moment, corrupt() is neither checking for allocation failure
-nor freeing the allocated memory.
-
-Harden the code by printing ENOMEM if the allocation failed and
-free 'str' after the last use.
-
-This is not considered to be a security issue because corrupt() should
-only be called when Xenstored thinks the database is corrupted. Note
-that the trigger (i.e. a guest reliably provoking the call) would be
-a security issue.
-
-Fixes: 06d17943f0cd ("Added a basic integrity checker, and some basic ability to recover from store")
-Signed-off-by: Julien Grall <jgrall@amazon.com>
-Reviewed-by: Juergen Gross <jgross@suse.com>
-master commit: db3382dd4f468c763512d6bf91c96773395058fb
-master date: 2022-06-23 13:44:10 +0100
----
- tools/xenstore/xenstored_core.c | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-
-diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
-index 91d093a12ea6..0c8ee276f837 100644
---- a/tools/xenstore/xenstored_core.c
-+++ b/tools/xenstore/xenstored_core.c
-@@ -2087,7 +2087,10 @@ void corrupt(struct connection *conn, const char *fmt, ...)
- va_end(arglist);
-
- log("corruption detected by connection %i: err %s: %s",
-- conn ? (int)conn->id : -1, strerror(saved_errno), str);
-+ conn ? (int)conn->id : -1, strerror(saved_errno),
-+ str ?: "ENOMEM");
-+
-+ talloc_free(str);
-
- check_store();
- }
---
-2.35.1
-
diff --git a/0036-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch b/0036-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch
deleted file mode 100644
index 8bc0768..0000000
--- a/0036-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch
+++ /dev/null
@@ -1,93 +0,0 @@
-From 09d533f4c80b7eaf9fb4e36ebba8259580857a9d Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Tue, 12 Jul 2022 11:12:46 +0200
-Subject: [PATCH 36/51] x86/spec-ctrl: Only adjust MSR_SPEC_CTRL for idle with
- legacy IBRS
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Back at the time of the original Spectre-v2 fixes, it was recommended to clear
-MSR_SPEC_CTRL when going idle. This is because of the side effects on the
-sibling thread caused by the microcode IBRS and STIBP implementations which
-were retrofitted to existing CPUs.
-
-However, there are no relevant cross-thread impacts for the hardware
-IBRS/STIBP implementations, so this logic should not be used on Intel CPUs
-supporting eIBRS, or any AMD CPUs; doing so only adds unnecessary latency to
-the idle path.
-
-Furthermore, there's no point playing with MSR_SPEC_CTRL in the idle paths if
-SMT is disabled for other reasons.
-
-Fixes: 8d03080d2a33 ("x86/spec-ctrl: Cease using thunk=lfence on AMD")
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-master commit: ffc7694e0c99eea158c32aa164b7d1e1bb1dc46b
-master date: 2022-06-30 18:07:13 +0100
----
- xen/arch/x86/spec_ctrl.c | 10 ++++++++--
- xen/include/asm-x86/cpufeatures.h | 2 +-
- xen/include/asm-x86/spec_ctrl.h | 5 +++--
- 3 files changed, 12 insertions(+), 5 deletions(-)
-
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index 099113ba41e6..1ed5ceda8b46 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -1150,8 +1150,14 @@ void __init init_speculation_mitigations(void)
- /* (Re)init BSP state now that default_spec_ctrl_flags has been calculated. */
- init_shadow_spec_ctrl_state();
-
-- /* If Xen is using any MSR_SPEC_CTRL settings, adjust the idle path. */
-- if ( default_xen_spec_ctrl )
-+ /*
-+ * For microcoded IBRS only (i.e. Intel, pre eIBRS), it is recommended to
-+ * clear MSR_SPEC_CTRL before going idle, to avoid impacting sibling
-+ * threads. Activate this if SMT is enabled, and Xen is using a non-zero
-+ * MSR_SPEC_CTRL setting.
-+ */
-+ if ( boot_cpu_has(X86_FEATURE_IBRSB) && !(caps & ARCH_CAPS_IBRS_ALL) &&
-+ hw_smt_enabled && default_xen_spec_ctrl )
- setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE);
-
- xpti_init_default(caps);
-diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
-index bd45a144ee78..493d338a085e 100644
---- a/xen/include/asm-x86/cpufeatures.h
-+++ b/xen/include/asm-x86/cpufeatures.h
-@@ -33,7 +33,7 @@ XEN_CPUFEATURE(SC_MSR_HVM, X86_SYNTH(17)) /* MSR_SPEC_CTRL used by Xen fo
- XEN_CPUFEATURE(SC_RSB_PV, X86_SYNTH(18)) /* RSB overwrite needed for PV */
- XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM */
- XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */
--XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */
-+XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* Clear MSR_SPEC_CTRL on idle */
- XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */
- /* Bits 23,24 unused. */
- XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */
-diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
-index 751355f471f4..7e83e0179fb9 100644
---- a/xen/include/asm-x86/spec_ctrl.h
-+++ b/xen/include/asm-x86/spec_ctrl.h
-@@ -78,7 +78,8 @@ static always_inline void spec_ctrl_enter_idle(struct cpu_info *info)
- uint32_t val = 0;
-
- /*
-- * Branch Target Injection:
-+ * It is recommended in some cases to clear MSR_SPEC_CTRL when going idle,
-+ * to avoid impacting sibling threads.
- *
- * Latch the new shadow value, then enable shadowing, then update the MSR.
- * There are no SMP issues here; only local processor ordering concerns.
-@@ -114,7 +115,7 @@ static always_inline void spec_ctrl_exit_idle(struct cpu_info *info)
- uint32_t val = info->xen_spec_ctrl;
-
- /*
-- * Branch Target Injection:
-+ * Restore MSR_SPEC_CTRL on exit from idle.
- *
- * Disable shadowing before updating the MSR. There are no SMP issues
- * here; only local processor ordering concerns.
---
-2.35.1
-
diff --git a/0037-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch b/0037-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch
deleted file mode 100644
index 156aa58..0000000
--- a/0037-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch
+++ /dev/null
@@ -1,234 +0,0 @@
-From db6ca8176ccc4ff7dfe3c06969af9ebfab0d7b04 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Tue, 12 Jul 2022 11:13:33 +0200
-Subject: [PATCH 37/51] x86/spec-ctrl: Knobs for STIBP and PSFD, and follow
- hardware STIBP hint
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-STIBP and PSFD are slightly weird bits, because they're both implied by other
-bits in MSR_SPEC_CTRL. Add fine grain controls for them, and take the
-implications into account when setting IBRS/SSBD.
-
-Rearrange the IBPB text/variables/logic to keep all the MSR_SPEC_CTRL bits
-together, for consistency.
-
-However, AMD have a hardware hint CPUID bit recommending that STIBP be set
-unilaterally. This is advertised on Zen3, so follow the recommendation.
-Furthermore, in such cases, set STIBP behind the guest's back for now. This
-has negligible overhead for the guest, but saves a WRMSR on vmentry. This is
-the only default change.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-master commit: fef244b179c06fcdfa581f7d57fa6e578c49ff50
-master date: 2022-06-30 18:07:13 +0100
----
- docs/misc/xen-command-line.pandoc | 21 +++++++---
- xen/arch/x86/hvm/svm/vmcb.c | 9 +++++
- xen/arch/x86/spec_ctrl.c | 67 ++++++++++++++++++++++++++-----
- 3 files changed, 82 insertions(+), 15 deletions(-)
-
-diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
-index a642e43476a2..46e9c58d35cd 100644
---- a/docs/misc/xen-command-line.pandoc
-+++ b/docs/misc/xen-command-line.pandoc
-@@ -2234,8 +2234,9 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
-
- ### spec-ctrl (x86)
- > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>,
--> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu,
--> l1d-flush,branch-harden,srb-lock,unpriv-mmio}=<bool> ]`
-+> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
-+> eager-fpu,l1d-flush,branch-harden,srb-lock,
-+> unpriv-mmio}=<bool> ]`
-
- Controls for speculative execution sidechannel mitigations. By default, Xen
- will pick the most appropriate mitigations based on compiled in support,
-@@ -2285,9 +2286,10 @@ On hardware supporting IBRS (Indirect Branch Restricted Speculation), the
- If Xen is not using IBRS itself, functionality is still set up so IBRS can be
- virtualised for guests.
-
--On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=`
--option can be used to force (the default) or prevent Xen from issuing branch
--prediction barriers on vcpu context switches.
-+On hardware supporting STIBP (Single Thread Indirect Branch Predictors), the
-+`stibp=` option can be used to force or prevent Xen using the feature itself.
-+By default, Xen will use STIBP when IBRS is in use (IBRS implies STIBP), and
-+when hardware hints recommend using it as a blanket setting.
-
- On hardware supporting SSBD (Speculative Store Bypass Disable), the `ssbd=`
- option can be used to force or prevent Xen using the feature itself. On AMD
-@@ -2295,6 +2297,15 @@ hardware, this is a global option applied at boot, and not virtualised for
- guest use. On Intel hardware, the feature is virtualised for guests,
- independently of Xen's choice of setting.
-
-+On hardware supporting PSFD (Predictive Store Forwarding Disable), the `psfd=`
-+option can be used to force or prevent Xen using the feature itself. By
-+default, Xen will not use PSFD. PSFD is implied by SSBD, and SSBD is off by
-+default.
-+
-+On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=`
-+option can be used to force (the default) or prevent Xen from issuing branch
-+prediction barriers on vcpu context switches.
-+
- On all hardware, the `eager-fpu=` option can be used to force or prevent Xen
- from using fully eager FPU context switches. This is currently implemented as
- a global control. By default, Xen will choose to use fully eager context
-diff --git a/xen/arch/x86/hvm/svm/vmcb.c b/xen/arch/x86/hvm/svm/vmcb.c
-index 565e997155f2..ef7224eb5dd7 100644
---- a/xen/arch/x86/hvm/svm/vmcb.c
-+++ b/xen/arch/x86/hvm/svm/vmcb.c
-@@ -29,6 +29,7 @@
- #include <asm/hvm/support.h>
- #include <asm/hvm/svm/svm.h>
- #include <asm/hvm/svm/svmdebug.h>
-+#include <asm/spec_ctrl.h>
-
- struct vmcb_struct *alloc_vmcb(void)
- {
-@@ -176,6 +177,14 @@ static int construct_vmcb(struct vcpu *v)
- vmcb->_pause_filter_thresh = SVM_PAUSETHRESH_INIT;
- }
-
-+ /*
-+ * When default_xen_spec_ctrl simply SPEC_CTRL_STIBP, default this behind
-+ * the back of the VM too. Our SMT topology isn't accurate, the overhead
-+ * is neglegable, and doing this saves a WRMSR on the vmentry path.
-+ */
-+ if ( default_xen_spec_ctrl == SPEC_CTRL_STIBP )
-+ v->arch.msrs->spec_ctrl.raw = SPEC_CTRL_STIBP;
-+
- return 0;
- }
-
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index 1ed5ceda8b46..dfdd45c358c4 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -48,9 +48,13 @@ static enum ind_thunk {
- THUNK_LFENCE,
- THUNK_JMP,
- } opt_thunk __initdata = THUNK_DEFAULT;
-+
- static int8_t __initdata opt_ibrs = -1;
-+int8_t __initdata opt_stibp = -1;
-+bool __read_mostly opt_ssbd;
-+int8_t __initdata opt_psfd = -1;
-+
- bool __read_mostly opt_ibpb = true;
--bool __read_mostly opt_ssbd = false;
- int8_t __read_mostly opt_eager_fpu = -1;
- int8_t __read_mostly opt_l1d_flush = -1;
- static bool __initdata opt_branch_harden = true;
-@@ -172,12 +176,20 @@ static int __init parse_spec_ctrl(const char *s)
- else
- rc = -EINVAL;
- }
-+
-+ /* Bits in MSR_SPEC_CTRL. */
- else if ( (val = parse_boolean("ibrs", s, ss)) >= 0 )
- opt_ibrs = val;
-- else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 )
-- opt_ibpb = val;
-+ else if ( (val = parse_boolean("stibp", s, ss)) >= 0 )
-+ opt_stibp = val;
- else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 )
- opt_ssbd = val;
-+ else if ( (val = parse_boolean("psfd", s, ss)) >= 0 )
-+ opt_psfd = val;
-+
-+ /* Misc settings. */
-+ else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 )
-+ opt_ibpb = val;
- else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 )
- opt_eager_fpu = val;
- else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
-@@ -376,7 +388,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- "\n");
-
- /* Settings for Xen's protection, irrespective of guests. */
-- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s, Other:%s%s%s%s%s\n",
-+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s\n",
- thunk == THUNK_NONE ? "N/A" :
- thunk == THUNK_RETPOLINE ? "RETPOLINE" :
- thunk == THUNK_LFENCE ? "LFENCE" :
-@@ -390,6 +402,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- (!boot_cpu_has(X86_FEATURE_SSBD) &&
- !boot_cpu_has(X86_FEATURE_AMD_SSBD)) ? "" :
- (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-",
-+ (!boot_cpu_has(X86_FEATURE_PSFD) &&
-+ !boot_cpu_has(X86_FEATURE_INTEL_PSFD)) ? "" :
-+ (default_xen_spec_ctrl & SPEC_CTRL_PSFD) ? " PSFD+" : " PSFD-",
- !(caps & ARCH_CAPS_TSX_CTRL) ? "" :
- (opt_tsx & 1) ? " TSX+" : " TSX-",
- !cpu_has_srbds_ctrl ? "" :
-@@ -979,10 +994,7 @@ void __init init_speculation_mitigations(void)
- if ( !has_spec_ctrl )
- printk(XENLOG_WARNING "?!? CET active, but no MSR_SPEC_CTRL?\n");
- else if ( opt_ibrs == -1 )
-- {
- opt_ibrs = ibrs = true;
-- default_xen_spec_ctrl |= SPEC_CTRL_IBRS | SPEC_CTRL_STIBP;
-- }
-
- if ( opt_thunk == THUNK_DEFAULT || opt_thunk == THUNK_RETPOLINE )
- thunk = THUNK_JMP;
-@@ -1086,14 +1098,49 @@ void __init init_speculation_mitigations(void)
- setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM);
- }
-
-- /* If we have IBRS available, see whether we should use it. */
-+ /* Figure out default_xen_spec_ctrl. */
- if ( has_spec_ctrl && ibrs )
-- default_xen_spec_ctrl |= SPEC_CTRL_IBRS;
-+ {
-+ /* IBRS implies STIBP. */
-+ if ( opt_stibp == -1 )
-+ opt_stibp = 1;
-+
-+ default_xen_spec_ctrl |= SPEC_CTRL_IBRS;
-+ }
-+
-+ /*
-+ * Use STIBP by default if the hardware hint is set. Otherwise, leave it
-+ * off as it a severe performance pentalty on pre-eIBRS Intel hardware
-+ * where it was retrofitted in microcode.
-+ */
-+ if ( opt_stibp == -1 )
-+ opt_stibp = !!boot_cpu_has(X86_FEATURE_STIBP_ALWAYS);
-+
-+ if ( opt_stibp && (boot_cpu_has(X86_FEATURE_STIBP) ||
-+ boot_cpu_has(X86_FEATURE_AMD_STIBP)) )
-+ default_xen_spec_ctrl |= SPEC_CTRL_STIBP;
-
-- /* If we have SSBD available, see whether we should use it. */
- if ( opt_ssbd && (boot_cpu_has(X86_FEATURE_SSBD) ||
- boot_cpu_has(X86_FEATURE_AMD_SSBD)) )
-+ {
-+ /* SSBD implies PSFD */
-+ if ( opt_psfd == -1 )
-+ opt_psfd = 1;
-+
- default_xen_spec_ctrl |= SPEC_CTRL_SSBD;
-+ }
-+
-+ /*
-+ * Don't use PSFD by default. AMD designed the predictor to
-+ * auto-clear on privilege change. PSFD is implied by SSBD, which is
-+ * off by default.
-+ */
-+ if ( opt_psfd == -1 )
-+ opt_psfd = 0;
-+
-+ if ( opt_psfd && (boot_cpu_has(X86_FEATURE_PSFD) ||
-+ boot_cpu_has(X86_FEATURE_INTEL_PSFD)) )
-+ default_xen_spec_ctrl |= SPEC_CTRL_PSFD;
-
- /*
- * PV guests can create RSB entries for any linear address they control,
---
-2.35.1
-
diff --git a/0038-libxc-fix-compilation-error-with-gcc13.patch b/0038-libxc-fix-compilation-error-with-gcc13.patch
deleted file mode 100644
index 8056742..0000000
--- a/0038-libxc-fix-compilation-error-with-gcc13.patch
+++ /dev/null
@@ -1,33 +0,0 @@
-From cd3d6b4cd46cd05590805b4a6c0b6654af60106e Mon Sep 17 00:00:00 2001
-From: Charles Arnold <carnold@suse.com>
-Date: Tue, 12 Jul 2022 11:14:07 +0200
-Subject: [PATCH 38/51] libxc: fix compilation error with gcc13
-
-xc_psr.c:161:5: error: conflicting types for 'xc_psr_cmt_get_data'
-due to enum/integer mismatch;
-
-Signed-off-by: Charles Arnold <carnold@suse.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-Acked-by: Anthony PERARD <anthony.perard@citrix.com>
-master commit: 8eeae8c2b4efefda8e946461e86cf2ae9c18e5a9
-master date: 2022-07-06 13:06:40 +0200
----
- tools/include/xenctrl.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/tools/include/xenctrl.h b/tools/include/xenctrl.h
-index 07b96e6671a5..893ae39e4a95 100644
---- a/tools/include/xenctrl.h
-+++ b/tools/include/xenctrl.h
-@@ -2516,7 +2516,7 @@ int xc_psr_cmt_get_l3_event_mask(xc_interface *xch, uint32_t *event_mask);
- int xc_psr_cmt_get_l3_cache_size(xc_interface *xch, uint32_t cpu,
- uint32_t *l3_cache_size);
- int xc_psr_cmt_get_data(xc_interface *xch, uint32_t rmid, uint32_t cpu,
-- uint32_t psr_cmt_type, uint64_t *monitor_data,
-+ xc_psr_cmt_type type, uint64_t *monitor_data,
- uint64_t *tsc);
- int xc_psr_cmt_enabled(xc_interface *xch);
-
---
-2.35.1
-
diff --git a/0039-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch b/0039-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch
deleted file mode 100644
index 1797a8f..0000000
--- a/0039-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-From 61b9c2ceeb94b0cdaff01023cc5523b1f13e66e2 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Tue, 12 Jul 2022 11:14:34 +0200
-Subject: [PATCH 39/51] x86/spec-ctrl: Honour spec-ctrl=0 for unpriv-mmio
- sub-option
-
-This was an oversight from when unpriv-mmio was introduced.
-
-Fixes: 8c24b70fedcb ("x86/spec-ctrl: Add spec-ctrl=unpriv-mmio")
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-master commit: 4cdb519d797c19ebb8fadc5938cdb47479d5a21b
-master date: 2022-07-11 15:21:35 +0100
----
- xen/arch/x86/spec_ctrl.c | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index dfdd45c358c4..ae74943c1053 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -122,6 +122,7 @@ static int __init parse_spec_ctrl(const char *s)
- opt_l1d_flush = 0;
- opt_branch_harden = false;
- opt_srb_lock = 0;
-+ opt_unpriv_mmio = false;
- }
- else if ( val > 0 )
- rc = -EINVAL;
---
-2.35.1
-
diff --git a/0040-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch b/0040-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch
deleted file mode 100644
index 3512590..0000000
--- a/0040-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch
+++ /dev/null
@@ -1,87 +0,0 @@
-From eec5b02403a9df2523527caad24f17af5060fbe7 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Tue, 12 Jul 2022 11:15:03 +0200
-Subject: [PATCH 40/51] xen/cmdline: Extend parse_boolean() to signal a name
- match
-
-This will help parsing a sub-option which has boolean and non-boolean options
-available.
-
-First, rework 'int val' into 'bool has_neg_prefix'. This inverts it's value,
-but the resulting logic is far easier to follow.
-
-Second, reject anything of the form 'no-$FOO=' which excludes ambiguous
-constructs such as 'no-$foo=yes' which have never been valid.
-
-This just leaves the case where everything is otherwise fine, but parse_bool()
-can't interpret the provided string.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Juergen Gross <jgross@suse.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-master commit: 382326cac528dd1eb0d04efd5c05363c453e29f4
-master date: 2022-07-11 15:21:35 +0100
----
- xen/common/kernel.c | 20 ++++++++++++++++----
- xen/include/xen/lib.h | 3 ++-
- 2 files changed, 18 insertions(+), 5 deletions(-)
-
-diff --git a/xen/common/kernel.c b/xen/common/kernel.c
-index e119e5401f9d..7ed96521f97a 100644
---- a/xen/common/kernel.c
-+++ b/xen/common/kernel.c
-@@ -272,9 +272,9 @@ int parse_bool(const char *s, const char *e)
- int parse_boolean(const char *name, const char *s, const char *e)
- {
- size_t slen, nlen;
-- int val = !!strncmp(s, "no-", 3);
-+ bool has_neg_prefix = !strncmp(s, "no-", 3);
-
-- if ( !val )
-+ if ( has_neg_prefix )
- s += 3;
-
- slen = e ? ({ ASSERT(e >= s); e - s; }) : strlen(s);
-@@ -286,11 +286,23 @@ int parse_boolean(const char *name, const char *s, const char *e)
-
- /* Exact, unadorned name? Result depends on the 'no-' prefix. */
- if ( slen == nlen )
-- return val;
-+ return !has_neg_prefix;
-+
-+ /* Inexact match with a 'no-' prefix? Not valid. */
-+ if ( has_neg_prefix )
-+ return -1;
-
- /* =$SOMETHING? Defer to the regular boolean parsing. */
- if ( s[nlen] == '=' )
-- return parse_bool(&s[nlen + 1], e);
-+ {
-+ int b = parse_bool(&s[nlen + 1], e);
-+
-+ if ( b >= 0 )
-+ return b;
-+
-+ /* Not a boolean, but the name matched. Signal specially. */
-+ return -2;
-+ }
-
- /* Unrecognised. Give up. */
- return -1;
-diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h
-index c6987973bf88..2296044caf79 100644
---- a/xen/include/xen/lib.h
-+++ b/xen/include/xen/lib.h
-@@ -80,7 +80,8 @@ int parse_bool(const char *s, const char *e);
- /**
- * Given a specific name, parses a string of the form:
- * [no-]$NAME[=...]
-- * returning 0 or 1 for a recognised boolean, or -1 for an error.
-+ * returning 0 or 1 for a recognised boolean. Returns -1 for general errors,
-+ * and -2 for "not a boolean, but $NAME= matches".
- */
- int parse_boolean(const char *name, const char *s, const char *e);
-
---
-2.35.1
-
diff --git a/0041-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch b/0041-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch
deleted file mode 100644
index 9964bb9..0000000
--- a/0041-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch
+++ /dev/null
@@ -1,137 +0,0 @@
-From f066c8bb3e5686141cef6fa1dc86ea9f37c5388a Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Tue, 12 Jul 2022 11:15:37 +0200
-Subject: [PATCH 41/51] x86/spec-ctrl: Add fine-grained cmdline suboptions for
- primitives
-
-Support controling the PV/HVM suboption of msr-sc/rsb/md-clear, which
-previously wasn't possible.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-master commit: 27357c394ba6e1571a89105b840ce1c6f026485c
-master date: 2022-07-11 15:21:35 +0100
----
- docs/misc/xen-command-line.pandoc | 12 ++++--
- xen/arch/x86/spec_ctrl.c | 66 ++++++++++++++++++++++++++-----
- 2 files changed, 66 insertions(+), 12 deletions(-)
-
-diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
-index 46e9c58d35cd..1bbdb55129cc 100644
---- a/docs/misc/xen-command-line.pandoc
-+++ b/docs/misc/xen-command-line.pandoc
-@@ -2233,7 +2233,8 @@ not be able to control the state of the mitigation.
- By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
-
- ### spec-ctrl (x86)
--> `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>,
-+> `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>,
-+> {msr-sc,rsb,md-clear}=<bool>|{pv,hvm}=<bool>,
- > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
- > eager-fpu,l1d-flush,branch-harden,srb-lock,
- > unpriv-mmio}=<bool> ]`
-@@ -2258,12 +2259,17 @@ in place for guests to use.
-
- Use of a positive boolean value for either of these options is invalid.
-
--The booleans `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` offer fine
-+The `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` options offer fine
- grained control over the primitives by Xen. These impact Xen's ability to
--protect itself, and Xen's ability to virtualise support for guests to use.
-+protect itself, and/or Xen's ability to virtualise support for guests to use.
-
- * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests
- respectively.
-+* Each other option can be used either as a plain boolean
-+ (e.g. `spec-ctrl=rsb` to control both the PV and HVM sub-options), or with
-+ `pv=` or `hvm=` subsuboptions (e.g. `spec-ctrl=rsb=no-hvm` to disable HVM
-+ RSB only).
-+
- * `msr-sc=` offers control over Xen's support for manipulating `MSR_SPEC_CTRL`
- on entry and exit. These blocks are necessary to virtualise support for
- guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc.
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index ae74943c1053..9507e5da60a9 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -147,20 +147,68 @@ static int __init parse_spec_ctrl(const char *s)
- opt_rsb_hvm = val;
- opt_md_clear_hvm = val;
- }
-- else if ( (val = parse_boolean("msr-sc", s, ss)) >= 0 )
-+ else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 )
- {
-- opt_msr_sc_pv = val;
-- opt_msr_sc_hvm = val;
-+ switch ( val )
-+ {
-+ case 0:
-+ case 1:
-+ opt_msr_sc_pv = opt_msr_sc_hvm = val;
-+ break;
-+
-+ case -2:
-+ s += strlen("msr-sc=");
-+ if ( (val = parse_boolean("pv", s, ss)) >= 0 )
-+ opt_msr_sc_pv = val;
-+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
-+ opt_msr_sc_hvm = val;
-+ else
-+ default:
-+ rc = -EINVAL;
-+ break;
-+ }
- }
-- else if ( (val = parse_boolean("rsb", s, ss)) >= 0 )
-+ else if ( (val = parse_boolean("rsb", s, ss)) != -1 )
- {
-- opt_rsb_pv = val;
-- opt_rsb_hvm = val;
-+ switch ( val )
-+ {
-+ case 0:
-+ case 1:
-+ opt_rsb_pv = opt_rsb_hvm = val;
-+ break;
-+
-+ case -2:
-+ s += strlen("rsb=");
-+ if ( (val = parse_boolean("pv", s, ss)) >= 0 )
-+ opt_rsb_pv = val;
-+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
-+ opt_rsb_hvm = val;
-+ else
-+ default:
-+ rc = -EINVAL;
-+ break;
-+ }
- }
-- else if ( (val = parse_boolean("md-clear", s, ss)) >= 0 )
-+ else if ( (val = parse_boolean("md-clear", s, ss)) != -1 )
- {
-- opt_md_clear_pv = val;
-- opt_md_clear_hvm = val;
-+ switch ( val )
-+ {
-+ case 0:
-+ case 1:
-+ opt_md_clear_pv = opt_md_clear_hvm = val;
-+ break;
-+
-+ case -2:
-+ s += strlen("md-clear=");
-+ if ( (val = parse_boolean("pv", s, ss)) >= 0 )
-+ opt_md_clear_pv = val;
-+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
-+ opt_md_clear_hvm = val;
-+ else
-+ default:
-+ rc = -EINVAL;
-+ break;
-+ }
- }
-
- /* Xen's speculative sidechannel mitigation settings. */
---
-2.35.1
-
diff --git a/0042-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch b/0042-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch
deleted file mode 100644
index eea790a..0000000
--- a/0042-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch
+++ /dev/null
@@ -1,28 +0,0 @@
-From 14fd97e3de939a63a6e467f240efb49fe226a5dc Mon Sep 17 00:00:00 2001
-From: Anthony PERARD <anthony.perard@citrix.com>
-Date: Tue, 12 Jul 2022 11:16:10 +0200
-Subject: [PATCH 42/51] tools/helpers: fix build of xen-init-dom0 with -Werror
-
-Missing prototype of asprintf() without _GNU_SOURCE.
-
-Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
-Reviewed-by: Henry Wang <Henry.Wang@arm.com>
-master commit: d693b22733044d68e9974766b5c9e6259c9b1708
-master date: 2022-07-12 08:38:35 +0200
----
- tools/helpers/xen-init-dom0.c | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/tools/helpers/xen-init-dom0.c b/tools/helpers/xen-init-dom0.c
-index c99224a4b607..b4861c9e8041 100644
---- a/tools/helpers/xen-init-dom0.c
-+++ b/tools/helpers/xen-init-dom0.c
-@@ -1,3 +1,5 @@
-+#define _GNU_SOURCE
-+
- #include <stdlib.h>
- #include <stdint.h>
- #include <string.h>
---
-2.35.1
-
diff --git a/0043-libxl-check-return-value-of-libxl__xs_directory-in-n.patch b/0043-libxl-check-return-value-of-libxl__xs_directory-in-n.patch
deleted file mode 100644
index 0c2470a..0000000
--- a/0043-libxl-check-return-value-of-libxl__xs_directory-in-n.patch
+++ /dev/null
@@ -1,38 +0,0 @@
-From 744accad1b73223b3261e3e678e16e030d83b179 Mon Sep 17 00:00:00 2001
-From: Anthony PERARD <anthony.perard@citrix.com>
-Date: Tue, 12 Jul 2022 11:16:30 +0200
-Subject: [PATCH 43/51] libxl: check return value of libxl__xs_directory in
- name2bdf
-
-libxl__xs_directory() can potentially return NULL without setting `n`.
-As `n` isn't initialised, we need to check libxl__xs_directory()
-return value before checking `n`. Otherwise, `n` might be non-zero
-with `bdfs` NULL which would lead to a segv.
-
-Fixes: 57bff091f4 ("libxl: add 'name' field to 'libxl_device_pci' in the IDL...")
-Reported-by: "G.R." <firemeteor@users.sourceforge.net>
-Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
-Reviewed-by: Juergen Gross <jgross@suse.com>
-Tested-by: "G.R." <firemeteor@users.sourceforge.net>
-master commit: d778089ac70e5b8e3bdea0c85fc8c0b9ed0eaf2f
-master date: 2022-07-12 08:38:51 +0200
----
- tools/libs/light/libxl_pci.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/tools/libs/light/libxl_pci.c b/tools/libs/light/libxl_pci.c
-index 4bbbfe9f168f..ce3bf7c0ae81 100644
---- a/tools/libs/light/libxl_pci.c
-+++ b/tools/libs/light/libxl_pci.c
-@@ -859,7 +859,7 @@ static int name2bdf(libxl__gc *gc, libxl_device_pci *pci)
- int rc = ERROR_NOTFOUND;
-
- bdfs = libxl__xs_directory(gc, XBT_NULL, PCI_INFO_PATH, &n);
-- if (!n)
-+ if (!bdfs || !n)
- goto out;
-
- for (i = 0; i < n; i++) {
---
-2.35.1
-
diff --git a/0044-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch b/0044-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch
deleted file mode 100644
index d8517f8..0000000
--- a/0044-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch
+++ /dev/null
@@ -1,167 +0,0 @@
-From 3a280cbae7022b83af91c27a8e2211ba3b1234f5 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Fri, 1 Jul 2022 15:59:40 +0100
-Subject: [PATCH 44/51] x86/spec-ctrl: Rework spec_ctrl_flags context switching
-
-We are shortly going to need to context switch new bits in both the vcpu and
-S3 paths. Introduce SCF_IST_MASK and SCF_DOM_MASK, and rework d->arch.verw
-into d->arch.spec_ctrl_flags to accommodate.
-
-No functional change.
-
-This is part of XSA-407.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-(cherry picked from commit 5796912f7279d9348a3166655588d30eae9f72cc)
----
- xen/arch/x86/acpi/power.c | 8 ++++----
- xen/arch/x86/domain.c | 8 ++++----
- xen/arch/x86/spec_ctrl.c | 9 ++++++---
- xen/include/asm-x86/domain.h | 3 +--
- xen/include/asm-x86/spec_ctrl.h | 30 ++++++++++++++++++++++++++++-
- xen/include/asm-x86/spec_ctrl_asm.h | 3 ---
- 6 files changed, 44 insertions(+), 17 deletions(-)
-
-diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c
-index 5eaa77f66a28..dd397f713067 100644
---- a/xen/arch/x86/acpi/power.c
-+++ b/xen/arch/x86/acpi/power.c
-@@ -248,8 +248,8 @@ static int enter_state(u32 state)
- error = 0;
-
- ci = get_cpu_info();
-- /* Avoid NMI/#MC using MSR_SPEC_CTRL until we've reloaded microcode. */
-- ci->spec_ctrl_flags &= ~SCF_ist_wrmsr;
-+ /* Avoid NMI/#MC using unsafe MSRs until we've reloaded microcode. */
-+ ci->spec_ctrl_flags &= ~SCF_IST_MASK;
-
- ACPI_FLUSH_CPU_CACHE();
-
-@@ -292,8 +292,8 @@ static int enter_state(u32 state)
- if ( !recheck_cpu_features(0) )
- panic("Missing previously available feature(s)\n");
-
-- /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */
-- ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_ist_wrmsr);
-+ /* Re-enabled default NMI/#MC use of MSRs now microcode is loaded. */
-+ ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_IST_MASK);
-
- if ( boot_cpu_has(X86_FEATURE_IBRSB) || boot_cpu_has(X86_FEATURE_IBRS) )
- {
-diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
-index 1fe6644a71ae..82a0b73cf6ef 100644
---- a/xen/arch/x86/domain.c
-+++ b/xen/arch/x86/domain.c
-@@ -2092,10 +2092,10 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
- }
- }
-
-- /* Update the top-of-stack block with the VERW disposition. */
-- info->spec_ctrl_flags &= ~SCF_verw;
-- if ( nextd->arch.verw )
-- info->spec_ctrl_flags |= SCF_verw;
-+ /* Update the top-of-stack block with the new spec_ctrl settings. */
-+ info->spec_ctrl_flags =
-+ (info->spec_ctrl_flags & ~SCF_DOM_MASK) |
-+ (nextd->arch.spec_ctrl_flags & SCF_DOM_MASK);
- }
-
- sched_context_switched(prev, next);
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index 9507e5da60a9..7e646680f1c7 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -1010,9 +1010,12 @@ void spec_ctrl_init_domain(struct domain *d)
- {
- bool pv = is_pv_domain(d);
-
-- d->arch.verw =
-- (pv ? opt_md_clear_pv : opt_md_clear_hvm) ||
-- (opt_fb_clear_mmio && is_iommu_enabled(d));
-+ bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) ||
-+ (opt_fb_clear_mmio && is_iommu_enabled(d)));
-+
-+ d->arch.spec_ctrl_flags =
-+ (verw ? SCF_verw : 0) |
-+ 0;
- }
-
- void __init init_speculation_mitigations(void)
-diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
-index 2398a1d99da9..e4c099262cb7 100644
---- a/xen/include/asm-x86/domain.h
-+++ b/xen/include/asm-x86/domain.h
-@@ -319,8 +319,7 @@ struct arch_domain
- uint32_t pci_cf8;
- uint8_t cmos_idx;
-
-- /* Use VERW on return-to-guest for its flushing side effect. */
-- bool verw;
-+ uint8_t spec_ctrl_flags; /* See SCF_DOM_MASK */
-
- union {
- struct pv_domain pv;
-diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
-index 7e83e0179fb9..3cd72e40305f 100644
---- a/xen/include/asm-x86/spec_ctrl.h
-+++ b/xen/include/asm-x86/spec_ctrl.h
-@@ -20,12 +20,40 @@
- #ifndef __X86_SPEC_CTRL_H__
- #define __X86_SPEC_CTRL_H__
-
--/* Encoding of cpuinfo.spec_ctrl_flags */
-+/*
-+ * Encoding of:
-+ * cpuinfo.spec_ctrl_flags
-+ * default_spec_ctrl_flags
-+ * domain.spec_ctrl_flags
-+ *
-+ * Live settings are in the top-of-stack block, because they need to be
-+ * accessable when XPTI is active. Some settings are fixed from boot, some
-+ * context switched per domain, and some inhibited in the S3 path.
-+ */
- #define SCF_use_shadow (1 << 0)
- #define SCF_ist_wrmsr (1 << 1)
- #define SCF_ist_rsb (1 << 2)
- #define SCF_verw (1 << 3)
-
-+/*
-+ * The IST paths (NMI/#MC) can interrupt any arbitrary context. Some
-+ * functionality requires updated microcode to work.
-+ *
-+ * On boot, this is easy; we load microcode before figuring out which
-+ * speculative protections to apply. However, on the S3 resume path, we must
-+ * be able to disable the configured mitigations until microcode is reloaded.
-+ *
-+ * These are the controls to inhibit on the S3 resume path until microcode has
-+ * been reloaded.
-+ */
-+#define SCF_IST_MASK (SCF_ist_wrmsr)
-+
-+/*
-+ * Some speculative protections are per-domain. These settings are merged
-+ * into the top-of-stack block in the context switch path.
-+ */
-+#define SCF_DOM_MASK (SCF_verw)
-+
- #ifndef __ASSEMBLY__
-
- #include <asm/alternative.h>
-diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h
-index 5a590bac44aa..66b00d511fc6 100644
---- a/xen/include/asm-x86/spec_ctrl_asm.h
-+++ b/xen/include/asm-x86/spec_ctrl_asm.h
-@@ -248,9 +248,6 @@
-
- /*
- * Use in IST interrupt/exception context. May interrupt Xen or PV context.
-- * Fine grain control of SCF_ist_wrmsr is needed for safety in the S3 resume
-- * path to avoid using MSR_SPEC_CTRL before the microcode introducing it has
-- * been reloaded.
- */
- .macro SPEC_CTRL_ENTRY_FROM_INTR_IST
- /*
---
-2.35.1
-
diff --git a/0045-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch b/0045-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch
deleted file mode 100644
index 5b841a6..0000000
--- a/0045-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch
+++ /dev/null
@@ -1,110 +0,0 @@
-From 31aa2a20bfefc3a8a200da54a56471bf99f9630e Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Tue, 28 Jun 2022 14:36:56 +0100
-Subject: [PATCH 45/51] x86/spec-ctrl: Rename SCF_ist_wrmsr to SCF_ist_sc_msr
-
-We are about to introduce SCF_ist_ibpb, at which point SCF_ist_wrmsr becomes
-ambiguous.
-
-No functional change.
-
-This is part of XSA-407.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-(cherry picked from commit 76d6a36f645dfdbad8830559d4d52caf36efc75e)
----
- xen/arch/x86/spec_ctrl.c | 6 +++---
- xen/include/asm-x86/spec_ctrl.h | 4 ++--
- xen/include/asm-x86/spec_ctrl_asm.h | 8 ++++----
- 3 files changed, 9 insertions(+), 9 deletions(-)
-
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index 7e646680f1c7..89f95c083e1b 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -1115,7 +1115,7 @@ void __init init_speculation_mitigations(void)
- {
- if ( opt_msr_sc_pv )
- {
-- default_spec_ctrl_flags |= SCF_ist_wrmsr;
-+ default_spec_ctrl_flags |= SCF_ist_sc_msr;
- setup_force_cpu_cap(X86_FEATURE_SC_MSR_PV);
- }
-
-@@ -1126,7 +1126,7 @@ void __init init_speculation_mitigations(void)
- * Xen's value is not restored atomically. An early NMI hitting
- * the VMExit path needs to restore Xen's value for safety.
- */
-- default_spec_ctrl_flags |= SCF_ist_wrmsr;
-+ default_spec_ctrl_flags |= SCF_ist_sc_msr;
- setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM);
- }
- }
-@@ -1139,7 +1139,7 @@ void __init init_speculation_mitigations(void)
- * on real hardware matches the availability of MSR_SPEC_CTRL in the
- * first place.
- *
-- * No need for SCF_ist_wrmsr because Xen's value is restored
-+ * No need for SCF_ist_sc_msr because Xen's value is restored
- * atomically WRT NMIs in the VMExit path.
- *
- * TODO: Adjust cpu_has_svm_spec_ctrl to be usable earlier on boot.
-diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
-index 3cd72e40305f..f8f0ac47e759 100644
---- a/xen/include/asm-x86/spec_ctrl.h
-+++ b/xen/include/asm-x86/spec_ctrl.h
-@@ -31,7 +31,7 @@
- * context switched per domain, and some inhibited in the S3 path.
- */
- #define SCF_use_shadow (1 << 0)
--#define SCF_ist_wrmsr (1 << 1)
-+#define SCF_ist_sc_msr (1 << 1)
- #define SCF_ist_rsb (1 << 2)
- #define SCF_verw (1 << 3)
-
-@@ -46,7 +46,7 @@
- * These are the controls to inhibit on the S3 resume path until microcode has
- * been reloaded.
- */
--#define SCF_IST_MASK (SCF_ist_wrmsr)
-+#define SCF_IST_MASK (SCF_ist_sc_msr)
-
- /*
- * Some speculative protections are per-domain. These settings are merged
-diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h
-index 66b00d511fc6..0ff1b118f882 100644
---- a/xen/include/asm-x86/spec_ctrl_asm.h
-+++ b/xen/include/asm-x86/spec_ctrl_asm.h
-@@ -266,8 +266,8 @@
-
- .L\@_skip_rsb:
-
-- test $SCF_ist_wrmsr, %al
-- jz .L\@_skip_wrmsr
-+ test $SCF_ist_sc_msr, %al
-+ jz .L\@_skip_msr_spec_ctrl
-
- xor %edx, %edx
- testb $3, UREGS_cs(%rsp)
-@@ -290,7 +290,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
- * to speculate around the WRMSR. As a result, we need a dispatch
- * serialising instruction in the else clause.
- */
--.L\@_skip_wrmsr:
-+.L\@_skip_msr_spec_ctrl:
- lfence
- UNLIKELY_END(\@_serialise)
- .endm
-@@ -301,7 +301,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
- * Requires %rbx=stack_end
- * Clobbers %rax, %rcx, %rdx
- */
-- testb $SCF_ist_wrmsr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx)
-+ testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx)
- jz .L\@_skip
-
- DO_SPEC_CTRL_EXIT_TO_XEN
---
-2.35.1
-
diff --git a/0046-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch b/0046-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch
deleted file mode 100644
index a950639..0000000
--- a/0046-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch
+++ /dev/null
@@ -1,97 +0,0 @@
-From e7671561c84322860875745e57b228a7a310f2bf Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Mon, 4 Jul 2022 21:32:17 +0100
-Subject: [PATCH 46/51] x86/spec-ctrl: Rename opt_ibpb to opt_ibpb_ctxt_switch
-
-We are about to introduce the use of IBPB at different points in Xen, making
-opt_ibpb ambiguous. Rename it to opt_ibpb_ctxt_switch.
-
-No functional change.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-(cherry picked from commit a8e5ef079d6f5c88c472e3e620db5a8d1402a50d)
----
- xen/arch/x86/domain.c | 2 +-
- xen/arch/x86/spec_ctrl.c | 10 +++++-----
- xen/include/asm-x86/spec_ctrl.h | 2 +-
- 3 files changed, 7 insertions(+), 7 deletions(-)
-
-diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
-index 82a0b73cf6ef..0d39981550ca 100644
---- a/xen/arch/x86/domain.c
-+++ b/xen/arch/x86/domain.c
-@@ -2064,7 +2064,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
-
- ctxt_switch_levelling(next);
-
-- if ( opt_ibpb && !is_idle_domain(nextd) )
-+ if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) )
- {
- static DEFINE_PER_CPU(unsigned int, last);
- unsigned int *last_id = &this_cpu(last);
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index 89f95c083e1b..f4ae36eae2d0 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -54,7 +54,7 @@ int8_t __initdata opt_stibp = -1;
- bool __read_mostly opt_ssbd;
- int8_t __initdata opt_psfd = -1;
-
--bool __read_mostly opt_ibpb = true;
-+bool __read_mostly opt_ibpb_ctxt_switch = true;
- int8_t __read_mostly opt_eager_fpu = -1;
- int8_t __read_mostly opt_l1d_flush = -1;
- static bool __initdata opt_branch_harden = true;
-@@ -117,7 +117,7 @@ static int __init parse_spec_ctrl(const char *s)
-
- opt_thunk = THUNK_JMP;
- opt_ibrs = 0;
-- opt_ibpb = false;
-+ opt_ibpb_ctxt_switch = false;
- opt_ssbd = false;
- opt_l1d_flush = 0;
- opt_branch_harden = false;
-@@ -238,7 +238,7 @@ static int __init parse_spec_ctrl(const char *s)
-
- /* Misc settings. */
- else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 )
-- opt_ibpb = val;
-+ opt_ibpb_ctxt_switch = val;
- else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 )
- opt_eager_fpu = val;
- else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
-@@ -458,7 +458,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- (opt_tsx & 1) ? " TSX+" : " TSX-",
- !cpu_has_srbds_ctrl ? "" :
- opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-",
-- opt_ibpb ? " IBPB" : "",
-+ opt_ibpb_ctxt_switch ? " IBPB-ctxt" : "",
- opt_l1d_flush ? " L1D_FLUSH" : "",
- opt_md_clear_pv || opt_md_clear_hvm ||
- opt_fb_clear_mmio ? " VERW" : "",
-@@ -1240,7 +1240,7 @@ void __init init_speculation_mitigations(void)
-
- /* Check we have hardware IBPB support before using it... */
- if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) )
-- opt_ibpb = false;
-+ opt_ibpb_ctxt_switch = false;
-
- /* Check whether Eager FPU should be enabled by default. */
- if ( opt_eager_fpu == -1 )
-diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
-index f8f0ac47e759..fb4365575620 100644
---- a/xen/include/asm-x86/spec_ctrl.h
-+++ b/xen/include/asm-x86/spec_ctrl.h
-@@ -63,7 +63,7 @@
- void init_speculation_mitigations(void);
- void spec_ctrl_init_domain(struct domain *d);
-
--extern bool opt_ibpb;
-+extern bool opt_ibpb_ctxt_switch;
- extern bool opt_ssbd;
- extern int8_t opt_eager_fpu;
- extern int8_t opt_l1d_flush;
---
-2.35.1
-
diff --git a/0047-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch b/0047-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch
deleted file mode 100644
index 3ce9fd9..0000000
--- a/0047-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch
+++ /dev/null
@@ -1,106 +0,0 @@
-From 2a9e690a0ad5d54dca4166e089089a07bbe7fc85 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Fri, 1 Jul 2022 15:59:40 +0100
-Subject: [PATCH 47/51] x86/spec-ctrl: Rework SPEC_CTRL_ENTRY_FROM_INTR_IST
-
-We are shortly going to add a conditional IBPB in this path.
-
-Therefore, we cannot hold spec_ctrl_flags in %eax, and rely on only clobbering
-it after we're done with its contents. %rbx is available for use, and the
-more normal register to hold preserved information in.
-
-With %rax freed up, use it instead of %rdx for the RSB tmp register, and for
-the adjustment to spec_ctrl_flags.
-
-This leaves no use of %rdx, except as 0 for the upper half of WRMSR. In
-practice, %rdx is 0 from SAVE_ALL on all paths and isn't likely to change in
-the foreseeable future, so update the macro entry requirements to state this
-dependency. This marginal optimisation can be revisited if circumstances
-change.
-
-No practical change.
-
-This is part of XSA-407.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-(cherry picked from commit e9b8d31981f184c6539f91ec54bd9cae29cdae36)
----
- xen/arch/x86/x86_64/entry.S | 4 ++--
- xen/include/asm-x86/spec_ctrl_asm.h | 21 ++++++++++-----------
- 2 files changed, 12 insertions(+), 13 deletions(-)
-
-diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
-index 2a86938f1f32..a1810bf4d311 100644
---- a/xen/arch/x86/x86_64/entry.S
-+++ b/xen/arch/x86/x86_64/entry.S
-@@ -932,7 +932,7 @@ ENTRY(double_fault)
-
- GET_STACK_END(14)
-
-- SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, Clob: acd */
-+ SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: abcd */
- /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
-
- mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rbx
-@@ -968,7 +968,7 @@ handle_ist_exception:
-
- GET_STACK_END(14)
-
-- SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, Clob: acd */
-+ SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: abcd */
- /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
-
- mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
-diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h
-index 0ff1b118f882..15e24cde00d1 100644
---- a/xen/include/asm-x86/spec_ctrl_asm.h
-+++ b/xen/include/asm-x86/spec_ctrl_asm.h
-@@ -251,34 +251,33 @@
- */
- .macro SPEC_CTRL_ENTRY_FROM_INTR_IST
- /*
-- * Requires %rsp=regs, %r14=stack_end
-- * Clobbers %rax, %rcx, %rdx
-+ * Requires %rsp=regs, %r14=stack_end, %rdx=0
-+ * Clobbers %rax, %rbx, %rcx, %rdx
- *
- * This is logical merge of DO_OVERWRITE_RSB and DO_SPEC_CTRL_ENTRY
- * maybexen=1, but with conditionals rather than alternatives.
- */
-- movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %eax
-+ movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx
-
-- test $SCF_ist_rsb, %al
-+ test $SCF_ist_rsb, %bl
- jz .L\@_skip_rsb
-
-- DO_OVERWRITE_RSB tmp=rdx /* Clobbers %rcx/%rdx */
-+ DO_OVERWRITE_RSB /* Clobbers %rax/%rcx */
-
- .L\@_skip_rsb:
-
-- test $SCF_ist_sc_msr, %al
-+ test $SCF_ist_sc_msr, %bl
- jz .L\@_skip_msr_spec_ctrl
-
-- xor %edx, %edx
-+ xor %eax, %eax
- testb $3, UREGS_cs(%rsp)
-- setnz %dl
-- not %edx
-- and %dl, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14)
-+ setnz %al
-+ not %eax
-+ and %al, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14)
-
- /* Load Xen's intended value. */
- mov $MSR_SPEC_CTRL, %ecx
- movzbl STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax
-- xor %edx, %edx
- wrmsr
-
- /* Opencoded UNLIKELY_START() with no condition. */
---
-2.35.1
-
diff --git a/0048-x86-spec-ctrl-Support-IBPB-on-entry.patch b/0048-x86-spec-ctrl-Support-IBPB-on-entry.patch
deleted file mode 100644
index d5ad043..0000000
--- a/0048-x86-spec-ctrl-Support-IBPB-on-entry.patch
+++ /dev/null
@@ -1,300 +0,0 @@
-From 76c5fcee9027fb8823dd501086f0ff3ee3c4231c Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Thu, 24 Feb 2022 13:44:33 +0000
-Subject: [PATCH 48/51] x86/spec-ctrl: Support IBPB-on-entry
-
-We are going to need this to mitigate Branch Type Confusion on AMD/Hygon CPUs,
-but as we've talked about using it in other cases too, arrange to support it
-generally. However, this is also very expensive in some cases, so we're going
-to want per-domain controls.
-
-Introduce SCF_ist_ibpb and SCF_entry_ibpb controls, adding them to the IST and
-DOM masks as appropriate. Also introduce X86_FEATURE_IBPB_ENTRY_{PV,HVM} to
-to patch the code blocks.
-
-For SVM, the STGI is serialising enough to protect against Spectre-v1 attacks,
-so no "else lfence" is necessary. VT-x will use use the MSR host load list,
-so doesn't need any code in the VMExit path.
-
-For the IST path, we can't safely check CPL==0 to skip a flush, as we might
-have hit an entry path before it's IBPB. As IST hitting Xen is rare, flush
-irrespective of CPL. A later path, SCF_ist_sc_msr, provides Spectre-v1
-safety.
-
-For the PV paths, we know we're interrupting CPL>0, while for the INTR paths,
-we can safely check CPL==0. Only flush when interrupting guest context.
-
-An "else lfence" is needed for safety, but we want to be able to skip it on
-unaffected CPUs, so the block wants to be an alternative, which means the
-lfence has to be inline rather than UNLIKELY() (the replacement block doesn't
-have displacements fixed up for anything other than the first instruction).
-
-As with SPEC_CTRL_ENTRY_FROM_INTR_IST, %rdx is 0 on entry so rely on this to
-shrink the logic marginally. Update the comments to specify this new
-dependency.
-
-This is part of XSA-407.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-(cherry picked from commit 53a570b285694947776d5190f591a0d5b9b18de7)
----
- xen/arch/x86/hvm/svm/entry.S | 18 ++++++++++-
- xen/arch/x86/hvm/vmx/vmcs.c | 4 +++
- xen/arch/x86/x86_64/compat/entry.S | 2 +-
- xen/arch/x86/x86_64/entry.S | 12 +++----
- xen/include/asm-x86/cpufeatures.h | 2 ++
- xen/include/asm-x86/spec_ctrl.h | 6 ++--
- xen/include/asm-x86/spec_ctrl_asm.h | 49 +++++++++++++++++++++++++++--
- 7 files changed, 81 insertions(+), 12 deletions(-)
-
-diff --git a/xen/arch/x86/hvm/svm/entry.S b/xen/arch/x86/hvm/svm/entry.S
-index 4ae55a2ef605..0ff4008060fa 100644
---- a/xen/arch/x86/hvm/svm/entry.S
-+++ b/xen/arch/x86/hvm/svm/entry.S
-@@ -97,7 +97,19 @@ __UNLIKELY_END(nsvm_hap)
-
- GET_CURRENT(bx)
-
-- /* SPEC_CTRL_ENTRY_FROM_SVM Req: %rsp=regs/cpuinfo Clob: acd */
-+ /* SPEC_CTRL_ENTRY_FROM_SVM Req: %rsp=regs/cpuinfo, %rdx=0 Clob: acd */
-+
-+ .macro svm_vmexit_cond_ibpb
-+ testb $SCF_entry_ibpb, CPUINFO_xen_spec_ctrl(%rsp)
-+ jz .L_skip_ibpb
-+
-+ mov $MSR_PRED_CMD, %ecx
-+ mov $PRED_CMD_IBPB, %eax
-+ wrmsr
-+.L_skip_ibpb:
-+ .endm
-+ ALTERNATIVE "", svm_vmexit_cond_ibpb, X86_FEATURE_IBPB_ENTRY_HVM
-+
- ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM
-
- .macro svm_vmexit_spec_ctrl
-@@ -114,6 +126,10 @@ __UNLIKELY_END(nsvm_hap)
- ALTERNATIVE "", svm_vmexit_spec_ctrl, X86_FEATURE_SC_MSR_HVM
- /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
-
-+ /*
-+ * STGI is executed unconditionally, and is sufficiently serialising
-+ * to safely resolve any Spectre-v1 concerns in the above logic.
-+ */
- stgi
- GLOBAL(svm_stgi_label)
- mov %rsp,%rdi
-diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
-index f9f9bc18cdbc..dd817cee4e69 100644
---- a/xen/arch/x86/hvm/vmx/vmcs.c
-+++ b/xen/arch/x86/hvm/vmx/vmcs.c
-@@ -1345,6 +1345,10 @@ static int construct_vmcs(struct vcpu *v)
- rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D,
- VMX_MSR_GUEST_LOADONLY);
-
-+ if ( !rc && (d->arch.spec_ctrl_flags & SCF_entry_ibpb) )
-+ rc = vmx_add_msr(v, MSR_PRED_CMD, PRED_CMD_IBPB,
-+ VMX_MSR_HOST);
-+
- out:
- vmx_vmcs_exit(v);
-
-diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
-index 5fd6dbbd4513..b86d38d1c50d 100644
---- a/xen/arch/x86/x86_64/compat/entry.S
-+++ b/xen/arch/x86/x86_64/compat/entry.S
-@@ -18,7 +18,7 @@ ENTRY(entry_int82)
- movl $HYPERCALL_VECTOR, 4(%rsp)
- SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */
-
-- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */
-+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
- /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
-
- CR4_PV32_RESTORE
-diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
-index a1810bf4d311..fba8ae498f74 100644
---- a/xen/arch/x86/x86_64/entry.S
-+++ b/xen/arch/x86/x86_64/entry.S
-@@ -260,7 +260,7 @@ ENTRY(lstar_enter)
- movl $TRAP_syscall, 4(%rsp)
- SAVE_ALL
-
-- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */
-+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
- /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
-
- GET_STACK_END(bx)
-@@ -298,7 +298,7 @@ ENTRY(cstar_enter)
- movl $TRAP_syscall, 4(%rsp)
- SAVE_ALL
-
-- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */
-+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
- /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
-
- GET_STACK_END(bx)
-@@ -338,7 +338,7 @@ GLOBAL(sysenter_eflags_saved)
- movl $TRAP_syscall, 4(%rsp)
- SAVE_ALL
-
-- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */
-+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
- /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
-
- GET_STACK_END(bx)
-@@ -392,7 +392,7 @@ ENTRY(int80_direct_trap)
- movl $0x80, 4(%rsp)
- SAVE_ALL
-
-- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */
-+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
- /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
-
- GET_STACK_END(bx)
-@@ -674,7 +674,7 @@ ENTRY(common_interrupt)
-
- GET_STACK_END(14)
-
-- SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, Clob: acd */
-+ SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: acd */
- /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
-
- mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
-@@ -708,7 +708,7 @@ GLOBAL(handle_exception)
-
- GET_STACK_END(14)
-
-- SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, Clob: acd */
-+ SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: acd */
- /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */
-
- mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
-diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
-index 493d338a085e..672c9ee22ba2 100644
---- a/xen/include/asm-x86/cpufeatures.h
-+++ b/xen/include/asm-x86/cpufeatures.h
-@@ -39,6 +39,8 @@ XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */
- XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */
- XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */
- XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */
-+XEN_CPUFEATURE(IBPB_ENTRY_PV, X86_SYNTH(28)) /* MSR_PRED_CMD used by Xen for PV */
-+XEN_CPUFEATURE(IBPB_ENTRY_HVM, X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for HVM */
-
- /* Bug words follow the synthetic words. */
- #define X86_NR_BUG 1
-diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
-index fb4365575620..3fc599a817c4 100644
---- a/xen/include/asm-x86/spec_ctrl.h
-+++ b/xen/include/asm-x86/spec_ctrl.h
-@@ -34,6 +34,8 @@
- #define SCF_ist_sc_msr (1 << 1)
- #define SCF_ist_rsb (1 << 2)
- #define SCF_verw (1 << 3)
-+#define SCF_ist_ibpb (1 << 4)
-+#define SCF_entry_ibpb (1 << 5)
-
- /*
- * The IST paths (NMI/#MC) can interrupt any arbitrary context. Some
-@@ -46,13 +48,13 @@
- * These are the controls to inhibit on the S3 resume path until microcode has
- * been reloaded.
- */
--#define SCF_IST_MASK (SCF_ist_sc_msr)
-+#define SCF_IST_MASK (SCF_ist_sc_msr | SCF_ist_ibpb)
-
- /*
- * Some speculative protections are per-domain. These settings are merged
- * into the top-of-stack block in the context switch path.
- */
--#define SCF_DOM_MASK (SCF_verw)
-+#define SCF_DOM_MASK (SCF_verw | SCF_entry_ibpb)
-
- #ifndef __ASSEMBLY__
-
-diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h
-index 15e24cde00d1..9eb4ad9ab71d 100644
---- a/xen/include/asm-x86/spec_ctrl_asm.h
-+++ b/xen/include/asm-x86/spec_ctrl_asm.h
-@@ -88,6 +88,35 @@
- * - SPEC_CTRL_EXIT_TO_{SVM,VMX}
- */
-
-+.macro DO_SPEC_CTRL_COND_IBPB maybexen:req
-+/*
-+ * Requires %rsp=regs (also cpuinfo if !maybexen)
-+ * Requires %r14=stack_end (if maybexen), %rdx=0
-+ * Clobbers %rax, %rcx, %rdx
-+ *
-+ * Conditionally issue IBPB if SCF_entry_ibpb is active. In the maybexen
-+ * case, we can safely look at UREGS_cs to skip taking the hit when
-+ * interrupting Xen.
-+ */
-+ .if \maybexen
-+ testb $SCF_entry_ibpb, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14)
-+ jz .L\@_skip
-+ testb $3, UREGS_cs(%rsp)
-+ .else
-+ testb $SCF_entry_ibpb, CPUINFO_xen_spec_ctrl(%rsp)
-+ .endif
-+ jz .L\@_skip
-+
-+ mov $MSR_PRED_CMD, %ecx
-+ mov $PRED_CMD_IBPB, %eax
-+ wrmsr
-+ jmp .L\@_done
-+
-+.L\@_skip:
-+ lfence
-+.L\@_done:
-+.endm
-+
- .macro DO_OVERWRITE_RSB tmp=rax
- /*
- * Requires nothing
-@@ -225,12 +254,16 @@
-
- /* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */
- #define SPEC_CTRL_ENTRY_FROM_PV \
-+ ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=0), \
-+ X86_FEATURE_IBPB_ENTRY_PV; \
- ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \
- ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=0), \
- X86_FEATURE_SC_MSR_PV
-
- /* Use in interrupt/exception context. May interrupt Xen or PV context. */
- #define SPEC_CTRL_ENTRY_FROM_INTR \
-+ ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=1), \
-+ X86_FEATURE_IBPB_ENTRY_PV; \
- ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \
- ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=1), \
- X86_FEATURE_SC_MSR_PV
-@@ -254,11 +287,23 @@
- * Requires %rsp=regs, %r14=stack_end, %rdx=0
- * Clobbers %rax, %rbx, %rcx, %rdx
- *
-- * This is logical merge of DO_OVERWRITE_RSB and DO_SPEC_CTRL_ENTRY
-- * maybexen=1, but with conditionals rather than alternatives.
-+ * This is logical merge of:
-+ * DO_SPEC_CTRL_COND_IBPB maybexen=0
-+ * DO_OVERWRITE_RSB
-+ * DO_SPEC_CTRL_ENTRY maybexen=1
-+ * but with conditionals rather than alternatives.
- */
- movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx
-
-+ test $SCF_ist_ibpb, %bl
-+ jz .L\@_skip_ibpb
-+
-+ mov $MSR_PRED_CMD, %ecx
-+ mov $PRED_CMD_IBPB, %eax
-+ wrmsr
-+
-+.L\@_skip_ibpb:
-+
- test $SCF_ist_rsb, %bl
- jz .L\@_skip_rsb
-
---
-2.35.1
-
diff --git a/0049-x86-cpuid-Enumeration-for-BTC_NO.patch b/0049-x86-cpuid-Enumeration-for-BTC_NO.patch
deleted file mode 100644
index 0e5d119..0000000
--- a/0049-x86-cpuid-Enumeration-for-BTC_NO.patch
+++ /dev/null
@@ -1,106 +0,0 @@
-From 0826c7596d35c887b3b7858137c7ac374d9ef17a Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Mon, 16 May 2022 15:48:24 +0100
-Subject: [PATCH 49/51] x86/cpuid: Enumeration for BTC_NO
-
-BTC_NO indicates that hardware is not succeptable to Branch Type Confusion.
-
-Zen3 CPUs don't suffer BTC.
-
-This is part of XSA-407.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-(cherry picked from commit 76cb04ad64f3ab9ae785988c40655a71dde9c319)
----
- tools/libs/light/libxl_cpuid.c | 1 +
- tools/misc/xen-cpuid.c | 2 +-
- xen/arch/x86/cpu/amd.c | 10 ++++++++++
- xen/arch/x86/spec_ctrl.c | 5 +++--
- xen/include/public/arch-x86/cpufeatureset.h | 1 +
- 5 files changed, 16 insertions(+), 3 deletions(-)
-
-diff --git a/tools/libs/light/libxl_cpuid.c b/tools/libs/light/libxl_cpuid.c
-index d462f9e421ed..bf6fdee360a9 100644
---- a/tools/libs/light/libxl_cpuid.c
-+++ b/tools/libs/light/libxl_cpuid.c
-@@ -288,6 +288,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str)
- {"virt-ssbd", 0x80000008, NA, CPUID_REG_EBX, 25, 1},
- {"ssb-no", 0x80000008, NA, CPUID_REG_EBX, 26, 1},
- {"psfd", 0x80000008, NA, CPUID_REG_EBX, 28, 1},
-+ {"btc-no", 0x80000008, NA, CPUID_REG_EBX, 29, 1},
-
- {"nc", 0x80000008, NA, CPUID_REG_ECX, 0, 8},
- {"apicidsize", 0x80000008, NA, CPUID_REG_ECX, 12, 4},
-diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c
-index bc7dcf55757a..fe22f5f5b68b 100644
---- a/tools/misc/xen-cpuid.c
-+++ b/tools/misc/xen-cpuid.c
-@@ -158,7 +158,7 @@ static const char *const str_e8b[32] =
- /* [22] */ [23] = "ppin",
- [24] = "amd-ssbd", [25] = "virt-ssbd",
- [26] = "ssb-no",
-- [28] = "psfd",
-+ [28] = "psfd", [29] = "btc-no",
- };
-
- static const char *const str_7d0[32] =
-diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
-index b3b9a0df5fed..b158e3acb5c7 100644
---- a/xen/arch/x86/cpu/amd.c
-+++ b/xen/arch/x86/cpu/amd.c
-@@ -847,6 +847,16 @@ static void init_amd(struct cpuinfo_x86 *c)
- warning_add(text);
- }
- break;
-+
-+ case 0x19:
-+ /*
-+ * Zen3 (Fam19h model < 0x10) parts are not susceptible to
-+ * Branch Type Confusion, but predate the allocation of the
-+ * BTC_NO bit. Fill it back in if we're not virtualised.
-+ */
-+ if (!cpu_has_hypervisor && !cpu_has(c, X86_FEATURE_BTC_NO))
-+ __set_bit(X86_FEATURE_BTC_NO, c->x86_capability);
-+ break;
- }
-
- display_cacheinfo(c);
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index f4ae36eae2d0..0f101c057f3e 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -388,7 +388,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- * Hardware read-only information, stating immunity to certain issues, or
- * suggestions of which mitigation to use.
- */
-- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
-+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
- (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "",
- (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "",
- (caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
-@@ -403,7 +403,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "",
- (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "",
- (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "",
-- (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : "");
-+ (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : "",
-+ (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : "");
-
- /* Hardware features which need driving to mitigate issues. */
- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n",
-diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
-index 743b857dcd5c..e7b8167800a2 100644
---- a/xen/include/public/arch-x86/cpufeatureset.h
-+++ b/xen/include/public/arch-x86/cpufeatureset.h
-@@ -266,6 +266,7 @@ XEN_CPUFEATURE(AMD_SSBD, 8*32+24) /*S MSR_SPEC_CTRL.SSBD available */
- XEN_CPUFEATURE(VIRT_SSBD, 8*32+25) /* MSR_VIRT_SPEC_CTRL.SSBD */
- XEN_CPUFEATURE(SSB_NO, 8*32+26) /*A Hardware not vulnerable to SSB */
- XEN_CPUFEATURE(PSFD, 8*32+28) /*S MSR_SPEC_CTRL.PSFD */
-+XEN_CPUFEATURE(BTC_NO, 8*32+29) /*A Hardware not vulnerable to Branch Type Confusion */
-
- /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */
- XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */
---
-2.35.1
-
diff --git a/0050-x86-spec-ctrl-Enable-Zen2-chickenbit.patch b/0050-x86-spec-ctrl-Enable-Zen2-chickenbit.patch
deleted file mode 100644
index c83844d..0000000
--- a/0050-x86-spec-ctrl-Enable-Zen2-chickenbit.patch
+++ /dev/null
@@ -1,106 +0,0 @@
-From 5457a6870eb1369b868f7b8e833966ed43a773ad Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Tue, 15 Mar 2022 18:30:25 +0000
-Subject: [PATCH 50/51] x86/spec-ctrl: Enable Zen2 chickenbit
-
-... as instructed in the Branch Type Confusion whitepaper.
-
-This is part of XSA-407.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-(cherry picked from commit 9deaf2d932f08c16c6b96a1c426e4b1142c0cdbe)
----
- xen/arch/x86/cpu/amd.c | 28 ++++++++++++++++++++++++++++
- xen/arch/x86/cpu/cpu.h | 1 +
- xen/arch/x86/cpu/hygon.c | 6 ++++++
- xen/include/asm-x86/msr-index.h | 1 +
- 4 files changed, 36 insertions(+)
-
-diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
-index b158e3acb5c7..37ac84ddd74d 100644
---- a/xen/arch/x86/cpu/amd.c
-+++ b/xen/arch/x86/cpu/amd.c
-@@ -731,6 +731,31 @@ void amd_init_ssbd(const struct cpuinfo_x86 *c)
- printk_once(XENLOG_ERR "No SSBD controls available\n");
- }
-
-+/*
-+ * On Zen2 we offer this chicken (bit) on the altar of Speculation.
-+ *
-+ * Refer to the AMD Branch Type Confusion whitepaper:
-+ * https://XXX
-+ *
-+ * Setting this unnamed bit supposedly causes prediction information on
-+ * non-branch instructions to be ignored. It is to be set unilaterally in
-+ * newer microcode.
-+ *
-+ * This chickenbit is something unrelated on Zen1, and Zen1 vs Zen2 isn't a
-+ * simple model number comparison, so use STIBP as a heuristic to separate the
-+ * two uarches in Fam17h(AMD)/18h(Hygon).
-+ */
-+void amd_init_spectral_chicken(void)
-+{
-+ uint64_t val, chickenbit = 1 << 1;
-+
-+ if (cpu_has_hypervisor || !boot_cpu_has(X86_FEATURE_AMD_STIBP))
-+ return;
-+
-+ if (rdmsr_safe(MSR_AMD64_DE_CFG2, val) == 0 && !(val & chickenbit))
-+ wrmsr_safe(MSR_AMD64_DE_CFG2, val | chickenbit);
-+}
-+
- void __init detect_zen2_null_seg_behaviour(void)
- {
- uint64_t base;
-@@ -796,6 +821,9 @@ static void init_amd(struct cpuinfo_x86 *c)
-
- amd_init_ssbd(c);
-
-+ if (c->x86 == 0x17)
-+ amd_init_spectral_chicken();
-+
- /* Probe for NSCB on Zen2 CPUs when not virtualised */
- if (!cpu_has_hypervisor && !cpu_has_nscb && c == &boot_cpu_data &&
- c->x86 == 0x17)
-diff --git a/xen/arch/x86/cpu/cpu.h b/xen/arch/x86/cpu/cpu.h
-index b593bd85f04f..145bc5156a86 100644
---- a/xen/arch/x86/cpu/cpu.h
-+++ b/xen/arch/x86/cpu/cpu.h
-@@ -22,4 +22,5 @@ void early_init_amd(struct cpuinfo_x86 *c);
- void amd_log_freq(const struct cpuinfo_x86 *c);
- void amd_init_lfence(struct cpuinfo_x86 *c);
- void amd_init_ssbd(const struct cpuinfo_x86 *c);
-+void amd_init_spectral_chicken(void);
- void detect_zen2_null_seg_behaviour(void);
-diff --git a/xen/arch/x86/cpu/hygon.c b/xen/arch/x86/cpu/hygon.c
-index cdc94130dd2e..6f8d491297e8 100644
---- a/xen/arch/x86/cpu/hygon.c
-+++ b/xen/arch/x86/cpu/hygon.c
-@@ -40,6 +40,12 @@ static void init_hygon(struct cpuinfo_x86 *c)
- c->x86 == 0x18)
- detect_zen2_null_seg_behaviour();
-
-+ /*
-+ * TODO: Check heuristic safety with Hygon first
-+ if (c->x86 == 0x18)
-+ amd_init_spectral_chicken();
-+ */
-+
- /*
- * Hygon CPUs before Zen2 don't clear segment bases/limits when
- * loading a NULL selector.
-diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
-index 72bc32ba04ff..d3735e499e0f 100644
---- a/xen/include/asm-x86/msr-index.h
-+++ b/xen/include/asm-x86/msr-index.h
-@@ -361,6 +361,7 @@
- #define MSR_AMD64_DE_CFG 0xc0011029
- #define AMD64_DE_CFG_LFENCE_SERIALISE (_AC(1, ULL) << 1)
- #define MSR_AMD64_EX_CFG 0xc001102c
-+#define MSR_AMD64_DE_CFG2 0xc00110e3
-
- #define MSR_AMD64_DR0_ADDRESS_MASK 0xc0011027
- #define MSR_AMD64_DR1_ADDRESS_MASK 0xc0011019
---
-2.35.1
-
diff --git a/0051-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch b/0051-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch
deleted file mode 100644
index e313ede..0000000
--- a/0051-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch
+++ /dev/null
@@ -1,305 +0,0 @@
-From 0a5387a01165b46c8c85e7f7e2ddbe60a7f5db44 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Mon, 27 Jun 2022 19:29:40 +0100
-Subject: [PATCH 51/51] x86/spec-ctrl: Mitigate Branch Type Confusion when
- possible
-
-Branch Type Confusion affects AMD/Hygon CPUs on Zen2 and earlier. To
-mitigate, we require SMT safety (STIBP on Zen2, no-SMT on Zen1), and to issue
-an IBPB on each entry to Xen, to flush the BTB.
-
-Due to performance concerns, dom0 (which is trusted in most configurations) is
-excluded from protections by default.
-
-Therefore:
- * Use STIBP by default on Zen2 too, which now means we want it on by default
- on all hardware supporting STIBP.
- * Break the current IBPB logic out into a new function, extending it with
- IBPB-at-entry logic.
- * Change the existing IBPB-at-ctxt-switch boolean to be tristate, and disable
- it by default when IBPB-at-entry is providing sufficient safety.
-
-If all PV guests on the system are trusted, then it is recommended to boot
-with `spec-ctrl=ibpb-entry=no-pv`, as this will provide an additional marginal
-perf improvement.
-
-This is part of XSA-407 / CVE-2022-23825.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-(cherry picked from commit d8cb7e0f069e0f106d24941355b59b45a731eabe)
----
- docs/misc/xen-command-line.pandoc | 14 ++--
- xen/arch/x86/spec_ctrl.c | 113 ++++++++++++++++++++++++++----
- xen/include/asm-x86/spec_ctrl.h | 2 +-
- 3 files changed, 112 insertions(+), 17 deletions(-)
-
-diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
-index 1bbdb55129cc..bd6826d0ae05 100644
---- a/docs/misc/xen-command-line.pandoc
-+++ b/docs/misc/xen-command-line.pandoc
-@@ -2234,7 +2234,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
-
- ### spec-ctrl (x86)
- > `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>,
--> {msr-sc,rsb,md-clear}=<bool>|{pv,hvm}=<bool>,
-+> {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>,
- > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
- > eager-fpu,l1d-flush,branch-harden,srb-lock,
- > unpriv-mmio}=<bool> ]`
-@@ -2259,9 +2259,10 @@ in place for guests to use.
-
- Use of a positive boolean value for either of these options is invalid.
-
--The `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` options offer fine
--grained control over the primitives by Xen. These impact Xen's ability to
--protect itself, and/or Xen's ability to virtualise support for guests to use.
-+The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `md-clear=` and `ibpb-entry=` options
-+offer fine grained control over the primitives by Xen. These impact Xen's
-+ability to protect itself, and/or Xen's ability to virtualise support for
-+guests to use.
-
- * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests
- respectively.
-@@ -2280,6 +2281,11 @@ protect itself, and/or Xen's ability to virtualise support for guests to use.
- compatibility with development versions of this fix, `mds=` is also accepted
- on Xen 4.12 and earlier as an alias. Consult vendor documentation in
- preference to here.*
-+* `ibpb-entry=` offers control over whether IBPB (Indirect Branch Prediction
-+ Barrier) is used on entry to Xen. This is used by default on hardware
-+ vulnerable to Branch Type Confusion, but for performance reasons, dom0 is
-+ unprotected by default. If it necessary to protect dom0 too, boot with
-+ `spec-ctrl=ibpb-entry`.
-
- If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to
- select which of the thunks gets patched into the `__x86_indirect_thunk_%reg`
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index 0f101c057f3e..1d9796c34d71 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -39,6 +39,10 @@ static bool __initdata opt_rsb_hvm = true;
- static int8_t __read_mostly opt_md_clear_pv = -1;
- static int8_t __read_mostly opt_md_clear_hvm = -1;
-
-+static int8_t __read_mostly opt_ibpb_entry_pv = -1;
-+static int8_t __read_mostly opt_ibpb_entry_hvm = -1;
-+static bool __read_mostly opt_ibpb_entry_dom0;
-+
- /* Cmdline controls for Xen's speculative settings. */
- static enum ind_thunk {
- THUNK_DEFAULT, /* Decide which thunk to use at boot time. */
-@@ -54,7 +58,7 @@ int8_t __initdata opt_stibp = -1;
- bool __read_mostly opt_ssbd;
- int8_t __initdata opt_psfd = -1;
-
--bool __read_mostly opt_ibpb_ctxt_switch = true;
-+int8_t __read_mostly opt_ibpb_ctxt_switch = -1;
- int8_t __read_mostly opt_eager_fpu = -1;
- int8_t __read_mostly opt_l1d_flush = -1;
- static bool __initdata opt_branch_harden = true;
-@@ -114,6 +118,9 @@ static int __init parse_spec_ctrl(const char *s)
- opt_rsb_hvm = false;
- opt_md_clear_pv = 0;
- opt_md_clear_hvm = 0;
-+ opt_ibpb_entry_pv = 0;
-+ opt_ibpb_entry_hvm = 0;
-+ opt_ibpb_entry_dom0 = false;
-
- opt_thunk = THUNK_JMP;
- opt_ibrs = 0;
-@@ -140,12 +147,14 @@ static int __init parse_spec_ctrl(const char *s)
- opt_msr_sc_pv = val;
- opt_rsb_pv = val;
- opt_md_clear_pv = val;
-+ opt_ibpb_entry_pv = val;
- }
- else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
- {
- opt_msr_sc_hvm = val;
- opt_rsb_hvm = val;
- opt_md_clear_hvm = val;
-+ opt_ibpb_entry_hvm = val;
- }
- else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 )
- {
-@@ -210,6 +219,28 @@ static int __init parse_spec_ctrl(const char *s)
- break;
- }
- }
-+ else if ( (val = parse_boolean("ibpb-entry", s, ss)) != -1 )
-+ {
-+ switch ( val )
-+ {
-+ case 0:
-+ case 1:
-+ opt_ibpb_entry_pv = opt_ibpb_entry_hvm =
-+ opt_ibpb_entry_dom0 = val;
-+ break;
-+
-+ case -2:
-+ s += strlen("ibpb-entry=");
-+ if ( (val = parse_boolean("pv", s, ss)) >= 0 )
-+ opt_ibpb_entry_pv = val;
-+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
-+ opt_ibpb_entry_hvm = val;
-+ else
-+ default:
-+ rc = -EINVAL;
-+ break;
-+ }
-+ }
-
- /* Xen's speculative sidechannel mitigation settings. */
- else if ( !strncmp(s, "bti-thunk=", 10) )
-@@ -477,27 +508,31 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- * mitigation support for guests.
- */
- #ifdef CONFIG_HVM
-- printk(" Support for HVM VMs:%s%s%s%s%s\n",
-+ printk(" Support for HVM VMs:%s%s%s%s%s%s\n",
- (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ||
- boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ||
- boot_cpu_has(X86_FEATURE_MD_CLEAR) ||
-+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ||
- opt_eager_fpu) ? "" : " None",
- boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "",
- boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "",
- opt_eager_fpu ? " EAGER_FPU" : "",
-- boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "");
-+ boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "",
-+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : "");
-
- #endif
- #ifdef CONFIG_PV
-- printk(" Support for PV VMs:%s%s%s%s%s\n",
-+ printk(" Support for PV VMs:%s%s%s%s%s%s\n",
- (boot_cpu_has(X86_FEATURE_SC_MSR_PV) ||
- boot_cpu_has(X86_FEATURE_SC_RSB_PV) ||
- boot_cpu_has(X86_FEATURE_MD_CLEAR) ||
-+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ||
- opt_eager_fpu) ? "" : " None",
- boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "",
- boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "",
- opt_eager_fpu ? " EAGER_FPU" : "",
-- boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "");
-+ boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "",
-+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : "");
-
- printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n",
- opt_xpti_hwdom ? "enabled" : "disabled",
-@@ -759,6 +794,55 @@ static bool __init should_use_eager_fpu(void)
- }
- }
-
-+static void __init ibpb_calculations(void)
-+{
-+ /* Check we have hardware IBPB support before using it... */
-+ if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) )
-+ {
-+ opt_ibpb_entry_hvm = opt_ibpb_entry_pv = opt_ibpb_ctxt_switch = 0;
-+ opt_ibpb_entry_dom0 = false;
-+ return;
-+ }
-+
-+ /*
-+ * IBPB-on-entry mitigations for Branch Type Confusion.
-+ *
-+ * IBPB && !BTC_NO selects all AMD/Hygon hardware, not known to be safe,
-+ * that we can provide some form of mitigation on.
-+ */
-+ if ( opt_ibpb_entry_pv == -1 )
-+ opt_ibpb_entry_pv = (IS_ENABLED(CONFIG_PV) &&
-+ boot_cpu_has(X86_FEATURE_IBPB) &&
-+ !boot_cpu_has(X86_FEATURE_BTC_NO));
-+ if ( opt_ibpb_entry_hvm == -1 )
-+ opt_ibpb_entry_hvm = (IS_ENABLED(CONFIG_HVM) &&
-+ boot_cpu_has(X86_FEATURE_IBPB) &&
-+ !boot_cpu_has(X86_FEATURE_BTC_NO));
-+
-+ if ( opt_ibpb_entry_pv )
-+ {
-+ setup_force_cpu_cap(X86_FEATURE_IBPB_ENTRY_PV);
-+
-+ /*
-+ * We only need to flush in IST context if we're protecting against PV
-+ * guests. HVM IBPB-on-entry protections are both atomic with
-+ * NMI/#MC, so can't interrupt Xen ahead of having already flushed the
-+ * BTB.
-+ */
-+ default_spec_ctrl_flags |= SCF_ist_ibpb;
-+ }
-+ if ( opt_ibpb_entry_hvm )
-+ setup_force_cpu_cap(X86_FEATURE_IBPB_ENTRY_HVM);
-+
-+ /*
-+ * If we're using IBPB-on-entry to protect against PV and HVM guests
-+ * (ignoring dom0 if trusted), then there's no need to also issue IBPB on
-+ * context switch too.
-+ */
-+ if ( opt_ibpb_ctxt_switch == -1 )
-+ opt_ibpb_ctxt_switch = !(opt_ibpb_entry_hvm && opt_ibpb_entry_pv);
-+}
-+
- /* Calculate whether this CPU is vulnerable to L1TF. */
- static __init void l1tf_calculations(uint64_t caps)
- {
-@@ -1014,8 +1098,12 @@ void spec_ctrl_init_domain(struct domain *d)
- bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) ||
- (opt_fb_clear_mmio && is_iommu_enabled(d)));
-
-+ bool ibpb = ((pv ? opt_ibpb_entry_pv : opt_ibpb_entry_hvm) &&
-+ (d->domain_id != 0 || opt_ibpb_entry_dom0));
-+
- d->arch.spec_ctrl_flags =
- (verw ? SCF_verw : 0) |
-+ (ibpb ? SCF_entry_ibpb : 0) |
- 0;
- }
-
-@@ -1162,12 +1250,15 @@ void __init init_speculation_mitigations(void)
- }
-
- /*
-- * Use STIBP by default if the hardware hint is set. Otherwise, leave it
-- * off as it a severe performance pentalty on pre-eIBRS Intel hardware
-- * where it was retrofitted in microcode.
-+ * Use STIBP by default on all AMD systems. Zen3 and later enumerate
-+ * STIBP_ALWAYS, but STIBP is needed on Zen2 as part of the mitigations
-+ * for Branch Type Confusion.
-+ *
-+ * Leave STIBP off by default on Intel. Pre-eIBRS systems suffer a
-+ * substantial perf hit when it was implemented in microcode.
- */
- if ( opt_stibp == -1 )
-- opt_stibp = !!boot_cpu_has(X86_FEATURE_STIBP_ALWAYS);
-+ opt_stibp = !!boot_cpu_has(X86_FEATURE_AMD_STIBP);
-
- if ( opt_stibp && (boot_cpu_has(X86_FEATURE_STIBP) ||
- boot_cpu_has(X86_FEATURE_AMD_STIBP)) )
-@@ -1239,9 +1330,7 @@ void __init init_speculation_mitigations(void)
- if ( opt_rsb_hvm )
- setup_force_cpu_cap(X86_FEATURE_SC_RSB_HVM);
-
-- /* Check we have hardware IBPB support before using it... */
-- if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) )
-- opt_ibpb_ctxt_switch = false;
-+ ibpb_calculations();
-
- /* Check whether Eager FPU should be enabled by default. */
- if ( opt_eager_fpu == -1 )
-diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
-index 3fc599a817c4..9403b81dc7af 100644
---- a/xen/include/asm-x86/spec_ctrl.h
-+++ b/xen/include/asm-x86/spec_ctrl.h
-@@ -65,7 +65,7 @@
- void init_speculation_mitigations(void);
- void spec_ctrl_init_domain(struct domain *d);
-
--extern bool opt_ibpb_ctxt_switch;
-+extern int8_t opt_ibpb_ctxt_switch;
- extern bool opt_ssbd;
- extern int8_t opt_eager_fpu;
- extern int8_t opt_l1d_flush;
---
-2.35.1
-
diff --git a/info.txt b/info.txt
index e830829..d2c53b1 100644
--- a/info.txt
+++ b/info.txt
@@ -1,6 +1,6 @@
-Xen upstream patchset #1 for 4.16.2-pre
+Xen upstream patchset #0 for 4.16.3-pre
Containing patches from
-RELEASE-4.16.1 (13fee86475f3831d7a1ecf6d7e0acbc2ac779f7e)
+RELEASE-4.16.2 (1871bd1c9eb934f0ffd039f3d68e42fd0097f322)
to
-staging-4.16 (0a5387a01165b46c8c85e7f7e2ddbe60a7f5db44)
+staging-4.16 (1bce7fb1f702da4f7a749c6f1457ecb20bf74fca)