1012 files changed, 58238 insertions, 18518 deletions
diff --git a/tools/Makefile b/tools/Makefile
index 37e9f6804832..276f5d0d53a4 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -11,7 +11,6 @@ help:
 	@echo ''
 	@echo '  acpi                   - ACPI tools'
 	@echo '  bpf                    - misc BPF tools'
-	@echo '  cgroup                 - cgroup tools'
 	@echo '  counter                - counter tools'
 	@echo '  cpupower               - a tool for all things x86 CPU power'
 	@echo '  debugging              - tools for debugging'
@@ -69,7 +68,7 @@ acpi: FORCE
 cpupower: FORCE
 	$(call descend,power/$@)
 
-cgroup counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE
+counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE
 	$(call descend,$@)
 
 bpf/%: FORCE
@@ -116,7 +115,7 @@ freefall: FORCE
 kvm_stat: FORCE
 	$(call descend,kvm/$@)
 
-all: acpi cgroup counter cpupower gpio hv firewire \
+all: acpi counter cpupower gpio hv firewire \
 		perf selftests bootconfig spi turbostat usb \
 		virtio mm bpf x86_energy_perf_policy \
 		tmon freefall iio objtool kvm_stat wmi \
@@ -128,7 +127,7 @@ acpi_install:
 cpupower_install:
 	$(call descend,power/$(@:_install=),install)
 
-cgroup_install counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install:
+counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install:
 	$(call descend,$(@:_install=),install)
 
 selftests_install:
@@ -155,7 +154,7 @@ freefall_install:
 kvm_stat_install:
 	$(call descend,kvm/$(@:_install=),install)
 
-install: acpi_install cgroup_install counter_install cpupower_install gpio_install \
+install: acpi_install counter_install cpupower_install gpio_install \
 		hv_install firewire_install iio_install \
 		perf_install selftests_install turbostat_install usb_install \
 		virtio_install mm_install bpf_install x86_energy_perf_policy_install \
@@ -169,7 +168,7 @@ acpi_clean:
 cpupower_clean:
 	$(call descend,power/cpupower,clean)
 
-cgroup_clean counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean:
+counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean:
 	$(call descend,$(@:_clean=),clean)
 
 libapi_clean:
@@ -209,7 +208,7 @@ freefall_clean:
 build_clean:
 	$(call descend,build,clean)
 
-clean: acpi_clean cgroup_clean counter_clean cpupower_clean hv_clean firewire_clean \
+clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \
 		perf_clean selftests_clean turbostat_clean bootconfig_clean spi_clean usb_clean virtio_clean \
 		mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
 		freefall_clean build_clean libbpf_clean libsubcmd_clean \
diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h
index 7c7493cb571f..52f076afeb96 100644
--- a/tools/arch/arm64/include/asm/cputype.h
+++ b/tools/arch/arm64/include/asm/cputype.h
@@ -61,6 +61,7 @@
 #define ARM_CPU_IMP_HISI		0x48
 #define ARM_CPU_IMP_APPLE		0x61
 #define ARM_CPU_IMP_AMPERE		0xC0
+#define ARM_CPU_IMP_MICROSOFT		0x6D
 
 #define ARM_CPU_PART_AEM_V8		0xD0F
 #define ARM_CPU_PART_FOUNDATION		0xD00
@@ -135,6 +136,8 @@
 
 #define AMPERE_CPU_PART_AMPERE1		0xAC3
 
+#define MICROSOFT_CPU_PART_AZURE_COBALT_100	0xD49 /* Based on r0p0 of ARM Neoverse N2 */
+
 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
 #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
 #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72)
@@ -193,6 +196,7 @@
 #define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX)
 #define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX)
 #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1)
+#define MIDR_MICROSOFT_AZURE_COBALT_100 MIDR_CPU_MODEL(ARM_CPU_IMP_MICROSOFT, MICROSOFT_CPU_PART_AZURE_COBALT_100)
 
 /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */
 #define MIDR_FUJITSU_ERRATUM_010001		MIDR_FUJITSU_A64FX
diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h
index ccc13e991376..cd8420e8c3ad 100644
--- a/tools/arch/arm64/include/asm/sysreg.h
+++ b/tools/arch/arm64/include/asm/sysreg.h
@@ -701,18 +701,18 @@
  * Permission Indirection Extension (PIE) permission encodings.
  * Encodings with the _O suffix, have overlays applied (Permission Overlay Extension).
  */
-#define PIE_NONE_O	0x0
-#define PIE_R_O		0x1
-#define PIE_X_O		0x2
-#define PIE_RX_O	0x3
-#define PIE_RW_O	0x5
-#define PIE_RWnX_O	0x6
-#define PIE_RWX_O	0x7
-#define PIE_R		0x8
-#define PIE_GCS		0x9
-#define PIE_RX		0xa
-#define PIE_RW		0xc
-#define PIE_RWX		0xe
+#define PIE_NONE_O	UL(0x0)
+#define PIE_R_O		UL(0x1)
+#define PIE_X_O		UL(0x2)
+#define PIE_RX_O	UL(0x3)
+#define PIE_RW_O	UL(0x5)
+#define PIE_RWnX_O	UL(0x6)
+#define PIE_RWX_O	UL(0x7)
+#define PIE_R		UL(0x8)
+#define PIE_GCS		UL(0x9)
+#define PIE_RX		UL(0xa)
+#define PIE_RW		UL(0xc)
+#define PIE_RWX		UL(0xe)
 
 #define PIRx_ELx_PERM(idx, perm)	((perm) << ((idx) * 4))
 
diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h
index 89d2fc872d9f..964df31da975 100644
--- a/tools/arch/arm64/include/uapi/asm/kvm.h
+++ b/tools/arch/arm64/include/uapi/asm/kvm.h
@@ -37,9 +37,7 @@
 #include <asm/ptrace.h>
 #include <asm/sve_context.h>
 
-#define __KVM_HAVE_GUEST_DEBUG
 #define __KVM_HAVE_IRQ_LINE
-#define __KVM_HAVE_READONLY_MEM
 #define __KVM_HAVE_VCPU_EVENTS
 
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -76,11 +74,11 @@ struct kvm_regs {
 
 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
 #define KVM_ARM_DEVICE_TYPE_SHIFT	0
-#define KVM_ARM_DEVICE_TYPE_MASK	GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \
-						KVM_ARM_DEVICE_TYPE_SHIFT)
+#define KVM_ARM_DEVICE_TYPE_MASK	__GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \
+						  KVM_ARM_DEVICE_TYPE_SHIFT)
 #define KVM_ARM_DEVICE_ID_SHIFT		16
-#define KVM_ARM_DEVICE_ID_MASK		GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \
-						KVM_ARM_DEVICE_ID_SHIFT)
+#define KVM_ARM_DEVICE_ID_MASK		__GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \
+						  KVM_ARM_DEVICE_ID_SHIFT)
 
 /* Supported device IDs */
 #define KVM_ARM_DEVICE_VGIC_V2		0
@@ -162,6 +160,11 @@ struct kvm_sync_regs {
 	__u64 device_irq_level;
 };
 
+/* Bits for run->s.regs.device_irq_level */
+#define KVM_ARM_DEV_EL1_VTIMER		(1 << 0)
+#define KVM_ARM_DEV_EL1_PTIMER		(1 << 1)
+#define KVM_ARM_DEV_PMU			(1 << 2)
+
 /*
  * PMU filter structure. Describe a range of events with a particular
  * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER.
diff --git a/tools/arch/loongarch/include/asm/inst.h b/tools/arch/loongarch/include/asm/inst.h
new file mode 100644
index 000000000000..c25b5853181d
--- /dev/null
+++ b/tools/arch/loongarch/include/asm/inst.h
@@ -0,0 +1,161 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+#ifndef _ASM_INST_H
+#define _ASM_INST_H
+
+#include <linux/bitops.h>
+
+#define LOONGARCH_INSN_NOP		0x03400000
+
+enum reg0i15_op {
+	break_op	= 0x54,
+};
+
+enum reg0i26_op {
+	b_op		= 0x14,
+	bl_op		= 0x15,
+};
+
+enum reg1i21_op {
+	beqz_op		= 0x10,
+	bnez_op		= 0x11,
+	bceqz_op	= 0x12, /* bits[9:8] = 0x00 */
+	bcnez_op	= 0x12, /* bits[9:8] = 0x01 */
+};
+
+enum reg2_op {
+	ertn_op		= 0x1920e,
+};
+
+enum reg2i12_op {
+	addid_op	= 0x0b,
+	andi_op		= 0x0d,
+	ldd_op		= 0xa3,
+	std_op		= 0xa7,
+};
+
+enum reg2i14_op {
+	ldptrd_op	= 0x26,
+	stptrd_op	= 0x27,
+};
+
+enum reg2i16_op {
+	jirl_op		= 0x13,
+	beq_op		= 0x16,
+	bne_op		= 0x17,
+	blt_op		= 0x18,
+	bge_op		= 0x19,
+	bltu_op		= 0x1a,
+	bgeu_op		= 0x1b,
+};
+
+struct reg0i15_format {
+	unsigned int immediate : 15;
+	unsigned int opcode : 17;
+};
+
+struct reg0i26_format {
+	unsigned int immediate_h : 10;
+	unsigned int immediate_l : 16;
+	unsigned int opcode : 6;
+};
+
+struct reg1i21_format {
+	unsigned int immediate_h  : 5;
+	unsigned int rj : 5;
+	unsigned int immediate_l : 16;
+	unsigned int opcode : 6;
+};
+
+struct reg2_format {
+	unsigned int rd : 5;
+	unsigned int rj : 5;
+	unsigned int opcode : 22;
+};
+
+struct reg2i12_format {
+	unsigned int rd : 5;
+	unsigned int rj : 5;
+	unsigned int immediate : 12;
+	unsigned int opcode : 10;
+};
+
+struct reg2i14_format {
+	unsigned int rd : 5;
+	unsigned int rj : 5;
+	unsigned int immediate : 14;
+	unsigned int opcode : 8;
+};
+
+struct reg2i16_format {
+	unsigned int rd : 5;
+	unsigned int rj : 5;
+	unsigned int immediate : 16;
+	unsigned int opcode : 6;
+};
+
+union loongarch_instruction {
+	unsigned int word;
+	struct reg0i15_format	reg0i15_format;
+	struct reg0i26_format	reg0i26_format;
+	struct reg1i21_format	reg1i21_format;
+	struct reg2_format	reg2_format;
+	struct reg2i12_format	reg2i12_format;
+	struct reg2i14_format	reg2i14_format;
+	struct reg2i16_format	reg2i16_format;
+};
+
+#define LOONGARCH_INSN_SIZE	sizeof(union loongarch_instruction)
+
+enum loongarch_gpr {
+	LOONGARCH_GPR_ZERO = 0,
+	LOONGARCH_GPR_RA = 1,
+	LOONGARCH_GPR_TP = 2,
+	LOONGARCH_GPR_SP = 3,
+	LOONGARCH_GPR_A0 = 4,	/* Reused as V0 for return value */
+	LOONGARCH_GPR_A1,	/* Reused as V1 for return value */
+	LOONGARCH_GPR_A2,
+	LOONGARCH_GPR_A3,
+	LOONGARCH_GPR_A4,
+	LOONGARCH_GPR_A5,
+	LOONGARCH_GPR_A6,
+	LOONGARCH_GPR_A7,
+	LOONGARCH_GPR_T0 = 12,
+	LOONGARCH_GPR_T1,
+	LOONGARCH_GPR_T2,
+	LOONGARCH_GPR_T3,
+	LOONGARCH_GPR_T4,
+	LOONGARCH_GPR_T5,
+	LOONGARCH_GPR_T6,
+	LOONGARCH_GPR_T7,
+	LOONGARCH_GPR_T8,
+	LOONGARCH_GPR_FP = 22,
+	LOONGARCH_GPR_S0 = 23,
+	LOONGARCH_GPR_S1,
+	LOONGARCH_GPR_S2,
+	LOONGARCH_GPR_S3,
+	LOONGARCH_GPR_S4,
+	LOONGARCH_GPR_S5,
+	LOONGARCH_GPR_S6,
+	LOONGARCH_GPR_S7,
+	LOONGARCH_GPR_S8,
+	LOONGARCH_GPR_MAX
+};
+
+#define DEF_EMIT_REG2I16_FORMAT(NAME, OP)				\
+static inline void emit_##NAME(union loongarch_instruction *insn,	\
+			       enum loongarch_gpr rj,			\
+			       enum loongarch_gpr rd,			\
+			       int offset)				\
+{									\
+	insn->reg2i16_format.opcode = OP;				\
+	insn->reg2i16_format.immediate = offset;			\
+	insn->reg2i16_format.rj = rj;					\
+	insn->reg2i16_format.rd = rd;					\
+}
+
+DEF_EMIT_REG2I16_FORMAT(jirl, jirl_op)
+
+#endif /* _ASM_INST_H */
diff --git a/tools/arch/loongarch/include/asm/orc_types.h b/tools/arch/loongarch/include/asm/orc_types.h
new file mode 100644
index 000000000000..caf1f71a1057
--- /dev/null
+++ b/tools/arch/loongarch/include/asm/orc_types.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ORC_TYPES_H
+#define _ORC_TYPES_H
+
+#include <linux/types.h>
+
+/*
+ * The ORC_REG_* registers are base registers which are used to find other
+ * registers on the stack.
+ *
+ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
+ * address of the previous frame: the caller's SP before it called the current
+ * function.
+ *
+ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in
+ * the current frame.
+ *
+ * The most commonly used base registers are SP and FP -- which the previous SP
+ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous FP is
+ * usually based on.
+ *
+ * The rest of the base registers are needed for special cases like entry code
+ * and GCC realigned stacks.
+ */
+#define ORC_REG_UNDEFINED		0
+#define ORC_REG_PREV_SP			1
+#define ORC_REG_SP			2
+#define ORC_REG_FP			3
+#define ORC_REG_MAX			4
+
+#define ORC_TYPE_UNDEFINED		0
+#define ORC_TYPE_END_OF_STACK		1
+#define ORC_TYPE_CALL			2
+#define ORC_TYPE_REGS			3
+#define ORC_TYPE_REGS_PARTIAL		4
+
+#ifndef __ASSEMBLY__
+/*
+ * This struct is more or less a vastly simplified version of the DWARF Call
+ * Frame Information standard.  It contains only the necessary parts of DWARF
+ * CFI, simplified for ease of access by the in-kernel unwinder.  It tells the
+ * unwinder how to find the previous SP and FP (and sometimes entry regs) on
+ * the stack for a given code address.  Each instance of the struct corresponds
+ * to one or more code locations.
+ */
+struct orc_entry {
+	s16		sp_offset;
+	s16		fp_offset;
+	s16		ra_offset;
+	unsigned int	sp_reg:4;
+	unsigned int	fp_reg:4;
+	unsigned int	ra_reg:4;
+	unsigned int	type:3;
+	unsigned int	signal:1;
+};
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ORC_TYPES_H */
diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h
index 9f18fa090f1f..1691297a766a 100644
--- a/tools/arch/powerpc/include/uapi/asm/kvm.h
+++ b/tools/arch/powerpc/include/uapi/asm/kvm.h
@@ -28,7 +28,6 @@
 #define __KVM_HAVE_PPC_SMT
 #define __KVM_HAVE_IRQCHIP
 #define __KVM_HAVE_IRQ_LINE
-#define __KVM_HAVE_GUEST_DEBUG
 
 /* Not always available, but if it is, this is the correct offset.  */
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -733,4 +732,48 @@ struct kvm_ppc_xive_eq {
 #define KVM_XIVE_TIMA_PAGE_OFFSET	0
 #define KVM_XIVE_ESB_PAGE_OFFSET	4
 
+/* for KVM_PPC_GET_PVINFO */
+
+#define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
+
+struct kvm_ppc_pvinfo {
+	/* out */
+	__u32 flags;
+	__u32 hcall[4];
+	__u8  pad[108];
+};
+
+/* for KVM_PPC_GET_SMMU_INFO */
+#define KVM_PPC_PAGE_SIZES_MAX_SZ	8
+
+struct kvm_ppc_one_page_size {
+	__u32 page_shift;	/* Page shift (or 0) */
+	__u32 pte_enc;		/* Encoding in the HPTE (>>12) */
+};
+
+struct kvm_ppc_one_seg_page_size {
+	__u32 page_shift;	/* Base page shift of segment (or 0) */
+	__u32 slb_enc;		/* SLB encoding for BookS */
+	struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ];
+};
+
+#define KVM_PPC_PAGE_SIZES_REAL		0x00000001
+#define KVM_PPC_1T_SEGMENTS		0x00000002
+#define KVM_PPC_NO_HASH			0x00000004
+
+struct kvm_ppc_smmu_info {
+	__u64 flags;
+	__u32 slb_size;
+	__u16 data_keys;	/* # storage keys supported for data */
+	__u16 instr_keys;	/* # storage keys supported for instructions */
+	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
+};
+
+/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/tools/arch/riscv/include/asm/csr.h b/tools/arch/riscv/include/asm/csr.h
new file mode 100644
index 000000000000..0dfc09254f99
--- /dev/null
+++ b/tools/arch/riscv/include/asm/csr.h
@@ -0,0 +1,541 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2015 Regents of the University of California
+ */
+
+#ifndef _ASM_RISCV_CSR_H
+#define _ASM_RISCV_CSR_H
+
+#include <linux/bits.h>
+
+/* Status register flags */
+#define SR_SIE		_AC(0x00000002, UL) /* Supervisor Interrupt Enable */
+#define SR_MIE		_AC(0x00000008, UL) /* Machine Interrupt Enable */
+#define SR_SPIE		_AC(0x00000020, UL) /* Previous Supervisor IE */
+#define SR_MPIE		_AC(0x00000080, UL) /* Previous Machine IE */
+#define SR_SPP		_AC(0x00000100, UL) /* Previously Supervisor */
+#define SR_MPP		_AC(0x00001800, UL) /* Previously Machine */
+#define SR_SUM		_AC(0x00040000, UL) /* Supervisor User Memory Access */
+
+#define SR_FS		_AC(0x00006000, UL) /* Floating-point Status */
+#define SR_FS_OFF	_AC(0x00000000, UL)
+#define SR_FS_INITIAL	_AC(0x00002000, UL)
+#define SR_FS_CLEAN	_AC(0x00004000, UL)
+#define SR_FS_DIRTY	_AC(0x00006000, UL)
+
+#define SR_VS		_AC(0x00000600, UL) /* Vector Status */
+#define SR_VS_OFF	_AC(0x00000000, UL)
+#define SR_VS_INITIAL	_AC(0x00000200, UL)
+#define SR_VS_CLEAN	_AC(0x00000400, UL)
+#define SR_VS_DIRTY	_AC(0x00000600, UL)
+
+#define SR_XS		_AC(0x00018000, UL) /* Extension Status */
+#define SR_XS_OFF	_AC(0x00000000, UL)
+#define SR_XS_INITIAL	_AC(0x00008000, UL)
+#define SR_XS_CLEAN	_AC(0x00010000, UL)
+#define SR_XS_DIRTY	_AC(0x00018000, UL)
+
+#define SR_FS_VS	(SR_FS | SR_VS) /* Vector and Floating-Point Unit */
+
+#ifndef CONFIG_64BIT
+#define SR_SD		_AC(0x80000000, UL) /* FS/VS/XS dirty */
+#else
+#define SR_SD		_AC(0x8000000000000000, UL) /* FS/VS/XS dirty */
+#endif
+
+#ifdef CONFIG_64BIT
+#define SR_UXL		_AC(0x300000000, UL) /* XLEN mask for U-mode */
+#define SR_UXL_32	_AC(0x100000000, UL) /* XLEN = 32 for U-mode */
+#define SR_UXL_64	_AC(0x200000000, UL) /* XLEN = 64 for U-mode */
+#endif
+
+/* SATP flags */
+#ifndef CONFIG_64BIT
+#define SATP_PPN	_AC(0x003FFFFF, UL)
+#define SATP_MODE_32	_AC(0x80000000, UL)
+#define SATP_MODE_SHIFT	31
+#define SATP_ASID_BITS	9
+#define SATP_ASID_SHIFT	22
+#define SATP_ASID_MASK	_AC(0x1FF, UL)
+#else
+#define SATP_PPN	_AC(0x00000FFFFFFFFFFF, UL)
+#define SATP_MODE_39	_AC(0x8000000000000000, UL)
+#define SATP_MODE_48	_AC(0x9000000000000000, UL)
+#define SATP_MODE_57	_AC(0xa000000000000000, UL)
+#define SATP_MODE_SHIFT	60
+#define SATP_ASID_BITS	16
+#define SATP_ASID_SHIFT	44
+#define SATP_ASID_MASK	_AC(0xFFFF, UL)
+#endif
+
+/* Exception cause high bit - is an interrupt if set */
+#define CAUSE_IRQ_FLAG		(_AC(1, UL) << (__riscv_xlen - 1))
+
+/* Interrupt causes (minus the high bit) */
+#define IRQ_S_SOFT		1
+#define IRQ_VS_SOFT		2
+#define IRQ_M_SOFT		3
+#define IRQ_S_TIMER		5
+#define IRQ_VS_TIMER		6
+#define IRQ_M_TIMER		7
+#define IRQ_S_EXT		9
+#define IRQ_VS_EXT		10
+#define IRQ_M_EXT		11
+#define IRQ_S_GEXT		12
+#define IRQ_PMU_OVF		13
+#define IRQ_LOCAL_MAX		(IRQ_PMU_OVF + 1)
+#define IRQ_LOCAL_MASK		GENMASK((IRQ_LOCAL_MAX - 1), 0)
+
+/* Exception causes */
+#define EXC_INST_MISALIGNED	0
+#define EXC_INST_ACCESS		1
+#define EXC_INST_ILLEGAL	2
+#define EXC_BREAKPOINT		3
+#define EXC_LOAD_MISALIGNED	4
+#define EXC_LOAD_ACCESS		5
+#define EXC_STORE_MISALIGNED	6
+#define EXC_STORE_ACCESS	7
+#define EXC_SYSCALL		8
+#define EXC_HYPERVISOR_SYSCALL	9
+#define EXC_SUPERVISOR_SYSCALL	10
+#define EXC_INST_PAGE_FAULT	12
+#define EXC_LOAD_PAGE_FAULT	13
+#define EXC_STORE_PAGE_FAULT	15
+#define EXC_INST_GUEST_PAGE_FAULT	20
+#define EXC_LOAD_GUEST_PAGE_FAULT	21
+#define EXC_VIRTUAL_INST_FAULT		22
+#define EXC_STORE_GUEST_PAGE_FAULT	23
+
+/* PMP configuration */
+#define PMP_R			0x01
+#define PMP_W			0x02
+#define PMP_X			0x04
+#define PMP_A			0x18
+#define PMP_A_TOR		0x08
+#define PMP_A_NA4		0x10
+#define PMP_A_NAPOT		0x18
+#define PMP_L			0x80
+
+/* HSTATUS flags */
+#ifdef CONFIG_64BIT
+#define HSTATUS_VSXL		_AC(0x300000000, UL)
+#define HSTATUS_VSXL_SHIFT	32
+#endif
+#define HSTATUS_VTSR		_AC(0x00400000, UL)
+#define HSTATUS_VTW		_AC(0x00200000, UL)
+#define HSTATUS_VTVM		_AC(0x00100000, UL)
+#define HSTATUS_VGEIN		_AC(0x0003f000, UL)
+#define HSTATUS_VGEIN_SHIFT	12
+#define HSTATUS_HU		_AC(0x00000200, UL)
+#define HSTATUS_SPVP		_AC(0x00000100, UL)
+#define HSTATUS_SPV		_AC(0x00000080, UL)
+#define HSTATUS_GVA		_AC(0x00000040, UL)
+#define HSTATUS_VSBE		_AC(0x00000020, UL)
+
+/* HGATP flags */
+#define HGATP_MODE_OFF		_AC(0, UL)
+#define HGATP_MODE_SV32X4	_AC(1, UL)
+#define HGATP_MODE_SV39X4	_AC(8, UL)
+#define HGATP_MODE_SV48X4	_AC(9, UL)
+#define HGATP_MODE_SV57X4	_AC(10, UL)
+
+#define HGATP32_MODE_SHIFT	31
+#define HGATP32_VMID_SHIFT	22
+#define HGATP32_VMID		GENMASK(28, 22)
+#define HGATP32_PPN		GENMASK(21, 0)
+
+#define HGATP64_MODE_SHIFT	60
+#define HGATP64_VMID_SHIFT	44
+#define HGATP64_VMID		GENMASK(57, 44)
+#define HGATP64_PPN		GENMASK(43, 0)
+
+#define HGATP_PAGE_SHIFT	12
+
+#ifdef CONFIG_64BIT
+#define HGATP_PPN		HGATP64_PPN
+#define HGATP_VMID_SHIFT	HGATP64_VMID_SHIFT
+#define HGATP_VMID		HGATP64_VMID
+#define HGATP_MODE_SHIFT	HGATP64_MODE_SHIFT
+#else
+#define HGATP_PPN		HGATP32_PPN
+#define HGATP_VMID_SHIFT	HGATP32_VMID_SHIFT
+#define HGATP_VMID		HGATP32_VMID
+#define HGATP_MODE_SHIFT	HGATP32_MODE_SHIFT
+#endif
+
+/* VSIP & HVIP relation */
+#define VSIP_TO_HVIP_SHIFT	(IRQ_VS_SOFT - IRQ_S_SOFT)
+#define VSIP_VALID_MASK		((_AC(1, UL) << IRQ_S_SOFT) | \
+				 (_AC(1, UL) << IRQ_S_TIMER) | \
+				 (_AC(1, UL) << IRQ_S_EXT))
+
+/* AIA CSR bits */
+#define TOPI_IID_SHIFT		16
+#define TOPI_IID_MASK		GENMASK(11, 0)
+#define TOPI_IPRIO_MASK		GENMASK(7, 0)
+#define TOPI_IPRIO_BITS		8
+
+#define TOPEI_ID_SHIFT		16
+#define TOPEI_ID_MASK		GENMASK(10, 0)
+#define TOPEI_PRIO_MASK		GENMASK(10, 0)
+
+#define ISELECT_IPRIO0		0x30
+#define ISELECT_IPRIO15		0x3f
+#define ISELECT_MASK		GENMASK(8, 0)
+
+#define HVICTL_VTI		BIT(30)
+#define HVICTL_IID		GENMASK(27, 16)
+#define HVICTL_IID_SHIFT	16
+#define HVICTL_DPR		BIT(9)
+#define HVICTL_IPRIOM		BIT(8)
+#define HVICTL_IPRIO		GENMASK(7, 0)
+
+/* xENVCFG flags */
+#define ENVCFG_STCE			(_AC(1, ULL) << 63)
+#define ENVCFG_PBMTE			(_AC(1, ULL) << 62)
+#define ENVCFG_CBZE			(_AC(1, UL) << 7)
+#define ENVCFG_CBCFE			(_AC(1, UL) << 6)
+#define ENVCFG_CBIE_SHIFT		4
+#define ENVCFG_CBIE			(_AC(0x3, UL) << ENVCFG_CBIE_SHIFT)
+#define ENVCFG_CBIE_ILL			_AC(0x0, UL)
+#define ENVCFG_CBIE_FLUSH		_AC(0x1, UL)
+#define ENVCFG_CBIE_INV			_AC(0x3, UL)
+#define ENVCFG_FIOM			_AC(0x1, UL)
+
+/* Smstateen bits */
+#define SMSTATEEN0_AIA_IMSIC_SHIFT	58
+#define SMSTATEEN0_AIA_IMSIC		(_ULL(1) << SMSTATEEN0_AIA_IMSIC_SHIFT)
+#define SMSTATEEN0_AIA_SHIFT		59
+#define SMSTATEEN0_AIA			(_ULL(1) << SMSTATEEN0_AIA_SHIFT)
+#define SMSTATEEN0_AIA_ISEL_SHIFT	60
+#define SMSTATEEN0_AIA_ISEL		(_ULL(1) << SMSTATEEN0_AIA_ISEL_SHIFT)
+#define SMSTATEEN0_HSENVCFG_SHIFT	62
+#define SMSTATEEN0_HSENVCFG		(_ULL(1) << SMSTATEEN0_HSENVCFG_SHIFT)
+#define SMSTATEEN0_SSTATEEN0_SHIFT	63
+#define SMSTATEEN0_SSTATEEN0		(_ULL(1) << SMSTATEEN0_SSTATEEN0_SHIFT)
+
+/* symbolic CSR names: */
+#define CSR_CYCLE		0xc00
+#define CSR_TIME		0xc01
+#define CSR_INSTRET		0xc02
+#define CSR_HPMCOUNTER3		0xc03
+#define CSR_HPMCOUNTER4		0xc04
+#define CSR_HPMCOUNTER5		0xc05
+#define CSR_HPMCOUNTER6		0xc06
+#define CSR_HPMCOUNTER7		0xc07
+#define CSR_HPMCOUNTER8		0xc08
+#define CSR_HPMCOUNTER9		0xc09
+#define CSR_HPMCOUNTER10	0xc0a
+#define CSR_HPMCOUNTER11	0xc0b
+#define CSR_HPMCOUNTER12	0xc0c
+#define CSR_HPMCOUNTER13	0xc0d
+#define CSR_HPMCOUNTER14	0xc0e
+#define CSR_HPMCOUNTER15	0xc0f
+#define CSR_HPMCOUNTER16	0xc10
+#define CSR_HPMCOUNTER17	0xc11
+#define CSR_HPMCOUNTER18	0xc12
+#define CSR_HPMCOUNTER19	0xc13
+#define CSR_HPMCOUNTER20	0xc14
+#define CSR_HPMCOUNTER21	0xc15
+#define CSR_HPMCOUNTER22	0xc16
+#define CSR_HPMCOUNTER23	0xc17
+#define CSR_HPMCOUNTER24	0xc18
+#define CSR_HPMCOUNTER25	0xc19
+#define CSR_HPMCOUNTER26	0xc1a
+#define CSR_HPMCOUNTER27	0xc1b
+#define CSR_HPMCOUNTER28	0xc1c
+#define CSR_HPMCOUNTER29	0xc1d
+#define CSR_HPMCOUNTER30	0xc1e
+#define CSR_HPMCOUNTER31	0xc1f
+#define CSR_CYCLEH		0xc80
+#define CSR_TIMEH		0xc81
+#define CSR_INSTRETH		0xc82
+#define CSR_HPMCOUNTER3H	0xc83
+#define CSR_HPMCOUNTER4H	0xc84
+#define CSR_HPMCOUNTER5H	0xc85
+#define CSR_HPMCOUNTER6H	0xc86
+#define CSR_HPMCOUNTER7H	0xc87
+#define CSR_HPMCOUNTER8H	0xc88
+#define CSR_HPMCOUNTER9H	0xc89
+#define CSR_HPMCOUNTER10H	0xc8a
+#define CSR_HPMCOUNTER11H	0xc8b
+#define CSR_HPMCOUNTER12H	0xc8c
+#define CSR_HPMCOUNTER13H	0xc8d
+#define CSR_HPMCOUNTER14H	0xc8e
+#define CSR_HPMCOUNTER15H	0xc8f
+#define CSR_HPMCOUNTER16H	0xc90
+#define CSR_HPMCOUNTER17H	0xc91
+#define CSR_HPMCOUNTER18H	0xc92
+#define CSR_HPMCOUNTER19H	0xc93
+#define CSR_HPMCOUNTER20H	0xc94
+#define CSR_HPMCOUNTER21H	0xc95
+#define CSR_HPMCOUNTER22H	0xc96
+#define CSR_HPMCOUNTER23H	0xc97
+#define CSR_HPMCOUNTER24H	0xc98
+#define CSR_HPMCOUNTER25H	0xc99
+#define CSR_HPMCOUNTER26H	0xc9a
+#define CSR_HPMCOUNTER27H	0xc9b
+#define CSR_HPMCOUNTER28H	0xc9c
+#define CSR_HPMCOUNTER29H	0xc9d
+#define CSR_HPMCOUNTER30H	0xc9e
+#define CSR_HPMCOUNTER31H	0xc9f
+
+#define CSR_SSCOUNTOVF		0xda0
+
+#define CSR_SSTATUS		0x100
+#define CSR_SIE			0x104
+#define CSR_STVEC		0x105
+#define CSR_SCOUNTEREN		0x106
+#define CSR_SENVCFG		0x10a
+#define CSR_SSTATEEN0		0x10c
+#define CSR_SSCRATCH		0x140
+#define CSR_SEPC		0x141
+#define CSR_SCAUSE		0x142
+#define CSR_STVAL		0x143
+#define CSR_SIP			0x144
+#define CSR_SATP		0x180
+
+#define CSR_STIMECMP		0x14D
+#define CSR_STIMECMPH		0x15D
+
+/* Supervisor-Level Window to Indirectly Accessed Registers (AIA) */
+#define CSR_SISELECT		0x150
+#define CSR_SIREG		0x151
+
+/* Supervisor-Level Interrupts (AIA) */
+#define CSR_STOPEI		0x15c
+#define CSR_STOPI		0xdb0
+
+/* Supervisor-Level High-Half CSRs (AIA) */
+#define CSR_SIEH		0x114
+#define CSR_SIPH		0x154
+
+#define CSR_VSSTATUS		0x200
+#define CSR_VSIE		0x204
+#define CSR_VSTVEC		0x205
+#define CSR_VSSCRATCH		0x240
+#define CSR_VSEPC		0x241
+#define CSR_VSCAUSE		0x242
+#define CSR_VSTVAL		0x243
+#define CSR_VSIP		0x244
+#define CSR_VSATP		0x280
+#define CSR_VSTIMECMP		0x24D
+#define CSR_VSTIMECMPH		0x25D
+
+#define CSR_HSTATUS		0x600
+#define CSR_HEDELEG		0x602
+#define CSR_HIDELEG		0x603
+#define CSR_HIE			0x604
+#define CSR_HTIMEDELTA		0x605
+#define CSR_HCOUNTEREN		0x606
+#define CSR_HGEIE		0x607
+#define CSR_HENVCFG		0x60a
+#define CSR_HTIMEDELTAH		0x615
+#define CSR_HENVCFGH		0x61a
+#define CSR_HTVAL		0x643
+#define CSR_HIP			0x644
+#define CSR_HVIP		0x645
+#define CSR_HTINST		0x64a
+#define CSR_HGATP		0x680
+#define CSR_HGEIP		0xe12
+
+/* Virtual Interrupts and Interrupt Priorities (H-extension with AIA) */
+#define CSR_HVIEN		0x608
+#define CSR_HVICTL		0x609
+#define CSR_HVIPRIO1		0x646
+#define CSR_HVIPRIO2		0x647
+
+/* VS-Level Window to Indirectly Accessed Registers (H-extension with AIA) */
+#define CSR_VSISELECT		0x250
+#define CSR_VSIREG		0x251
+
+/* VS-Level Interrupts (H-extension with AIA) */
+#define CSR_VSTOPEI		0x25c
+#define CSR_VSTOPI		0xeb0
+
+/* Hypervisor and VS-Level High-Half CSRs (H-extension with AIA) */
+#define CSR_HIDELEGH		0x613
+#define CSR_HVIENH		0x618
+#define CSR_HVIPH		0x655
+#define CSR_HVIPRIO1H		0x656
+#define CSR_HVIPRIO2H		0x657
+#define CSR_VSIEH		0x214
+#define CSR_VSIPH		0x254
+
+/* Hypervisor stateen CSRs */
+#define CSR_HSTATEEN0		0x60c
+#define CSR_HSTATEEN0H		0x61c
+
+#define CSR_MSTATUS		0x300
+#define CSR_MISA		0x301
+#define CSR_MIDELEG		0x303
+#define CSR_MIE			0x304
+#define CSR_MTVEC		0x305
+#define CSR_MENVCFG		0x30a
+#define CSR_MENVCFGH		0x31a
+#define CSR_MSCRATCH		0x340
+#define CSR_MEPC		0x341
+#define CSR_MCAUSE		0x342
+#define CSR_MTVAL		0x343
+#define CSR_MIP			0x344
+#define CSR_PMPCFG0		0x3a0
+#define CSR_PMPADDR0		0x3b0
+#define CSR_MVENDORID		0xf11
+#define CSR_MARCHID		0xf12
+#define CSR_MIMPID		0xf13
+#define CSR_MHARTID		0xf14
+
+/* Machine-Level Window to Indirectly Accessed Registers (AIA) */
+#define CSR_MISELECT		0x350
+#define CSR_MIREG		0x351
+
+/* Machine-Level Interrupts (AIA) */
+#define CSR_MTOPEI		0x35c
+#define CSR_MTOPI		0xfb0
+
+/* Virtual Interrupts for Supervisor Level (AIA) */
+#define CSR_MVIEN		0x308
+#define CSR_MVIP		0x309
+
+/* Machine-Level High-Half CSRs (AIA) */
+#define CSR_MIDELEGH		0x313
+#define CSR_MIEH		0x314
+#define CSR_MVIENH		0x318
+#define CSR_MVIPH		0x319
+#define CSR_MIPH		0x354
+
+#define CSR_VSTART		0x8
+#define CSR_VCSR		0xf
+#define CSR_VL			0xc20
+#define CSR_VTYPE		0xc21
+#define CSR_VLENB		0xc22
+
+#ifdef CONFIG_RISCV_M_MODE
+# define CSR_STATUS	CSR_MSTATUS
+# define CSR_IE		CSR_MIE
+# define CSR_TVEC	CSR_MTVEC
+# define CSR_SCRATCH	CSR_MSCRATCH
+# define CSR_EPC	CSR_MEPC
+# define CSR_CAUSE	CSR_MCAUSE
+# define CSR_TVAL	CSR_MTVAL
+# define CSR_IP		CSR_MIP
+
+# define CSR_IEH		CSR_MIEH
+# define CSR_ISELECT	CSR_MISELECT
+# define CSR_IREG	CSR_MIREG
+# define CSR_IPH		CSR_MIPH
+# define CSR_TOPEI	CSR_MTOPEI
+# define CSR_TOPI	CSR_MTOPI
+
+# define SR_IE		SR_MIE
+# define SR_PIE		SR_MPIE
+# define SR_PP		SR_MPP
+
+# define RV_IRQ_SOFT		IRQ_M_SOFT
+# define RV_IRQ_TIMER	IRQ_M_TIMER
+# define RV_IRQ_EXT		IRQ_M_EXT
+#else /* CONFIG_RISCV_M_MODE */
+# define CSR_STATUS	CSR_SSTATUS
+# define CSR_IE		CSR_SIE
+# define CSR_TVEC	CSR_STVEC
+# define CSR_SCRATCH	CSR_SSCRATCH
+# define CSR_EPC	CSR_SEPC
+# define CSR_CAUSE	CSR_SCAUSE
+# define CSR_TVAL	CSR_STVAL
+# define CSR_IP		CSR_SIP
+
+# define CSR_IEH		CSR_SIEH
+# define CSR_ISELECT	CSR_SISELECT
+# define CSR_IREG	CSR_SIREG
+# define CSR_IPH		CSR_SIPH
+# define CSR_TOPEI	CSR_STOPEI
+# define CSR_TOPI	CSR_STOPI
+
+# define SR_IE		SR_SIE
+# define SR_PIE		SR_SPIE
+# define SR_PP		SR_SPP
+
+# define RV_IRQ_SOFT		IRQ_S_SOFT
+# define RV_IRQ_TIMER	IRQ_S_TIMER
+# define RV_IRQ_EXT		IRQ_S_EXT
+# define RV_IRQ_PMU	IRQ_PMU_OVF
+# define SIP_LCOFIP     (_AC(0x1, UL) << IRQ_PMU_OVF)
+
+#endif /* !CONFIG_RISCV_M_MODE */
+
+/* IE/IP (Supervisor/Machine Interrupt Enable/Pending) flags */
+#define IE_SIE		(_AC(0x1, UL) << RV_IRQ_SOFT)
+#define IE_TIE		(_AC(0x1, UL) << RV_IRQ_TIMER)
+#define IE_EIE		(_AC(0x1, UL) << RV_IRQ_EXT)
+
+#ifdef __ASSEMBLY__
+#define __ASM_STR(x)    x
+#else
+#define __ASM_STR(x)    #x
+#endif
+
+#ifndef __ASSEMBLY__
+
+#define csr_swap(csr, val)					\
+({								\
+	unsigned long __v = (unsigned long)(val);		\
+	__asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1"\
+			      : "=r" (__v) : "rK" (__v)		\
+			      : "memory");			\
+	__v;							\
+})
+
+#define csr_read(csr)						\
+({								\
+	register unsigned long __v;				\
+	__asm__ __volatile__ ("csrr %0, " __ASM_STR(csr)	\
+			      : "=r" (__v) :			\
+			      : "memory");			\
+	__v;							\
+})
+
+#define csr_write(csr, val)					\
+({								\
+	unsigned long __v = (unsigned long)(val);		\
+	__asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0"	\
+			      : : "rK" (__v)			\
+			      : "memory");			\
+})
+
+#define csr_read_set(csr, val)					\
+({								\
+	unsigned long __v = (unsigned long)(val);		\
+	__asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1"\
+			      : "=r" (__v) : "rK" (__v)		\
+			      : "memory");			\
+	__v;							\
+})
+
+#define csr_set(csr, val)					\
+({								\
+	unsigned long __v = (unsigned long)(val);		\
+	__asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0"	\
+			      : : "rK" (__v)			\
+			      : "memory");			\
+})
+
+#define csr_read_clear(csr, val)				\
+({								\
+	unsigned long __v = (unsigned long)(val);		\
+	__asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1"\
+			      : "=r" (__v) : "rK" (__v)		\
+			      : "memory");			\
+	__v;							\
+})
+
+#define csr_clear(csr, val)					\
+({								\
+	unsigned long __v = (unsigned long)(val);		\
+	__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0"	\
+			      : : "rK" (__v)			\
+			      : "memory");			\
+})
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_RISCV_CSR_H */
diff --git a/tools/arch/riscv/include/asm/vdso/processor.h b/tools/arch/riscv/include/asm/vdso/processor.h
new file mode 100644
index 000000000000..662aca039848
--- /dev/null
+++ b/tools/arch/riscv/include/asm/vdso/processor.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_VDSO_PROCESSOR_H
+#define __ASM_VDSO_PROCESSOR_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm-generic/barrier.h>
+
+static inline void cpu_relax(void)
+{
+#ifdef __riscv_muldiv
+	int dummy;
+	/* In lieu of a halt instruction, induce a long-latency stall. */
+	__asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy));
+#endif
+
+#ifdef CONFIG_TOOLCHAIN_HAS_ZIHINTPAUSE
+	/*
+	 * Reduce instruction retirement.
+	 * This assumes the PC changes.
+	 */
+	__asm__ __volatile__ ("pause");
+#else
+	/* Encoding of the pause instruction */
+	__asm__ __volatile__ (".4byte 0x100000F");
+#endif
+	barrier();
+}
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_PROCESSOR_H */
diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h
index abe926d43cbe..05eaf6db3ad4 100644
--- a/tools/arch/s390/include/uapi/asm/kvm.h
+++ b/tools/arch/s390/include/uapi/asm/kvm.h
@@ -12,7 +12,320 @@
 #include <linux/types.h>
 
 #define __KVM_S390
-#define __KVM_HAVE_GUEST_DEBUG
+
+struct kvm_s390_skeys {
+	__u64 start_gfn;
+	__u64 count;
+	__u64 skeydata_addr;
+	__u32 flags;
+	__u32 reserved[9];
+};
+
+#define KVM_S390_CMMA_PEEK (1 << 0)
+
+/**
+ * kvm_s390_cmma_log - Used for CMMA migration.
+ *
+ * Used both for input and output.
+ *
+ * @start_gfn: Guest page number to start from.
+ * @count: Size of the result buffer.
+ * @flags: Control operation mode via KVM_S390_CMMA_* flags
+ * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty
+ *             pages are still remaining.
+ * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set
+ *        in the PGSTE.
+ * @values: Pointer to the values buffer.
+ *
+ * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls.
+ */
+struct kvm_s390_cmma_log {
+	__u64 start_gfn;
+	__u32 count;
+	__u32 flags;
+	union {
+		__u64 remaining;
+		__u64 mask;
+	};
+	__u64 values;
+};
+
+#define KVM_S390_RESET_POR       1
+#define KVM_S390_RESET_CLEAR     2
+#define KVM_S390_RESET_SUBSYSTEM 4
+#define KVM_S390_RESET_CPU_INIT  8
+#define KVM_S390_RESET_IPL       16
+
+/* for KVM_S390_MEM_OP */
+struct kvm_s390_mem_op {
+	/* in */
+	__u64 gaddr;		/* the guest address */
+	__u64 flags;		/* flags */
+	__u32 size;		/* amount of bytes */
+	__u32 op;		/* type of operation */
+	__u64 buf;		/* buffer in userspace */
+	union {
+		struct {
+			__u8 ar;	/* the access register number */
+			__u8 key;	/* access key, ignored if flag unset */
+			__u8 pad1[6];	/* ignored */
+			__u64 old_addr;	/* ignored if cmpxchg flag unset */
+		};
+		__u32 sida_offset; /* offset into the sida */
+		__u8 reserved[32]; /* ignored */
+	};
+};
+/* types for kvm_s390_mem_op->op */
+#define KVM_S390_MEMOP_LOGICAL_READ	0
+#define KVM_S390_MEMOP_LOGICAL_WRITE	1
+#define KVM_S390_MEMOP_SIDA_READ	2
+#define KVM_S390_MEMOP_SIDA_WRITE	3
+#define KVM_S390_MEMOP_ABSOLUTE_READ	4
+#define KVM_S390_MEMOP_ABSOLUTE_WRITE	5
+#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG	6
+
+/* flags for kvm_s390_mem_op->flags */
+#define KVM_S390_MEMOP_F_CHECK_ONLY		(1ULL << 0)
+#define KVM_S390_MEMOP_F_INJECT_EXCEPTION	(1ULL << 1)
+#define KVM_S390_MEMOP_F_SKEY_PROTECTION	(1ULL << 2)
+
+/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */
+#define KVM_S390_MEMOP_EXTENSION_CAP_BASE	(1 << 0)
+#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG	(1 << 1)
+
+struct kvm_s390_psw {
+	__u64 mask;
+	__u64 addr;
+};
+
+/* valid values for type in kvm_s390_interrupt */
+#define KVM_S390_SIGP_STOP		0xfffe0000u
+#define KVM_S390_PROGRAM_INT		0xfffe0001u
+#define KVM_S390_SIGP_SET_PREFIX	0xfffe0002u
+#define KVM_S390_RESTART		0xfffe0003u
+#define KVM_S390_INT_PFAULT_INIT	0xfffe0004u
+#define KVM_S390_INT_PFAULT_DONE	0xfffe0005u
+#define KVM_S390_MCHK			0xfffe1000u
+#define KVM_S390_INT_CLOCK_COMP		0xffff1004u
+#define KVM_S390_INT_CPU_TIMER		0xffff1005u
+#define KVM_S390_INT_VIRTIO		0xffff2603u
+#define KVM_S390_INT_SERVICE		0xffff2401u
+#define KVM_S390_INT_EMERGENCY		0xffff1201u
+#define KVM_S390_INT_EXTERNAL_CALL	0xffff1202u
+/* Anything below 0xfffe0000u is taken by INT_IO */
+#define KVM_S390_INT_IO(ai,cssid,ssid,schid)   \
+	(((schid)) |			       \
+	 ((ssid) << 16) |		       \
+	 ((cssid) << 18) |		       \
+	 ((ai) << 26))
+#define KVM_S390_INT_IO_MIN		0x00000000u
+#define KVM_S390_INT_IO_MAX		0xfffdffffu
+#define KVM_S390_INT_IO_AI_MASK		0x04000000u
+
+
+struct kvm_s390_interrupt {
+	__u32 type;
+	__u32 parm;
+	__u64 parm64;
+};
+
+struct kvm_s390_io_info {
+	__u16 subchannel_id;
+	__u16 subchannel_nr;
+	__u32 io_int_parm;
+	__u32 io_int_word;
+};
+
+struct kvm_s390_ext_info {
+	__u32 ext_params;
+	__u32 pad;
+	__u64 ext_params2;
+};
+
+struct kvm_s390_pgm_info {
+	__u64 trans_exc_code;
+	__u64 mon_code;
+	__u64 per_address;
+	__u32 data_exc_code;
+	__u16 code;
+	__u16 mon_class_nr;
+	__u8 per_code;
+	__u8 per_atmid;
+	__u8 exc_access_id;
+	__u8 per_access_id;
+	__u8 op_access_id;
+#define KVM_S390_PGM_FLAGS_ILC_VALID	0x01
+#define KVM_S390_PGM_FLAGS_ILC_0	0x02
+#define KVM_S390_PGM_FLAGS_ILC_1	0x04
+#define KVM_S390_PGM_FLAGS_ILC_MASK	0x06
+#define KVM_S390_PGM_FLAGS_NO_REWIND	0x08
+	__u8 flags;
+	__u8 pad[2];
+};
+
+struct kvm_s390_prefix_info {
+	__u32 address;
+};
+
+struct kvm_s390_extcall_info {
+	__u16 code;
+};
+
+struct kvm_s390_emerg_info {
+	__u16 code;
+};
+
+#define KVM_S390_STOP_FLAG_STORE_STATUS	0x01
+struct kvm_s390_stop_info {
+	__u32 flags;
+};
+
+struct kvm_s390_mchk_info {
+	__u64 cr14;
+	__u64 mcic;
+	__u64 failing_storage_address;
+	__u32 ext_damage_code;
+	__u32 pad;
+	__u8 fixed_logout[16];
+};
+
+struct kvm_s390_irq {
+	__u64 type;
+	union {
+		struct kvm_s390_io_info io;
+		struct kvm_s390_ext_info ext;
+		struct kvm_s390_pgm_info pgm;
+		struct kvm_s390_emerg_info emerg;
+		struct kvm_s390_extcall_info extcall;
+		struct kvm_s390_prefix_info prefix;
+		struct kvm_s390_stop_info stop;
+		struct kvm_s390_mchk_info mchk;
+		char reserved[64];
+	} u;
+};
+
+struct kvm_s390_irq_state {
+	__u64 buf;
+	__u32 flags;        /* will stay unused for compatibility reasons */
+	__u32 len;
+	__u32 reserved[4];  /* will stay unused for compatibility reasons */
+};
+
+struct kvm_s390_ucas_mapping {
+	__u64 user_addr;
+	__u64 vcpu_addr;
+	__u64 length;
+};
+
+struct kvm_s390_pv_sec_parm {
+	__u64 origin;
+	__u64 length;
+};
+
+struct kvm_s390_pv_unp {
+	__u64 addr;
+	__u64 size;
+	__u64 tweak;
+};
+
+enum pv_cmd_dmp_id {
+	KVM_PV_DUMP_INIT,
+	KVM_PV_DUMP_CONFIG_STOR_STATE,
+	KVM_PV_DUMP_COMPLETE,
+	KVM_PV_DUMP_CPU,
+};
+
+struct kvm_s390_pv_dmp {
+	__u64 subcmd;
+	__u64 buff_addr;
+	__u64 buff_len;
+	__u64 gaddr;		/* For dump storage state */
+	__u64 reserved[4];
+};
+
+enum pv_cmd_info_id {
+	KVM_PV_INFO_VM,
+	KVM_PV_INFO_DUMP,
+};
+
+struct kvm_s390_pv_info_dump {
+	__u64 dump_cpu_buffer_len;
+	__u64 dump_config_mem_buffer_per_1m;
+	__u64 dump_config_finalize_len;
+};
+
+struct kvm_s390_pv_info_vm {
+	__u64 inst_calls_list[4];
+	__u64 max_cpus;
+	__u64 max_guests;
+	__u64 max_guest_addr;
+	__u64 feature_indication;
+};
+
+struct kvm_s390_pv_info_header {
+	__u32 id;
+	__u32 len_max;
+	__u32 len_written;
+	__u32 reserved;
+};
+
+struct kvm_s390_pv_info {
+	struct kvm_s390_pv_info_header header;
+	union {
+		struct kvm_s390_pv_info_dump dump;
+		struct kvm_s390_pv_info_vm vm;
+	};
+};
+
+enum pv_cmd_id {
+	KVM_PV_ENABLE,
+	KVM_PV_DISABLE,
+	KVM_PV_SET_SEC_PARMS,
+	KVM_PV_UNPACK,
+	KVM_PV_VERIFY,
+	KVM_PV_PREP_RESET,
+	KVM_PV_UNSHARE_ALL,
+	KVM_PV_INFO,
+	KVM_PV_DUMP,
+	KVM_PV_ASYNC_CLEANUP_PREPARE,
+	KVM_PV_ASYNC_CLEANUP_PERFORM,
+};
+
+struct kvm_pv_cmd {
+	__u32 cmd;	/* Command to be executed */
+	__u16 rc;	/* Ultravisor return code */
+	__u16 rrc;	/* Ultravisor return reason code */
+	__u64 data;	/* Data or address */
+	__u32 flags;    /* flags for future extensions. Must be 0 for now */
+	__u32 reserved[3];
+};
+
+struct kvm_s390_zpci_op {
+	/* in */
+	__u32 fh;               /* target device */
+	__u8  op;               /* operation to perform */
+	__u8  pad[3];
+	union {
+		/* for KVM_S390_ZPCIOP_REG_AEN */
+		struct {
+			__u64 ibv;      /* Guest addr of interrupt bit vector */
+			__u64 sb;       /* Guest addr of summary bit */
+			__u32 flags;
+			__u32 noi;      /* Number of interrupts */
+			__u8 isc;       /* Guest interrupt subclass */
+			__u8 sbo;       /* Offset of guest summary bit vector */
+			__u16 pad;
+		} reg_aen;
+		__u64 reserved[8];
+	} u;
+};
+
+/* types for kvm_s390_zpci_op->op */
+#define KVM_S390_ZPCIOP_REG_AEN                0
+#define KVM_S390_ZPCIOP_DEREG_AEN      1
+
+/* flags for kvm_s390_zpci_op->u.reg_aen.flags */
+#define KVM_S390_ZPCIOP_REGAEN_HOST    (1 << 0)
 
 /* Device control API: s390-specific devices */
 #define KVM_DEV_FLIC_GET_ALL_IRQS	1
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 25160d26764b..a38f8f9ba657 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -13,7 +13,7 @@
 /*
  * Defines x86 CPU feature bits
  */
-#define NCAPINTS			21	   /* N 32-bit words worth of info */
+#define NCAPINTS			22	   /* N 32-bit words worth of info */
 #define NBUGINTS			2	   /* N 32-bit bug flags */
 
 /*
@@ -81,10 +81,8 @@
 #define X86_FEATURE_K6_MTRR		( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
 #define X86_FEATURE_CYRIX_ARR		( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
 #define X86_FEATURE_CENTAUR_MCR		( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
-
-/* CPU types for specific tunings: */
 #define X86_FEATURE_K8			( 3*32+ 4) /* "" Opteron, Athlon64 */
-/* FREE, was #define X86_FEATURE_K7			( 3*32+ 5) "" Athlon */
+#define X86_FEATURE_ZEN5		( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */
 #define X86_FEATURE_P3			( 3*32+ 6) /* "" P3 */
 #define X86_FEATURE_P4			( 3*32+ 7) /* "" P4 */
 #define X86_FEATURE_CONSTANT_TSC	( 3*32+ 8) /* TSC ticks at a constant rate */
@@ -97,7 +95,7 @@
 #define X86_FEATURE_SYSENTER32		( 3*32+15) /* "" sysenter in IA32 userspace */
 #define X86_FEATURE_REP_GOOD		( 3*32+16) /* REP microcode works well */
 #define X86_FEATURE_AMD_LBR_V2		( 3*32+17) /* AMD Last Branch Record Extension Version 2 */
-/* FREE, was #define X86_FEATURE_LFENCE_RDTSC		( 3*32+18) "" LFENCE synchronizes RDTSC */
+#define X86_FEATURE_CLEAR_CPU_BUF	( 3*32+18) /* "" Clear CPU buffers using VERW */
 #define X86_FEATURE_ACC_POWER		( 3*32+19) /* AMD Accumulated Power Mechanism */
 #define X86_FEATURE_NOPL		( 3*32+20) /* The NOPL (0F 1F) instructions */
 #define X86_FEATURE_ALWAYS		( 3*32+21) /* "" Always-present feature */
@@ -462,6 +460,14 @@
 #define X86_FEATURE_SRSO_NO		(20*32+29) /* "" CPU is not affected by SRSO */
 
 /*
+ * Extended auxiliary flags: Linux defined - for features scattered in various
+ * CPUID levels like 0x80000022, etc.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+#define X86_FEATURE_AMD_LBR_PMC_FREEZE	(21*32+ 0) /* AMD LBR and PMC Freeze */
+
+/*
  * BUG word(s)
  */
 #define X86_BUG(x)			(NCAPINTS*32 + (x))
@@ -508,4 +514,5 @@
 /* BUG word 2 */
 #define X86_BUG_SRSO			X86_BUG(1*32 + 0) /* AMD SRSO bug */
 #define X86_BUG_DIV0			X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */
+#define X86_BUG_RFDS			X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index 1f23960d2b06..c492bdc97b05 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -123,6 +123,12 @@
 # define DISABLE_FRED	(1 << (X86_FEATURE_FRED & 31))
 #endif
 
+#ifdef CONFIG_KVM_AMD_SEV
+#define DISABLE_SEV_SNP		0
+#else
+#define DISABLE_SEV_SNP		(1 << (X86_FEATURE_SEV_SNP & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -147,8 +153,9 @@
 			 DISABLE_ENQCMD)
 #define DISABLED_MASK17	0
 #define DISABLED_MASK18	(DISABLE_IBT)
-#define DISABLED_MASK19	0
+#define DISABLED_MASK19	(DISABLE_SEV_SNP)
 #define DISABLED_MASK20	0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
+#define DISABLED_MASK21	0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22)
 
 #endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/tools/arch/x86/include/asm/irq_vectors.h b/tools/arch/x86/include/asm/irq_vectors.h
index 3a19904c2db6..d18bfb238f66 100644
--- a/tools/arch/x86/include/asm/irq_vectors.h
+++ b/tools/arch/x86/include/asm/irq_vectors.h
@@ -84,11 +84,9 @@
 #define HYPERVISOR_CALLBACK_VECTOR	0xf3
 
 /* Vector for KVM to deliver posted interrupt IPI */
-#ifdef CONFIG_HAVE_KVM
 #define POSTED_INTR_VECTOR		0xf2
 #define POSTED_INTR_WAKEUP_VECTOR	0xf1
 #define POSTED_INTR_NESTED_VECTOR	0xf0
-#endif
 
 #define MANAGED_IRQ_SHUTDOWN_VECTOR	0xef
 
diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index 1f9dc9bd13eb..05956bd8bacf 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -176,6 +176,14 @@
 						 * CPU is not vulnerable to Gather
 						 * Data Sampling (GDS).
 						 */
+#define ARCH_CAP_RFDS_NO		BIT(27)	/*
+						 * Not susceptible to Register
+						 * File Data Sampling.
+						 */
+#define ARCH_CAP_RFDS_CLEAR		BIT(28)	/*
+						 * VERW clears CPU Register
+						 * File.
+						 */
 
 #define ARCH_CAP_XAPIC_DISABLE		BIT(21)	/*
 						 * IA32_XAPIC_DISABLE_STATUS MSR
@@ -605,34 +613,47 @@
 #define MSR_AMD64_SEV_ES_GHCB		0xc0010130
 #define MSR_AMD64_SEV			0xc0010131
 #define MSR_AMD64_SEV_ENABLED_BIT	0
-#define MSR_AMD64_SEV_ES_ENABLED_BIT	1
-#define MSR_AMD64_SEV_SNP_ENABLED_BIT	2
 #define MSR_AMD64_SEV_ENABLED		BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
+#define MSR_AMD64_SEV_ES_ENABLED_BIT	1
 #define MSR_AMD64_SEV_ES_ENABLED	BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
+#define MSR_AMD64_SEV_SNP_ENABLED_BIT	2
 #define MSR_AMD64_SEV_SNP_ENABLED	BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
-
-/* SNP feature bits enabled by the hypervisor */
-#define MSR_AMD64_SNP_VTOM			BIT_ULL(3)
-#define MSR_AMD64_SNP_REFLECT_VC		BIT_ULL(4)
-#define MSR_AMD64_SNP_RESTRICTED_INJ		BIT_ULL(5)
-#define MSR_AMD64_SNP_ALT_INJ			BIT_ULL(6)
-#define MSR_AMD64_SNP_DEBUG_SWAP		BIT_ULL(7)
-#define MSR_AMD64_SNP_PREVENT_HOST_IBS		BIT_ULL(8)
-#define MSR_AMD64_SNP_BTB_ISOLATION		BIT_ULL(9)
-#define MSR_AMD64_SNP_VMPL_SSS			BIT_ULL(10)
-#define MSR_AMD64_SNP_SECURE_TSC		BIT_ULL(11)
-#define MSR_AMD64_SNP_VMGEXIT_PARAM		BIT_ULL(12)
-#define MSR_AMD64_SNP_IBS_VIRT			BIT_ULL(14)
-#define MSR_AMD64_SNP_VMSA_REG_PROTECTION	BIT_ULL(16)
-#define MSR_AMD64_SNP_SMT_PROTECTION		BIT_ULL(17)
-
-/* SNP feature bits reserved for future use. */
-#define MSR_AMD64_SNP_RESERVED_BIT13		BIT_ULL(13)
-#define MSR_AMD64_SNP_RESERVED_BIT15		BIT_ULL(15)
-#define MSR_AMD64_SNP_RESERVED_MASK		GENMASK_ULL(63, 18)
+#define MSR_AMD64_SNP_VTOM_BIT		3
+#define MSR_AMD64_SNP_VTOM		BIT_ULL(MSR_AMD64_SNP_VTOM_BIT)
+#define MSR_AMD64_SNP_REFLECT_VC_BIT	4
+#define MSR_AMD64_SNP_REFLECT_VC	BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT)
+#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5
+#define MSR_AMD64_SNP_RESTRICTED_INJ	BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT)
+#define MSR_AMD64_SNP_ALT_INJ_BIT	6
+#define MSR_AMD64_SNP_ALT_INJ		BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT)
+#define MSR_AMD64_SNP_DEBUG_SWAP_BIT	7
+#define MSR_AMD64_SNP_DEBUG_SWAP	BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT)
+#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8
+#define MSR_AMD64_SNP_PREVENT_HOST_IBS	BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT)
+#define MSR_AMD64_SNP_BTB_ISOLATION_BIT	9
+#define MSR_AMD64_SNP_BTB_ISOLATION	BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT)
+#define MSR_AMD64_SNP_VMPL_SSS_BIT	10
+#define MSR_AMD64_SNP_VMPL_SSS		BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT)
+#define MSR_AMD64_SNP_SECURE_TSC_BIT	11
+#define MSR_AMD64_SNP_SECURE_TSC	BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT)
+#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT	12
+#define MSR_AMD64_SNP_VMGEXIT_PARAM	BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT)
+#define MSR_AMD64_SNP_RESERVED_BIT13	BIT_ULL(13)
+#define MSR_AMD64_SNP_IBS_VIRT_BIT	14
+#define MSR_AMD64_SNP_IBS_VIRT		BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT)
+#define MSR_AMD64_SNP_RESERVED_BIT15	BIT_ULL(15)
+#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT	16
+#define MSR_AMD64_SNP_VMSA_REG_PROT	BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT)
+#define MSR_AMD64_SNP_SMT_PROT_BIT	17
+#define MSR_AMD64_SNP_SMT_PROT		BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT)
+#define MSR_AMD64_SNP_RESV_BIT		18
+#define MSR_AMD64_SNP_RESERVED_MASK	GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT)
 
 #define MSR_AMD64_VIRT_SPEC_CTRL	0xc001011f
 
+#define MSR_AMD64_RMP_BASE		0xc0010132
+#define MSR_AMD64_RMP_END		0xc0010133
+
 /* AMD Collaborative Processor Performance Control MSRs */
 #define MSR_AMD_CPPC_CAP1		0xc00102b0
 #define MSR_AMD_CPPC_ENABLE		0xc00102b1
@@ -719,8 +740,15 @@
 #define MSR_K8_TOP_MEM1			0xc001001a
 #define MSR_K8_TOP_MEM2			0xc001001d
 #define MSR_AMD64_SYSCFG		0xc0010010
-#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT	23
+#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23
 #define MSR_AMD64_SYSCFG_MEM_ENCRYPT	BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT)
+#define MSR_AMD64_SYSCFG_SNP_EN_BIT	24
+#define MSR_AMD64_SYSCFG_SNP_EN		BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT)
+#define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25
+#define MSR_AMD64_SYSCFG_SNP_VMPL_EN	BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT)
+#define MSR_AMD64_SYSCFG_MFDM_BIT	19
+#define MSR_AMD64_SYSCFG_MFDM		BIT_ULL(MSR_AMD64_SYSCFG_MFDM_BIT)
+
 #define MSR_K8_INT_PENDING_MSG		0xc0010055
 /* C1E active bits in int pending message */
 #define K8_INTP_C1E_ACTIVE_MASK		0x18000000
diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h
index 7ba1726b71c7..e9187ddd3d1f 100644
--- a/tools/arch/x86/include/asm/required-features.h
+++ b/tools/arch/x86/include/asm/required-features.h
@@ -99,6 +99,7 @@
 #define REQUIRED_MASK18	0
 #define REQUIRED_MASK19	0
 #define REQUIRED_MASK20	0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
+#define REQUIRED_MASK21	0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22)
 
 #endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index a448d0964fc0..ef11aa4cab42 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -7,6 +7,8 @@
  *
  */
 
+#include <linux/const.h>
+#include <linux/bits.h>
 #include <linux/types.h>
 #include <linux/ioctl.h>
 #include <linux/stddef.h>
@@ -40,7 +42,6 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
-#define __KVM_HAVE_GUEST_DEBUG
 #define __KVM_HAVE_MSIX
 #define __KVM_HAVE_MCE
 #define __KVM_HAVE_PIT_STATE2
@@ -49,7 +50,6 @@
 #define __KVM_HAVE_DEBUGREGS
 #define __KVM_HAVE_XSAVE
 #define __KVM_HAVE_XCRS
-#define __KVM_HAVE_READONLY_MEM
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -526,9 +526,301 @@ struct kvm_pmu_event_filter {
 #define KVM_PMU_EVENT_ALLOW 0
 #define KVM_PMU_EVENT_DENY 1
 
-#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0)
+#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0)
 #define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
 
+/* for KVM_CAP_MCE */
+struct kvm_x86_mce {
+	__u64 status;
+	__u64 addr;
+	__u64 misc;
+	__u64 mcg_status;
+	__u8 bank;
+	__u8 pad1[7];
+	__u64 pad2[3];
+};
+
+/* for KVM_CAP_XEN_HVM */
+#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR	(1 << 0)
+#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO		(1 << 2)
+#define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND		(1 << 5)
+#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG	(1 << 6)
+#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE	(1 << 7)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA	(1 << 8)
+
+struct kvm_xen_hvm_config {
+	__u32 flags;
+	__u32 msr;
+	__u64 blob_addr_32;
+	__u64 blob_addr_64;
+	__u8 blob_size_32;
+	__u8 blob_size_64;
+	__u8 pad2[30];
+};
+
+struct kvm_xen_hvm_attr {
+	__u16 type;
+	__u16 pad[3];
+	union {
+		__u8 long_mode;
+		__u8 vector;
+		__u8 runstate_update_flag;
+		union {
+			__u64 gfn;
+#define KVM_XEN_INVALID_GFN ((__u64)-1)
+			__u64 hva;
+		} shared_info;
+		struct {
+			__u32 send_port;
+			__u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+			__u32 flags;
+#define KVM_XEN_EVTCHN_DEASSIGN		(1 << 0)
+#define KVM_XEN_EVTCHN_UPDATE		(1 << 1)
+#define KVM_XEN_EVTCHN_RESET		(1 << 2)
+			/*
+			 * Events sent by the guest are either looped back to
+			 * the guest itself (potentially on a different port#)
+			 * or signalled via an eventfd.
+			 */
+			union {
+				struct {
+					__u32 port;
+					__u32 vcpu;
+					__u32 priority;
+				} port;
+				struct {
+					__u32 port; /* Zero for eventfd */
+					__s32 fd;
+				} eventfd;
+				__u32 padding[4];
+			} deliver;
+		} evtchn;
+		__u32 xen_version;
+		__u64 pad[8];
+	} u;
+};
+
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_ATTR_TYPE_LONG_MODE		0x0
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO		0x1
+#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR		0x2
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_ATTR_TYPE_EVTCHN		0x3
+#define KVM_XEN_ATTR_TYPE_XEN_VERSION		0x4
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */
+#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG	0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA	0x6
+
+struct kvm_xen_vcpu_attr {
+	__u16 type;
+	__u16 pad[3];
+	union {
+		__u64 gpa;
+#define KVM_XEN_INVALID_GPA ((__u64)-1)
+		__u64 hva;
+		__u64 pad[8];
+		struct {
+			__u64 state;
+			__u64 state_entry_time;
+			__u64 time_running;
+			__u64 time_runnable;
+			__u64 time_blocked;
+			__u64 time_offline;
+		} runstate;
+		__u32 vcpu_id;
+		struct {
+			__u32 port;
+			__u32 priority;
+			__u64 expires_ns;
+		} timer;
+		__u8 vector;
+	} u;
+};
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO	0x0
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO	0x1
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR	0x2
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT	0x3
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA	0x4
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST	0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID		0x6
+#define KVM_XEN_VCPU_ATTR_TYPE_TIMER		0x7
+#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR	0x8
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA	0x9
+
+/* Secure Encrypted Virtualization command */
+enum sev_cmd_id {
+	/* Guest initialization commands */
+	KVM_SEV_INIT = 0,
+	KVM_SEV_ES_INIT,
+	/* Guest launch commands */
+	KVM_SEV_LAUNCH_START,
+	KVM_SEV_LAUNCH_UPDATE_DATA,
+	KVM_SEV_LAUNCH_UPDATE_VMSA,
+	KVM_SEV_LAUNCH_SECRET,
+	KVM_SEV_LAUNCH_MEASURE,
+	KVM_SEV_LAUNCH_FINISH,
+	/* Guest migration commands (outgoing) */
+	KVM_SEV_SEND_START,
+	KVM_SEV_SEND_UPDATE_DATA,
+	KVM_SEV_SEND_UPDATE_VMSA,
+	KVM_SEV_SEND_FINISH,
+	/* Guest migration commands (incoming) */
+	KVM_SEV_RECEIVE_START,
+	KVM_SEV_RECEIVE_UPDATE_DATA,
+	KVM_SEV_RECEIVE_UPDATE_VMSA,
+	KVM_SEV_RECEIVE_FINISH,
+	/* Guest status and debug commands */
+	KVM_SEV_GUEST_STATUS,
+	KVM_SEV_DBG_DECRYPT,
+	KVM_SEV_DBG_ENCRYPT,
+	/* Guest certificates commands */
+	KVM_SEV_CERT_EXPORT,
+	/* Attestation report */
+	KVM_SEV_GET_ATTESTATION_REPORT,
+	/* Guest Migration Extension */
+	KVM_SEV_SEND_CANCEL,
+
+	KVM_SEV_NR_MAX,
+};
+
+struct kvm_sev_cmd {
+	__u32 id;
+	__u32 pad0;
+	__u64 data;
+	__u32 error;
+	__u32 sev_fd;
+};
+
+struct kvm_sev_launch_start {
+	__u32 handle;
+	__u32 policy;
+	__u64 dh_uaddr;
+	__u32 dh_len;
+	__u32 pad0;
+	__u64 session_uaddr;
+	__u32 session_len;
+	__u32 pad1;
+};
+
+struct kvm_sev_launch_update_data {
+	__u64 uaddr;
+	__u32 len;
+	__u32 pad0;
+};
+
+
+struct kvm_sev_launch_secret {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u32 pad0;
+	__u64 guest_uaddr;
+	__u32 guest_len;
+	__u32 pad1;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+	__u32 pad2;
+};
+
+struct kvm_sev_launch_measure {
+	__u64 uaddr;
+	__u32 len;
+	__u32 pad0;
+};
+
+struct kvm_sev_guest_status {
+	__u32 handle;
+	__u32 policy;
+	__u32 state;
+};
+
+struct kvm_sev_dbg {
+	__u64 src_uaddr;
+	__u64 dst_uaddr;
+	__u32 len;
+	__u32 pad0;
+};
+
+struct kvm_sev_attestation_report {
+	__u8 mnonce[16];
+	__u64 uaddr;
+	__u32 len;
+	__u32 pad0;
+};
+
+struct kvm_sev_send_start {
+	__u32 policy;
+	__u32 pad0;
+	__u64 pdh_cert_uaddr;
+	__u32 pdh_cert_len;
+	__u32 pad1;
+	__u64 plat_certs_uaddr;
+	__u32 plat_certs_len;
+	__u32 pad2;
+	__u64 amd_certs_uaddr;
+	__u32 amd_certs_len;
+	__u32 pad3;
+	__u64 session_uaddr;
+	__u32 session_len;
+	__u32 pad4;
+};
+
+struct kvm_sev_send_update_data {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u32 pad0;
+	__u64 guest_uaddr;
+	__u32 guest_len;
+	__u32 pad1;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+	__u32 pad2;
+};
+
+struct kvm_sev_receive_start {
+	__u32 handle;
+	__u32 policy;
+	__u64 pdh_uaddr;
+	__u32 pdh_len;
+	__u32 pad0;
+	__u64 session_uaddr;
+	__u32 session_len;
+	__u32 pad1;
+};
+
+struct kvm_sev_receive_update_data {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u32 pad0;
+	__u64 guest_uaddr;
+	__u32 guest_len;
+	__u32 pad1;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+	__u32 pad2;
+};
+
+#define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
+
+struct kvm_hyperv_eventfd {
+	__u32 conn_id;
+	__s32 fd;
+	__u32 flags;
+	__u32 padding[3];
+};
+
+#define KVM_HYPERV_CONN_ID_MASK		0x00ffffff
+#define KVM_HYPERV_EVENTFD_DEASSIGN	(1 << 0)
+
 /*
  * Masked event layout.
  * Bits   Description
@@ -549,10 +841,10 @@ struct kvm_pmu_event_filter {
 	((__u64)(!!(exclude)) << 55))
 
 #define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \
-	(GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32))
-#define KVM_PMU_MASKED_ENTRY_UMASK_MASK		(GENMASK_ULL(63, 56))
-#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH	(GENMASK_ULL(15, 8))
-#define KVM_PMU_MASKED_ENTRY_EXCLUDE		(BIT_ULL(55))
+	(__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK		(__GENMASK_ULL(63, 56))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH	(__GENMASK_ULL(15, 8))
+#define KVM_PMU_MASKED_ENTRY_EXCLUDE		(_BITULL(55))
 #define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT	(56)
 
 /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
@@ -560,7 +852,7 @@ struct kvm_pmu_event_filter {
 #define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
 
 /* x86-specific KVM_EXIT_HYPERCALL flags. */
-#define KVM_EXIT_HYPERCALL_LONG_MODE	BIT(0)
+#define KVM_EXIT_HYPERCALL_LONG_MODE	_BITULL(0)
 
 #define KVM_X86_DEFAULT_VM	0
 #define KVM_X86_SW_PROTECTED_VM	1
diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile
index ac8487dcff1d..4315652678b9 100644
--- a/tools/bpf/bpftool/Documentation/Makefile
+++ b/tools/bpf/bpftool/Documentation/Makefile
@@ -31,9 +31,9 @@ see_also = $(subst " ",, \
 	"\n" \
 	"SEE ALSO\n" \
 	"========\n" \
-	"\t**bpf**\ (2),\n" \
-	"\t**bpf-helpers**\\ (7)" \
-	$(foreach page,$(call list_pages,$(1)),",\n\t**$(page)**\\ (8)") \
+	"**bpf**\ (2),\n" \
+	"**bpf-helpers**\\ (7)" \
+	$(foreach page,$(call list_pages,$(1)),",\n**$(page)**\\ (8)") \
 	"\n")
 
 $(OUTPUT)%.8: %.rst
diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 342716f74ec4..eaba24320fb2 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -14,82 +14,76 @@ tool for inspection of BTF data
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] **btf** *COMMAND*
+**bpftool** [*OPTIONS*] **btf** *COMMAND*
 
-	*OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } }
+*OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } }
 
-	*COMMANDS* := { **dump** | **help** }
+*COMMANDS* := { **dump** | **help** }
 
 BTF COMMANDS
 =============
 
-|	**bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*]
-|	**bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*]
-|	**bpftool** **btf help**
+| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*]
+| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*]
+| **bpftool** **btf help**
 |
-|	*BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* }
-|	*FORMAT* := { **raw** | **c** }
-|	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
-|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
+| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* }
+| *FORMAT* := { **raw** | **c** }
+| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
+| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
 
 DESCRIPTION
 ===========
-	**bpftool btf { show | list }** [**id** *BTF_ID*]
-		  Show information about loaded BTF objects. If a BTF ID is
-		  specified, show information only about given BTF object,
-		  otherwise list all BTF objects currently loaded on the
-		  system.
+bpftool btf { show | list } [id *BTF_ID*]
+    Show information about loaded BTF objects. If a BTF ID is specified, show
+    information only about given BTF object, otherwise list all BTF objects
+    currently loaded on the system.
 
-		  Since Linux 5.8 bpftool is able to discover information about
-		  processes that hold open file descriptors (FDs) against BTF
-		  objects. On such kernels bpftool will automatically emit this
-		  information as well.
+    Since Linux 5.8 bpftool is able to discover information about processes
+    that hold open file descriptors (FDs) against BTF objects. On such kernels
+    bpftool will automatically emit this information as well.
 
-	**bpftool btf dump** *BTF_SRC*
-		  Dump BTF entries from a given *BTF_SRC*.
+bpftool btf dump *BTF_SRC*
+    Dump BTF entries from a given *BTF_SRC*.
 
-		  When **id** is specified, BTF object with that ID will be
-		  loaded and all its BTF types emitted.
+    When **id** is specified, BTF object with that ID will be loaded and all
+    its BTF types emitted.
 
-		  When **map** is provided, it's expected that map has
-		  associated BTF object with BTF types describing key and
-		  value. It's possible to select whether to dump only BTF
-		  type(s) associated with key (**key**), value (**value**),
-		  both key and value (**kv**), or all BTF types present in
-		  associated BTF object (**all**). If not specified, **kv**
-		  is assumed.
+    When **map** is provided, it's expected that map has associated BTF object
+    with BTF types describing key and value. It's possible to select whether to
+    dump only BTF type(s) associated with key (**key**), value (**value**),
+    both key and value (**kv**), or all BTF types present in associated BTF
+    object (**all**). If not specified, **kv** is assumed.
 
-		  When **prog** is provided, it's expected that program has
-		  associated BTF object with BTF types.
+    When **prog** is provided, it's expected that program has associated BTF
+    object with BTF types.
 
-		  When specifying *FILE*, an ELF file is expected, containing
-		  .BTF section with well-defined BTF binary format data,
-		  typically produced by clang or pahole.
+    When specifying *FILE*, an ELF file is expected, containing .BTF section
+    with well-defined BTF binary format data, typically produced by clang or
+    pahole.
 
-		  **format** option can be used to override default (raw)
-		  output format. Raw (**raw**) or C-syntax (**c**) output
-		  formats are supported.
+    **format** option can be used to override default (raw) output format. Raw
+    (**raw**) or C-syntax (**c**) output formats are supported.
 
-	**bpftool btf help**
-		  Print short help message.
+bpftool btf help
+    Print short help message.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
-
-	-B, --base-btf *FILE*
-		  Pass a base BTF object. Base BTF objects are typically used
-		  with BTF objects for kernel modules. To avoid duplicating
-		  all kernel symbols required by modules, BTF objects for
-		  modules are "split", they are built incrementally on top of
-		  the kernel (vmlinux) BTF object. So the base BTF reference
-		  should usually point to the kernel BTF.
-
-		  When the main BTF object to process (for example, the
-		  module BTF to dump) is passed as a *FILE*, bpftool attempts
-		  to autodetect the path for the base object, and passing
-		  this option is optional. When the main BTF object is passed
-		  through other handles, this option becomes necessary.
+.. include:: common_options.rst
+
+-B, --base-btf *FILE*
+    Pass a base BTF object. Base BTF objects are typically used with BTF
+    objects for kernel modules. To avoid duplicating all kernel symbols
+    required by modules, BTF objects for modules are "split", they are
+    built incrementally on top of the kernel (vmlinux) BTF object. So the
+    base BTF reference should usually point to the kernel BTF.
+
+    When the main BTF object to process (for example, the module BTF to
+    dump) is passed as a *FILE*, bpftool attempts to autodetect the path
+    for the base object, and passing this option is optional. When the main
+    BTF object is passed through other handles, this option becomes
+    necessary.
 
 EXAMPLES
 ========
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 2ce900f66d6e..e8185596a759 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -14,134 +14,125 @@ tool for inspection and simple manipulation of eBPF progs
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] **cgroup** *COMMAND*
+**bpftool** [*OPTIONS*] **cgroup** *COMMAND*
 
-	*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } }
+*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } }
 
-	*COMMANDS* :=
-	{ **show** | **list** | **tree** | **attach** | **detach** | **help** }
+*COMMANDS* :=
+{ **show** | **list** | **tree** | **attach** | **detach** | **help** }
 
 CGROUP COMMANDS
 ===============
 
-|	**bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**]
-|	**bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**]
-|	**bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*]
-|	**bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
-|	**bpftool** **cgroup help**
+| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**]
+| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**]
+| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*]
+| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
+| **bpftool** **cgroup help**
 |
-|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
-|	*ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** |
-|		**cgroup_inet_sock_create** | **cgroup_sock_ops** |
-|		**cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** |
-|		**cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** |
-|		**cgroup_inet4_connect** | **cgroup_inet6_connect** |
-|		**cgroup_unix_connect** | **cgroup_inet4_getpeername** |
-|		**cgroup_inet6_getpeername** | **cgroup_unix_getpeername** |
-|		**cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** |
-|		**cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** |
-|		**cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** |
-|		**cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** |
-|		**cgroup_unix_recvmsg** | **cgroup_sysctl** |
-|		**cgroup_getsockopt** | **cgroup_setsockopt** |
-|		**cgroup_inet_sock_release** }
-|	*ATTACH_FLAGS* := { **multi** | **override** }
+| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
+| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** |
+|     **cgroup_inet_sock_create** | **cgroup_sock_ops** |
+|     **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** |
+|     **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** |
+|     **cgroup_inet4_connect** | **cgroup_inet6_connect** |
+|     **cgroup_unix_connect** | **cgroup_inet4_getpeername** |
+|     **cgroup_inet6_getpeername** | **cgroup_unix_getpeername** |
+|     **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** |
+|     **cgroup_unix_getsockname** | **cgroup_udp4_sendmsg** |
+|     **cgroup_udp6_sendmsg** | **cgroup_unix_sendmsg** |
+|     **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** |
+|     **cgroup_unix_recvmsg** | **cgroup_sysctl** |
+|     **cgroup_getsockopt** | **cgroup_setsockopt** |
+|     **cgroup_inet_sock_release** }
+| *ATTACH_FLAGS* := { **multi** | **override** }
 
 DESCRIPTION
 ===========
-	**bpftool cgroup { show | list }** *CGROUP* [**effective**]
-		  List all programs attached to the cgroup *CGROUP*.
-
-		  Output will start with program ID followed by attach type,
-		  attach flags and program name.
-
-		  If **effective** is specified retrieve effective programs that
-		  will execute for events within a cgroup. This includes
-		  inherited along with attached ones.
-
-	**bpftool cgroup tree** [*CGROUP_ROOT*] [**effective**]
-		  Iterate over all cgroups in *CGROUP_ROOT* and list all
-		  attached programs. If *CGROUP_ROOT* is not specified,
-		  bpftool uses cgroup v2 mountpoint.
-
-		  The output is similar to the output of cgroup show/list
-		  commands: it starts with absolute cgroup path, followed by
-		  program ID, attach type, attach flags and program name.
-
-		  If **effective** is specified retrieve effective programs that
-		  will execute for events within a cgroup. This includes
-		  inherited along with attached ones.
-
-	**bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*]
-		  Attach program *PROG* to the cgroup *CGROUP* with attach type
-		  *ATTACH_TYPE* and optional *ATTACH_FLAGS*.
-
-		  *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs
-		  some bpf program, the program in this cgroup yields to sub-cgroup
-		  program; **multi** if a sub-cgroup installs some bpf program,
-		  that cgroup program gets run in addition to the program in this
-		  cgroup.
-
-		  Only one program is allowed to be attached to a cgroup with
-		  no attach flags or the **override** flag. Attaching another
-		  program will release old program and attach the new one.
-
-		  Multiple programs are allowed to be attached to a cgroup with
-		  **multi**. They are executed in FIFO order (those that were
-		  attached first, run first).
-
-		  Non-default *ATTACH_FLAGS* are supported by kernel version 4.14
-		  and later.
-
-		  *ATTACH_TYPE* can be on of:
-		  **ingress** ingress path of the inet socket (since 4.10);
-		  **egress** egress path of the inet socket (since 4.10);
-		  **sock_create** opening of an inet socket (since 4.10);
-		  **sock_ops** various socket operations (since 4.12);
-		  **device** device access (since 4.15);
-		  **bind4** call to bind(2) for an inet4 socket (since 4.17);
-		  **bind6** call to bind(2) for an inet6 socket (since 4.17);
-		  **post_bind4** return from bind(2) for an inet4 socket (since 4.17);
-		  **post_bind6** return from bind(2) for an inet6 socket (since 4.17);
-		  **connect4** call to connect(2) for an inet4 socket (since 4.17);
-		  **connect6** call to connect(2) for an inet6 socket (since 4.17);
-		  **connect_unix** call to connect(2) for a unix socket (since 6.7);
-		  **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an
-		  unconnected udp4 socket (since 4.18);
-		  **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an
-		  unconnected udp6 socket (since 4.18);
-		  **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for
-		  an unconnected unix socket (since 6.7);
-		  **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for
-		  an unconnected udp4 socket (since 5.2);
-		  **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for
-		  an unconnected udp6 socket (since 5.2);
-		  **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for
-		  an unconnected unix socket (since 6.7);
-		  **sysctl** sysctl access (since 5.2);
-		  **getsockopt** call to getsockopt (since 5.3);
-		  **setsockopt** call to setsockopt (since 5.3);
-		  **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8);
-		  **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8);
-		  **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7);
-		  **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8);
-		  **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8).
-		  **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7);
-		  **sock_release** closing an userspace inet socket (since 5.9).
-
-	**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
-		  Detach *PROG* from the cgroup *CGROUP* and attach type
-		  *ATTACH_TYPE*.
-
-	**bpftool prog help**
-		  Print short help message.
+bpftool cgroup { show | list } *CGROUP* [effective]
+    List all programs attached to the cgroup *CGROUP*.
+
+    Output will start with program ID followed by attach type, attach flags and
+    program name.
+
+    If **effective** is specified retrieve effective programs that will execute
+    for events within a cgroup. This includes inherited along with attached
+    ones.
+
+bpftool cgroup tree [*CGROUP_ROOT*] [effective]
+    Iterate over all cgroups in *CGROUP_ROOT* and list all attached programs.
+    If *CGROUP_ROOT* is not specified, bpftool uses cgroup v2 mountpoint.
+
+    The output is similar to the output of cgroup show/list commands: it starts
+    with absolute cgroup path, followed by program ID, attach type, attach
+    flags and program name.
+
+    If **effective** is specified retrieve effective programs that will execute
+    for events within a cgroup. This includes inherited along with attached
+    ones.
+
+bpftool cgroup attach *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*]
+    Attach program *PROG* to the cgroup *CGROUP* with attach type *ATTACH_TYPE*
+    and optional *ATTACH_FLAGS*.
+
+    *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs some
+    bpf program, the program in this cgroup yields to sub-cgroup program;
+    **multi** if a sub-cgroup installs some bpf program, that cgroup program
+    gets run in addition to the program in this cgroup.
+
+    Only one program is allowed to be attached to a cgroup with no attach flags
+    or the **override** flag. Attaching another program will release old
+    program and attach the new one.
+
+    Multiple programs are allowed to be attached to a cgroup with **multi**.
+    They are executed in FIFO order (those that were attached first, run
+    first).
+
+    Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 and later.
+
+    *ATTACH_TYPE* can be one of:
+
+    - **ingress** ingress path of the inet socket (since 4.10)
+    - **egress** egress path of the inet socket (since 4.10)
+    - **sock_create** opening of an inet socket (since 4.10)
+    - **sock_ops** various socket operations (since 4.12)
+    - **device** device access (since 4.15)
+    - **bind4** call to bind(2) for an inet4 socket (since 4.17)
+    - **bind6** call to bind(2) for an inet6 socket (since 4.17)
+    - **post_bind4** return from bind(2) for an inet4 socket (since 4.17)
+    - **post_bind6** return from bind(2) for an inet6 socket (since 4.17)
+    - **connect4** call to connect(2) for an inet4 socket (since 4.17)
+    - **connect6** call to connect(2) for an inet6 socket (since 4.17)
+    - **connect_unix** call to connect(2) for a unix socket (since 6.7)
+    - **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp4 socket (since 4.18)
+    - **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected udp6 socket (since 4.18)
+    - **sendmsg_unix** call to sendto(2), sendmsg(2), sendmmsg(2) for an unconnected unix socket (since 6.7)
+    - **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp4 socket (since 5.2)
+    - **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected udp6 socket (since 5.2)
+    - **recvmsg_unix** call to recvfrom(2), recvmsg(2), recvmmsg(2) for an unconnected unix socket (since 6.7)
+    - **sysctl** sysctl access (since 5.2)
+    - **getsockopt** call to getsockopt (since 5.3)
+    - **setsockopt** call to setsockopt (since 5.3)
+    - **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8)
+    - **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8)
+    - **getpeername_unix** call to getpeername(2) for a unix socket (since 6.7)
+    - **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8)
+    - **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8)
+    - **getsockname_unix** call to getsockname(2) for a unix socket (since 6.7)
+    - **sock_release** closing a userspace inet socket (since 5.9)
+
+bpftool cgroup detach *CGROUP* *ATTACH_TYPE* *PROG*
+    Detach *PROG* from the cgroup *CGROUP* and attach type *ATTACH_TYPE*.
+
+bpftool prog help
+    Print short help message.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
+.. include:: common_options.rst
 
-	-f, --bpffs
-		  Show file names of pinned programs.
+-f, --bpffs
+    Show file names of pinned programs.
 
 EXAMPLES
 ========
diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
index e44039f89be7..c7f837898bc7 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
@@ -14,77 +14,70 @@ tool for inspection of eBPF-related parameters for Linux kernel or net device
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] **feature** *COMMAND*
+**bpftool** [*OPTIONS*] **feature** *COMMAND*
 
-	*OPTIONS* := { |COMMON_OPTIONS| }
+*OPTIONS* := { |COMMON_OPTIONS| }
 
-	*COMMANDS* := { **probe** | **help** }
+*COMMANDS* := { **probe** | **help** }
 
 FEATURE COMMANDS
 ================
 
-|	**bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]]
-|	**bpftool** **feature list_builtins** *GROUP*
-|	**bpftool** **feature help**
+| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]]
+| **bpftool** **feature list_builtins** *GROUP*
+| **bpftool** **feature help**
 |
-|	*COMPONENT* := { **kernel** | **dev** *NAME* }
-|	*GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** }
+| *COMPONENT* := { **kernel** | **dev** *NAME* }
+| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** }
 
 DESCRIPTION
 ===========
-	**bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]]
-		  Probe the running kernel and dump a number of eBPF-related
-		  parameters, such as availability of the **bpf**\ () system call,
-		  JIT status, eBPF program types availability, eBPF helper
-		  functions availability, and more.
-
-		  By default, bpftool **does not run probes** for
-		  **bpf_probe_write_user**\ () and **bpf_trace_printk**\()
-		  helpers which print warnings to kernel logs. To enable them
-		  and run all probes, the **full** keyword should be used.
-
-		  If the **macros** keyword (but not the **-j** option) is
-		  passed, a subset of the output is dumped as a list of
-		  **#define** macros that are ready to be included in a C
-		  header file, for example. If, additionally, **prefix** is
-		  used to define a *PREFIX*, the provided string will be used
-		  as a prefix to the names of the macros: this can be used to
-		  avoid conflicts on macro names when including the output of
-		  this command as a header file.
-
-		  Keyword **kernel** can be omitted. If no probe target is
-		  specified, probing the kernel is the default behaviour.
-
-		  When the **unprivileged** keyword is used, bpftool will dump
-		  only the features available to a user who does not have the
-		  **CAP_SYS_ADMIN** capability set. The features available in
-		  that case usually represent a small subset of the parameters
-		  supported by the system. Unprivileged users MUST use the
-		  **unprivileged** keyword: This is to avoid misdetection if
-		  bpftool is inadvertently run as non-root, for example. This
-		  keyword is unavailable if bpftool was compiled without
-		  libcap.
-
-	**bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]]
-		  Probe network device for supported eBPF features and dump
-		  results to the console.
-
-		  The keywords **full**, **macros** and **prefix** have the
-		  same role as when probing the kernel.
-
-	**bpftool feature list_builtins** *GROUP*
-		  List items known to bpftool. These can be BPF program types
-		  (**prog_types**), BPF map types (**map_types**), attach types
-		  (**attach_types**), link types (**link_types**), or BPF helper
-		  functions (**helpers**). The command does not probe the system, but
-		  simply lists the elements that bpftool knows from compilation time,
-		  as provided from libbpf (for all object types) or from the BPF UAPI
-		  header (list of helpers). This can be used in scripts to iterate over
-		  BPF types or helpers.
-
-	**bpftool feature help**
-		  Print short help message.
+bpftool feature probe [kernel] [full] [macros [prefix *PREFIX*]]
+    Probe the running kernel and dump a number of eBPF-related parameters, such
+    as availability of the **bpf**\ () system call, JIT status, eBPF program
+    types availability, eBPF helper functions availability, and more.
+
+    By default, bpftool **does not run probes** for **bpf_probe_write_user**\
+    () and **bpf_trace_printk**\() helpers which print warnings to kernel logs.
+    To enable them and run all probes, the **full** keyword should be used.
+
+    If the **macros** keyword (but not the **-j** option) is passed, a subset
+    of the output is dumped as a list of **#define** macros that are ready to
+    be included in a C header file, for example. If, additionally, **prefix**
+    is used to define a *PREFIX*, the provided string will be used as a prefix
+    to the names of the macros: this can be used to avoid conflicts on macro
+    names when including the output of this command as a header file.
+
+    Keyword **kernel** can be omitted. If no probe target is specified, probing
+    the kernel is the default behaviour.
+
+    When the **unprivileged** keyword is used, bpftool will dump only the
+    features available to a user who does not have the **CAP_SYS_ADMIN**
+    capability set. The features available in that case usually represent a
+    small subset of the parameters supported by the system. Unprivileged users
+    MUST use the **unprivileged** keyword: This is to avoid misdetection if
+    bpftool is inadvertently run as non-root, for example. This keyword is
+    unavailable if bpftool was compiled without libcap.
+
+bpftool feature probe dev *NAME* [full] [macros [prefix *PREFIX*]]
+    Probe network device for supported eBPF features and dump results to the
+    console.
+
+    The keywords **full**, **macros** and **prefix** have the same role as when
+    probing the kernel.
+
+bpftool feature list_builtins *GROUP*
+    List items known to bpftool. These can be BPF program types
+    (**prog_types**), BPF map types (**map_types**), attach types
+    (**attach_types**), link types (**link_types**), or BPF helper functions
+    (**helpers**). The command does not probe the system, but simply lists the
+    elements that bpftool knows from compilation time, as provided from libbpf
+    (for all object types) or from the BPF UAPI header (list of helpers). This
+    can be used in scripts to iterate over BPF types or helpers.
+
+bpftool feature help
+    Print short help message.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
+.. include:: common_options.rst
diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
index 5e60825818dd..c768e6d4ae09 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
@@ -14,199 +14,177 @@ tool for BPF code-generation
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] **gen** *COMMAND*
+**bpftool** [*OPTIONS*] **gen** *COMMAND*
 
-	*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } }
+*OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } }
 
-	*COMMAND* := { **object** | **skeleton** | **help** }
+*COMMAND* := { **object** | **skeleton** | **help** }
 
 GEN COMMANDS
 =============
 
-|	**bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...]
-|	**bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*]
-|	**bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*]
-|	**bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...]
-|	**bpftool** **gen help**
+| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...]
+| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*]
+| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*]
+| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...]
+| **bpftool** **gen help**
 
 DESCRIPTION
 ===========
-	**bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...]
-		  Statically link (combine) together one or more *INPUT_FILE*'s
-		  into a single resulting *OUTPUT_FILE*. All the files involved
-		  are BPF ELF object files.
-
-		  The rules of BPF static linking are mostly the same as for
-		  user-space object files, but in addition to combining data
-		  and instruction sections, .BTF and .BTF.ext (if present in
-		  any of the input files) data are combined together. .BTF
-		  data is deduplicated, so all the common types across
-		  *INPUT_FILE*'s will only be represented once in the resulting
-		  BTF information.
-
-		  BPF static linking allows to partition BPF source code into
-		  individually compiled files that are then linked into
-		  a single resulting BPF object file, which can be used to
-		  generated BPF skeleton (with **gen skeleton** command) or
-		  passed directly into **libbpf** (using **bpf_object__open()**
-		  family of APIs).
-
-	**bpftool gen skeleton** *FILE*
-		  Generate BPF skeleton C header file for a given *FILE*.
-
-		  BPF skeleton is an alternative interface to existing libbpf
-		  APIs for working with BPF objects. Skeleton code is intended
-		  to significantly shorten and simplify code to load and work
-		  with BPF programs from userspace side. Generated code is
-		  tailored to specific input BPF object *FILE*, reflecting its
-		  structure by listing out available maps, program, variables,
-		  etc. Skeleton eliminates the need to lookup mentioned
-		  components by name. Instead, if skeleton instantiation
-		  succeeds, they are populated in skeleton structure as valid
-		  libbpf types (e.g., **struct bpf_map** pointer) and can be
-		  passed to existing generic libbpf APIs.
-
-		  In addition to simple and reliable access to maps and
-		  programs, skeleton provides a storage for BPF links (**struct
-		  bpf_link**) for each BPF program within BPF object. When
-		  requested, supported BPF programs will be automatically
-		  attached and resulting BPF links stored for further use by
-		  user in pre-allocated fields in skeleton struct. For BPF
-		  programs that can't be automatically attached by libbpf,
-		  user can attach them manually, but store resulting BPF link
-		  in per-program link field. All such set up links will be
-		  automatically destroyed on BPF skeleton destruction. This
-		  eliminates the need for users to manage links manually and
-		  rely on libbpf support to detach programs and free up
-		  resources.
-
-		  Another facility provided by BPF skeleton is an interface to
-		  global variables of all supported kinds: mutable, read-only,
-		  as well as extern ones. This interface allows to pre-setup
-		  initial values of variables before BPF object is loaded and
-		  verified by kernel. For non-read-only variables, the same
-		  interface can be used to fetch values of global variables on
-		  userspace side, even if they are modified by BPF code.
-
-		  During skeleton generation, contents of source BPF object
-		  *FILE* is embedded within generated code and is thus not
-		  necessary to keep around. This ensures skeleton and BPF
-		  object file are matching 1-to-1 and always stay in sync.
-		  Generated code is dual-licensed under LGPL-2.1 and
-		  BSD-2-Clause licenses.
-
-		  It is a design goal and guarantee that skeleton interfaces
-		  are interoperable with generic libbpf APIs. User should
-		  always be able to use skeleton API to create and load BPF
-		  object, and later use libbpf APIs to keep working with
-		  specific maps, programs, etc.
-
-		  As part of skeleton, few custom functions are generated.
-		  Each of them is prefixed with object name. Object name can
-		  either be derived from object file name, i.e., if BPF object
-		  file name is **example.o**, BPF object name will be
-		  **example**. Object name can be also specified explicitly
-		  through **name** *OBJECT_NAME* parameter. The following
-		  custom functions are provided (assuming **example** as
-		  the object name):
-
-		  - **example__open** and **example__open_opts**.
-		    These functions are used to instantiate skeleton. It
-		    corresponds to libbpf's **bpf_object__open**\ () API.
-		    **_opts** variants accepts extra **bpf_object_open_opts**
-		    options.
-
-		  - **example__load**.
-		    This function creates maps, loads and verifies BPF
-		    programs, initializes global data maps. It corresponds to
-		    libppf's **bpf_object__load**\ () API.
-
-		  - **example__open_and_load** combines **example__open** and
-		    **example__load** invocations in one commonly used
-		    operation.
-
-		  - **example__attach** and **example__detach**
-		    This pair of functions allow to attach and detach,
-		    correspondingly, already loaded BPF object. Only BPF
-		    programs of types supported by libbpf for auto-attachment
-		    will be auto-attached and their corresponding BPF links
-		    instantiated. For other BPF programs, user can manually
-		    create a BPF link and assign it to corresponding fields in
-		    skeleton struct. **example__detach** will detach both
-		    links created automatically, as well as those populated by
-		    user manually.
-
-		  - **example__destroy**
-		    Detach and unload BPF programs, free up all the resources
-		    used by skeleton and BPF object.
-
-		  If BPF object has global variables, corresponding structs
-		  with memory layout corresponding to global data data section
-		  layout will be created. Currently supported ones are: *.data*,
-		  *.bss*, *.rodata*, and *.kconfig* structs/data sections.
-		  These data sections/structs can be used to set up initial
-		  values of variables, if set before **example__load**.
-		  Afterwards, if target kernel supports memory-mapped BPF
-		  arrays, same structs can be used to fetch and update
-		  (non-read-only) data from userspace, with same simplicity
-		  as for BPF side.
-
-	**bpftool gen subskeleton** *FILE*
-		  Generate BPF subskeleton C header file for a given *FILE*.
-
-		  Subskeletons are similar to skeletons, except they do not own
-		  the corresponding maps, programs, or global variables. They
-		  require that the object file used to generate them is already
-		  loaded into a *bpf_object* by some other means.
-
-		  This functionality is useful when a library is included into a
-		  larger BPF program. A subskeleton for the library would have
-		  access to all objects and globals defined in it, without
-		  having to know about the larger program.
-
-		  Consequently, there are only two functions defined
-		  for subskeletons:
-
-		  - **example__open(bpf_object\*)**
-		    Instantiates a subskeleton from an already opened (but not
-		    necessarily loaded) **bpf_object**.
-
-		  - **example__destroy()**
-		    Frees the storage for the subskeleton but *does not* unload
-		    any BPF programs or maps.
-
-	**bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...]
-		  Generate a minimum BTF file as *OUTPUT*, derived from a given
-		  *INPUT* BTF file, containing all needed BTF types so one, or
-		  more, given eBPF objects CO-RE relocations may be satisfied.
-
-		  When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF,
-		  libbpf, when loading an eBPF object, has to rely on external
-		  BTF files to be able to calculate CO-RE relocations.
-
-		  Usually, an external BTF file is built from existing kernel
-		  DWARF data using pahole. It contains all the types used by
-		  its respective kernel image and, because of that, is big.
-
-		  The min_core_btf feature builds smaller BTF files, customized
-		  to one or multiple eBPF objects, so they can be distributed
-		  together with an eBPF CO-RE based application, turning the
-		  application portable to different kernel versions.
-
-		  Check examples bellow for more information how to use it.
-
-	**bpftool gen help**
-		  Print short help message.
+bpftool gen object *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...]
+    Statically link (combine) together one or more *INPUT_FILE*'s into a single
+    resulting *OUTPUT_FILE*. All the files involved are BPF ELF object files.
+
+    The rules of BPF static linking are mostly the same as for user-space
+    object files, but in addition to combining data and instruction sections,
+    .BTF and .BTF.ext (if present in any of the input files) data are combined
+    together. .BTF data is deduplicated, so all the common types across
+    *INPUT_FILE*'s will only be represented once in the resulting BTF
+    information.
+
+    BPF static linking allows to partition BPF source code into individually
+    compiled files that are then linked into a single resulting BPF object
+    file, which can be used to generated BPF skeleton (with **gen skeleton**
+    command) or passed directly into **libbpf** (using **bpf_object__open()**
+    family of APIs).
+
+bpftool gen skeleton *FILE*
+    Generate BPF skeleton C header file for a given *FILE*.
+
+    BPF skeleton is an alternative interface to existing libbpf APIs for
+    working with BPF objects. Skeleton code is intended to significantly
+    shorten and simplify code to load and work with BPF programs from userspace
+    side. Generated code is tailored to specific input BPF object *FILE*,
+    reflecting its structure by listing out available maps, program, variables,
+    etc. Skeleton eliminates the need to lookup mentioned components by name.
+    Instead, if skeleton instantiation succeeds, they are populated in skeleton
+    structure as valid libbpf types (e.g., **struct bpf_map** pointer) and can
+    be passed to existing generic libbpf APIs.
+
+    In addition to simple and reliable access to maps and programs, skeleton
+    provides a storage for BPF links (**struct bpf_link**) for each BPF program
+    within BPF object. When requested, supported BPF programs will be
+    automatically attached and resulting BPF links stored for further use by
+    user in pre-allocated fields in skeleton struct. For BPF programs that
+    can't be automatically attached by libbpf, user can attach them manually,
+    but store resulting BPF link in per-program link field. All such set up
+    links will be automatically destroyed on BPF skeleton destruction. This
+    eliminates the need for users to manage links manually and rely on libbpf
+    support to detach programs and free up resources.
+
+    Another facility provided by BPF skeleton is an interface to global
+    variables of all supported kinds: mutable, read-only, as well as extern
+    ones. This interface allows to pre-setup initial values of variables before
+    BPF object is loaded and verified by kernel. For non-read-only variables,
+    the same interface can be used to fetch values of global variables on
+    userspace side, even if they are modified by BPF code.
+
+    During skeleton generation, contents of source BPF object *FILE* is
+    embedded within generated code and is thus not necessary to keep around.
+    This ensures skeleton and BPF object file are matching 1-to-1 and always
+    stay in sync. Generated code is dual-licensed under LGPL-2.1 and
+    BSD-2-Clause licenses.
+
+    It is a design goal and guarantee that skeleton interfaces are
+    interoperable with generic libbpf APIs. User should always be able to use
+    skeleton API to create and load BPF object, and later use libbpf APIs to
+    keep working with specific maps, programs, etc.
+
+    As part of skeleton, few custom functions are generated. Each of them is
+    prefixed with object name. Object name can either be derived from object
+    file name, i.e., if BPF object file name is **example.o**, BPF object name
+    will be **example**. Object name can be also specified explicitly through
+    **name** *OBJECT_NAME* parameter. The following custom functions are
+    provided (assuming **example** as the object name):
+
+    - **example__open** and **example__open_opts**.
+      These functions are used to instantiate skeleton. It corresponds to
+      libbpf's **bpf_object__open**\ () API. **_opts** variants accepts extra
+      **bpf_object_open_opts** options.
+
+    - **example__load**.
+      This function creates maps, loads and verifies BPF programs, initializes
+      global data maps. It corresponds to libppf's **bpf_object__load**\ ()
+      API.
+
+    - **example__open_and_load** combines **example__open** and
+      **example__load** invocations in one commonly used operation.
+
+    - **example__attach** and **example__detach**.
+      This pair of functions allow to attach and detach, correspondingly,
+      already loaded BPF object. Only BPF programs of types supported by libbpf
+      for auto-attachment will be auto-attached and their corresponding BPF
+      links instantiated. For other BPF programs, user can manually create a
+      BPF link and assign it to corresponding fields in skeleton struct.
+      **example__detach** will detach both links created automatically, as well
+      as those populated by user manually.
+
+    - **example__destroy**.
+      Detach and unload BPF programs, free up all the resources used by
+      skeleton and BPF object.
+
+    If BPF object has global variables, corresponding structs with memory
+    layout corresponding to global data data section layout will be created.
+    Currently supported ones are: *.data*, *.bss*, *.rodata*, and *.kconfig*
+    structs/data sections. These data sections/structs can be used to set up
+    initial values of variables, if set before **example__load**. Afterwards,
+    if target kernel supports memory-mapped BPF arrays, same structs can be
+    used to fetch and update (non-read-only) data from userspace, with same
+    simplicity as for BPF side.
+
+bpftool gen subskeleton *FILE*
+    Generate BPF subskeleton C header file for a given *FILE*.
+
+    Subskeletons are similar to skeletons, except they do not own the
+    corresponding maps, programs, or global variables. They require that the
+    object file used to generate them is already loaded into a *bpf_object* by
+    some other means.
+
+    This functionality is useful when a library is included into a larger BPF
+    program. A subskeleton for the library would have access to all objects and
+    globals defined in it, without having to know about the larger program.
+
+    Consequently, there are only two functions defined for subskeletons:
+
+    - **example__open(bpf_object\*)**.
+      Instantiates a subskeleton from an already opened (but not necessarily
+      loaded) **bpf_object**.
+
+    - **example__destroy()**.
+      Frees the storage for the subskeleton but *does not* unload any BPF
+      programs or maps.
+
+bpftool gen min_core_btf *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...]
+    Generate a minimum BTF file as *OUTPUT*, derived from a given *INPUT* BTF
+    file, containing all needed BTF types so one, or more, given eBPF objects
+    CO-RE relocations may be satisfied.
+
+    When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, libbpf, when
+    loading an eBPF object, has to rely on external BTF files to be able to
+    calculate CO-RE relocations.
+
+    Usually, an external BTF file is built from existing kernel DWARF data
+    using pahole. It contains all the types used by its respective kernel image
+    and, because of that, is big.
+
+    The min_core_btf feature builds smaller BTF files, customized to one or
+    multiple eBPF objects, so they can be distributed together with an eBPF
+    CO-RE based application, turning the application portable to different
+    kernel versions.
+
+    Check examples bellow for more information how to use it.
+
+bpftool gen help
+    Print short help message.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
+.. include:: common_options.rst
 
-	-L, --use-loader
-		  For skeletons, generate a "light" skeleton (also known as "loader"
-		  skeleton). A light skeleton contains a loader eBPF program. It does
-		  not use the majority of the libbpf infrastructure, and does not need
-		  libelf.
+-L, --use-loader
+    For skeletons, generate a "light" skeleton (also known as "loader"
+    skeleton). A light skeleton contains a loader eBPF program. It does not use
+    the majority of the libbpf infrastructure, and does not need libelf.
 
 EXAMPLES
 ========
diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
index 84839d488621..2e5d81c906dc 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
@@ -14,50 +14,46 @@ tool to create BPF iterators
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] **iter** *COMMAND*
+**bpftool** [*OPTIONS*] **iter** *COMMAND*
 
-	*OPTIONS* := { |COMMON_OPTIONS| }
+*OPTIONS* := { |COMMON_OPTIONS| }
 
-	*COMMANDS* := { **pin** | **help** }
+*COMMANDS* := { **pin** | **help** }
 
 ITER COMMANDS
-===================
+=============
 
-|	**bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*]
-|	**bpftool** **iter help**
+| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*]
+| **bpftool** **iter help**
 |
-|	*OBJ* := /a/file/of/bpf_iter_target.o
-|	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
+| *OBJ* := /a/file/of/bpf_iter_target.o
+| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
 
 DESCRIPTION
 ===========
-	**bpftool iter pin** *OBJ* *PATH* [**map** *MAP*]
-		  A bpf iterator combines a kernel iterating of
-		  particular kernel data (e.g., tasks, bpf_maps, etc.)
-		  and a bpf program called for each kernel data object
-		  (e.g., one task, one bpf_map, etc.). User space can
-		  *read* kernel iterator output through *read()* syscall.
-
-		  The *pin* command creates a bpf iterator from *OBJ*,
-		  and pin it to *PATH*. The *PATH* should be located
-		  in *bpffs* mount. It must not contain a dot
-		  character ('.'), which is reserved for future extensions
-		  of *bpffs*.
-
-		  Map element bpf iterator requires an additional parameter
-		  *MAP* so bpf program can iterate over map elements for
-		  that map. User can have a bpf program in kernel to run
-		  with each map element, do checking, filtering, aggregation,
-		  etc. without copying data to user space.
-
-		  User can then *cat PATH* to see the bpf iterator output.
-
-	**bpftool iter help**
-		  Print short help message.
+bpftool iter pin *OBJ* *PATH* [map *MAP*]
+    A bpf iterator combines a kernel iterating of particular kernel data (e.g.,
+    tasks, bpf_maps, etc.) and a bpf program called for each kernel data object
+    (e.g., one task, one bpf_map, etc.). User space can *read* kernel iterator
+    output through *read()* syscall.
+
+    The *pin* command creates a bpf iterator from *OBJ*, and pin it to *PATH*.
+    The *PATH* should be located in *bpffs* mount. It must not contain a dot
+    character ('.'), which is reserved for future extensions of *bpffs*.
+
+    Map element bpf iterator requires an additional parameter *MAP* so bpf
+    program can iterate over map elements for that map. User can have a bpf
+    program in kernel to run with each map element, do checking, filtering,
+    aggregation, etc. without copying data to user space.
+
+    User can then *cat PATH* to see the bpf iterator output.
+
+bpftool iter help
+    Print short help message.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
+.. include:: common_options.rst
 
 EXAMPLES
 ========
diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst
index 52a4eee4af54..6f09d4405ed8 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-link.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst
@@ -14,67 +14,62 @@ tool for inspection and simple manipulation of eBPF links
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] **link** *COMMAND*
+**bpftool** [*OPTIONS*] **link** *COMMAND*
 
-	*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } }
+*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } }
 
-	*COMMANDS* := { **show** | **list** | **pin** | **help** }
+*COMMANDS* := { **show** | **list** | **pin** | **help** }
 
 LINK COMMANDS
 =============
 
-|	**bpftool** **link { show | list }** [*LINK*]
-|	**bpftool** **link pin** *LINK* *FILE*
-|	**bpftool** **link detach** *LINK*
-|	**bpftool** **link help**
+| **bpftool** **link { show | list }** [*LINK*]
+| **bpftool** **link pin** *LINK* *FILE*
+| **bpftool** **link detach** *LINK*
+| **bpftool** **link help**
 |
-|	*LINK* := { **id** *LINK_ID* | **pinned** *FILE* }
+| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* }
 
 
 DESCRIPTION
 ===========
-	**bpftool link { show | list }** [*LINK*]
-		  Show information about active links. If *LINK* is
-		  specified show information only about given link,
-		  otherwise list all links currently active on the system.
+bpftool link { show | list } [*LINK*]
+    Show information about active links. If *LINK* is specified show
+    information only about given link, otherwise list all links currently
+    active on the system.
 
-		  Output will start with link ID followed by link type and
-		  zero or more named attributes, some of which depend on type
-		  of link.
+    Output will start with link ID followed by link type and zero or more named
+    attributes, some of which depend on type of link.
 
-		  Since Linux 5.8 bpftool is able to discover information about
-		  processes that hold open file descriptors (FDs) against BPF
-		  links. On such kernels bpftool will automatically emit this
-		  information as well.
+    Since Linux 5.8 bpftool is able to discover information about processes
+    that hold open file descriptors (FDs) against BPF links. On such kernels
+    bpftool will automatically emit this information as well.
 
-	**bpftool link pin** *LINK* *FILE*
-		  Pin link *LINK* as *FILE*.
+bpftool link pin *LINK* *FILE*
+    Pin link *LINK* as *FILE*.
 
-		  Note: *FILE* must be located in *bpffs* mount. It must not
-		  contain a dot character ('.'), which is reserved for future
-		  extensions of *bpffs*.
+    Note: *FILE* must be located in *bpffs* mount. It must not contain a dot
+    character ('.'), which is reserved for future extensions of *bpffs*.
 
-	**bpftool link detach** *LINK*
-		  Force-detach link *LINK*. BPF link and its underlying BPF
-		  program will stay valid, but they will be detached from the
-		  respective BPF hook and BPF link will transition into
-		  a defunct state until last open file descriptor for that
-		  link is closed.
+bpftool link detach *LINK*
+    Force-detach link *LINK*. BPF link and its underlying BPF program will stay
+    valid, but they will be detached from the respective BPF hook and BPF link
+    will transition into a defunct state until last open file descriptor for
+    that link is closed.
 
-	**bpftool link help**
-		  Print short help message.
+bpftool link help
+    Print short help message.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
+    .. include:: common_options.rst
 
-	-f, --bpffs
-		  When showing BPF links, show file names of pinned
-		  links.
+    -f, --bpffs
+        When showing BPF links, show file names of pinned links.
 
-	-n, --nomount
-		  Do not automatically attempt to mount any virtual file system
-		  (such as tracefs or BPF virtual file system) when necessary.
+    -n, --nomount
+        Do not automatically attempt to mount any virtual file system (such as
+        tracefs or BPF virtual file system) when necessary.
 
 EXAMPLES
 ========
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 9d6a314dfd7a..252e4c538edb 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -14,166 +14,160 @@ tool for inspection and simple manipulation of eBPF maps
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] **map** *COMMAND*
+**bpftool** [*OPTIONS*] **map** *COMMAND*
 
-	*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } }
+*OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } }
 
-	*COMMANDS* :=
-	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** |
-	**delete** | **pin** | **help** }
+*COMMANDS* :=
+{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** |
+**delete** | **pin** | **help** }
 
 MAP COMMANDS
 =============
 
-|	**bpftool** **map** { **show** | **list** }   [*MAP*]
-|	**bpftool** **map create**     *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \
-|		**entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \
-|		[**offload_dev** *NAME*]
-|	**bpftool** **map dump**       *MAP*
-|	**bpftool** **map update**     *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*]
-|	**bpftool** **map lookup**     *MAP* [**key** *DATA*]
-|	**bpftool** **map getnext**    *MAP* [**key** *DATA*]
-|	**bpftool** **map delete**     *MAP*  **key** *DATA*
-|	**bpftool** **map pin**        *MAP*  *FILE*
-|	**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
-|	**bpftool** **map peek**       *MAP*
-|	**bpftool** **map push**       *MAP* **value** *VALUE*
-|	**bpftool** **map pop**        *MAP*
-|	**bpftool** **map enqueue**    *MAP* **value** *VALUE*
-|	**bpftool** **map dequeue**    *MAP*
-|	**bpftool** **map freeze**     *MAP*
-|	**bpftool** **map help**
+| **bpftool** **map** { **show** | **list** }   [*MAP*]
+| **bpftool** **map create**     *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \
+|     **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \
+|     [**offload_dev** *NAME*]
+| **bpftool** **map dump**       *MAP*
+| **bpftool** **map update**     *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*]
+| **bpftool** **map lookup**     *MAP* [**key** *DATA*]
+| **bpftool** **map getnext**    *MAP* [**key** *DATA*]
+| **bpftool** **map delete**     *MAP*  **key** *DATA*
+| **bpftool** **map pin**        *MAP*  *FILE*
+| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
+| **bpftool** **map peek**       *MAP*
+| **bpftool** **map push**       *MAP* **value** *VALUE*
+| **bpftool** **map pop**        *MAP*
+| **bpftool** **map enqueue**    *MAP* **value** *VALUE*
+| **bpftool** **map dequeue**    *MAP*
+| **bpftool** **map freeze**     *MAP*
+| **bpftool** **map help**
 |
-|	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* }
-|	*DATA* := { [**hex**] *BYTES* }
-|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
-|	*VALUE* := { *DATA* | *MAP* | *PROG* }
-|	*UPDATE_FLAGS* := { **any** | **exist** | **noexist** }
-|	*TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash**
-|		| **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash**
-|		| **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps**
-|		| **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
-|		| **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
-|		| **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage**
-|		| **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** }
+| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* }
+| *DATA* := { [**hex**] *BYTES* }
+| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
+| *VALUE* := { *DATA* | *MAP* | *PROG* }
+| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** }
+| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash**
+|     | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash**
+|     | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps**
+|     | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
+|     | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
+|     | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage**
+|     | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** }
 
 DESCRIPTION
 ===========
-	**bpftool map { show | list }**   [*MAP*]
-		  Show information about loaded maps.  If *MAP* is specified
-		  show information only about given maps, otherwise list all
-		  maps currently loaded on the system.  In case of **name**,
-		  *MAP* may match several maps which will all be shown.
+bpftool map { show | list }   [*MAP*]
+    Show information about loaded maps.  If *MAP* is specified show information
+    only about given maps, otherwise list all maps currently loaded on the
+    system.  In case of **name**, *MAP* may match several maps which will all
+    be shown.
 
-		  Output will start with map ID followed by map type and
-		  zero or more named attributes (depending on kernel version).
+    Output will start with map ID followed by map type and zero or more named
+    attributes (depending on kernel version).
 
-		  Since Linux 5.8 bpftool is able to discover information about
-		  processes that hold open file descriptors (FDs) against BPF
-		  maps. On such kernels bpftool will automatically emit this
-		  information as well.
+    Since Linux 5.8 bpftool is able to discover information about processes
+    that hold open file descriptors (FDs) against BPF maps. On such kernels
+    bpftool will automatically emit this information as well.
 
-	**bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE*  **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**offload_dev** *NAME*]
-		  Create a new map with given parameters and pin it to *bpffs*
-		  as *FILE*.
+bpftool map create *FILE* type *TYPE* key *KEY_SIZE* value *VALUE_SIZE*  entries *MAX_ENTRIES* name *NAME* [flags *FLAGS*] [inner_map *MAP*] [offload_dev *NAME*]
+    Create a new map with given parameters and pin it to *bpffs* as *FILE*.
 
-		  *FLAGS* should be an integer which is the combination of
-		  desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h
-		  UAPI header for existing flags).
+    *FLAGS* should be an integer which is the combination of desired flags,
+    e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h UAPI header for existing
+    flags).
 
-		  To create maps of type array-of-maps or hash-of-maps, the
-		  **inner_map** keyword must be used to pass an inner map. The
-		  kernel needs it to collect metadata related to the inner maps
-		  that the new map will work with.
+    To create maps of type array-of-maps or hash-of-maps, the **inner_map**
+    keyword must be used to pass an inner map. The kernel needs it to collect
+    metadata related to the inner maps that the new map will work with.
 
-		  Keyword **offload_dev** expects a network interface name,
-		  and is used to request hardware offload for the map.
+    Keyword **offload_dev** expects a network interface name, and is used to
+    request hardware offload for the map.
 
-	**bpftool map dump**    *MAP*
-		  Dump all entries in a given *MAP*.  In case of **name**,
-		  *MAP* may match several maps which will all be dumped.
+bpftool map dump    *MAP*
+    Dump all entries in a given *MAP*.  In case of **name**, *MAP* may match
+    several maps which will all be dumped.
 
-	**bpftool map update**  *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*]
-		  Update map entry for a given *KEY*.
+bpftool map update  *MAP* [key *DATA*] [value *VALUE*] [*UPDATE_FLAGS*]
+    Update map entry for a given *KEY*.
 
-		  *UPDATE_FLAGS* can be one of: **any** update existing entry
-		  or add if doesn't exit; **exist** update only if entry already
-		  exists; **noexist** update only if entry doesn't exist.
+    *UPDATE_FLAGS* can be one of: **any** update existing entry or add if
+    doesn't exit; **exist** update only if entry already exists; **noexist**
+    update only if entry doesn't exist.
 
-		  If the **hex** keyword is provided in front of the bytes
-		  sequence, the bytes are parsed as hexadecimal values, even if
-		  no "0x" prefix is added. If the keyword is not provided, then
-		  the bytes are parsed as decimal values, unless a "0x" prefix
-		  (for hexadecimal) or a "0" prefix (for octal) is provided.
+    If the **hex** keyword is provided in front of the bytes sequence, the
+    bytes are parsed as hexadecimal values, even if no "0x" prefix is added. If
+    the keyword is not provided, then the bytes are parsed as decimal values,
+    unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is
+    provided.
 
-	**bpftool map lookup**  *MAP* [**key** *DATA*]
-		  Lookup **key** in the map.
+bpftool map lookup  *MAP* [key *DATA*]
+    Lookup **key** in the map.
 
-	**bpftool map getnext** *MAP* [**key** *DATA*]
-		  Get next key.  If *key* is not specified, get first key.
+bpftool map getnext *MAP* [key *DATA*]
+    Get next key.  If *key* is not specified, get first key.
 
-	**bpftool map delete**  *MAP*  **key** *DATA*
-		  Remove entry from the map.
+bpftool map delete  *MAP*  key *DATA*
+    Remove entry from the map.
 
-	**bpftool map pin**     *MAP*  *FILE*
-		  Pin map *MAP* as *FILE*.
+bpftool map pin     *MAP*  *FILE*
+    Pin map *MAP* as *FILE*.
 
-		  Note: *FILE* must be located in *bpffs* mount. It must not
-		  contain a dot character ('.'), which is reserved for future
-		  extensions of *bpffs*.
+    Note: *FILE* must be located in *bpffs* mount. It must not contain a dot
+    character ('.'), which is reserved for future extensions of *bpffs*.
 
-	**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
-		  Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map.
+bpftool map event_pipe *MAP* [cpu *N* index *M*]
+    Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map.
 
-		  Install perf rings into a perf event array map and dump
-		  output of any **bpf_perf_event_output**\ () call in the kernel.
-		  By default read the number of CPUs on the system and
-		  install perf ring for each CPU in the corresponding index
-		  in the array.
+    Install perf rings into a perf event array map and dump output of any
+    **bpf_perf_event_output**\ () call in the kernel. By default read the
+    number of CPUs on the system and install perf ring for each CPU in the
+    corresponding index in the array.
 
-		  If **cpu** and **index** are specified, install perf ring
-		  for given **cpu** at **index** in the array (single ring).
+    If **cpu** and **index** are specified, install perf ring for given **cpu**
+    at **index** in the array (single ring).
 
-		  Note that installing a perf ring into an array will silently
-		  replace any existing ring.  Any other application will stop
-		  receiving events if it installed its rings earlier.
+    Note that installing a perf ring into an array will silently replace any
+    existing ring.  Any other application will stop receiving events if it
+    installed its rings earlier.
 
-	**bpftool map peek**  *MAP*
-		  Peek next value in the queue or stack.
+bpftool map peek  *MAP*
+    Peek next value in the queue or stack.
 
-	**bpftool map push**  *MAP* **value** *VALUE*
-		  Push *VALUE* onto the stack.
+bpftool map push  *MAP* value *VALUE*
+    Push *VALUE* onto the stack.
 
-	**bpftool map pop**  *MAP*
-		  Pop and print value from the stack.
+bpftool map pop  *MAP*
+    Pop and print value from the stack.
 
-	**bpftool map enqueue**  *MAP* **value** *VALUE*
-		  Enqueue *VALUE* into the queue.
+bpftool map enqueue  *MAP* value *VALUE*
+    Enqueue *VALUE* into the queue.
 
-	**bpftool map dequeue**  *MAP*
-		  Dequeue and print value from the queue.
+bpftool map dequeue  *MAP*
+    Dequeue and print value from the queue.
 
-	**bpftool map freeze**  *MAP*
-		  Freeze the map as read-only from user space. Entries from a
-		  frozen map can not longer be updated or deleted with the
-		  **bpf**\ () system call. This operation is not reversible,
-		  and the map remains immutable from user space until its
-		  destruction. However, read and write permissions for BPF
-		  programs to the map remain unchanged.
+bpftool map freeze  *MAP*
+    Freeze the map as read-only from user space. Entries from a frozen map can
+    not longer be updated or deleted with the **bpf**\ () system call. This
+    operation is not reversible, and the map remains immutable from user space
+    until its destruction. However, read and write permissions for BPF programs
+    to the map remain unchanged.
 
-	**bpftool map help**
-		  Print short help message.
+bpftool map help
+    Print short help message.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
+.. include:: common_options.rst
 
-	-f, --bpffs
-		  Show file names of pinned maps.
+-f, --bpffs
+    Show file names of pinned maps.
 
-	-n, --nomount
-		  Do not automatically attempt to mount any virtual file system
-		  (such as tracefs or BPF virtual file system) when necessary.
+-n, --nomount
+    Do not automatically attempt to mount any virtual file system (such as
+    tracefs or BPF virtual file system) when necessary.
 
 EXAMPLES
 ========
diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index dd3f9469765b..348812881297 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -14,76 +14,74 @@ tool for inspection of networking related bpf prog attachments
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] **net** *COMMAND*
+**bpftool** [*OPTIONS*] **net** *COMMAND*
 
-	*OPTIONS* := { |COMMON_OPTIONS| }
+*OPTIONS* := { |COMMON_OPTIONS| }
 
-	*COMMANDS* :=
-	{ **show** | **list** | **attach** | **detach** | **help** }
+*COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
 
 NET COMMANDS
 ============
 
-|	**bpftool** **net** { **show** | **list** } [ **dev** *NAME* ]
-|	**bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
-|	**bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
-|	**bpftool** **net help**
+| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ]
+| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
+| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
+| **bpftool** **net help**
 |
-|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
-|	*ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** }
+| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
+| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** }
 
 DESCRIPTION
 ===========
-	**bpftool net { show | list }** [ **dev** *NAME* ]
-		  List bpf program attachments in the kernel networking subsystem.
-
-		  Currently, device driver xdp attachments, tcx, netkit and old-style tc
-		  classifier/action attachments, flow_dissector as well as netfilter
-		  attachments are implemented, i.e., for
-		  program types **BPF_PROG_TYPE_XDP**, **BPF_PROG_TYPE_SCHED_CLS**,
-		  **BPF_PROG_TYPE_SCHED_ACT**, **BPF_PROG_TYPE_FLOW_DISSECTOR**,
-		  **BPF_PROG_TYPE_NETFILTER**.
-
-		  For programs attached to a particular cgroup, e.g.,
-		  **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**,
-		  **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
-		  users can use **bpftool cgroup** to dump cgroup attachments.
-		  For sk_{filter, skb, msg, reuseport} and lwt/seg6
-		  bpf programs, users should consult other tools, e.g., iproute2.
-
-		  The current output will start with all xdp program attachments, followed by
-		  all tcx, netkit, then tc class/qdisc bpf program attachments, then flow_dissector
-		  and finally netfilter programs. Both xdp programs and tcx/netkit/tc programs are
-		  ordered based on ifindex number. If multiple bpf programs attached
-		  to the same networking device through **tc**, the order will be first
-		  all bpf programs attached to tcx, netkit, then tc classes, then all bpf programs
-		  attached to non clsact qdiscs, and finally all bpf programs attached
-		  to root and clsact qdisc.
-
-	**bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
-		  Attach bpf program *PROG* to network interface *NAME* with
-		  type specified by *ATTACH_TYPE*. Previously attached bpf program
-		  can be replaced by the command used with **overwrite** option.
-		  Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
-
-		  *ATTACH_TYPE* can be of:
-		  **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it;
-		  **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb;
-		  **xdpdrv** - Native XDP. runs earliest point in driver's receive path;
-		  **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception;
-
-	**bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
-		  Detach bpf program attached to network interface *NAME* with
-		  type specified by *ATTACH_TYPE*. To detach bpf program, same
-		  *ATTACH_TYPE* previously used for attach must be specified.
-		  Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
-
-	**bpftool net help**
-		  Print short help message.
+bpftool net { show | list } [ dev *NAME* ]
+    List bpf program attachments in the kernel networking subsystem.
+
+    Currently, device driver xdp attachments, tcx, netkit and old-style tc
+    classifier/action attachments, flow_dissector as well as netfilter
+    attachments are implemented, i.e., for program types **BPF_PROG_TYPE_XDP**,
+    **BPF_PROG_TYPE_SCHED_CLS**, **BPF_PROG_TYPE_SCHED_ACT**,
+    **BPF_PROG_TYPE_FLOW_DISSECTOR**, **BPF_PROG_TYPE_NETFILTER**.
+
+    For programs attached to a particular cgroup, e.g.,
+    **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**,
+    **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, users
+    can use **bpftool cgroup** to dump cgroup attachments. For sk_{filter, skb,
+    msg, reuseport} and lwt/seg6 bpf programs, users should consult other
+    tools, e.g., iproute2.
+
+    The current output will start with all xdp program attachments, followed by
+    all tcx, netkit, then tc class/qdisc bpf program attachments, then
+    flow_dissector and finally netfilter programs. Both xdp programs and
+    tcx/netkit/tc programs are ordered based on ifindex number. If multiple bpf
+    programs attached to the same networking device through **tc**, the order
+    will be first all bpf programs attached to tcx, netkit, then tc classes,
+    then all bpf programs attached to non clsact qdiscs, and finally all bpf
+    programs attached to root and clsact qdisc.
+
+bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ]
+    Attach bpf program *PROG* to network interface *NAME* with type specified
+    by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the
+    command used with **overwrite** option. Currently, only XDP-related modes
+    are supported for *ATTACH_TYPE*.
+
+    *ATTACH_TYPE* can be of:
+    **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it;
+    **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb;
+    **xdpdrv** - Native XDP. runs earliest point in driver's receive path;
+    **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception;
+
+bpftool net detach *ATTACH_TYPE* dev *NAME*
+    Detach bpf program attached to network interface *NAME* with type specified
+    by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used
+    for attach must be specified. Currently, only XDP-related modes are
+    supported for *ATTACH_TYPE*.
+
+bpftool net help
+    Print short help message.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
+.. include:: common_options.rst
 
 EXAMPLES
 ========
diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
index 5fea633a82f1..8c1ae55be596 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -14,37 +14,37 @@ tool for inspection of perf related bpf prog attachments
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] **perf** *COMMAND*
+**bpftool** [*OPTIONS*] **perf** *COMMAND*
 
-	*OPTIONS* := { |COMMON_OPTIONS| }
+*OPTIONS* := { |COMMON_OPTIONS| }
 
-	*COMMANDS* :=
-	{ **show** | **list** | **help** }
+*COMMANDS* :=
+{ **show** | **list** | **help** }
 
 PERF COMMANDS
 =============
 
-|	**bpftool** **perf** { **show** | **list** }
-|	**bpftool** **perf help**
+| **bpftool** **perf** { **show** | **list** }
+| **bpftool** **perf help**
 
 DESCRIPTION
 ===========
-	**bpftool perf { show | list }**
-		  List all raw_tracepoint, tracepoint, kprobe attachment in the system.
+bpftool perf { show | list }
+    List all raw_tracepoint, tracepoint, kprobe attachment in the system.
 
-		  Output will start with process id and file descriptor in that process,
-		  followed by bpf program id, attachment information, and attachment point.
-		  The attachment point for raw_tracepoint/tracepoint is the trace probe name.
-		  The attachment point for k[ret]probe is either symbol name and offset,
-		  or a kernel virtual address.
-		  The attachment point for u[ret]probe is the file name and the file offset.
+    Output will start with process id and file descriptor in that process,
+    followed by bpf program id, attachment information, and attachment point.
+    The attachment point for raw_tracepoint/tracepoint is the trace probe name.
+    The attachment point for k[ret]probe is either symbol name and offset, or a
+    kernel virtual address. The attachment point for u[ret]probe is the file
+    name and the file offset.
 
-	**bpftool perf help**
-		  Print short help message.
+bpftool perf help
+    Print short help message.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
+.. include:: common_options.rst
 
 EXAMPLES
 ========
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 58e6a5b10ef7..d6304e01afe0 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -14,250 +14,226 @@ tool for inspection and simple manipulation of eBPF progs
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] **prog** *COMMAND*
+**bpftool** [*OPTIONS*] **prog** *COMMAND*
 
-	*OPTIONS* := { |COMMON_OPTIONS| |
-	{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } |
-	{ **-L** | **--use-loader** } }
+*OPTIONS* := { |COMMON_OPTIONS| |
+{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } |
+{ **-L** | **--use-loader** } }
 
-	*COMMANDS* :=
-	{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** |
-	**loadall** | **help** }
+*COMMANDS* :=
+{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** |
+**loadall** | **help** }
 
 PROG COMMANDS
 =============
 
-|	**bpftool** **prog** { **show** | **list** } [*PROG*]
-|	**bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }]
-|	**bpftool** **prog dump jited**  *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }]
-|	**bpftool** **prog pin** *PROG* *FILE*
-|	**bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**]
-|	**bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
-|	**bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
-|	**bpftool** **prog tracelog**
-|	**bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*]
-|	**bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs*
-|	**bpftool** **prog help**
+| **bpftool** **prog** { **show** | **list** } [*PROG*]
+| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }]
+| **bpftool** **prog dump jited**  *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }]
+| **bpftool** **prog pin** *PROG* *FILE*
+| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**]
+| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
+| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
+| **bpftool** **prog tracelog**
+| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*]
+| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs*
+| **bpftool** **prog help**
 |
-|	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
-|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
-|	*TYPE* := {
-|		**socket** | **kprobe** | **kretprobe** | **classifier** | **action** |
-|		**tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** |
-|		**cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** |
-|		**lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** |
-|		**cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
-|		**cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** |
-|		**cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** |
-|		**cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** |
-|		**cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** |
-|		**cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** |
-|		**cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** |
-|		**struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup**
-|	}
-|	*ATTACH_TYPE* := {
-|		**sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** |
-|		**sk_skb_stream_parser** | **flow_dissector**
-|	}
-|	*METRICs* := {
-|		**cycles** | **instructions** | **l1d_loads** | **llc_misses** |
-|		**itlb_misses** | **dtlb_misses**
-|	}
+| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* }
+| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
+| *TYPE* := {
+|     **socket** | **kprobe** | **kretprobe** | **classifier** | **action** |
+|     **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** |
+|     **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** |
+|     **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** |
+|     **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
+|     **cgroup/connect4** | **cgroup/connect6** | **cgroup/connect_unix** |
+|     **cgroup/getpeername4** | **cgroup/getpeername6** | **cgroup/getpeername_unix** |
+|     **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/getsockname_unix** |
+|     **cgroup/sendmsg4** | **cgroup/sendmsg6** | **cgroup/sendmsg_unix** |
+|     **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/recvmsg_unix** | **cgroup/sysctl** |
+|     **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** |
+|     **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup**
+| }
+| *ATTACH_TYPE* := {
+|     **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** |
+|     **sk_skb_stream_parser** | **flow_dissector**
+| }
+| *METRICs* := {
+|     **cycles** | **instructions** | **l1d_loads** | **llc_misses** |
+|     **itlb_misses** | **dtlb_misses**
+| }
 
 
 DESCRIPTION
 ===========
-	**bpftool prog { show | list }** [*PROG*]
-		  Show information about loaded programs.  If *PROG* is
-		  specified show information only about given programs,
-		  otherwise list all programs currently loaded on the system.
-		  In case of **tag** or **name**, *PROG* may match several
-		  programs which will all be shown.
-
-		  Output will start with program ID followed by program type and
-		  zero or more named attributes (depending on kernel version).
-
-		  Since Linux 5.1 the kernel can collect statistics on BPF
-		  programs (such as the total time spent running the program,
-		  and the number of times it was run). If available, bpftool
-		  shows such statistics. However, the kernel does not collect
-		  them by defaults, as it slightly impacts performance on each
-		  program run. Activation or deactivation of the feature is
-		  performed via the **kernel.bpf_stats_enabled** sysctl knob.
-
-		  Since Linux 5.8 bpftool is able to discover information about
-		  processes that hold open file descriptors (FDs) against BPF
-		  programs. On such kernels bpftool will automatically emit this
-		  information as well.
-
-	**bpftool prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }]
-		  Dump eBPF instructions of the programs from the kernel. By
-		  default, eBPF will be disassembled and printed to standard
-		  output in human-readable format. In this case, **opcodes**
-		  controls if raw opcodes should be printed as well.
-
-		  In case of **tag** or **name**, *PROG* may match several
-		  programs which will all be dumped.  However, if **file** or
-		  **visual** is specified, *PROG* must match a single program.
-
-		  If **file** is specified, the binary image will instead be
-		  written to *FILE*.
-
-		  If **visual** is specified, control flow graph (CFG) will be
-		  built instead, and eBPF instructions will be presented with
-		  CFG in DOT format, on standard output.
-
-		  If the programs have line_info available, the source line will
-		  be displayed.  If **linum** is specified, the filename, line
-		  number and line column will also be displayed.
-
-	**bpftool prog dump jited**  *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }]
-		  Dump jited image (host machine code) of the program.
-
-		  If *FILE* is specified image will be written to a file,
-		  otherwise it will be disassembled and printed to stdout.
-		  *PROG* must match a single program when **file** is specified.
-
-		  **opcodes** controls if raw opcodes will be printed.
-
-		  If the prog has line_info available, the source line will
-		  be displayed.  If **linum** is specified, the filename, line
-		  number and line column will also be displayed.
-
-	**bpftool prog pin** *PROG* *FILE*
-		  Pin program *PROG* as *FILE*.
-
-		  Note: *FILE* must be located in *bpffs* mount. It must not
-		  contain a dot character ('.'), which is reserved for future
-		  extensions of *bpffs*.
-
-	**bpftool prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**]
-		  Load bpf program(s) from binary *OBJ* and pin as *PATH*.
-		  **bpftool prog load** pins only the first program from the
-		  *OBJ* as *PATH*. **bpftool prog loadall** pins all programs
-		  from the *OBJ* under *PATH* directory.
-		  **type** is optional, if not specified program type will be
-		  inferred from section names.
-		  By default bpftool will create new maps as declared in the ELF
-		  object being loaded.  **map** parameter allows for the reuse
-		  of existing maps.  It can be specified multiple times, each
-		  time for a different map.  *IDX* refers to index of the map
-		  to be replaced in the ELF file counting from 0, while *NAME*
-		  allows to replace a map by name.  *MAP* specifies the map to
-		  use, referring to it by **id** or through a **pinned** file.
-		  If **offload_dev** *NAME* is specified program will be loaded
-		  onto given networking device (offload).
-		  If **xdpmeta_dev** *NAME* is specified program will become
-		  device-bound without offloading, this facilitates access
-		  to XDP metadata.
-		  Optional **pinmaps** argument can be provided to pin all
-		  maps under *MAP_DIR* directory.
-
-		  If **autoattach** is specified program will be attached
-		  before pin. In that case, only the link (representing the
-		  program attached to its hook) is pinned, not the program as
-		  such, so the path won't show in **bpftool prog show -f**,
-		  only show in **bpftool link show -f**. Also, this only works
-		  when bpftool (libbpf) is able to infer all necessary
-		  information from the object file, in particular, it's not
-		  supported for all program types. If a program does not
-		  support autoattach, bpftool falls back to regular pinning
-		  for that program instead.
-
-		  Note: *PATH* must be located in *bpffs* mount. It must not
-		  contain a dot character ('.'), which is reserved for future
-		  extensions of *bpffs*.
-
-	**bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
-		  Attach bpf program *PROG* (with type specified by
-		  *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP*
-		  parameter, with the exception of *flow_dissector* which is
-		  attached to current networking name space.
-
-	**bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
-		  Detach bpf program *PROG* (with type specified by
-		  *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP*
-		  parameter, with the exception of *flow_dissector* which is
-		  detached from the current networking name space.
-
-	**bpftool prog tracelog**
-		  Dump the trace pipe of the system to the console (stdout).
-		  Hit <Ctrl+C> to stop printing. BPF programs can write to this
-		  trace pipe at runtime with the **bpf_trace_printk**\ () helper.
-		  This should be used only for debugging purposes. For
-		  streaming data from BPF programs to user space, one can use
-		  perf events (see also **bpftool-map**\ (8)).
-
-	**bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*]
-		  Run BPF program *PROG* in the kernel testing infrastructure
-		  for BPF, meaning that the program works on the data and
-		  context provided by the user, and not on actual packets or
-		  monitored functions etc. Return value and duration for the
-		  test run are printed out to the console.
-
-		  Input data is read from the *FILE* passed with **data_in**.
-		  If this *FILE* is "**-**", input data is read from standard
-		  input. Input context, if any, is read from *FILE* passed with
-		  **ctx_in**. Again, "**-**" can be used to read from standard
-		  input, but only if standard input is not already in use for
-		  input data. If a *FILE* is passed with **data_out**, output
-		  data is written to that file. Similarly, output context is
-		  written to the *FILE* passed with **ctx_out**. For both
-		  output flows, "**-**" can be used to print to the standard
-		  output (as plain text, or JSON if relevant option was
-		  passed). If output keywords are omitted, output data and
-		  context are discarded. Keywords **data_size_out** and
-		  **ctx_size_out** are used to pass the size (in bytes) for the
-		  output buffers to the kernel, although the default of 32 kB
-		  should be more than enough for most cases.
-
-		  Keyword **repeat** is used to indicate the number of
-		  consecutive runs to perform. Note that output data and
-		  context printed to files correspond to the last of those
-		  runs. The duration printed out at the end of the runs is an
-		  average over all runs performed by the command.
-
-		  Not all program types support test run. Among those which do,
-		  not all of them can take the **ctx_in**/**ctx_out**
-		  arguments. bpftool does not perform checks on program types.
-
-	**bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs*
-		  Profile *METRICs* for bpf program *PROG* for *DURATION*
-		  seconds or until user hits <Ctrl+C>. *DURATION* is optional.
-		  If *DURATION* is not specified, the profiling will run up to
-		  **UINT_MAX** seconds.
-
-	**bpftool prog help**
-		  Print short help message.
+bpftool prog { show | list } [*PROG*]
+    Show information about loaded programs.  If *PROG* is specified show
+    information only about given programs, otherwise list all programs
+    currently loaded on the system. In case of **tag** or **name**, *PROG* may
+    match several programs which will all be shown.
+
+    Output will start with program ID followed by program type and zero or more
+    named attributes (depending on kernel version).
+
+    Since Linux 5.1 the kernel can collect statistics on BPF programs (such as
+    the total time spent running the program, and the number of times it was
+    run). If available, bpftool shows such statistics. However, the kernel does
+    not collect them by defaults, as it slightly impacts performance on each
+    program run. Activation or deactivation of the feature is performed via the
+    **kernel.bpf_stats_enabled** sysctl knob.
+
+    Since Linux 5.8 bpftool is able to discover information about processes
+    that hold open file descriptors (FDs) against BPF programs. On such kernels
+    bpftool will automatically emit this information as well.
+
+bpftool prog dump xlated *PROG* [{ file *FILE* | [opcodes] [linum] [visual] }]
+    Dump eBPF instructions of the programs from the kernel. By default, eBPF
+    will be disassembled and printed to standard output in human-readable
+    format. In this case, **opcodes** controls if raw opcodes should be printed
+    as well.
+
+    In case of **tag** or **name**, *PROG* may match several programs which
+    will all be dumped.  However, if **file** or **visual** is specified,
+    *PROG* must match a single program.
+
+    If **file** is specified, the binary image will instead be written to
+    *FILE*.
+
+    If **visual** is specified, control flow graph (CFG) will be built instead,
+    and eBPF instructions will be presented with CFG in DOT format, on standard
+    output.
+
+    If the programs have line_info available, the source line will be
+    displayed.  If **linum** is specified, the filename, line number and line
+    column will also be displayed.
+
+bpftool prog dump jited  *PROG* [{ file *FILE* | [opcodes] [linum] }]
+    Dump jited image (host machine code) of the program.
+
+    If *FILE* is specified image will be written to a file, otherwise it will
+    be disassembled and printed to stdout. *PROG* must match a single program
+    when **file** is specified.
+
+    **opcodes** controls if raw opcodes will be printed.
+
+    If the prog has line_info available, the source line will be displayed. If
+    **linum** is specified, the filename, line number and line column will also
+    be displayed.
+
+bpftool prog pin *PROG* *FILE*
+    Pin program *PROG* as *FILE*.
+
+    Note: *FILE* must be located in *bpffs* mount. It must not contain a dot
+    character ('.'), which is reserved for future extensions of *bpffs*.
+
+bpftool prog { load | loadall } *OBJ* *PATH* [type *TYPE*] [map { idx *IDX* | name *NAME* } *MAP*] [{ offload_dev | xdpmeta_dev } *NAME*] [pinmaps *MAP_DIR*] [autoattach]
+    Load bpf program(s) from binary *OBJ* and pin as *PATH*. **bpftool prog
+    load** pins only the first program from the *OBJ* as *PATH*. **bpftool prog
+    loadall** pins all programs from the *OBJ* under *PATH* directory. **type**
+    is optional, if not specified program type will be inferred from section
+    names. By default bpftool will create new maps as declared in the ELF
+    object being loaded.  **map** parameter allows for the reuse of existing
+    maps. It can be specified multiple times, each time for a different map.
+    *IDX* refers to index of the map to be replaced in the ELF file counting
+    from 0, while *NAME* allows to replace a map by name. *MAP* specifies the
+    map to use, referring to it by **id** or through a **pinned** file. If
+    **offload_dev** *NAME* is specified program will be loaded onto given
+    networking device (offload). If **xdpmeta_dev** *NAME* is specified program
+    will become device-bound without offloading, this facilitates access to XDP
+    metadata. Optional **pinmaps** argument can be provided to pin all maps
+    under *MAP_DIR* directory.
+
+    If **autoattach** is specified program will be attached before pin. In that
+    case, only the link (representing the program attached to its hook) is
+    pinned, not the program as such, so the path won't show in **bpftool prog
+    show -f**, only show in **bpftool link show -f**. Also, this only works
+    when bpftool (libbpf) is able to infer all necessary information from the
+    object file, in particular, it's not supported for all program types. If a
+    program does not support autoattach, bpftool falls back to regular pinning
+    for that program instead.
+
+    Note: *PATH* must be located in *bpffs* mount. It must not contain a dot
+    character ('.'), which is reserved for future extensions of *bpffs*.
+
+bpftool prog attach *PROG* *ATTACH_TYPE* [*MAP*]
+    Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most
+    *ATTACH_TYPEs* require a *MAP* parameter, with the exception of
+    *flow_dissector* which is attached to current networking name space.
+
+bpftool prog detach *PROG* *ATTACH_TYPE* [*MAP*]
+    Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*). Most
+    *ATTACH_TYPEs* require a *MAP* parameter, with the exception of
+    *flow_dissector* which is detached from the current networking name space.
+
+bpftool prog tracelog
+    Dump the trace pipe of the system to the console (stdout). Hit <Ctrl+C> to
+    stop printing. BPF programs can write to this trace pipe at runtime with
+    the **bpf_trace_printk**\ () helper. This should be used only for debugging
+    purposes. For streaming data from BPF programs to user space, one can use
+    perf events (see also **bpftool-map**\ (8)).
+
+bpftool prog run *PROG* data_in *FILE* [data_out *FILE* [data_size_out *L*]] [ctx_in *FILE* [ctx_out *FILE* [ctx_size_out *M*]]] [repeat *N*]
+    Run BPF program *PROG* in the kernel testing infrastructure for BPF,
+    meaning that the program works on the data and context provided by the
+    user, and not on actual packets or monitored functions etc. Return value
+    and duration for the test run are printed out to the console.
+
+    Input data is read from the *FILE* passed with **data_in**. If this *FILE*
+    is "**-**", input data is read from standard input. Input context, if any,
+    is read from *FILE* passed with **ctx_in**. Again, "**-**" can be used to
+    read from standard input, but only if standard input is not already in use
+    for input data. If a *FILE* is passed with **data_out**, output data is
+    written to that file. Similarly, output context is written to the *FILE*
+    passed with **ctx_out**. For both output flows, "**-**" can be used to
+    print to the standard output (as plain text, or JSON if relevant option was
+    passed). If output keywords are omitted, output data and context are
+    discarded. Keywords **data_size_out** and **ctx_size_out** are used to pass
+    the size (in bytes) for the output buffers to the kernel, although the
+    default of 32 kB should be more than enough for most cases.
+
+    Keyword **repeat** is used to indicate the number of consecutive runs to
+    perform. Note that output data and context printed to files correspond to
+    the last of those runs. The duration printed out at the end of the runs is
+    an average over all runs performed by the command.
+
+    Not all program types support test run. Among those which do, not all of
+    them can take the **ctx_in**/**ctx_out** arguments. bpftool does not
+    perform checks on program types.
+
+bpftool prog profile *PROG* [duration *DURATION*] *METRICs*
+    Profile *METRICs* for bpf program *PROG* for *DURATION* seconds or until
+    user hits <Ctrl+C>. *DURATION* is optional. If *DURATION* is not specified,
+    the profiling will run up to **UINT_MAX** seconds.
+
+bpftool prog help
+    Print short help message.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
-
-	-f, --bpffs
-		  When showing BPF programs, show file names of pinned
-		  programs.
-
-	-m, --mapcompat
-		  Allow loading maps with unknown map definitions.
-
-	-n, --nomount
-		  Do not automatically attempt to mount any virtual file system
-		  (such as tracefs or BPF virtual file system) when necessary.
-
-	-L, --use-loader
-		  Load program as a "loader" program. This is useful to debug
-		  the generation of such programs. When this option is in
-		  use, bpftool attempts to load the programs from the object
-		  file into the kernel, but does not pin them (therefore, the
-		  *PATH* must not be provided).
-
-		  When combined with the **-d**\ \|\ **--debug** option,
-		  additional debug messages are generated, and the execution
-		  of the loader program will use the **bpf_trace_printk**\ ()
-		  helper to log each step of loading BTF, creating the maps,
-		  and loading the programs (see **bpftool prog tracelog** as
-		  a way to dump those messages).
+.. include:: common_options.rst
+
+-f, --bpffs
+    When showing BPF programs, show file names of pinned programs.
+
+-m, --mapcompat
+    Allow loading maps with unknown map definitions.
+
+-n, --nomount
+    Do not automatically attempt to mount any virtual file system (such as
+    tracefs or BPF virtual file system) when necessary.
+
+-L, --use-loader
+    Load program as a "loader" program. This is useful to debug the generation
+    of such programs. When this option is in use, bpftool attempts to load the
+    programs from the object file into the kernel, but does not pin them
+    (therefore, the *PATH* must not be provided).
+
+    When combined with the **-d**\ \|\ **--debug** option, additional debug
+    messages are generated, and the execution of the loader program will use
+    the **bpf_trace_printk**\ () helper to log each step of loading BTF,
+    creating the maps, and loading the programs (see **bpftool prog tracelog**
+    as a way to dump those messages).
 
 EXAMPLES
 ========
diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
index 8022b5321dbe..e871b9539ac7 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
@@ -14,61 +14,60 @@ tool to register/unregister/introspect BPF struct_ops
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] **struct_ops** *COMMAND*
+**bpftool** [*OPTIONS*] **struct_ops** *COMMAND*
 
-	*OPTIONS* := { |COMMON_OPTIONS| }
+*OPTIONS* := { |COMMON_OPTIONS| }
 
-	*COMMANDS* :=
-	{ **show** | **list** | **dump** | **register** | **unregister** | **help** }
+*COMMANDS* :=
+{ **show** | **list** | **dump** | **register** | **unregister** | **help** }
 
 STRUCT_OPS COMMANDS
 ===================
 
-|	**bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*]
-|	**bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*]
-|	**bpftool** **struct_ops register** *OBJ* [*LINK_DIR*]
-|	**bpftool** **struct_ops unregister** *STRUCT_OPS_MAP*
-|	**bpftool** **struct_ops help**
+| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*]
+| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*]
+| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*]
+| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP*
+| **bpftool** **struct_ops help**
 |
-|	*STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* }
-|	*OBJ* := /a/file/of/bpf_struct_ops.o
+| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* }
+| *OBJ* := /a/file/of/bpf_struct_ops.o
 
 
 DESCRIPTION
 ===========
-	**bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*]
-		  Show brief information about the struct_ops in the system.
-		  If *STRUCT_OPS_MAP* is specified, it shows information only
-		  for the given struct_ops.  Otherwise, it lists all struct_ops
-		  currently existing in the system.
-
-		  Output will start with struct_ops map ID, followed by its map
-		  name and its struct_ops's kernel type.
-
-	**bpftool struct_ops dump** [*STRUCT_OPS_MAP*]
-		  Dump details information about the struct_ops in the system.
-		  If *STRUCT_OPS_MAP* is specified, it dumps information only
-		  for the given struct_ops.  Otherwise, it dumps all struct_ops
-		  currently existing in the system.
-
-	**bpftool struct_ops register** *OBJ* [*LINK_DIR*]
-		  Register bpf struct_ops from *OBJ*.  All struct_ops under
-		  the ELF section ".struct_ops" and ".struct_ops.link" will
-		  be registered to its kernel subsystem.  For each
-		  struct_ops in the ".struct_ops.link" section, a link
-		  will be created.  You can give *LINK_DIR* to provide a
-		  directory path where these links will be pinned with the
-		  same name as their corresponding map name.
-
-	**bpftool struct_ops unregister**  *STRUCT_OPS_MAP*
-		  Unregister the *STRUCT_OPS_MAP* from the kernel subsystem.
-
-	**bpftool struct_ops help**
-		  Print short help message.
+bpftool struct_ops { show | list } [*STRUCT_OPS_MAP*]
+    Show brief information about the struct_ops in the system. If
+    *STRUCT_OPS_MAP* is specified, it shows information only for the given
+    struct_ops.  Otherwise, it lists all struct_ops currently existing in the
+    system.
+
+    Output will start with struct_ops map ID, followed by its map name and its
+    struct_ops's kernel type.
+
+bpftool struct_ops dump [*STRUCT_OPS_MAP*]
+    Dump details information about the struct_ops in the system. If
+    *STRUCT_OPS_MAP* is specified, it dumps information only for the given
+    struct_ops.  Otherwise, it dumps all struct_ops currently existing in the
+    system.
+
+bpftool struct_ops register *OBJ* [*LINK_DIR*]
+    Register bpf struct_ops from *OBJ*.  All struct_ops under the ELF section
+    ".struct_ops" and ".struct_ops.link" will be registered to its kernel
+    subsystem.  For each struct_ops in the ".struct_ops.link" section, a link
+    will be created.  You can give *LINK_DIR* to provide a directory path where
+    these links will be pinned with the same name as their corresponding map
+    name.
+
+bpftool struct_ops unregister  *STRUCT_OPS_MAP*
+    Unregister the *STRUCT_OPS_MAP* from the kernel subsystem.
+
+bpftool struct_ops help
+    Print short help message.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
+.. include:: common_options.rst
 
 EXAMPLES
 ========
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 09e4f2ff5658..f38ae5c40439 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -14,57 +14,57 @@ tool for inspection and simple manipulation of eBPF programs and maps
 SYNOPSIS
 ========
 
-	**bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** }
+**bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** }
 
-	**bpftool** **batch file** *FILE*
+**bpftool** **batch file** *FILE*
 
-	**bpftool** **version**
+**bpftool** **version**
 
-	*OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** |
-	**btf** | **gen** | **struct_ops** | **iter** }
+*OBJECT* := { **map** | **prog** | **link** | **cgroup** | **perf** | **net** | **feature** |
+**btf** | **gen** | **struct_ops** | **iter** }
 
-	*OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| }
+*OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| }
 
-	*MAP-COMMANDS* :=
-	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** |
-	**delete** | **pin** | **event_pipe** | **help** }
+*MAP-COMMANDS* :=
+{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** |
+**delete** | **pin** | **event_pipe** | **help** }
 
-	*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** |
-	**load** | **attach** | **detach** | **help** }
+*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** |
+**load** | **attach** | **detach** | **help** }
 
-	*LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** }
+*LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** }
 
-	*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
+*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
 
-	*PERF-COMMANDS* := { **show** | **list** | **help** }
+*PERF-COMMANDS* := { **show** | **list** | **help** }
 
-	*NET-COMMANDS* := { **show** | **list** | **help** }
+*NET-COMMANDS* := { **show** | **list** | **help** }
 
-	*FEATURE-COMMANDS* := { **probe** | **help** }
+*FEATURE-COMMANDS* := { **probe** | **help** }
 
-	*BTF-COMMANDS* := { **show** | **list** | **dump** | **help** }
+*BTF-COMMANDS* := { **show** | **list** | **dump** | **help** }
 
-	*GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** }
+*GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** }
 
-	*STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** }
+*STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** }
 
-	*ITER-COMMANDS* := { **pin** | **help** }
+*ITER-COMMANDS* := { **pin** | **help** }
 
 DESCRIPTION
 ===========
-	*bpftool* allows for inspection and simple modification of BPF objects
-	on the system.
+*bpftool* allows for inspection and simple modification of BPF objects on the
+system.
 
-	Note that format of the output of all tools is not guaranteed to be
-	stable and should not be depended upon.
+Note that format of the output of all tools is not guaranteed to be stable and
+should not be depended upon.
 
 OPTIONS
 =======
-	.. include:: common_options.rst
+.. include:: common_options.rst
 
-	-m, --mapcompat
-		  Allow loading maps with unknown map definitions.
+-m, --mapcompat
+    Allow loading maps with unknown map definitions.
 
-	-n, --nomount
-		  Do not automatically attempt to mount any virtual file system
-		  (such as tracefs or BPF virtual file system) when necessary.
+-n, --nomount
+    Do not automatically attempt to mount any virtual file system (such as
+    tracefs or BPF virtual file system) when necessary.
diff --git a/tools/bpf/bpftool/Documentation/common_options.rst b/tools/bpf/bpftool/Documentation/common_options.rst
index 30df7a707f02..9234b9dab768 100644
--- a/tools/bpf/bpftool/Documentation/common_options.rst
+++ b/tools/bpf/bpftool/Documentation/common_options.rst
@@ -1,25 +1,23 @@
 .. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 
 -h, --help
-	  Print short help message (similar to **bpftool help**).
+    Print short help message (similar to **bpftool help**).
 
 -V, --version
-	  Print bpftool's version number (similar to **bpftool version**), the
-	  number of the libbpf version in use, and optional features that were
-	  included when bpftool was compiled. Optional features include linking
-	  against LLVM or libbfd to provide the disassembler for JIT-ted
-	  programs (**bpftool prog dump jited**) and usage of BPF skeletons
-	  (some features like **bpftool prog profile** or showing pids
-	  associated to BPF objects may rely on it).
+    Print bpftool's version number (similar to **bpftool version**), the number
+    of the libbpf version in use, and optional features that were included when
+    bpftool was compiled. Optional features include linking against LLVM or
+    libbfd to provide the disassembler for JIT-ted programs (**bpftool prog
+    dump jited**) and usage of BPF skeletons (some features like **bpftool prog
+    profile** or showing pids associated to BPF objects may rely on it).
 
 -j, --json
-	  Generate JSON output. For commands that cannot produce JSON, this
-	  option has no effect.
+    Generate JSON output. For commands that cannot produce JSON, this option
+    has no effect.
 
 -p, --pretty
-	  Generate human-readable JSON output. Implies **-j**.
+    Generate human-readable JSON output. Implies **-j**.
 
 -d, --debug
-	  Print all logs available, even debug-level information. This includes
-	  logs from libbpf as well as from the verifier, when attempting to
-	  load programs.
+    Print all logs available, even debug-level information. This includes logs
+    from libbpf as well as from the verifier, when attempting to load programs.
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index e9154ace80ff..dfa4f1bebbb3 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -89,6 +89,10 @@ ifneq ($(EXTRA_LDFLAGS),)
 LDFLAGS += $(EXTRA_LDFLAGS)
 endif
 
+HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\
+		$(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS)))
+HOST_LDFLAGS := $(LDFLAGS)
+
 INSTALL ?= install
 RM ?= rm -f
 
@@ -143,7 +147,7 @@ ifeq ($(feature-llvm),1)
   # If LLVM is available, use it for JIT disassembly
   CFLAGS  += -DHAVE_LLVM_SUPPORT
   LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets
-  CFLAGS  += $(shell $(LLVM_CONFIG) --cflags --libs $(LLVM_CONFIG_LIB_COMPONENTS))
+  CFLAGS  += $(shell $(LLVM_CONFIG) --cflags)
   LIBS    += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS))
   ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static)
     LIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS))
@@ -178,12 +182,9 @@ ifeq ($(filter -DHAVE_LLVM_SUPPORT -DHAVE_LIBBFD_SUPPORT,$(CFLAGS)),)
   SRCS := $(filter-out jit_disasm.c,$(SRCS))
 endif
 
-HOST_CFLAGS = $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\
-		$(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS)))
-
 BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool
 
-BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o xlated_dumper.o btf_dumper.o disasm.o)
+BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o)
 $(BOOTSTRAP_OBJS): $(LIBBPF_BOOTSTRAP)
 
 OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
@@ -231,14 +232,11 @@ endif
 
 CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS)
 
-$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
-	$(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@
-
 $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
 	$(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@
 
 $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP)
-	$(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@
+	$(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(HOST_LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@
 
 $(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
 	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 6e4f7ce6bc01..04afe2ac2228 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -106,19 +106,19 @@ _bpftool_get_link_ids()
 
 _bpftool_get_obj_map_names()
 {
-    local obj
+    local obj maps
 
     obj=$1
 
-    maps=$(objdump -j maps -t $obj 2>/dev/null | \
-        command awk '/g     . maps/ {print $NF}')
+    maps=$(objdump -j .maps -t $obj 2>/dev/null | \
+        command awk '/g     . .maps/ {print $NF}')
 
     COMPREPLY+=( $( compgen -W "$maps" -- "$cur" ) )
 }
 
 _bpftool_get_obj_map_idxs()
 {
-    local obj
+    local obj nmaps
 
     obj=$1
 
@@ -136,7 +136,7 @@ _sysfs_get_netdevs()
 # Retrieve type of the map that we are operating on.
 _bpftool_map_guess_map_type()
 {
-    local keyword ref
+    local keyword idx ref=""
     for (( idx=3; idx < ${#words[@]}-1; idx++ )); do
         case "${words[$((idx-2))]}" in
             lookup|update)
@@ -255,8 +255,9 @@ _bpftool_map_update_get_name()
 
 _bpftool()
 {
-    local cur prev words objword json=0
-    _init_completion || return
+    local cur prev words cword comp_args
+    local json=0
+    _init_completion -- "$@" || return
 
     # Deal with options
     if [[ ${words[cword]} == -* ]]; then
@@ -293,7 +294,7 @@ _bpftool()
     esac
 
     # Remove all options so completions don't have to deal with them.
-    local i
+    local i pprev
     for (( i=1; i < ${#words[@]}; )); do
         if [[ ${words[i]::1} == - ]] &&
             [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then
@@ -307,7 +308,7 @@ _bpftool()
     prev=${words[cword - 1]}
     pprev=${words[cword - 2]}
 
-    local object=${words[1]} command=${words[2]}
+    local object=${words[1]}
 
     if [[ -z $object || $cword -eq 1 ]]; then
         case $cur in
@@ -324,8 +325,12 @@ _bpftool()
         esac
     fi
 
+    local command=${words[2]}
     [[ $command == help ]] && return 0
 
+    local MAP_TYPE='id pinned name'
+    local PROG_TYPE='id pinned tag name'
+
     # Completion depends on object and command in use
     case $object in
         prog)
@@ -346,8 +351,6 @@ _bpftool()
                     ;;
             esac
 
-            local PROG_TYPE='id pinned tag name'
-            local MAP_TYPE='id pinned name'
             local METRIC_TYPE='cycles instructions l1d_loads llc_misses \
                 itlb_misses dtlb_misses'
             case $command in
@@ -457,7 +460,7 @@ _bpftool()
                     obj=${words[3]}
 
                     if [[ ${words[-4]} == "map" ]]; then
-                        COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) )
+                        COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) )
                         return 0
                     fi
                     if [[ ${words[-3]} == "map" ]]; then
@@ -541,20 +544,9 @@ _bpftool()
                             COMPREPLY=( $( compgen -W "$METRIC_TYPE duration" -- "$cur" ) )
                             return 0
                             ;;
-                        6)
-                            case $prev in
-                                duration)
-                                    return 0
-                                    ;;
-                                *)
-                                    COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) )
-                                    return 0
-                                    ;;
-                            esac
-                            return 0
-                            ;;
                         *)
-                            COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) )
+                            [[ $prev == duration ]] && return 0
+                            _bpftool_once_attr "$METRIC_TYPE"
                             return 0
                             ;;
                     esac
@@ -612,7 +604,7 @@ _bpftool()
                     return 0
                     ;;
                 register)
-                    _filedir
+                    [[ $prev == $command ]] && _filedir
                     return 0
                     ;;
                 *)
@@ -638,9 +630,12 @@ _bpftool()
                         pinned)
                             _filedir
                             ;;
-                        *)
+                        map)
                             _bpftool_one_of_list $MAP_TYPE
                             ;;
+                        *)
+                            _bpftool_once_attr 'map'
+                            ;;
                     esac
                     return 0
                     ;;
@@ -652,7 +647,6 @@ _bpftool()
             esac
             ;;
         map)
-            local MAP_TYPE='id pinned name'
             case $command in
                 show|list|dump|peek|pop|dequeue|freeze)
                     case $prev in
@@ -793,13 +787,11 @@ _bpftool()
                             # map, depending on the type of the map to update.
                             case "$(_bpftool_map_guess_map_type)" in
                                 array_of_maps|hash_of_maps)
-                                    local MAP_TYPE='id pinned name'
                                     COMPREPLY+=( $( compgen -W "$MAP_TYPE" \
                                         -- "$cur" ) )
                                     return 0
                                     ;;
                                 prog_array)
-                                    local PROG_TYPE='id pinned tag name'
                                     COMPREPLY+=( $( compgen -W "$PROG_TYPE" \
                                         -- "$cur" ) )
                                     return 0
@@ -821,7 +813,7 @@ _bpftool()
                             esac
 
                             _bpftool_once_attr 'key'
-                            local UPDATE_FLAGS='any exist noexist'
+                            local UPDATE_FLAGS='any exist noexist' idx
                             for (( idx=3; idx < ${#words[@]}-1; idx++ )); do
                                 if [[ ${words[idx]} == 'value' ]]; then
                                     # 'value' is present, but is not the last
@@ -893,7 +885,6 @@ _bpftool()
             esac
             ;;
         btf)
-            local PROG_TYPE='id pinned tag name'
             local MAP_TYPE='id pinned name'
             case $command in
                 dump)
@@ -1033,7 +1024,6 @@ _bpftool()
                     local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list_builtins attach_types 2>/dev/null | \
                         grep '^cgroup_')"
                     local ATTACH_FLAGS='multi override'
-                    local PROG_TYPE='id pinned tag name'
                     # Check for $prev = $command first
                     if [ $prev = $command ]; then
                         _filedir
@@ -1086,7 +1076,6 @@ _bpftool()
             esac
             ;;
         net)
-            local PROG_TYPE='id pinned tag name'
             local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload'
             case $command in
                 show|list)
@@ -1193,14 +1182,14 @@ _bpftool()
                 pin|detach)
                     if [[ $prev == "$command" ]]; then
                         COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) )
-                    else
+                    elif [[ $pprev == "$command" ]]; then
                         _filedir
                     fi
                     return 0
                     ;;
                 *)
                     [[ $prev == $object ]] && \
-                        COMPREPLY=( $( compgen -W 'help pin show list' -- "$cur" ) )
+                        COMPREPLY=( $( compgen -W 'help pin detach show list' -- "$cur" ) )
                     ;;
             esac
             ;;
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index cc6e6aae2447..958e92acca8e 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -244,29 +244,101 @@ int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type)
 	return fd;
 }
 
-int mount_bpffs_for_pin(const char *name, bool is_dir)
+int create_and_mount_bpffs_dir(const char *dir_name)
 {
 	char err_str[ERR_MAX_LEN];
-	char *file;
-	char *dir;
+	bool dir_exists;
 	int err = 0;
 
-	if (is_dir && is_bpffs(name))
+	if (is_bpffs(dir_name))
 		return err;
 
-	file = malloc(strlen(name) + 1);
-	if (!file) {
+	dir_exists = access(dir_name, F_OK) == 0;
+
+	if (!dir_exists) {
+		char *temp_name;
+		char *parent_name;
+
+		temp_name = strdup(dir_name);
+		if (!temp_name) {
+			p_err("mem alloc failed");
+			return -1;
+		}
+
+		parent_name = dirname(temp_name);
+
+		if (is_bpffs(parent_name)) {
+			/* nothing to do if already mounted */
+			free(temp_name);
+			return err;
+		}
+
+		if (access(parent_name, F_OK) == -1) {
+			p_err("can't create dir '%s' to pin BPF object: parent dir '%s' doesn't exist",
+			      dir_name, parent_name);
+			free(temp_name);
+			return -1;
+		}
+
+		free(temp_name);
+	}
+
+	if (block_mount) {
+		p_err("no BPF file system found, not mounting it due to --nomount option");
+		return -1;
+	}
+
+	if (!dir_exists) {
+		err = mkdir(dir_name, S_IRWXU);
+		if (err) {
+			p_err("failed to create dir '%s': %s", dir_name, strerror(errno));
+			return err;
+		}
+	}
+
+	err = mnt_fs(dir_name, "bpf", err_str, ERR_MAX_LEN);
+	if (err) {
+		err_str[ERR_MAX_LEN - 1] = '\0';
+		p_err("can't mount BPF file system on given dir '%s': %s",
+		      dir_name, err_str);
+
+		if (!dir_exists)
+			rmdir(dir_name);
+	}
+
+	return err;
+}
+
+int mount_bpffs_for_file(const char *file_name)
+{
+	char err_str[ERR_MAX_LEN];
+	char *temp_name;
+	char *dir;
+	int err = 0;
+
+	if (access(file_name, F_OK) != -1) {
+		p_err("can't pin BPF object: path '%s' already exists", file_name);
+		return -1;
+	}
+
+	temp_name = strdup(file_name);
+	if (!temp_name) {
 		p_err("mem alloc failed");
 		return -1;
 	}
 
-	strcpy(file, name);
-	dir = dirname(file);
+	dir = dirname(temp_name);
 
 	if (is_bpffs(dir))
 		/* nothing to do if already mounted */
 		goto out_free;
 
+	if (access(dir, F_OK) == -1) {
+		p_err("can't pin BPF object: dir '%s' doesn't exist", dir);
+		err = -1;
+		goto out_free;
+	}
+
 	if (block_mount) {
 		p_err("no BPF file system found, not mounting it due to --nomount option");
 		err = -1;
@@ -276,12 +348,12 @@ int mount_bpffs_for_pin(const char *name, bool is_dir)
 	err = mnt_fs(dir, "bpf", err_str, ERR_MAX_LEN);
 	if (err) {
 		err_str[ERR_MAX_LEN - 1] = '\0';
-		p_err("can't mount BPF file system to pin the object (%s): %s",
-		      name, err_str);
+		p_err("can't mount BPF file system to pin the object '%s': %s",
+		      file_name, err_str);
 	}
 
 out_free:
-	free(file);
+	free(temp_name);
 	return err;
 }
 
@@ -289,7 +361,7 @@ int do_pin_fd(int fd, const char *name)
 {
 	int err;
 
-	err = mount_bpffs_for_pin(name, false);
+	err = mount_bpffs_for_file(name);
 	if (err)
 		return err;
 
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 708733b0ea06..c754a428c8c6 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -664,7 +664,8 @@ probe_helper_ifindex(enum bpf_func_id id, enum bpf_prog_type prog_type,
 
 	probe_prog_load_ifindex(prog_type, insns, ARRAY_SIZE(insns), buf,
 				sizeof(buf), ifindex);
-	res = !grep(buf, "invalid func ") && !grep(buf, "unknown func ");
+	res = !grep(buf, "invalid func ") && !grep(buf, "unknown func ") &&
+		!grep(buf, "program of this type cannot use helper ");
 
 	switch (get_vendor_id(ifindex)) {
 	case 0x19ee: /* Netronome specific */
diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
index 4fa4ade1ce74..b3979ddc0189 100644
--- a/tools/bpf/bpftool/gen.c
+++ b/tools/bpf/bpftool/gen.c
@@ -121,7 +121,7 @@ static bool get_datasec_ident(const char *sec_name, char *buf, size_t buf_sz)
 	int i, n;
 
 	/* recognize hard coded LLVM section name */
-	if (strcmp(sec_name, ".arena.1") == 0) {
+	if (strcmp(sec_name, ".addr_space.1") == 0) {
 		/* this is the name to use in skeleton */
 		snprintf(buf, buf_sz, "arena");
 		return true;
@@ -386,7 +386,7 @@ static int codegen_subskel_datasecs(struct bpf_object *obj, const char *obj_name
 			 */
 			needs_typeof = btf_is_array(var) || btf_is_ptr_to_func_proto(btf, var);
 			if (needs_typeof)
-				printf("typeof(");
+				printf("__typeof__(");
 
 			err = btf_dump__emit_type_decl(d, var_type_id, &opts);
 			if (err)
@@ -1131,7 +1131,8 @@ static void gen_st_ops_shadow_init(struct btf *btf, struct bpf_object *obj)
 			continue;
 		codegen("\
 			\n\
-				obj->struct_ops.%1$s = bpf_map__initial_value(obj->maps.%1$s, NULL);\n\
+				obj->struct_ops.%1$s = (__typeof__(obj->struct_ops.%1$s))\n\
+					bpf_map__initial_value(obj->maps.%1$s, NULL);\n\
 			\n\
 			", ident);
 	}
diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c
index 6b0e5202ca7a..5c39c2ed36a2 100644
--- a/tools/bpf/bpftool/iter.c
+++ b/tools/bpf/bpftool/iter.c
@@ -76,7 +76,7 @@ static int do_pin(int argc, char **argv)
 		goto close_obj;
 	}
 
-	err = mount_bpffs_for_pin(path, false);
+	err = mount_bpffs_for_file(path);
 	if (err)
 		goto close_link;
 
diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index afde9d0c2ea1..5cd503b763d7 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -526,6 +526,10 @@ static int show_link_close_json(int fd, struct bpf_link_info *info)
 		show_link_ifindex_json(info->netkit.ifindex, json_wtr);
 		show_link_attach_type_json(info->netkit.attach_type, json_wtr);
 		break;
+	case BPF_LINK_TYPE_SOCKMAP:
+		jsonw_uint_field(json_wtr, "map_id", info->sockmap.map_id);
+		show_link_attach_type_json(info->sockmap.attach_type, json_wtr);
+		break;
 	case BPF_LINK_TYPE_XDP:
 		show_link_ifindex_json(info->xdp.ifindex, json_wtr);
 		break;
@@ -915,6 +919,11 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info)
 		show_link_ifindex_plain(info->netkit.ifindex);
 		show_link_attach_type_plain(info->netkit.attach_type);
 		break;
+	case BPF_LINK_TYPE_SOCKMAP:
+		printf("\n\t");
+		printf("map_id %u  ", info->sockmap.map_id);
+		show_link_attach_type_plain(info->sockmap.attach_type);
+		break;
 	case BPF_LINK_TYPE_XDP:
 		printf("\n\t");
 		show_link_ifindex_plain(info->xdp.ifindex);
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index b8bb08d10dec..9eb764fe4cc8 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -142,7 +142,8 @@ const char *get_fd_type_name(enum bpf_obj_type type);
 char *get_fdinfo(int fd, const char *key);
 int open_obj_pinned(const char *path, bool quiet);
 int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type);
-int mount_bpffs_for_pin(const char *name, bool is_dir);
+int mount_bpffs_for_file(const char *file_name);
+int create_and_mount_bpffs_dir(const char *dir_name);
 int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(int *, char ***));
 int do_pin_fd(int fd, const char *name);
 
diff --git a/tools/bpf/bpftool/pids.c b/tools/bpf/bpftool/pids.c
index 00c77edb6331..9b898571b49e 100644
--- a/tools/bpf/bpftool/pids.c
+++ b/tools/bpf/bpftool/pids.c
@@ -101,7 +101,6 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type)
 	char buf[4096 / sizeof(*e) * sizeof(*e)];
 	struct pid_iter_bpf *skel;
 	int err, ret, fd = -1, i;
-	libbpf_print_fn_t default_print;
 
 	*map = hashmap__new(hash_fn_for_key_as_id, equal_fn_for_key_as_id, NULL);
 	if (IS_ERR(*map)) {
@@ -118,12 +117,18 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type)
 
 	skel->rodata->obj_type = type;
 
-	/* we don't want output polluted with libbpf errors if bpf_iter is not
-	 * supported
-	 */
-	default_print = libbpf_set_print(libbpf_print_none);
-	err = pid_iter_bpf__load(skel);
-	libbpf_set_print(default_print);
+	if (!verifier_logs) {
+		libbpf_print_fn_t default_print;
+
+		/* Unless debug information is on, we don't want the output to
+		 * be polluted with libbpf errors if bpf_iter is not supported.
+		 */
+		default_print = libbpf_set_print(libbpf_print_none);
+		err = pid_iter_bpf__load(skel);
+		libbpf_set_print(default_print);
+	} else {
+		err = pid_iter_bpf__load(skel);
+	}
 	if (err) {
 		/* too bad, kernel doesn't support BPF iterators yet */
 		err = 0;
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 9cb42a3366c0..1a501cf09e78 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -1778,7 +1778,10 @@ offload_dev:
 		goto err_close_obj;
 	}
 
-	err = mount_bpffs_for_pin(pinfile, !first_prog_only);
+	if (first_prog_only)
+		err = mount_bpffs_for_file(pinfile);
+	else
+		err = create_and_mount_bpffs_dir(pinfile);
 	if (err)
 		goto err_close_obj;
 
@@ -2078,7 +2081,7 @@ static int profile_parse_metrics(int argc, char **argv)
 		NEXT_ARG();
 	}
 	if (selected_cnt > MAX_NUM_PROFILE_METRICS) {
-		p_err("too many (%d) metrics, please specify no more than %d metrics at at time",
+		p_err("too many (%d) metrics, please specify no more than %d metrics at a time",
 		      selected_cnt, MAX_NUM_PROFILE_METRICS);
 		return -1;
 	}
diff --git a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c
index 26004f0c5a6a..7bdbcac3cf62 100644
--- a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c
+++ b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c
@@ -102,8 +102,8 @@ int iter(struct bpf_iter__task_file *ctx)
 				       BPF_LINK_TYPE_PERF_EVENT___local)) {
 		struct bpf_link *link = (struct bpf_link *) file->private_data;
 
-		if (link->type == bpf_core_enum_value(enum bpf_link_type___local,
-						      BPF_LINK_TYPE_PERF_EVENT___local)) {
+		if (BPF_CORE_READ(link, type) == bpf_core_enum_value(enum bpf_link_type___local,
+								     BPF_LINK_TYPE_PERF_EVENT___local)) {
 			e.has_bpf_cookie = true;
 			e.bpf_cookie = get_bpf_cookie(link);
 		}
diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c
index d573f2640d8e..aa43dead249c 100644
--- a/tools/bpf/bpftool/struct_ops.c
+++ b/tools/bpf/bpftool/struct_ops.c
@@ -515,7 +515,7 @@ static int do_register(int argc, char **argv)
 	if (argc == 1)
 		linkdir = GET_ARG();
 
-	if (linkdir && mount_bpffs_for_pin(linkdir, true)) {
+	if (linkdir && create_and_mount_bpffs_dir(linkdir)) {
 		p_err("can't mount bpffs for pinning");
 		return -1;
 	}
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 64df118376df..1e2ab148d5db 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -87,6 +87,7 @@ FEATURE_TESTS_EXTRA :=                  \
          gtk2-infobar                   \
          hello                          \
          libbabeltrace                  \
+         libcapstone                    \
          libbfd-liberty                 \
          libbfd-liberty-z               \
          libopencsd                     \
@@ -134,6 +135,7 @@ FEATURE_DISPLAY ?=              \
          libcrypto              \
          libunwind              \
          libdw-dwarf-unwind     \
+         libcapstone            \
          zlib                   \
          lzma                   \
          get_cpuid              \
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 37722e509eb9..ed54cef450f5 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -54,6 +54,7 @@ FILES=                                          \
          test-timerfd.bin                       \
          test-libdw-dwarf-unwind.bin            \
          test-libbabeltrace.bin                 \
+         test-libcapstone.bin			\
          test-compile-32.bin                    \
          test-compile-x32.bin                   \
          test-zlib.bin                          \
@@ -286,6 +287,9 @@ $(OUTPUT)test-libdw-dwarf-unwind.bin:
 $(OUTPUT)test-libbabeltrace.bin:
 	$(BUILD) # -lbabeltrace provided by $(FEATURE_CHECK_LDFLAGS-libbabeltrace)
 
+$(OUTPUT)test-libcapstone.bin:
+	$(BUILD) # -lcapstone provided by $(FEATURE_CHECK_LDFLAGS-libcapstone)
+
 $(OUTPUT)test-compile-32.bin:
 	$(CC) -m32 -o $@ test-compile.c
 
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index 6f4bf386a3b5..dd0a18c2ef8f 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -134,6 +134,10 @@
 #undef main
 #endif
 
+#define main main_test_libcapstone
+# include "test-libcapstone.c"
+#undef main
+
 #define main main_test_lzma
 # include "test-lzma.c"
 #undef main
diff --git a/tools/build/feature/test-libcapstone.c b/tools/build/feature/test-libcapstone.c
new file mode 100644
index 000000000000..fbe8dba189e9
--- /dev/null
+++ b/tools/build/feature/test-libcapstone.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <capstone/capstone.h>
+
+int main(void)
+{
+	csh handle;
+
+	cs_open(CS_ARCH_X86, CS_MODE_64, &handle);
+	return 0;
+}
diff --git a/tools/crypto/ccp/test_dbc.py b/tools/crypto/ccp/test_dbc.py
index 79de3638a01a..bb0e671be96d 100755
--- a/tools/crypto/ccp/test_dbc.py
+++ b/tools/crypto/ccp/test_dbc.py
@@ -138,12 +138,14 @@ class TestInvalidSignature(DynamicBoostControlTest):
 
     def test_authenticated_nonce(self) -> None:
         """fetch authenticated nonce"""
+        get_nonce(self.d, None)
         with self.assertRaises(OSError) as error:
             get_nonce(self.d, self.signature)
-        self.assertEqual(error.exception.errno, 1)
+        self.assertEqual(error.exception.errno, 22)
 
     def test_set_uid(self) -> None:
         """set uid"""
+        get_nonce(self.d, None)
         with self.assertRaises(OSError) as error:
             set_uid(self.d, self.uid, self.signature)
         self.assertEqual(error.exception.errno, 1)
@@ -152,13 +154,13 @@ class TestInvalidSignature(DynamicBoostControlTest):
         """fetch a parameter"""
         with self.assertRaises(OSError) as error:
             process_param(self.d, PARAM_GET_SOC_PWR_CUR, self.signature)
-        self.assertEqual(error.exception.errno, 1)
+        self.assertEqual(error.exception.errno, 11)
 
     def test_set_param(self) -> None:
         """set a parameter"""
         with self.assertRaises(OSError) as error:
             process_param(self.d, PARAM_SET_PWR_CAP, self.signature, 1000)
-        self.assertEqual(error.exception.errno, 1)
+        self.assertEqual(error.exception.errno, 11)
 
 
 class TestUnFusedSystem(DynamicBoostControlTest):
diff --git a/tools/edid/1024x768.S b/tools/edid/1024x768.S
deleted file mode 100644
index 4aed3f9ab88a..000000000000
--- a/tools/edid/1024x768.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-   1024x768.S: EDID data set for standard 1024x768 60 Hz monitor
-
-   Copyright (C) 2011 Carsten Emde <C.Emde@osadl.org>
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License
-   as published by the Free Software Foundation; either version 2
-   of the License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
-*/
-
-/* EDID */
-#define VERSION 1
-#define REVISION 3
-
-/* Display */
-#define CLOCK 65000 /* kHz */
-#define XPIX 1024
-#define YPIX 768
-#define XY_RATIO XY_RATIO_4_3
-#define XBLANK 320
-#define YBLANK 38
-#define XOFFSET 8
-#define XPULSE 144
-#define YOFFSET 3
-#define YPULSE 6
-#define DPI 72
-#define VFREQ 60 /* Hz */
-#define TIMING_NAME "Linux XGA"
-#define ESTABLISHED_TIMING2_BITS 0x08 /* Bit 3 -> 1024x768 @60 Hz */
-#define HSYNC_POL 0
-#define VSYNC_POL 0
-
-#include "edid.S"
diff --git a/tools/edid/1280x1024.S b/tools/edid/1280x1024.S
deleted file mode 100644
index b26dd424cad7..000000000000
--- a/tools/edid/1280x1024.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-   1280x1024.S: EDID data set for standard 1280x1024 60 Hz monitor
-
-   Copyright (C) 2011 Carsten Emde <C.Emde@osadl.org>
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License
-   as published by the Free Software Foundation; either version 2
-   of the License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
-*/
-
-/* EDID */
-#define VERSION 1
-#define REVISION 3
-
-/* Display */
-#define CLOCK 108000 /* kHz */
-#define XPIX 1280
-#define YPIX 1024
-#define XY_RATIO XY_RATIO_5_4
-#define XBLANK 408
-#define YBLANK 42
-#define XOFFSET 48
-#define XPULSE 112
-#define YOFFSET 1
-#define YPULSE 3
-#define DPI 72
-#define VFREQ 60 /* Hz */
-#define TIMING_NAME "Linux SXGA"
-/* No ESTABLISHED_TIMINGx_BITS */
-#define HSYNC_POL 1
-#define VSYNC_POL 1
-
-#include "edid.S"
diff --git a/tools/edid/1600x1200.S b/tools/edid/1600x1200.S
deleted file mode 100644
index 0d091b282768..000000000000
--- a/tools/edid/1600x1200.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-   1600x1200.S: EDID data set for standard 1600x1200 60 Hz monitor
-
-   Copyright (C) 2013 Carsten Emde <C.Emde@osadl.org>
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License
-   as published by the Free Software Foundation; either version 2
-   of the License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
-*/
-
-/* EDID */
-#define VERSION 1
-#define REVISION 3
-
-/* Display */
-#define CLOCK 162000 /* kHz */
-#define XPIX 1600
-#define YPIX 1200
-#define XY_RATIO XY_RATIO_4_3
-#define XBLANK 560
-#define YBLANK 50
-#define XOFFSET 64
-#define XPULSE 192
-#define YOFFSET 1
-#define YPULSE 3
-#define DPI 72
-#define VFREQ 60 /* Hz */
-#define TIMING_NAME "Linux UXGA"
-/* No ESTABLISHED_TIMINGx_BITS */
-#define HSYNC_POL 1
-#define VSYNC_POL 1
-
-#include "edid.S"
diff --git a/tools/edid/1680x1050.S b/tools/edid/1680x1050.S
deleted file mode 100644
index 7dfed9a33eab..000000000000
--- a/tools/edid/1680x1050.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-   1680x1050.S: EDID data set for standard 1680x1050 60 Hz monitor
-
-   Copyright (C) 2012 Carsten Emde <C.Emde@osadl.org>
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License
-   as published by the Free Software Foundation; either version 2
-   of the License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
-*/
-
-/* EDID */
-#define VERSION 1
-#define REVISION 3
-
-/* Display */
-#define CLOCK 146250 /* kHz */
-#define XPIX 1680
-#define YPIX 1050
-#define XY_RATIO XY_RATIO_16_10
-#define XBLANK 560
-#define YBLANK 39
-#define XOFFSET 104
-#define XPULSE 176
-#define YOFFSET 3
-#define YPULSE 6
-#define DPI 96
-#define VFREQ 60 /* Hz */
-#define TIMING_NAME "Linux WSXGA"
-/* No ESTABLISHED_TIMINGx_BITS */
-#define HSYNC_POL 1
-#define VSYNC_POL 1
-
-#include "edid.S"
diff --git a/tools/edid/1920x1080.S b/tools/edid/1920x1080.S
deleted file mode 100644
index d6ffbba28e95..000000000000
--- a/tools/edid/1920x1080.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-   1920x1080.S: EDID data set for standard 1920x1080 60 Hz monitor
-
-   Copyright (C) 2012 Carsten Emde <C.Emde@osadl.org>
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License
-   as published by the Free Software Foundation; either version 2
-   of the License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
-*/
-
-/* EDID */
-#define VERSION 1
-#define REVISION 3
-
-/* Display */
-#define CLOCK 148500 /* kHz */
-#define XPIX 1920
-#define YPIX 1080
-#define XY_RATIO XY_RATIO_16_9
-#define XBLANK 280
-#define YBLANK 45
-#define XOFFSET 88
-#define XPULSE 44
-#define YOFFSET 4
-#define YPULSE 5
-#define DPI 96
-#define VFREQ 60 /* Hz */
-#define TIMING_NAME "Linux FHD"
-/* No ESTABLISHED_TIMINGx_BITS */
-#define HSYNC_POL 1
-#define VSYNC_POL 1
-
-#include "edid.S"
diff --git a/tools/edid/800x600.S b/tools/edid/800x600.S
deleted file mode 100644
index a5616588de08..000000000000
--- a/tools/edid/800x600.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
-   800x600.S: EDID data set for standard 800x600 60 Hz monitor
-
-   Copyright (C) 2011 Carsten Emde <C.Emde@osadl.org>
-   Copyright (C) 2014 Linaro Limited
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License
-   as published by the Free Software Foundation; either version 2
-   of the License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-*/
-
-/* EDID */
-#define VERSION 1
-#define REVISION 3
-
-/* Display */
-#define CLOCK 40000 /* kHz */
-#define XPIX 800
-#define YPIX 600
-#define XY_RATIO XY_RATIO_4_3
-#define XBLANK 256
-#define YBLANK 28
-#define XOFFSET 40
-#define XPULSE 128
-#define YOFFSET 1
-#define YPULSE 4
-#define DPI 72
-#define VFREQ 60 /* Hz */
-#define TIMING_NAME "Linux SVGA"
-#define ESTABLISHED_TIMING1_BITS 0x01 /* Bit 0: 800x600 @ 60Hz */
-#define HSYNC_POL 1
-#define VSYNC_POL 1
-
-#include "edid.S"
diff --git a/tools/edid/Makefile b/tools/edid/Makefile
deleted file mode 100644
index 85a927dfab02..000000000000
--- a/tools/edid/Makefile
+++ /dev/null
@@ -1,37 +0,0 @@
-
-SOURCES	:= $(wildcard [0-9]*x[0-9]*.S)
-
-BIN	:= $(patsubst %.S, %.bin, $(SOURCES))
-
-IHEX	:= $(patsubst %.S, %.bin.ihex, $(SOURCES))
-
-CODE	:= $(patsubst %.S, %.c, $(SOURCES))
-
-all:	$(BIN) $(IHEX) $(CODE)
-
-clean:
-	@rm -f *.o *.bin.ihex *.bin *.c
-
-%.o:	%.S
-	@cc -c $^
-
-%.bin.nocrc:	%.o
-	@objcopy -Obinary $^ $@
-
-%.crc:	%.bin.nocrc
-	@list=$$(for i in `seq 1 127`; do head -c$$i $^ | tail -c1 \
-		| hexdump -v -e '/1 "%02X+"'; done); \
-		echo "ibase=16;100-($${list%?})%100" | bc >$@
-
-%.p:	%.crc %.S
-	@cc -c -DCRC="$$(cat $*.crc)" -o $@ $*.S
-
-%.bin:	%.p
-	@objcopy -Obinary $^ $@
-
-%.bin.ihex:	%.p
-	@objcopy -Oihex $^ $@
-	@dos2unix $@ 2>/dev/null
-
-%.c:	%.bin
-	@echo "{" >$@; hexdump -f hex $^ >>$@; echo "};" >>$@
diff --git a/tools/edid/edid.S b/tools/edid/edid.S
deleted file mode 100644
index c3d13815526d..000000000000
--- a/tools/edid/edid.S
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
-   edid.S: EDID data template
-
-   Copyright (C) 2012 Carsten Emde <C.Emde@osadl.org>
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License
-   as published by the Free Software Foundation; either version 2
-   of the License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
-*/
-
-
-/* Manufacturer */
-#define MFG_LNX1 'L'
-#define MFG_LNX2 'N'
-#define MFG_LNX3 'X'
-#define SERIAL 0
-#define YEAR 2012
-#define WEEK 5
-
-/* EDID 1.3 standard definitions */
-#define XY_RATIO_16_10	0b00
-#define XY_RATIO_4_3	0b01
-#define XY_RATIO_5_4	0b10
-#define XY_RATIO_16_9	0b11
-
-/* Provide defaults for the timing bits */
-#ifndef ESTABLISHED_TIMING1_BITS
-#define ESTABLISHED_TIMING1_BITS 0x00
-#endif
-#ifndef ESTABLISHED_TIMING2_BITS
-#define ESTABLISHED_TIMING2_BITS 0x00
-#endif
-#ifndef ESTABLISHED_TIMING3_BITS
-#define ESTABLISHED_TIMING3_BITS 0x00
-#endif
-
-#define mfgname2id(v1,v2,v3) \
-	((((v1-'@')&0x1f)<<10)+(((v2-'@')&0x1f)<<5)+((v3-'@')&0x1f))
-#define swap16(v1) ((v1>>8)+((v1&0xff)<<8))
-#define lsbs2(v1,v2) (((v1&0x0f)<<4)+(v2&0x0f))
-#define msbs2(v1,v2) ((((v1>>8)&0x0f)<<4)+((v2>>8)&0x0f))
-#define msbs4(v1,v2,v3,v4) \
-	((((v1>>8)&0x03)<<6)+(((v2>>8)&0x03)<<4)+\
-	(((v3>>4)&0x03)<<2)+((v4>>4)&0x03))
-#define pixdpi2mm(pix,dpi) ((pix*25)/dpi)
-#define xsize pixdpi2mm(XPIX,DPI)
-#define ysize pixdpi2mm(YPIX,DPI)
-
-		.data
-
-/* Fixed header pattern */
-header:		.byte	0x00,0xff,0xff,0xff,0xff,0xff,0xff,0x00
-
-mfg_id:		.hword	swap16(mfgname2id(MFG_LNX1, MFG_LNX2, MFG_LNX3))
-
-prod_code:	.hword	0
-
-/* Serial number. 32 bits, little endian. */
-serial_number:	.long	SERIAL
-
-/* Week of manufacture */
-week:		.byte	WEEK
-
-/* Year of manufacture, less 1990. (1990-2245)
-   If week=255, it is the model year instead */
-year:		.byte	YEAR-1990
-
-version:	.byte	VERSION 	/* EDID version, usually 1 (for 1.3) */
-revision:	.byte	REVISION	/* EDID revision, usually 3 (for 1.3) */
-
-/* If Bit 7=1	Digital input. If set, the following bit definitions apply:
-     Bits 6-1	Reserved, must be 0
-     Bit 0	Signal is compatible with VESA DFP 1.x TMDS CRGB,
-		  1 pixel per clock, up to 8 bits per color, MSB aligned,
-   If Bit 7=0	Analog input. If clear, the following bit definitions apply:
-     Bits 6-5	Video white and sync levels, relative to blank
-		  00=+0.7/-0.3 V; 01=+0.714/-0.286 V;
-		  10=+1.0/-0.4 V; 11=+0.7/0 V
-   Bit 4	Blank-to-black setup (pedestal) expected
-   Bit 3	Separate sync supported
-   Bit 2	Composite sync (on HSync) supported
-   Bit 1	Sync on green supported
-   Bit 0	VSync pulse must be serrated when somposite or
-		  sync-on-green is used. */
-video_parms:	.byte	0x6d
-
-/* Maximum horizontal image size, in centimetres
-   (max 292 cm/115 in at 16:9 aspect ratio) */
-max_hor_size:	.byte	xsize/10
-
-/* Maximum vertical image size, in centimetres.
-   If either byte is 0, undefined (e.g. projector) */
-max_vert_size:	.byte	ysize/10
-
-/* Display gamma, minus 1, times 100 (range 1.00-3.5 */
-gamma:		.byte	120
-
-/* Bit 7	DPMS standby supported
-   Bit 6	DPMS suspend supported
-   Bit 5	DPMS active-off supported
-   Bits 4-3	Display type: 00=monochrome; 01=RGB colour;
-		  10=non-RGB multicolour; 11=undefined
-   Bit 2	Standard sRGB colour space. Bytes 25-34 must contain
-		  sRGB standard values.
-   Bit 1	Preferred timing mode specified in descriptor block 1.
-   Bit 0	GTF supported with default parameter values. */
-dsp_features:	.byte	0xea
-
-/* Chromaticity coordinates. */
-/* Red and green least-significant bits
-   Bits 7-6	Red x value least-significant 2 bits
-   Bits 5-4	Red y value least-significant 2 bits
-   Bits 3-2	Green x value lst-significant 2 bits
-   Bits 1-0	Green y value least-significant 2 bits */
-red_green_lsb:	.byte	0x5e
-
-/* Blue and white least-significant 2 bits */
-blue_white_lsb:	.byte	0xc0
-
-/* Red x value most significant 8 bits.
-   0-255 encodes 0-0.996 (255/256); 0-0.999 (1023/1024) with lsbits */
-red_x_msb:	.byte	0xa4
-
-/* Red y value most significant 8 bits */
-red_y_msb:	.byte	0x59
-
-/* Green x and y value most significant 8 bits */
-green_x_y_msb:	.byte	0x4a,0x98
-
-/* Blue x and y value most significant 8 bits */
-blue_x_y_msb:	.byte	0x25,0x20
-
-/* Default white point x and y value most significant 8 bits */
-white_x_y_msb:	.byte	0x50,0x54
-
-/* Established timings */
-/* Bit 7	720x400 @ 70 Hz
-   Bit 6	720x400 @ 88 Hz
-   Bit 5	640x480 @ 60 Hz
-   Bit 4	640x480 @ 67 Hz
-   Bit 3	640x480 @ 72 Hz
-   Bit 2	640x480 @ 75 Hz
-   Bit 1	800x600 @ 56 Hz
-   Bit 0	800x600 @ 60 Hz */
-estbl_timing1:	.byte	ESTABLISHED_TIMING1_BITS
-
-/* Bit 7	800x600 @ 72 Hz
-   Bit 6	800x600 @ 75 Hz
-   Bit 5	832x624 @ 75 Hz
-   Bit 4	1024x768 @ 87 Hz, interlaced (1024x768)
-   Bit 3	1024x768 @ 60 Hz
-   Bit 2	1024x768 @ 72 Hz
-   Bit 1	1024x768 @ 75 Hz
-   Bit 0	1280x1024 @ 75 Hz */
-estbl_timing2:	.byte	ESTABLISHED_TIMING2_BITS
-
-/* Bit 7	1152x870 @ 75 Hz (Apple Macintosh II)
-   Bits 6-0 	Other manufacturer-specific display mod */
-estbl_timing3:	.byte	ESTABLISHED_TIMING3_BITS
-
-/* Standard timing */
-/* X resolution, less 31, divided by 8 (256-2288 pixels) */
-std_xres:	.byte	(XPIX/8)-31
-/* Y resolution, X:Y pixel ratio
-   Bits 7-6	X:Y pixel ratio: 00=16:10; 01=4:3; 10=5:4; 11=16:9.
-   Bits 5-0	Vertical frequency, less 60 (60-123 Hz) */
-std_vres:	.byte	(XY_RATIO<<6)+VFREQ-60
-		.fill	7,2,0x0101	/* Unused */
-
-descriptor1:
-/* Pixel clock in 10 kHz units. (0.-655.35 MHz, little-endian) */
-clock:		.hword	CLOCK/10
-
-/* Horizontal active pixels 8 lsbits (0-4095) */
-x_act_lsb:	.byte	XPIX&0xff
-/* Horizontal blanking pixels 8 lsbits (0-4095)
-   End of active to start of next active. */
-x_blk_lsb:	.byte	XBLANK&0xff
-/* Bits 7-4 	Horizontal active pixels 4 msbits
-   Bits 3-0	Horizontal blanking pixels 4 msbits */
-x_msbs:		.byte	msbs2(XPIX,XBLANK)
-
-/* Vertical active lines 8 lsbits (0-4095) */
-y_act_lsb:	.byte	YPIX&0xff
-/* Vertical blanking lines 8 lsbits (0-4095) */
-y_blk_lsb:	.byte	YBLANK&0xff
-/* Bits 7-4 	Vertical active lines 4 msbits
-   Bits 3-0 	Vertical blanking lines 4 msbits */
-y_msbs:		.byte	msbs2(YPIX,YBLANK)
-
-/* Horizontal sync offset pixels 8 lsbits (0-1023) From blanking start */
-x_snc_off_lsb:	.byte	XOFFSET&0xff
-/* Horizontal sync pulse width pixels 8 lsbits (0-1023) */
-x_snc_pls_lsb:	.byte	XPULSE&0xff
-/* Bits 7-4 	Vertical sync offset lines 4 lsbits (0-63)
-   Bits 3-0 	Vertical sync pulse width lines 4 lsbits (0-63) */
-y_snc_lsb:	.byte	lsbs2(YOFFSET, YPULSE)
-/* Bits 7-6 	Horizontal sync offset pixels 2 msbits
-   Bits 5-4 	Horizontal sync pulse width pixels 2 msbits
-   Bits 3-2 	Vertical sync offset lines 2 msbits
-   Bits 1-0 	Vertical sync pulse width lines 2 msbits */
-xy_snc_msbs:	.byte	msbs4(XOFFSET,XPULSE,YOFFSET,YPULSE)
-
-/* Horizontal display size, mm, 8 lsbits (0-4095 mm, 161 in) */
-x_dsp_size:	.byte	xsize&0xff
-
-/* Vertical display size, mm, 8 lsbits (0-4095 mm, 161 in) */
-y_dsp_size:	.byte	ysize&0xff
-
-/* Bits 7-4 	Horizontal display size, mm, 4 msbits
-   Bits 3-0 	Vertical display size, mm, 4 msbits */
-dsp_size_mbsb:	.byte	msbs2(xsize,ysize)
-
-/* Horizontal border pixels (each side; total is twice this) */
-x_border:	.byte	0
-/* Vertical border lines (each side; total is twice this) */
-y_border:	.byte	0
-
-/* Bit 7 	Interlaced
-   Bits 6-5 	Stereo mode: 00=No stereo; other values depend on bit 0:
-   Bit 0=0: 01=Field sequential, sync=1 during right; 10=similar,
-     sync=1 during left; 11=4-way interleaved stereo
-   Bit 0=1 2-way interleaved stereo: 01=Right image on even lines;
-     10=Left image on even lines; 11=side-by-side
-   Bits 4-3 	Sync type: 00=Analog composite; 01=Bipolar analog composite;
-     10=Digital composite (on HSync); 11=Digital separate
-   Bit 2 	If digital separate: Vertical sync polarity (1=positive)
-   Other types: VSync serrated (HSync during VSync)
-   Bit 1 	If analog sync: Sync on all 3 RGB lines (else green only)
-   Digital: HSync polarity (1=positive)
-   Bit 0 	2-way line-interleaved stereo, if bits 4-3 are not 00. */
-features:	.byte	0x18+(VSYNC_POL<<2)+(HSYNC_POL<<1)
-
-descriptor2:	.byte	0,0	/* Not a detailed timing descriptor */
-		.byte	0	/* Must be zero */
-		.byte	0xff	/* Descriptor is monitor serial number (text) */
-		.byte	0	/* Must be zero */
-start1:		.ascii	"Linux #0"
-end1:		.byte	0x0a	/* End marker */
-		.fill	12-(end1-start1), 1, 0x20 /* Padded spaces */
-descriptor3:	.byte	0,0	/* Not a detailed timing descriptor */
-		.byte	0	/* Must be zero */
-		.byte	0xfd	/* Descriptor is monitor range limits */
-		.byte	0	/* Must be zero */
-start2:		.byte	VFREQ-1	/* Minimum vertical field rate (1-255 Hz) */
-		.byte	VFREQ+1	/* Maximum vertical field rate (1-255 Hz) */
-		.byte	(CLOCK/(XPIX+XBLANK))-1 /* Minimum horizontal line rate
-						    (1-255 kHz) */
-		.byte	(CLOCK/(XPIX+XBLANK))+1 /* Maximum horizontal line rate
-						    (1-255 kHz) */
-		.byte	(CLOCK/10000)+1	/* Maximum pixel clock rate, rounded up
-					   to 10 MHz multiple (10-2550 MHz) */
-		.byte	0	/* No extended timing information type */
-end2:		.byte	0x0a	/* End marker */
-		.fill	12-(end2-start2), 1, 0x20 /* Padded spaces */
-descriptor4:	.byte	0,0	/* Not a detailed timing descriptor */
-		.byte	0	/* Must be zero */
-		.byte	0xfc	/* Descriptor is text */
-		.byte	0	/* Must be zero */
-start3:		.ascii	TIMING_NAME
-end3:		.byte	0x0a	/* End marker */
-		.fill	12-(end3-start3), 1, 0x20 /* Padded spaces */
-extensions:	.byte	0	/* Number of extensions to follow */
-checksum:	.byte	CRC	/* Sum of all bytes must be 0 */
diff --git a/tools/edid/hex b/tools/edid/hex
deleted file mode 100644
index 8873ebb618af..000000000000
--- a/tools/edid/hex
+++ /dev/null
@@ -1 +0,0 @@
-"\t" 8/1 "0x%02x, " "\n"
diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index 318e2dad27e0..ae57bf69ad4a 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c
@@ -76,6 +76,12 @@ enum {
 	DNS
 };
 
+enum {
+	IPV4 = 1,
+	IPV6,
+	IP_TYPE_MAX
+};
+
 static int in_hand_shake;
 
 static char *os_name = "";
@@ -102,6 +108,11 @@ static struct utsname uts_buf;
 
 #define MAX_FILE_NAME 100
 #define ENTRIES_PER_BLOCK 50
+/*
+ * Change this entry if the number of addresses increases in future
+ */
+#define MAX_IP_ENTRIES 64
+#define OUTSTR_BUF_SIZE ((INET6_ADDRSTRLEN + 1) * MAX_IP_ENTRIES)
 
 struct kvp_record {
 	char key[HV_KVP_EXCHANGE_MAX_KEY_SIZE];
@@ -1171,6 +1182,18 @@ static int process_ip_string(FILE *f, char *ip_string, int type)
 	return 0;
 }
 
+int ip_version_check(const char *input_addr)
+{
+	struct in6_addr addr;
+
+	if (inet_pton(AF_INET, input_addr, &addr))
+		return IPV4;
+	else if (inet_pton(AF_INET6, input_addr, &addr))
+		return IPV6;
+
+	return -EINVAL;
+}
+
 /*
  * Only IPv4 subnet strings needs to be converted to plen
  * For IPv6 the subnet is already privided in plen format
@@ -1197,14 +1220,75 @@ static int kvp_subnet_to_plen(char *subnet_addr_str)
 	return plen;
 }
 
+static int process_dns_gateway_nm(FILE *f, char *ip_string, int type,
+				  int ip_sec)
+{
+	char addr[INET6_ADDRSTRLEN], *output_str;
+	int ip_offset = 0, error = 0, ip_ver;
+	char *param_name;
+
+	if (type == DNS)
+		param_name = "dns";
+	else if (type == GATEWAY)
+		param_name = "gateway";
+	else
+		return -EINVAL;
+
+	output_str = (char *)calloc(OUTSTR_BUF_SIZE, sizeof(char));
+	if (!output_str)
+		return -ENOMEM;
+
+	while (1) {
+		memset(addr, 0, sizeof(addr));
+
+		if (!parse_ip_val_buffer(ip_string, &ip_offset, addr,
+					 (MAX_IP_ADDR_SIZE * 2)))
+			break;
+
+		ip_ver = ip_version_check(addr);
+		if (ip_ver < 0)
+			continue;
+
+		if ((ip_ver == IPV4 && ip_sec == IPV4) ||
+		    (ip_ver == IPV6 && ip_sec == IPV6)) {
+			/*
+			 * do a bound check to avoid out-of bound writes
+			 */
+			if ((OUTSTR_BUF_SIZE - strlen(output_str)) >
+			    (strlen(addr) + 1)) {
+				strncat(output_str, addr,
+					OUTSTR_BUF_SIZE -
+					strlen(output_str) - 1);
+				strncat(output_str, ",",
+					OUTSTR_BUF_SIZE -
+					strlen(output_str) - 1);
+			}
+		} else {
+			continue;
+		}
+	}
+
+	if (strlen(output_str)) {
+		/*
+		 * This is to get rid of that extra comma character
+		 * in the end of the string
+		 */
+		output_str[strlen(output_str) - 1] = '\0';
+		error = fprintf(f, "%s=%s\n", param_name, output_str);
+	}
+
+	free(output_str);
+	return error;
+}
+
 static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet,
-				int is_ipv6)
+				int ip_sec)
 {
 	char addr[INET6_ADDRSTRLEN];
 	char subnet_addr[INET6_ADDRSTRLEN];
-	int error, i = 0;
+	int error = 0, i = 0;
 	int ip_offset = 0, subnet_offset = 0;
-	int plen;
+	int plen, ip_ver;
 
 	memset(addr, 0, sizeof(addr));
 	memset(subnet_addr, 0, sizeof(subnet_addr));
@@ -1216,10 +1300,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet,
 						       subnet_addr,
 						       (MAX_IP_ADDR_SIZE *
 							2))) {
-		if (!is_ipv6)
+		ip_ver = ip_version_check(addr);
+		if (ip_ver < 0)
+			continue;
+
+		if (ip_ver == IPV4 && ip_sec == IPV4)
 			plen = kvp_subnet_to_plen((char *)subnet_addr);
-		else
+		else if (ip_ver == IPV6 && ip_sec == IPV6)
 			plen = atoi(subnet_addr);
+		else
+			continue;
 
 		if (plen < 0)
 			return plen;
@@ -1233,17 +1323,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet,
 		memset(subnet_addr, 0, sizeof(subnet_addr));
 	}
 
-	return 0;
+	return error;
 }
 
 static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val)
 {
-	int error = 0;
+	int error = 0, ip_ver;
 	char if_filename[PATH_MAX];
 	char nm_filename[PATH_MAX];
 	FILE *ifcfg_file, *nmfile;
 	char cmd[PATH_MAX];
-	int is_ipv6 = 0;
 	char *mac_addr;
 	int str_len;
 
@@ -1421,52 +1510,94 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val)
 	if (error)
 		goto setval_error;
 
-	if (new_val->addr_family & ADDR_FAMILY_IPV6) {
-		error = fprintf(nmfile, "\n[ipv6]\n");
-		if (error < 0)
-			goto setval_error;
-		is_ipv6 = 1;
-	} else {
-		error = fprintf(nmfile, "\n[ipv4]\n");
-		if (error < 0)
-			goto setval_error;
-	}
-
 	/*
 	 * Now we populate the keyfile format
+	 *
+	 * The keyfile format expects the IPv6 and IPv4 configuration in
+	 * different sections. Therefore we iterate through the list twice,
+	 * once to populate the IPv4 section and the next time for IPv6
 	 */
+	ip_ver = IPV4;
+	do {
+		if (ip_ver == IPV4) {
+			error = fprintf(nmfile, "\n[ipv4]\n");
+			if (error < 0)
+				goto setval_error;
+		} else {
+			error = fprintf(nmfile, "\n[ipv6]\n");
+			if (error < 0)
+				goto setval_error;
+		}
 
-	if (new_val->dhcp_enabled) {
-		error = kvp_write_file(nmfile, "method", "", "auto");
-		if (error < 0)
-			goto setval_error;
-	} else {
-		error = kvp_write_file(nmfile, "method", "", "manual");
+		/*
+		 * Write the configuration for ipaddress, netmask, gateway and
+		 * name services
+		 */
+		error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr,
+					     (char *)new_val->sub_net,
+					     ip_ver);
 		if (error < 0)
 			goto setval_error;
-	}
 
-	/*
-	 * Write the configuration for ipaddress, netmask, gateway and
-	 * name services
-	 */
-	error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr,
-				     (char *)new_val->sub_net, is_ipv6);
-	if (error < 0)
-		goto setval_error;
+		/*
+		 * As dhcp_enabled is only valid for ipv4, we do not set dhcp
+		 * methods for ipv6 based on dhcp_enabled flag.
+		 *
+		 * For ipv4, set method to manual only when dhcp_enabled is
+		 * false and specific ipv4 addresses are configured. If neither
+		 * dhcp_enabled is true and no ipv4 addresses are configured,
+		 * set method to 'disabled'.
+		 *
+		 * For ipv6, set method to manual when we configure ipv6
+		 * addresses. Otherwise set method to 'auto' so that SLAAC from
+		 * RA may be used.
+		 */
+		if (ip_ver == IPV4) {
+			if (new_val->dhcp_enabled) {
+				error = kvp_write_file(nmfile, "method", "",
+						       "auto");
+				if (error < 0)
+					goto setval_error;
+			} else if (error) {
+				error = kvp_write_file(nmfile, "method", "",
+						       "manual");
+				if (error < 0)
+					goto setval_error;
+			} else {
+				error = kvp_write_file(nmfile, "method", "",
+						       "disabled");
+				if (error < 0)
+					goto setval_error;
+			}
+		} else if (ip_ver == IPV6) {
+			if (error) {
+				error = kvp_write_file(nmfile, "method", "",
+						       "manual");
+				if (error < 0)
+					goto setval_error;
+			} else {
+				error = kvp_write_file(nmfile, "method", "",
+						       "auto");
+				if (error < 0)
+					goto setval_error;
+			}
+		}
 
-	/* we do not want ipv4 addresses in ipv6 section and vice versa */
-	if (is_ipv6 != is_ipv4((char *)new_val->gate_way)) {
-		error = fprintf(nmfile, "gateway=%s\n", (char *)new_val->gate_way);
+		error = process_dns_gateway_nm(nmfile,
+					       (char *)new_val->gate_way,
+					       GATEWAY, ip_ver);
 		if (error < 0)
 			goto setval_error;
-	}
 
-	if (is_ipv6 != is_ipv4((char *)new_val->dns_addr)) {
-		error = fprintf(nmfile, "dns=%s\n", (char *)new_val->dns_addr);
+		error = process_dns_gateway_nm(nmfile,
+					       (char *)new_val->dns_addr, DNS,
+					       ip_ver);
 		if (error < 0)
 			goto setval_error;
-	}
+
+		ip_ver++;
+	} while (ip_ver < IP_TYPE_MAX);
+
 	fclose(nmfile);
 	fclose(ifcfg_file);
 
diff --git a/tools/iio/iio_utils.c b/tools/iio/iio_utils.c
index 6a00a6eecaef..c5c5082cb24e 100644
--- a/tools/iio/iio_utils.c
+++ b/tools/iio/iio_utils.c
@@ -376,7 +376,7 @@ int build_channel_array(const char *device_dir, int buffer_idx,
 		goto error_close_dir;
 	}
 
-	seekdir(dp, 0);
+	rewinddir(dp);
 	while (ent = readdir(dp), ent) {
 		if (strcmp(ent->d_name + strlen(ent->d_name) - strlen("_en"),
 			   "_en") == 0) {
diff --git a/tools/include/asm-generic/bitops/__fls.h b/tools/include/asm-generic/bitops/__fls.h
index 03f721a8a2b1..54ccccf96e21 100644
--- a/tools/include/asm-generic/bitops/__fls.h
+++ b/tools/include/asm-generic/bitops/__fls.h
@@ -5,12 +5,12 @@
 #include <asm/types.h>
 
 /**
- * __fls - find last (most-significant) set bit in a long word
+ * generic___fls - find last (most-significant) set bit in a long word
  * @word: the word to search
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long generic___fls(unsigned long word)
 {
 	int num = BITS_PER_LONG - 1;
 
@@ -41,4 +41,8 @@ static __always_inline unsigned long __fls(unsigned long word)
 	return num;
 }
 
+#ifndef __HAVE_ARCH___FLS
+#define __fls(word) generic___fls(word)
+#endif
+
 #endif /* _ASM_GENERIC_BITOPS___FLS_H_ */
diff --git a/tools/include/asm-generic/bitops/fls.h b/tools/include/asm-generic/bitops/fls.h
index b168bb10e1be..26f3ce1dd6e4 100644
--- a/tools/include/asm-generic/bitops/fls.h
+++ b/tools/include/asm-generic/bitops/fls.h
@@ -3,14 +3,14 @@
 #define _ASM_GENERIC_BITOPS_FLS_H_
 
 /**
- * fls - find last (most-significant) bit set
+ * generic_fls - find last (most-significant) bit set
  * @x: the word to search
  *
  * This is defined the same way as ffs.
  * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
 
-static __always_inline int fls(unsigned int x)
+static __always_inline int generic_fls(unsigned int x)
 {
 	int r = 32;
 
@@ -39,4 +39,8 @@ static __always_inline int fls(unsigned int x)
 	return r;
 }
 
+#ifndef __HAVE_ARCH_FLS
+#define fls(x) generic_fls(x)
+#endif
+
 #endif /* _ASM_GENERIC_BITOPS_FLS_H_ */
diff --git a/tools/include/linux/align.h b/tools/include/linux/align.h
new file mode 100644
index 000000000000..14e34ace80dd
--- /dev/null
+++ b/tools/include/linux/align.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _TOOLS_LINUX_ALIGN_H
+#define _TOOLS_LINUX_ALIGN_H
+
+#include <uapi/linux/const.h>
+
+#define ALIGN(x, a)		__ALIGN_KERNEL((x), (a))
+#define ALIGN_DOWN(x, a)	__ALIGN_KERNEL((x) - ((a) - 1), (a))
+#define IS_ALIGNED(x, a)	(((x) & ((typeof(x))(a) - 1)) == 0)
+
+#endif /* _TOOLS_LINUX_ALIGN_H */
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index f3566ea0f932..210c13b1b857 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -3,6 +3,7 @@
 #define _TOOLS_LINUX_BITMAP_H
 
 #include <string.h>
+#include <linux/align.h>
 #include <linux/bitops.h>
 #include <linux/find.h>
 #include <stdlib.h>
@@ -25,13 +26,14 @@ bool __bitmap_intersects(const unsigned long *bitmap1,
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
 
+#define bitmap_size(nbits)	(ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE)
+
 static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = 0UL;
 	else {
-		int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-		memset(dst, 0, len);
+		memset(dst, 0, bitmap_size(nbits));
 	}
 }
 
@@ -83,7 +85,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
  */
 static inline unsigned long *bitmap_zalloc(int nbits)
 {
-	return calloc(1, BITS_TO_LONGS(nbits) * sizeof(unsigned long));
+	return calloc(1, bitmap_size(nbits));
 }
 
 /*
@@ -126,7 +128,6 @@ static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
 #define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long))
 #endif
 #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1)
-#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0)
 
 static inline bool bitmap_equal(const unsigned long *src1,
 				const unsigned long *src2, unsigned int nbits)
diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h
index f18683b95ea6..272f15d0e434 100644
--- a/tools/include/linux/bitops.h
+++ b/tools/include/linux/bitops.h
@@ -20,6 +20,8 @@
 #define BITS_TO_U32(nr)		DIV_ROUND_UP(nr, BITS_PER_TYPE(u32))
 #define BITS_TO_BYTES(nr)	DIV_ROUND_UP(nr, BITS_PER_TYPE(char))
 
+#define BYTES_TO_BITS(nb)	((nb) * BITS_PER_BYTE)
+
 extern unsigned int __sw_hweight8(unsigned int w);
 extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
@@ -87,4 +89,15 @@ static inline __u32 rol32(__u32 word, unsigned int shift)
 	return (word << shift) | (word >> ((-shift) & 31));
 }
 
+/**
+ * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit
+ * @value: value to sign extend
+ * @index: 0 based bit index (0<=index<64) to sign bit
+ */
+static __always_inline __s64 sign_extend64(__u64 value, int index)
+{
+	__u8 shift = 63 - index;
+	return (__s64)(value << shift) >> shift;
+}
+
 #endif
diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h
index 72535f00572f..72ea363d434d 100644
--- a/tools/include/linux/btf_ids.h
+++ b/tools/include/linux/btf_ids.h
@@ -3,6 +3,8 @@
 #ifndef _LINUX_BTF_IDS_H
 #define _LINUX_BTF_IDS_H
 
+#include <linux/types.h> /* for u32 */
+
 struct btf_id_set {
 	u32 cnt;
 	u32 ids[];
diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 7b65566f3e42..8a63a9913495 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -58,6 +58,10 @@
 #define noinline
 #endif
 
+#ifndef __nocf_check
+#define __nocf_check __attribute__((nocf_check))
+#endif
+
 /* Are two types/vars the same type (ignoring qualifiers)? */
 #ifndef __same_type
 # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index 736bdeccdfe4..65aa8ce142e5 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -111,6 +111,24 @@
 		.off   = 0,					\
 		.imm   = IMM })
 
+/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */
+
+#define BPF_MOVSX64_REG(DST, SRC, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+#define BPF_MOVSX32_REG(DST, SRC, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
 /* Short form of mov based on type,  BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */
 
 #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM)			\
diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index 4b0673bf52c2..07cfad817d53 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -8,6 +8,7 @@
 #include <linux/build_bug.h>
 #include <linux/compiler.h>
 #include <linux/math.h>
+#include <linux/panic.h>
 #include <endian.h>
 #include <byteswap.h>
 
diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h
index f3c82ab5b14c..dc0fc7125bc3 100644
--- a/tools/include/linux/mm.h
+++ b/tools/include/linux/mm.h
@@ -2,8 +2,8 @@
 #ifndef _TOOLS_LINUX_MM_H
 #define _TOOLS_LINUX_MM_H
 
+#include <linux/align.h>
 #include <linux/mmzone.h>
-#include <uapi/linux/const.h>
 
 #define PAGE_SHIFT		12
 #define PAGE_SIZE		(_AC(1, UL) << PAGE_SHIFT)
@@ -11,9 +11,6 @@
 
 #define PHYS_ADDR_MAX	(~(phys_addr_t)0)
 
-#define ALIGN(x, a)			__ALIGN_KERNEL((x), (a))
-#define ALIGN_DOWN(x, a)		__ALIGN_KERNEL((x) - ((a) - 1), (a))
-
 #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
 
 #define __va(x) ((void *)((unsigned long)(x)))
@@ -37,4 +34,9 @@ static inline void totalram_pages_add(long count)
 {
 }
 
+static inline int early_pfn_to_nid(unsigned long pfn)
+{
+	return 0;
+}
+
 #endif
diff --git a/tools/include/linux/panic.h b/tools/include/linux/panic.h
new file mode 100644
index 000000000000..9c8f17a41ce8
--- /dev/null
+++ b/tools/include/linux/panic.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_PANIC_H
+#define _TOOLS_LINUX_PANIC_H
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static inline void panic(const char *fmt, ...)
+{
+	va_list argp;
+
+	va_start(argp, fmt);
+	vfprintf(stderr, fmt, argp);
+	va_end(argp);
+	exit(-1);
+}
+
+#endif
diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h
index bacfd35c5156..5be9d3c7435a 100644
--- a/tools/include/nolibc/stdlib.h
+++ b/tools/include/nolibc/stdlib.h
@@ -185,7 +185,7 @@ void *realloc(void *old_ptr, size_t new_size)
 	if (__builtin_expect(!ret, 0))
 		return NULL;
 
-	memcpy(ret, heap->user_p, heap->len);
+	memcpy(ret, heap->user_p, user_p_len);
 	munmap(heap, heap->len);
 	return ret;
 }
diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h
index a01c69dd495f..f9ab28421e6d 100644
--- a/tools/include/nolibc/string.h
+++ b/tools/include/nolibc/string.h
@@ -123,7 +123,7 @@ char *strcpy(char *dst, const char *src)
  * thus itself, hence the asm() statement below that's meant to disable this
  * confusing practice.
  */
-static __attribute__((unused))
+__attribute__((weak,unused,section(".text.nolibc_strlen")))
 size_t strlen(const char *str)
 {
 	size_t len;
@@ -187,22 +187,26 @@ char *strndup(const char *str, size_t maxlen)
 static __attribute__((unused))
 size_t strlcat(char *dst, const char *src, size_t size)
 {
-	size_t len;
-	char c;
-
-	for (len = 0; dst[len];	len++)
-		;
-
-	for (;;) {
-		c = *src;
-		if (len < size)
-			dst[len] = c;
-		if (!c)
+	size_t len = strnlen(dst, size);
+
+	/*
+	 * We want len < size-1. But as size is unsigned and can wrap
+	 * around, we use len + 1 instead.
+	 */
+	while (len + 1 < size) {
+		dst[len] = *src;
+		if (*src == '\0')
 			break;
 		len++;
 		src++;
 	}
 
+	if (len < size)
+		dst[len] = '\0';
+
+	while (*src++)
+		len++;
+
 	return len;
 }
 
@@ -210,16 +214,18 @@ static __attribute__((unused))
 size_t strlcpy(char *dst, const char *src, size_t size)
 {
 	size_t len;
-	char c;
 
-	for (len = 0;;) {
-		c = src[len];
-		if (len < size)
-			dst[len] = c;
-		if (!c)
-			break;
-		len++;
+	for (len = 0; len < size; len++) {
+		dst[len] = src[len];
+		if (!dst[len])
+			return len;
 	}
+	if (size)
+		dst[size-1] = '\0';
+
+	while (src[len])
+		len++;
+
 	return len;
 }
 
diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h
index dda9dffd1d74..7b82bc3cf107 100644
--- a/tools/include/nolibc/sys.h
+++ b/tools/include/nolibc/sys.h
@@ -22,6 +22,7 @@
 #include <linux/stat.h>  /* for statx() */
 #include <linux/prctl.h>
 #include <linux/resource.h>
+#include <linux/utsname.h>
 
 #include "arch.h"
 #include "errno.h"
@@ -1140,6 +1141,32 @@ int umount2(const char *path, int flags)
 
 
 /*
+ * int uname(struct utsname *buf);
+ */
+
+struct utsname {
+	char sysname[65];
+	char nodename[65];
+	char release[65];
+	char version[65];
+	char machine[65];
+	char domainname[65];
+};
+
+static __attribute__((unused))
+int sys_uname(struct utsname *buf)
+{
+	return my_syscall1(__NR_uname, buf);
+}
+
+static __attribute__((unused))
+int uname(struct utsname *buf)
+{
+	return __sysret(sys_uname(buf));
+}
+
+
+/*
  * int unlink(const char *path);
  */
 
diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h
index fd4f9574d177..2ee338860b7e 100644
--- a/tools/include/uapi/drm/i915_drm.h
+++ b/tools/include/uapi/drm/i915_drm.h
@@ -3013,6 +3013,7 @@ struct drm_i915_query_item {
 	 *  - %DRM_I915_QUERY_MEMORY_REGIONS (see struct drm_i915_query_memory_regions)
 	 *  - %DRM_I915_QUERY_HWCONFIG_BLOB (see `GuC HWCONFIG blob uAPI`)
 	 *  - %DRM_I915_QUERY_GEOMETRY_SUBSLICES (see struct drm_i915_query_topology_info)
+	 *  - %DRM_I915_QUERY_GUC_SUBMISSION_VERSION (see struct drm_i915_query_guc_submission_version)
 	 */
 	__u64 query_id;
 #define DRM_I915_QUERY_TOPOLOGY_INFO		1
@@ -3021,6 +3022,7 @@ struct drm_i915_query_item {
 #define DRM_I915_QUERY_MEMORY_REGIONS		4
 #define DRM_I915_QUERY_HWCONFIG_BLOB		5
 #define DRM_I915_QUERY_GEOMETRY_SUBSLICES	6
+#define DRM_I915_QUERY_GUC_SUBMISSION_VERSION	7
 /* Must be kept compact -- no holes and well documented */
 
 	/**
@@ -3567,6 +3569,20 @@ struct drm_i915_query_memory_regions {
 };
 
 /**
+ * struct drm_i915_query_guc_submission_version - query GuC submission interface version
+ */
+struct drm_i915_query_guc_submission_version {
+	/** @branch: Firmware branch version. */
+	__u32 branch;
+	/** @major: Firmware major version. */
+	__u32 major;
+	/** @minor: Firmware minor version. */
+	__u32 minor;
+	/** @patch: Firmware patch version. */
+	__u32 patch;
+};
+
+/**
  * DOC: GuC HWCONFIG blob uAPI
  *
  * The GuC produces a blob with information about the current device.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3c42b9f1bada..90706a47f6ff 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1115,6 +1115,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UNIX_GETSOCKNAME,
 	BPF_NETKIT_PRIMARY,
 	BPF_NETKIT_PEER,
+	BPF_TRACE_KPROBE_SESSION,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1135,6 +1136,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_TCX = 11,
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
+	BPF_LINK_TYPE_SOCKMAP = 14,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -1662,8 +1664,10 @@ union bpf_attr {
 	} query;
 
 	struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
-		__u64 name;
-		__u32 prog_fd;
+		__u64		name;
+		__u32		prog_fd;
+		__u32		:32;
+		__aligned_u64	cookie;
 	} raw_tracepoint;
 
 	struct { /* anonymous struct for BPF_BTF_LOAD */
@@ -3392,6 +3396,10 @@ union bpf_attr {
  *			for the nexthop. If the src addr cannot be derived,
  *			**BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this
  *			case, *params*->dmac and *params*->smac are not set either.
+ *		**BPF_FIB_LOOKUP_MARK**
+ *			Use the mark present in *params*->mark for the fib lookup.
+ *			This option should not be used with BPF_FIB_LOOKUP_DIRECT,
+ *			as it only has meaning for full lookups.
  *
  *		*ctx* is either **struct xdp_md** for XDP programs or
  *		**struct sk_buff** tc cls_act programs.
@@ -5020,7 +5028,7 @@ union bpf_attr {
  *		bytes will be copied to *dst*
  *	Return
  *		The **hash_algo** is returned on success,
- *		**-EOPNOTSUP** if IMA is disabled or **-EINVAL** if
+ *		**-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if
  *		invalid arguments are passed.
  *
  * struct socket *bpf_sock_from_file(struct file *file)
@@ -5506,7 +5514,7 @@ union bpf_attr {
  *		bytes will be copied to *dst*
  *	Return
  *		The **hash_algo** is returned on success,
- *		**-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if
+ *		**-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if
  *		invalid arguments are passed.
  *
  * void *bpf_kptr_xchg(void *map_value, void *ptr)
@@ -6718,6 +6726,10 @@ struct bpf_link_info {
 			__u32 ifindex;
 			__u32 attach_type;
 		} netkit;
+		struct {
+			__u32 map_id;
+			__u32 attach_type;
+		} sockmap;
 	};
 } __attribute__((aligned(8)));
 
@@ -6936,6 +6948,8 @@ enum {
 					 * socket transition to LISTEN state.
 					 */
 	BPF_SOCK_OPS_RTT_CB,		/* Called on every RTT.
+					 * Arg1: measured RTT input (mrtt)
+					 * Arg2: updated srtt
 					 */
 	BPF_SOCK_OPS_PARSE_HDR_OPT_CB,	/* Parse the header option.
 					 * It will be called to handle
@@ -7118,6 +7132,7 @@ enum {
 	BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
 	BPF_FIB_LOOKUP_TBID    = (1U << 3),
 	BPF_FIB_LOOKUP_SRC     = (1U << 4),
+	BPF_FIB_LOOKUP_MARK    = (1U << 5),
 };
 
 enum {
@@ -7150,7 +7165,7 @@ struct bpf_fib_lookup {
 
 		/* output: MTU value */
 		__u16	mtu_result;
-	};
+	} __attribute__((packed, aligned(2)));
 	/* input: L3 device index for lookup
 	 * output: device index from FIB lookup
 	 */
@@ -7195,8 +7210,19 @@ struct bpf_fib_lookup {
 		__u32	tbid;
 	};
 
-	__u8	smac[6];     /* ETH_ALEN */
-	__u8	dmac[6];     /* ETH_ALEN */
+	union {
+		/* input */
+		struct {
+			__u32	mark;   /* policy routing */
+			/* 2 4-byte holes for input */
+		};
+
+		/* output: source and dest mac */
+		struct {
+			__u8	smac[6];	/* ETH_ALEN */
+			__u8	dmac[6];	/* ETH_ALEN */
+		};
+	};
 };
 
 struct bpf_redir_neigh {
@@ -7283,6 +7309,10 @@ struct bpf_timer {
 	__u64 __opaque[2];
 } __attribute__((aligned(8)));
 
+struct bpf_wq {
+	__u64 __opaque[2];
+} __attribute__((aligned(8)));
+
 struct bpf_dynptr {
 	__u64 __opaque[2];
 } __attribute__((aligned(8)));
diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h
deleted file mode 100644
index 47afae3895ec..000000000000
--- a/tools/include/uapi/linux/ethtool.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * ethtool.h: Defines for Linux ethtool.
- *
- * Copyright (C) 1998 David S. Miller (davem@redhat.com)
- * Copyright 2001 Jeff Garzik <jgarzik@pobox.com>
- * Portions Copyright 2001 Sun Microsystems (thockin@sun.com)
- * Portions Copyright 2002 Intel (eli.kupermann@intel.com,
- *                                christopher.leech@intel.com,
- *                                scott.feldman@intel.com)
- * Portions Copyright (C) Sun Microsystems 2008
- */
-
-#ifndef _UAPI_LINUX_ETHTOOL_H
-#define _UAPI_LINUX_ETHTOOL_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/if_ether.h>
-
-#define ETHTOOL_GCHANNELS       0x0000003c /* Get no of channels */
-
-/**
- * struct ethtool_channels - configuring number of network channel
- * @cmd: ETHTOOL_{G,S}CHANNELS
- * @max_rx: Read only. Maximum number of receive channel the driver support.
- * @max_tx: Read only. Maximum number of transmit channel the driver support.
- * @max_other: Read only. Maximum number of other channel the driver support.
- * @max_combined: Read only. Maximum number of combined channel the driver
- *	support. Set of queues RX, TX or other.
- * @rx_count: Valid values are in the range 1 to the max_rx.
- * @tx_count: Valid values are in the range 1 to the max_tx.
- * @other_count: Valid values are in the range 1 to the max_other.
- * @combined_count: Valid values are in the range 1 to the max_combined.
- *
- * This can be used to configure RX, TX and other channels.
- */
-
-struct ethtool_channels {
-	__u32	cmd;
-	__u32	max_rx;
-	__u32	max_tx;
-	__u32	max_other;
-	__u32	max_combined;
-	__u32	rx_count;
-	__u32	tx_count;
-	__u32	other_count;
-	__u32	combined_count;
-};
-
-#define ETHTOOL_FWVERS_LEN	32
-#define ETHTOOL_BUSINFO_LEN	32
-#define ETHTOOL_EROMVERS_LEN	32
-
-/**
- * struct ethtool_drvinfo - general driver and device information
- * @cmd: Command number = %ETHTOOL_GDRVINFO
- * @driver: Driver short name.  This should normally match the name
- *	in its bus driver structure (e.g. pci_driver::name).  Must
- *	not be an empty string.
- * @version: Driver version string; may be an empty string
- * @fw_version: Firmware version string; may be an empty string
- * @erom_version: Expansion ROM version string; may be an empty string
- * @bus_info: Device bus address.  This should match the dev_name()
- *	string for the underlying bus device, if there is one.  May be
- *	an empty string.
- * @reserved2: Reserved for future use; see the note on reserved space.
- * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and
- *	%ETHTOOL_SPFLAGS commands; also the number of strings in the
- *	%ETH_SS_PRIV_FLAGS set
- * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS
- *	command; also the number of strings in the %ETH_SS_STATS set
- * @testinfo_len: Number of results returned by the %ETHTOOL_TEST
- *	command; also the number of strings in the %ETH_SS_TEST set
- * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM
- *	and %ETHTOOL_SEEPROM commands, in bytes
- * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS
- *	command, in bytes
- *
- * Users can use the %ETHTOOL_GSSET_INFO command to get the number of
- * strings in any string set (from Linux 2.6.34).
- *
- * Drivers should set at most @driver, @version, @fw_version and
- * @bus_info in their get_drvinfo() implementation.  The ethtool
- * core fills in the other fields using other driver operations.
- */
-struct ethtool_drvinfo {
-	__u32	cmd;
-	char	driver[32];
-	char	version[32];
-	char	fw_version[ETHTOOL_FWVERS_LEN];
-	char	bus_info[ETHTOOL_BUSINFO_LEN];
-	char	erom_version[ETHTOOL_EROMVERS_LEN];
-	char	reserved2[12];
-	__u32	n_priv_flags;
-	__u32	n_stats;
-	__u32	testinfo_len;
-	__u32	eedump_len;
-	__u32	regdump_len;
-};
-
-#define ETHTOOL_GDRVINFO	0x00000003
-
-#endif /* _UAPI_LINUX_ETHTOOL_H */
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
index 48ad69f7722e..45e4e64fd664 100644
--- a/tools/include/uapi/linux/fs.h
+++ b/tools/include/uapi/linux/fs.h
@@ -64,6 +64,24 @@ struct fstrim_range {
 	__u64 minlen;
 };
 
+/*
+ * We include a length field because some filesystems (vfat) have an identifier
+ * that we do want to expose as a UUID, but doesn't have the standard length.
+ *
+ * We use a fixed size buffer beacuse this interface will, by fiat, never
+ * support "UUIDs" longer than 16 bytes; we don't want to force all downstream
+ * users to have to deal with that.
+ */
+struct fsuuid2 {
+	__u8	len;
+	__u8	uuid[16];
+};
+
+struct fs_sysfs_path {
+	__u8			len;
+	__u8			name[128];
+};
+
 /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
 #define FILE_DEDUPE_RANGE_SAME		0
 #define FILE_DEDUPE_RANGE_DIFFERS	1
@@ -215,6 +233,13 @@ struct fsxattr {
 #define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
 #define FS_IOC_GETFSLABEL		_IOR(0x94, 49, char[FSLABEL_MAX])
 #define FS_IOC_SETFSLABEL		_IOW(0x94, 50, char[FSLABEL_MAX])
+/* Returns the external filesystem UUID, the same one blkid returns */
+#define FS_IOC_GETFSUUID		_IOR(0x15, 0, struct fsuuid2)
+/*
+ * Returns the path component under /sys/fs/ that refers to this filesystem;
+ * also /sys/kernel/debug/ for filesystems with debugfs exports
+ */
+#define FS_IOC_GETFSSYSFSPATH		_IOR(0x15, 1, struct fs_sysfs_path)
 
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
@@ -301,9 +326,12 @@ typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* per-IO negation of O_APPEND */
+#define RWF_NOAPPEND	((__force __kernel_rwf_t)0x00000020)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_NOAPPEND)
 
 /* Pagemap ioctl */
 #define PAGEMAP_SCAN	_IOWR('f', 16, struct pm_scan_arg)
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index c3308536482b..2190adbe3002 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -16,6 +16,11 @@
 
 #define KVM_API_VERSION 12
 
+/*
+ * Backwards-compatible definitions.
+ */
+#define __KVM_HAVE_GUEST_DEBUG
+
 /* for KVM_SET_USER_MEMORY_REGION */
 struct kvm_userspace_memory_region {
 	__u32 slot;
@@ -85,43 +90,6 @@ struct kvm_pit_config {
 
 #define KVM_PIT_SPEAKER_DUMMY     1
 
-struct kvm_s390_skeys {
-	__u64 start_gfn;
-	__u64 count;
-	__u64 skeydata_addr;
-	__u32 flags;
-	__u32 reserved[9];
-};
-
-#define KVM_S390_CMMA_PEEK (1 << 0)
-
-/**
- * kvm_s390_cmma_log - Used for CMMA migration.
- *
- * Used both for input and output.
- *
- * @start_gfn: Guest page number to start from.
- * @count: Size of the result buffer.
- * @flags: Control operation mode via KVM_S390_CMMA_* flags
- * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty
- *             pages are still remaining.
- * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set
- *        in the PGSTE.
- * @values: Pointer to the values buffer.
- *
- * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls.
- */
-struct kvm_s390_cmma_log {
-	__u64 start_gfn;
-	__u32 count;
-	__u32 flags;
-	union {
-		__u64 remaining;
-		__u64 mask;
-	};
-	__u64 values;
-};
-
 struct kvm_hyperv_exit {
 #define KVM_EXIT_HYPERV_SYNIC          1
 #define KVM_EXIT_HYPERV_HCALL          2
@@ -315,11 +283,6 @@ struct kvm_run {
 			__u32 ipb;
 		} s390_sieic;
 		/* KVM_EXIT_S390_RESET */
-#define KVM_S390_RESET_POR       1
-#define KVM_S390_RESET_CLEAR     2
-#define KVM_S390_RESET_SUBSYSTEM 4
-#define KVM_S390_RESET_CPU_INIT  8
-#define KVM_S390_RESET_IPL       16
 		__u64 s390_reset_flags;
 		/* KVM_EXIT_S390_UCONTROL */
 		struct {
@@ -536,43 +499,6 @@ struct kvm_translation {
 	__u8  pad[5];
 };
 
-/* for KVM_S390_MEM_OP */
-struct kvm_s390_mem_op {
-	/* in */
-	__u64 gaddr;		/* the guest address */
-	__u64 flags;		/* flags */
-	__u32 size;		/* amount of bytes */
-	__u32 op;		/* type of operation */
-	__u64 buf;		/* buffer in userspace */
-	union {
-		struct {
-			__u8 ar;	/* the access register number */
-			__u8 key;	/* access key, ignored if flag unset */
-			__u8 pad1[6];	/* ignored */
-			__u64 old_addr;	/* ignored if cmpxchg flag unset */
-		};
-		__u32 sida_offset; /* offset into the sida */
-		__u8 reserved[32]; /* ignored */
-	};
-};
-/* types for kvm_s390_mem_op->op */
-#define KVM_S390_MEMOP_LOGICAL_READ	0
-#define KVM_S390_MEMOP_LOGICAL_WRITE	1
-#define KVM_S390_MEMOP_SIDA_READ	2
-#define KVM_S390_MEMOP_SIDA_WRITE	3
-#define KVM_S390_MEMOP_ABSOLUTE_READ	4
-#define KVM_S390_MEMOP_ABSOLUTE_WRITE	5
-#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG	6
-
-/* flags for kvm_s390_mem_op->flags */
-#define KVM_S390_MEMOP_F_CHECK_ONLY		(1ULL << 0)
-#define KVM_S390_MEMOP_F_INJECT_EXCEPTION	(1ULL << 1)
-#define KVM_S390_MEMOP_F_SKEY_PROTECTION	(1ULL << 2)
-
-/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */
-#define KVM_S390_MEMOP_EXTENSION_CAP_BASE	(1 << 0)
-#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG	(1 << 1)
-
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
 	/* in */
@@ -637,124 +563,6 @@ struct kvm_mp_state {
 	__u32 mp_state;
 };
 
-struct kvm_s390_psw {
-	__u64 mask;
-	__u64 addr;
-};
-
-/* valid values for type in kvm_s390_interrupt */
-#define KVM_S390_SIGP_STOP		0xfffe0000u
-#define KVM_S390_PROGRAM_INT		0xfffe0001u
-#define KVM_S390_SIGP_SET_PREFIX	0xfffe0002u
-#define KVM_S390_RESTART		0xfffe0003u
-#define KVM_S390_INT_PFAULT_INIT	0xfffe0004u
-#define KVM_S390_INT_PFAULT_DONE	0xfffe0005u
-#define KVM_S390_MCHK			0xfffe1000u
-#define KVM_S390_INT_CLOCK_COMP		0xffff1004u
-#define KVM_S390_INT_CPU_TIMER		0xffff1005u
-#define KVM_S390_INT_VIRTIO		0xffff2603u
-#define KVM_S390_INT_SERVICE		0xffff2401u
-#define KVM_S390_INT_EMERGENCY		0xffff1201u
-#define KVM_S390_INT_EXTERNAL_CALL	0xffff1202u
-/* Anything below 0xfffe0000u is taken by INT_IO */
-#define KVM_S390_INT_IO(ai,cssid,ssid,schid)   \
-	(((schid)) |			       \
-	 ((ssid) << 16) |		       \
-	 ((cssid) << 18) |		       \
-	 ((ai) << 26))
-#define KVM_S390_INT_IO_MIN		0x00000000u
-#define KVM_S390_INT_IO_MAX		0xfffdffffu
-#define KVM_S390_INT_IO_AI_MASK		0x04000000u
-
-
-struct kvm_s390_interrupt {
-	__u32 type;
-	__u32 parm;
-	__u64 parm64;
-};
-
-struct kvm_s390_io_info {
-	__u16 subchannel_id;
-	__u16 subchannel_nr;
-	__u32 io_int_parm;
-	__u32 io_int_word;
-};
-
-struct kvm_s390_ext_info {
-	__u32 ext_params;
-	__u32 pad;
-	__u64 ext_params2;
-};
-
-struct kvm_s390_pgm_info {
-	__u64 trans_exc_code;
-	__u64 mon_code;
-	__u64 per_address;
-	__u32 data_exc_code;
-	__u16 code;
-	__u16 mon_class_nr;
-	__u8 per_code;
-	__u8 per_atmid;
-	__u8 exc_access_id;
-	__u8 per_access_id;
-	__u8 op_access_id;
-#define KVM_S390_PGM_FLAGS_ILC_VALID	0x01
-#define KVM_S390_PGM_FLAGS_ILC_0	0x02
-#define KVM_S390_PGM_FLAGS_ILC_1	0x04
-#define KVM_S390_PGM_FLAGS_ILC_MASK	0x06
-#define KVM_S390_PGM_FLAGS_NO_REWIND	0x08
-	__u8 flags;
-	__u8 pad[2];
-};
-
-struct kvm_s390_prefix_info {
-	__u32 address;
-};
-
-struct kvm_s390_extcall_info {
-	__u16 code;
-};
-
-struct kvm_s390_emerg_info {
-	__u16 code;
-};
-
-#define KVM_S390_STOP_FLAG_STORE_STATUS	0x01
-struct kvm_s390_stop_info {
-	__u32 flags;
-};
-
-struct kvm_s390_mchk_info {
-	__u64 cr14;
-	__u64 mcic;
-	__u64 failing_storage_address;
-	__u32 ext_damage_code;
-	__u32 pad;
-	__u8 fixed_logout[16];
-};
-
-struct kvm_s390_irq {
-	__u64 type;
-	union {
-		struct kvm_s390_io_info io;
-		struct kvm_s390_ext_info ext;
-		struct kvm_s390_pgm_info pgm;
-		struct kvm_s390_emerg_info emerg;
-		struct kvm_s390_extcall_info extcall;
-		struct kvm_s390_prefix_info prefix;
-		struct kvm_s390_stop_info stop;
-		struct kvm_s390_mchk_info mchk;
-		char reserved[64];
-	} u;
-};
-
-struct kvm_s390_irq_state {
-	__u64 buf;
-	__u32 flags;        /* will stay unused for compatibility reasons */
-	__u32 len;
-	__u32 reserved[4];  /* will stay unused for compatibility reasons */
-};
-
 /* for KVM_SET_GUEST_DEBUG */
 
 #define KVM_GUESTDBG_ENABLE		0x00000001
@@ -810,50 +618,6 @@ struct kvm_enable_cap {
 	__u8  pad[64];
 };
 
-/* for KVM_PPC_GET_PVINFO */
-
-#define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
-
-struct kvm_ppc_pvinfo {
-	/* out */
-	__u32 flags;
-	__u32 hcall[4];
-	__u8  pad[108];
-};
-
-/* for KVM_PPC_GET_SMMU_INFO */
-#define KVM_PPC_PAGE_SIZES_MAX_SZ	8
-
-struct kvm_ppc_one_page_size {
-	__u32 page_shift;	/* Page shift (or 0) */
-	__u32 pte_enc;		/* Encoding in the HPTE (>>12) */
-};
-
-struct kvm_ppc_one_seg_page_size {
-	__u32 page_shift;	/* Base page shift of segment (or 0) */
-	__u32 slb_enc;		/* SLB encoding for BookS */
-	struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ];
-};
-
-#define KVM_PPC_PAGE_SIZES_REAL		0x00000001
-#define KVM_PPC_1T_SEGMENTS		0x00000002
-#define KVM_PPC_NO_HASH			0x00000004
-
-struct kvm_ppc_smmu_info {
-	__u64 flags;
-	__u32 slb_size;
-	__u16 data_keys;	/* # storage keys supported for data */
-	__u16 instr_keys;	/* # storage keys supported for instructions */
-	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
-};
-
-/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */
-struct kvm_ppc_resize_hpt {
-	__u64 flags;
-	__u32 shift;
-	__u32 pad;
-};
-
 #define KVMIO 0xAE
 
 /* machine type bits, to be used as argument to KVM_CREATE_VM */
@@ -923,9 +687,7 @@ struct kvm_ppc_resize_hpt {
 /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
 #define KVM_CAP_USER_NMI 22
-#ifdef __KVM_HAVE_GUEST_DEBUG
 #define KVM_CAP_SET_GUEST_DEBUG 23
-#endif
 #ifdef __KVM_HAVE_PIT
 #define KVM_CAP_REINJECT_CONTROL 24
 #endif
@@ -1156,8 +918,6 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_GUEST_MEMFD 234
 #define KVM_CAP_VM_TYPES 235
 
-#ifdef KVM_CAP_IRQ_ROUTING
-
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
 	__u32 pin;
@@ -1222,42 +982,6 @@ struct kvm_irq_routing {
 	struct kvm_irq_routing_entry entries[];
 };
 
-#endif
-
-#ifdef KVM_CAP_MCE
-/* x86 MCE */
-struct kvm_x86_mce {
-	__u64 status;
-	__u64 addr;
-	__u64 misc;
-	__u64 mcg_status;
-	__u8 bank;
-	__u8 pad1[7];
-	__u64 pad2[3];
-};
-#endif
-
-#ifdef KVM_CAP_XEN_HVM
-#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR	(1 << 0)
-#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
-#define KVM_XEN_HVM_CONFIG_SHARED_INFO		(1 << 2)
-#define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
-#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
-#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND		(1 << 5)
-#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG	(1 << 6)
-#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE	(1 << 7)
-
-struct kvm_xen_hvm_config {
-	__u32 flags;
-	__u32 msr;
-	__u64 blob_addr_32;
-	__u64 blob_addr_64;
-	__u8 blob_size_32;
-	__u8 blob_size_64;
-	__u8 pad2[30];
-};
-#endif
-
 #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
 /*
  * Available with KVM_CAP_IRQFD_RESAMPLE
@@ -1442,11 +1166,6 @@ struct kvm_vfio_spapr_tce {
 					 struct kvm_userspace_memory_region2)
 
 /* enable ucontrol for s390 */
-struct kvm_s390_ucas_mapping {
-	__u64 user_addr;
-	__u64 vcpu_addr;
-	__u64 length;
-};
 #define KVM_S390_UCAS_MAP        _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping)
 #define KVM_S390_UCAS_UNMAP      _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping)
 #define KVM_S390_VCPU_FAULT	 _IOW(KVMIO, 0x52, unsigned long)
@@ -1641,89 +1360,6 @@ struct kvm_enc_region {
 #define KVM_S390_NORMAL_RESET	_IO(KVMIO,   0xc3)
 #define KVM_S390_CLEAR_RESET	_IO(KVMIO,   0xc4)
 
-struct kvm_s390_pv_sec_parm {
-	__u64 origin;
-	__u64 length;
-};
-
-struct kvm_s390_pv_unp {
-	__u64 addr;
-	__u64 size;
-	__u64 tweak;
-};
-
-enum pv_cmd_dmp_id {
-	KVM_PV_DUMP_INIT,
-	KVM_PV_DUMP_CONFIG_STOR_STATE,
-	KVM_PV_DUMP_COMPLETE,
-	KVM_PV_DUMP_CPU,
-};
-
-struct kvm_s390_pv_dmp {
-	__u64 subcmd;
-	__u64 buff_addr;
-	__u64 buff_len;
-	__u64 gaddr;		/* For dump storage state */
-	__u64 reserved[4];
-};
-
-enum pv_cmd_info_id {
-	KVM_PV_INFO_VM,
-	KVM_PV_INFO_DUMP,
-};
-
-struct kvm_s390_pv_info_dump {
-	__u64 dump_cpu_buffer_len;
-	__u64 dump_config_mem_buffer_per_1m;
-	__u64 dump_config_finalize_len;
-};
-
-struct kvm_s390_pv_info_vm {
-	__u64 inst_calls_list[4];
-	__u64 max_cpus;
-	__u64 max_guests;
-	__u64 max_guest_addr;
-	__u64 feature_indication;
-};
-
-struct kvm_s390_pv_info_header {
-	__u32 id;
-	__u32 len_max;
-	__u32 len_written;
-	__u32 reserved;
-};
-
-struct kvm_s390_pv_info {
-	struct kvm_s390_pv_info_header header;
-	union {
-		struct kvm_s390_pv_info_dump dump;
-		struct kvm_s390_pv_info_vm vm;
-	};
-};
-
-enum pv_cmd_id {
-	KVM_PV_ENABLE,
-	KVM_PV_DISABLE,
-	KVM_PV_SET_SEC_PARMS,
-	KVM_PV_UNPACK,
-	KVM_PV_VERIFY,
-	KVM_PV_PREP_RESET,
-	KVM_PV_UNSHARE_ALL,
-	KVM_PV_INFO,
-	KVM_PV_DUMP,
-	KVM_PV_ASYNC_CLEANUP_PREPARE,
-	KVM_PV_ASYNC_CLEANUP_PERFORM,
-};
-
-struct kvm_pv_cmd {
-	__u32 cmd;	/* Command to be executed */
-	__u16 rc;	/* Ultravisor return code */
-	__u16 rrc;	/* Ultravisor return reason code */
-	__u64 data;	/* Data or address */
-	__u32 flags;    /* flags for future extensions. Must be 0 for now */
-	__u32 reserved[3];
-};
-
 /* Available with KVM_CAP_S390_PROTECTED */
 #define KVM_S390_PV_COMMAND		_IOWR(KVMIO, 0xc5, struct kvm_pv_cmd)
 
@@ -1737,58 +1373,6 @@ struct kvm_pv_cmd {
 #define KVM_XEN_HVM_GET_ATTR	_IOWR(KVMIO, 0xc8, struct kvm_xen_hvm_attr)
 #define KVM_XEN_HVM_SET_ATTR	_IOW(KVMIO,  0xc9, struct kvm_xen_hvm_attr)
 
-struct kvm_xen_hvm_attr {
-	__u16 type;
-	__u16 pad[3];
-	union {
-		__u8 long_mode;
-		__u8 vector;
-		__u8 runstate_update_flag;
-		struct {
-			__u64 gfn;
-#define KVM_XEN_INVALID_GFN ((__u64)-1)
-		} shared_info;
-		struct {
-			__u32 send_port;
-			__u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
-			__u32 flags;
-#define KVM_XEN_EVTCHN_DEASSIGN		(1 << 0)
-#define KVM_XEN_EVTCHN_UPDATE		(1 << 1)
-#define KVM_XEN_EVTCHN_RESET		(1 << 2)
-			/*
-			 * Events sent by the guest are either looped back to
-			 * the guest itself (potentially on a different port#)
-			 * or signalled via an eventfd.
-			 */
-			union {
-				struct {
-					__u32 port;
-					__u32 vcpu;
-					__u32 priority;
-				} port;
-				struct {
-					__u32 port; /* Zero for eventfd */
-					__s32 fd;
-				} eventfd;
-				__u32 padding[4];
-			} deliver;
-		} evtchn;
-		__u32 xen_version;
-		__u64 pad[8];
-	} u;
-};
-
-
-/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
-#define KVM_XEN_ATTR_TYPE_LONG_MODE		0x0
-#define KVM_XEN_ATTR_TYPE_SHARED_INFO		0x1
-#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR		0x2
-/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
-#define KVM_XEN_ATTR_TYPE_EVTCHN		0x3
-#define KVM_XEN_ATTR_TYPE_XEN_VERSION		0x4
-/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */
-#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG	0x5
-
 /* Per-vCPU Xen attributes */
 #define KVM_XEN_VCPU_GET_ATTR	_IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)
 #define KVM_XEN_VCPU_SET_ATTR	_IOW(KVMIO,  0xcb, struct kvm_xen_vcpu_attr)
@@ -1799,242 +1383,6 @@ struct kvm_xen_hvm_attr {
 #define KVM_GET_SREGS2             _IOR(KVMIO,  0xcc, struct kvm_sregs2)
 #define KVM_SET_SREGS2             _IOW(KVMIO,  0xcd, struct kvm_sregs2)
 
-struct kvm_xen_vcpu_attr {
-	__u16 type;
-	__u16 pad[3];
-	union {
-		__u64 gpa;
-#define KVM_XEN_INVALID_GPA ((__u64)-1)
-		__u64 pad[8];
-		struct {
-			__u64 state;
-			__u64 state_entry_time;
-			__u64 time_running;
-			__u64 time_runnable;
-			__u64 time_blocked;
-			__u64 time_offline;
-		} runstate;
-		__u32 vcpu_id;
-		struct {
-			__u32 port;
-			__u32 priority;
-			__u64 expires_ns;
-		} timer;
-		__u8 vector;
-	} u;
-};
-
-/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
-#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO	0x0
-#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO	0x1
-#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR	0x2
-#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT	0x3
-#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA	0x4
-#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST	0x5
-/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
-#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID		0x6
-#define KVM_XEN_VCPU_ATTR_TYPE_TIMER		0x7
-#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR	0x8
-
-/* Secure Encrypted Virtualization command */
-enum sev_cmd_id {
-	/* Guest initialization commands */
-	KVM_SEV_INIT = 0,
-	KVM_SEV_ES_INIT,
-	/* Guest launch commands */
-	KVM_SEV_LAUNCH_START,
-	KVM_SEV_LAUNCH_UPDATE_DATA,
-	KVM_SEV_LAUNCH_UPDATE_VMSA,
-	KVM_SEV_LAUNCH_SECRET,
-	KVM_SEV_LAUNCH_MEASURE,
-	KVM_SEV_LAUNCH_FINISH,
-	/* Guest migration commands (outgoing) */
-	KVM_SEV_SEND_START,
-	KVM_SEV_SEND_UPDATE_DATA,
-	KVM_SEV_SEND_UPDATE_VMSA,
-	KVM_SEV_SEND_FINISH,
-	/* Guest migration commands (incoming) */
-	KVM_SEV_RECEIVE_START,
-	KVM_SEV_RECEIVE_UPDATE_DATA,
-	KVM_SEV_RECEIVE_UPDATE_VMSA,
-	KVM_SEV_RECEIVE_FINISH,
-	/* Guest status and debug commands */
-	KVM_SEV_GUEST_STATUS,
-	KVM_SEV_DBG_DECRYPT,
-	KVM_SEV_DBG_ENCRYPT,
-	/* Guest certificates commands */
-	KVM_SEV_CERT_EXPORT,
-	/* Attestation report */
-	KVM_SEV_GET_ATTESTATION_REPORT,
-	/* Guest Migration Extension */
-	KVM_SEV_SEND_CANCEL,
-
-	KVM_SEV_NR_MAX,
-};
-
-struct kvm_sev_cmd {
-	__u32 id;
-	__u64 data;
-	__u32 error;
-	__u32 sev_fd;
-};
-
-struct kvm_sev_launch_start {
-	__u32 handle;
-	__u32 policy;
-	__u64 dh_uaddr;
-	__u32 dh_len;
-	__u64 session_uaddr;
-	__u32 session_len;
-};
-
-struct kvm_sev_launch_update_data {
-	__u64 uaddr;
-	__u32 len;
-};
-
-
-struct kvm_sev_launch_secret {
-	__u64 hdr_uaddr;
-	__u32 hdr_len;
-	__u64 guest_uaddr;
-	__u32 guest_len;
-	__u64 trans_uaddr;
-	__u32 trans_len;
-};
-
-struct kvm_sev_launch_measure {
-	__u64 uaddr;
-	__u32 len;
-};
-
-struct kvm_sev_guest_status {
-	__u32 handle;
-	__u32 policy;
-	__u32 state;
-};
-
-struct kvm_sev_dbg {
-	__u64 src_uaddr;
-	__u64 dst_uaddr;
-	__u32 len;
-};
-
-struct kvm_sev_attestation_report {
-	__u8 mnonce[16];
-	__u64 uaddr;
-	__u32 len;
-};
-
-struct kvm_sev_send_start {
-	__u32 policy;
-	__u64 pdh_cert_uaddr;
-	__u32 pdh_cert_len;
-	__u64 plat_certs_uaddr;
-	__u32 plat_certs_len;
-	__u64 amd_certs_uaddr;
-	__u32 amd_certs_len;
-	__u64 session_uaddr;
-	__u32 session_len;
-};
-
-struct kvm_sev_send_update_data {
-	__u64 hdr_uaddr;
-	__u32 hdr_len;
-	__u64 guest_uaddr;
-	__u32 guest_len;
-	__u64 trans_uaddr;
-	__u32 trans_len;
-};
-
-struct kvm_sev_receive_start {
-	__u32 handle;
-	__u32 policy;
-	__u64 pdh_uaddr;
-	__u32 pdh_len;
-	__u64 session_uaddr;
-	__u32 session_len;
-};
-
-struct kvm_sev_receive_update_data {
-	__u64 hdr_uaddr;
-	__u32 hdr_len;
-	__u64 guest_uaddr;
-	__u32 guest_len;
-	__u64 trans_uaddr;
-	__u32 trans_len;
-};
-
-#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
-#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
-#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-
-struct kvm_assigned_pci_dev {
-	__u32 assigned_dev_id;
-	__u32 busnr;
-	__u32 devfn;
-	__u32 flags;
-	__u32 segnr;
-	union {
-		__u32 reserved[11];
-	};
-};
-
-#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
-#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
-#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
-
-#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
-#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
-#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
-
-#define KVM_DEV_IRQ_HOST_MASK	 0x00ff
-#define KVM_DEV_IRQ_GUEST_MASK   0xff00
-
-struct kvm_assigned_irq {
-	__u32 assigned_dev_id;
-	__u32 host_irq; /* ignored (legacy field) */
-	__u32 guest_irq;
-	__u32 flags;
-	union {
-		__u32 reserved[12];
-	};
-};
-
-struct kvm_assigned_msix_nr {
-	__u32 assigned_dev_id;
-	__u16 entry_nr;
-	__u16 padding;
-};
-
-#define KVM_MAX_MSIX_PER_DEV		256
-struct kvm_assigned_msix_entry {
-	__u32 assigned_dev_id;
-	__u32 gsi;
-	__u16 entry; /* The index of entry in the MSI-X table */
-	__u16 padding[3];
-};
-
-#define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
-#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
-
-/* Available with KVM_CAP_ARM_USER_IRQ */
-
-/* Bits for run->s.regs.device_irq_level */
-#define KVM_ARM_DEV_EL1_VTIMER		(1 << 0)
-#define KVM_ARM_DEV_EL1_PTIMER		(1 << 1)
-#define KVM_ARM_DEV_PMU			(1 << 2)
-
-struct kvm_hyperv_eventfd {
-	__u32 conn_id;
-	__s32 fd;
-	__u32 flags;
-	__u32 padding[3];
-};
-
-#define KVM_HYPERV_CONN_ID_MASK		0x00ffffff
-#define KVM_HYPERV_EVENTFD_DEASSIGN	(1 << 0)
-
 #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE    (1 << 0)
 #define KVM_DIRTY_LOG_INITIALLY_SET            (1 << 1)
 
@@ -2180,33 +1528,6 @@ struct kvm_stats_desc {
 /* Available with KVM_CAP_S390_ZPCI_OP */
 #define KVM_S390_ZPCI_OP         _IOW(KVMIO,  0xd1, struct kvm_s390_zpci_op)
 
-struct kvm_s390_zpci_op {
-	/* in */
-	__u32 fh;               /* target device */
-	__u8  op;               /* operation to perform */
-	__u8  pad[3];
-	union {
-		/* for KVM_S390_ZPCIOP_REG_AEN */
-		struct {
-			__u64 ibv;      /* Guest addr of interrupt bit vector */
-			__u64 sb;       /* Guest addr of summary bit */
-			__u32 flags;
-			__u32 noi;      /* Number of interrupts */
-			__u8 isc;       /* Guest interrupt subclass */
-			__u8 sbo;       /* Offset of guest summary bit vector */
-			__u16 pad;
-		} reg_aen;
-		__u64 reserved[8];
-	} u;
-};
-
-/* types for kvm_s390_zpci_op->op */
-#define KVM_S390_ZPCIOP_REG_AEN                0
-#define KVM_S390_ZPCIOP_DEREG_AEN      1
-
-/* flags for kvm_s390_zpci_op->u.reg_aen.flags */
-#define KVM_S390_ZPCIOP_REGAEN_HOST    (1 << 0)
-
 /* Available with KVM_CAP_MEMORY_ATTRIBUTES */
 #define KVM_SET_MEMORY_ATTRIBUTES              _IOW(KVMIO,  0xd2, struct kvm_memory_attributes)
 
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index bb65ee840cda..a8188202413e 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -146,6 +146,27 @@ enum {
 	NETDEV_A_QSTATS_TX_PACKETS,
 	NETDEV_A_QSTATS_TX_BYTES,
 	NETDEV_A_QSTATS_RX_ALLOC_FAIL,
+	NETDEV_A_QSTATS_RX_HW_DROPS,
+	NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS,
+	NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY,
+	NETDEV_A_QSTATS_RX_CSUM_NONE,
+	NETDEV_A_QSTATS_RX_CSUM_BAD,
+	NETDEV_A_QSTATS_RX_HW_GRO_PACKETS,
+	NETDEV_A_QSTATS_RX_HW_GRO_BYTES,
+	NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS,
+	NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES,
+	NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS,
+	NETDEV_A_QSTATS_TX_HW_DROPS,
+	NETDEV_A_QSTATS_TX_HW_DROP_ERRORS,
+	NETDEV_A_QSTATS_TX_CSUM_NONE,
+	NETDEV_A_QSTATS_TX_NEEDS_CSUM,
+	NETDEV_A_QSTATS_TX_HW_GSO_PACKETS,
+	NETDEV_A_QSTATS_TX_HW_GSO_BYTES,
+	NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS,
+	NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES,
+	NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS,
+	NETDEV_A_QSTATS_TX_STOP,
+	NETDEV_A_QSTATS_TX_WAKE,
 
 	__NETDEV_A_QSTATS_MAX,
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h
index d5b9cfbd9cea..628d46a0da92 100644
--- a/tools/include/uapi/sound/asound.h
+++ b/tools/include/uapi/sound/asound.h
@@ -142,7 +142,7 @@ struct snd_hwdep_dsp_image {
  *                                                                           *
  *****************************************************************************/
 
-#define SNDRV_PCM_VERSION		SNDRV_PROTOCOL_VERSION(2, 0, 16)
+#define SNDRV_PCM_VERSION		SNDRV_PROTOCOL_VERSION(2, 0, 17)
 
 typedef unsigned long snd_pcm_uframes_t;
 typedef signed long snd_pcm_sframes_t;
@@ -416,7 +416,7 @@ struct snd_pcm_hw_params {
 	unsigned int rmask;		/* W: requested masks */
 	unsigned int cmask;		/* R: changed masks */
 	unsigned int info;		/* R: Info flags for returned setup */
-	unsigned int msbits;		/* R: used most significant bits */
+	unsigned int msbits;		/* R: used most significant bits (in sample bit-width) */
 	unsigned int rate_num;		/* R: rate numerator */
 	unsigned int rate_den;		/* R: rate denominator */
 	snd_pcm_uframes_t fifo_size;	/* R: chip FIFO size in frames */
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 4be7144e4803..2cf892774346 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -2,7 +2,7 @@
 # Most of this file is copied from tools/lib/traceevent/Makefile
 
 RM ?= rm
-srctree = $(abs_srctree)
+srctree := $(realpath $(srctree))
 
 VERSION_SCRIPT := libbpf.map
 LIBBPF_VERSION := $(shell \
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 97ec005c3c47..466a29d80124 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -766,6 +766,7 @@ int bpf_link_create(int prog_fd, int target_fd,
 			return libbpf_err(-EINVAL);
 		break;
 	case BPF_TRACE_KPROBE_MULTI:
+	case BPF_TRACE_KPROBE_SESSION:
 		attr.link_create.kprobe_multi.flags = OPTS_GET(opts, kprobe_multi.flags, 0);
 		attr.link_create.kprobe_multi.cnt = OPTS_GET(opts, kprobe_multi.cnt, 0);
 		attr.link_create.kprobe_multi.syms = ptr_to_u64(OPTS_GET(opts, kprobe_multi.syms, 0));
@@ -785,6 +786,7 @@ int bpf_link_create(int prog_fd, int target_fd,
 		if (!OPTS_ZEROED(opts, uprobe_multi))
 			return libbpf_err(-EINVAL);
 		break;
+	case BPF_TRACE_RAW_TP:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
 	case BPF_MODIFY_RETURN:
@@ -1173,20 +1175,31 @@ int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info
 	return bpf_obj_get_info_by_fd(link_fd, info, info_len);
 }
 
-int bpf_raw_tracepoint_open(const char *name, int prog_fd)
+int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts)
 {
 	const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint);
 	union bpf_attr attr;
 	int fd;
 
+	if (!OPTS_VALID(opts, bpf_raw_tp_opts))
+		return libbpf_err(-EINVAL);
+
 	memset(&attr, 0, attr_sz);
-	attr.raw_tracepoint.name = ptr_to_u64(name);
 	attr.raw_tracepoint.prog_fd = prog_fd;
+	attr.raw_tracepoint.name = ptr_to_u64(OPTS_GET(opts, tp_name, NULL));
+	attr.raw_tracepoint.cookie = OPTS_GET(opts, cookie, 0);
 
 	fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz);
 	return libbpf_err_errno(fd);
 }
 
+int bpf_raw_tracepoint_open(const char *name, int prog_fd)
+{
+	LIBBPF_OPTS(bpf_raw_tp_opts, opts, .tp_name = name);
+
+	return bpf_raw_tracepoint_open_opts(prog_fd, &opts);
+}
+
 int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts)
 {
 	const size_t attr_sz = offsetofend(union bpf_attr, btf_token_fd);
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index df0db2f0cdb7..972e17ec0c09 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -617,6 +617,15 @@ LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type,
 			      __u32 query_flags, __u32 *attach_flags,
 			      __u32 *prog_ids, __u32 *prog_cnt);
 
+struct bpf_raw_tp_opts {
+	size_t sz; /* size of this struct for forward/backward compatibility */
+	const char *tp_name;
+	__u64 cookie;
+	size_t :0;
+};
+#define bpf_raw_tp_opts__last_field cookie
+
+LIBBPF_API int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts);
 LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd);
 LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf,
 				 __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
index 1ce738d91685..c0e13cdf9660 100644
--- a/tools/lib/bpf/bpf_core_read.h
+++ b/tools/lib/bpf/bpf_core_read.h
@@ -2,7 +2,7 @@
 #ifndef __BPF_CORE_READ_H__
 #define __BPF_CORE_READ_H__
 
-#include <bpf/bpf_helpers.h>
+#include "bpf_helpers.h"
 
 /*
  * enum bpf_field_info_kind is passed as a second argument into
@@ -104,6 +104,7 @@ enum bpf_enum_value_kind {
 	case 2: val = *(const unsigned short *)p; break;		      \
 	case 4: val = *(const unsigned int *)p; break;			      \
 	case 8: val = *(const unsigned long long *)p; break;		      \
+	default: val = 0; break;					      \
 	}								      \
 	val <<= __CORE_RELO(s, field, LSHIFT_U64);			      \
 	if (__CORE_RELO(s, field, SIGNED))				      \
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index cd17f6d0791f..305c62817dd3 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -137,7 +137,8 @@
 /*
  * Helper function to perform a tail call with a constant/immediate map slot.
  */
-#if __clang_major__ >= 8 && defined(__bpf__)
+#if (defined(__clang__) && __clang_major__ >= 8) || (!defined(__clang__) && __GNUC__ > 12)
+#if defined(__bpf__)
 static __always_inline void
 bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
 {
@@ -165,6 +166,7 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
 		     : "r0", "r1", "r2", "r3", "r4", "r5");
 }
 #endif
+#endif
 
 enum libbpf_pin_type {
 	LIBBPF_PIN_NONE,
@@ -184,10 +186,21 @@ enum libbpf_tristate {
 #define __kptr __attribute__((btf_type_tag("kptr")))
 #define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr")))
 
-#define bpf_ksym_exists(sym) ({									\
-	_Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak");	\
-	!!sym;											\
+#if defined (__clang__)
+#define bpf_ksym_exists(sym) ({						\
+	_Static_assert(!__builtin_constant_p(!!sym),			\
+		       #sym " should be marked as __weak");		\
+	!!sym;								\
+})
+#elif __GNUC__ > 8
+#define bpf_ksym_exists(sym) ({						\
+	_Static_assert(__builtin_has_attribute (*sym, __weak__),	\
+		       #sym " should be marked as __weak");		\
+	!!sym;								\
 })
+#else
+#define bpf_ksym_exists(sym) !!sym
+#endif
 
 #define __arg_ctx __attribute__((btf_decl_tag("arg:ctx")))
 #define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull")))
diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 1c13f8e88833..9314fa95f04e 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -633,18 +633,18 @@ struct pt_regs;
 #endif
 
 #define ___bpf_ctx_cast0()            ctx
-#define ___bpf_ctx_cast1(x)           ___bpf_ctx_cast0(), (void *)ctx[0]
-#define ___bpf_ctx_cast2(x, args...)  ___bpf_ctx_cast1(args), (void *)ctx[1]
-#define ___bpf_ctx_cast3(x, args...)  ___bpf_ctx_cast2(args), (void *)ctx[2]
-#define ___bpf_ctx_cast4(x, args...)  ___bpf_ctx_cast3(args), (void *)ctx[3]
-#define ___bpf_ctx_cast5(x, args...)  ___bpf_ctx_cast4(args), (void *)ctx[4]
-#define ___bpf_ctx_cast6(x, args...)  ___bpf_ctx_cast5(args), (void *)ctx[5]
-#define ___bpf_ctx_cast7(x, args...)  ___bpf_ctx_cast6(args), (void *)ctx[6]
-#define ___bpf_ctx_cast8(x, args...)  ___bpf_ctx_cast7(args), (void *)ctx[7]
-#define ___bpf_ctx_cast9(x, args...)  ___bpf_ctx_cast8(args), (void *)ctx[8]
-#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9]
-#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10]
-#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11]
+#define ___bpf_ctx_cast1(x)           ___bpf_ctx_cast0(), ctx[0]
+#define ___bpf_ctx_cast2(x, args...)  ___bpf_ctx_cast1(args), ctx[1]
+#define ___bpf_ctx_cast3(x, args...)  ___bpf_ctx_cast2(args), ctx[2]
+#define ___bpf_ctx_cast4(x, args...)  ___bpf_ctx_cast3(args), ctx[3]
+#define ___bpf_ctx_cast5(x, args...)  ___bpf_ctx_cast4(args), ctx[4]
+#define ___bpf_ctx_cast6(x, args...)  ___bpf_ctx_cast5(args), ctx[5]
+#define ___bpf_ctx_cast7(x, args...)  ___bpf_ctx_cast6(args), ctx[6]
+#define ___bpf_ctx_cast8(x, args...)  ___bpf_ctx_cast7(args), ctx[7]
+#define ___bpf_ctx_cast9(x, args...)  ___bpf_ctx_cast8(args), ctx[8]
+#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), ctx[9]
+#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), ctx[10]
+#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), ctx[11]
 #define ___bpf_ctx_cast(args...)      ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args)
 
 /*
@@ -786,14 +786,14 @@ ____##name(unsigned long long *ctx ___bpf_ctx_decl(args))
 struct pt_regs;
 
 #define ___bpf_kprobe_args0()           ctx
-#define ___bpf_kprobe_args1(x)          ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx)
-#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx)
-#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx)
-#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx)
-#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx)
-#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (void *)PT_REGS_PARM6(ctx)
-#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (void *)PT_REGS_PARM7(ctx)
-#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (void *)PT_REGS_PARM8(ctx)
+#define ___bpf_kprobe_args1(x)          ___bpf_kprobe_args0(), (unsigned long long)PT_REGS_PARM1(ctx)
+#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (unsigned long long)PT_REGS_PARM2(ctx)
+#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (unsigned long long)PT_REGS_PARM3(ctx)
+#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (unsigned long long)PT_REGS_PARM4(ctx)
+#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (unsigned long long)PT_REGS_PARM5(ctx)
+#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (unsigned long long)PT_REGS_PARM6(ctx)
+#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (unsigned long long)PT_REGS_PARM7(ctx)
+#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (unsigned long long)PT_REGS_PARM8(ctx)
 #define ___bpf_kprobe_args(args...)     ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args)
 
 /*
@@ -821,7 +821,7 @@ static __always_inline typeof(name(0))					    \
 ____##name(struct pt_regs *ctx, ##args)
 
 #define ___bpf_kretprobe_args0()       ctx
-#define ___bpf_kretprobe_args1(x)      ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx)
+#define ___bpf_kretprobe_args1(x)      ___bpf_kretprobe_args0(), (unsigned long long)PT_REGS_RC(ctx)
 #define ___bpf_kretprobe_args(args...) ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args)
 
 /*
@@ -845,24 +845,24 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)
 
 /* If kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER, read pt_regs directly */
 #define ___bpf_syscall_args0()           ctx
-#define ___bpf_syscall_args1(x)          ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_SYSCALL(regs)
-#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_SYSCALL(regs)
-#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs)
-#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs)
-#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs)
-#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (void *)PT_REGS_PARM6_SYSCALL(regs)
-#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (void *)PT_REGS_PARM7_SYSCALL(regs)
+#define ___bpf_syscall_args1(x)          ___bpf_syscall_args0(), (unsigned long long)PT_REGS_PARM1_SYSCALL(regs)
+#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (unsigned long long)PT_REGS_PARM2_SYSCALL(regs)
+#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (unsigned long long)PT_REGS_PARM3_SYSCALL(regs)
+#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (unsigned long long)PT_REGS_PARM4_SYSCALL(regs)
+#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (unsigned long long)PT_REGS_PARM5_SYSCALL(regs)
+#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (unsigned long long)PT_REGS_PARM6_SYSCALL(regs)
+#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (unsigned long long)PT_REGS_PARM7_SYSCALL(regs)
 #define ___bpf_syscall_args(args...)     ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args)
 
 /* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */
 #define ___bpf_syswrap_args0()           ctx
-#define ___bpf_syswrap_args1(x)          ___bpf_syswrap_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs)
-#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs)
-#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs)
-#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs)
-#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs)
-#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (void *)PT_REGS_PARM6_CORE_SYSCALL(regs)
-#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (void *)PT_REGS_PARM7_CORE_SYSCALL(regs)
+#define ___bpf_syswrap_args1(x)          ___bpf_syswrap_args0(), (unsigned long long)PT_REGS_PARM1_CORE_SYSCALL(regs)
+#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (unsigned long long)PT_REGS_PARM2_CORE_SYSCALL(regs)
+#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (unsigned long long)PT_REGS_PARM3_CORE_SYSCALL(regs)
+#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (unsigned long long)PT_REGS_PARM4_CORE_SYSCALL(regs)
+#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (unsigned long long)PT_REGS_PARM5_CORE_SYSCALL(regs)
+#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (unsigned long long)PT_REGS_PARM6_CORE_SYSCALL(regs)
+#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (unsigned long long)PT_REGS_PARM7_CORE_SYSCALL(regs)
 #define ___bpf_syswrap_args(args...)     ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args)
 
 /*
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 4d9f30bf7f01..5dbca76b953f 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -1929,6 +1929,7 @@ static int btf_dump_int_data(struct btf_dump *d,
 			if (d->typed_dump->is_array_terminated)
 				break;
 			if (*(char *)data == '\0') {
+				btf_dump_type_values(d, "'\\0'");
 				d->typed_dump->is_array_terminated = true;
 				break;
 			}
@@ -2031,6 +2032,7 @@ static int btf_dump_array_data(struct btf_dump *d,
 	__u32 i, elem_type_id;
 	__s64 elem_size;
 	bool is_array_member;
+	bool is_array_terminated;
 
 	elem_type_id = array->type;
 	elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL);
@@ -2066,12 +2068,15 @@ static int btf_dump_array_data(struct btf_dump *d,
 	 */
 	is_array_member = d->typed_dump->is_array_member;
 	d->typed_dump->is_array_member = true;
+	is_array_terminated = d->typed_dump->is_array_terminated;
+	d->typed_dump->is_array_terminated = false;
 	for (i = 0; i < array->nelems; i++, data += elem_size) {
 		if (d->typed_dump->is_array_terminated)
 			break;
 		btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0);
 	}
 	d->typed_dump->is_array_member = is_array_member;
+	d->typed_dump->is_array_terminated = is_array_terminated;
 	d->typed_dump->depth--;
 	btf_dump_data_pfx(d);
 	btf_dump_type_values(d, "]");
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index efab29b8935b..5401f2df463d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -132,6 +132,7 @@ static const char * const attach_type_name[] = {
 	[BPF_TRACE_UPROBE_MULTI]	= "trace_uprobe_multi",
 	[BPF_NETKIT_PRIMARY]		= "netkit_primary",
 	[BPF_NETKIT_PEER]		= "netkit_peer",
+	[BPF_TRACE_KPROBE_SESSION]	= "trace_kprobe_session",
 };
 
 static const char * const link_type_name[] = {
@@ -149,6 +150,7 @@ static const char * const link_type_name[] = {
 	[BPF_LINK_TYPE_TCX]			= "tcx",
 	[BPF_LINK_TYPE_UPROBE_MULTI]		= "uprobe_multi",
 	[BPF_LINK_TYPE_NETKIT]			= "netkit",
+	[BPF_LINK_TYPE_SOCKMAP]			= "sockmap",
 };
 
 static const char * const map_type_name[] = {
@@ -498,7 +500,7 @@ struct bpf_struct_ops {
 #define KSYMS_SEC ".ksyms"
 #define STRUCT_OPS_SEC ".struct_ops"
 #define STRUCT_OPS_LINK_SEC ".struct_ops.link"
-#define ARENA_SEC ".arena.1"
+#define ARENA_SEC ".addr_space.1"
 
 enum libbpf_map_type {
 	LIBBPF_MAP_UNSPEC,
@@ -1126,17 +1128,46 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map)
 		const struct btf_type *mtype, *kern_mtype;
 		__u32 mtype_id, kern_mtype_id;
 		void *mdata, *kern_mdata;
+		struct bpf_program *prog;
 		__s64 msize, kern_msize;
 		__u32 moff, kern_moff;
 		__u32 kern_member_idx;
 		const char *mname;
 
 		mname = btf__name_by_offset(btf, member->name_off);
+		moff = member->offset / 8;
+		mdata = data + moff;
+		msize = btf__resolve_size(btf, member->type);
+		if (msize < 0) {
+			pr_warn("struct_ops init_kern %s: failed to resolve the size of member %s\n",
+				map->name, mname);
+			return msize;
+		}
+
 		kern_member = find_member_by_name(kern_btf, kern_type, mname);
 		if (!kern_member) {
-			pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n",
+			if (!libbpf_is_mem_zeroed(mdata, msize)) {
+				pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n",
+					map->name, mname);
+				return -ENOTSUP;
+			}
+
+			if (st_ops->progs[i]) {
+				/* If we had declaratively set struct_ops callback, we need to
+				 * force its autoload to false, because it doesn't have
+				 * a chance of succeeding from POV of the current struct_ops map.
+				 * If this program is still referenced somewhere else, though,
+				 * then bpf_object_adjust_struct_ops_autoload() will update its
+				 * autoload accordingly.
+				 */
+				st_ops->progs[i]->autoload = false;
+				st_ops->progs[i] = NULL;
+			}
+
+			/* Skip all-zero/NULL fields if they are not present in the kernel BTF */
+			pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n",
 				map->name, mname);
-			return -ENOTSUP;
+			continue;
 		}
 
 		kern_member_idx = kern_member - btf_members(kern_type);
@@ -1147,10 +1178,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map)
 			return -ENOTSUP;
 		}
 
-		moff = member->offset / 8;
 		kern_moff = kern_member->offset / 8;
-
-		mdata = data + moff;
 		kern_mdata = kern_data + kern_moff;
 
 		mtype = skip_mods_and_typedefs(btf, member->type, &mtype_id);
@@ -1165,13 +1193,19 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map)
 		}
 
 		if (btf_is_ptr(mtype)) {
-			struct bpf_program *prog;
+			prog = *(void **)mdata;
+			/* just like for !kern_member case above, reset declaratively
+			 * set (at compile time) program's autload to false,
+			 * if user replaced it with another program or NULL
+			 */
+			if (st_ops->progs[i] && st_ops->progs[i] != prog)
+				st_ops->progs[i]->autoload = false;
 
 			/* Update the value from the shadow type */
-			prog = *(void **)mdata;
 			st_ops->progs[i] = prog;
 			if (!prog)
 				continue;
+
 			if (!is_valid_st_ops_program(obj, prog)) {
 				pr_warn("struct_ops init_kern %s: member %s is not a struct_ops program\n",
 					map->name, mname);
@@ -1230,9 +1264,8 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map)
 			continue;
 		}
 
-		msize = btf__resolve_size(btf, mtype_id);
 		kern_msize = btf__resolve_size(kern_btf, kern_mtype_id);
-		if (msize < 0 || kern_msize < 0 || msize != kern_msize) {
+		if (kern_msize < 0 || msize != kern_msize) {
 			pr_warn("struct_ops init_kern %s: Error in size of member %s: %zd != %zd(kernel)\n",
 				map->name, mname, (ssize_t)msize,
 				(ssize_t)kern_msize);
@@ -1650,6 +1683,10 @@ static int sys_memfd_create(const char *name, unsigned flags)
 	return syscall(__NR_memfd_create, name, flags);
 }
 
+#ifndef MFD_CLOEXEC
+#define MFD_CLOEXEC 0x0001U
+#endif
+
 static int create_placeholder_fd(void)
 {
 	int fd;
@@ -1952,6 +1989,20 @@ static struct extern_desc *find_extern_by_name(const struct bpf_object *obj,
 	return NULL;
 }
 
+static struct extern_desc *find_extern_by_name_with_len(const struct bpf_object *obj,
+							const void *name, int len)
+{
+	const char *ext_name;
+	int i;
+
+	for (i = 0; i < obj->nr_extern; i++) {
+		ext_name = obj->externs[i].name;
+		if (strlen(ext_name) == len && strncmp(ext_name, name, len) == 0)
+			return &obj->externs[i];
+	}
+	return NULL;
+}
+
 static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val,
 			      char value)
 {
@@ -5352,8 +5403,8 @@ retry:
 					goto err_out;
 			}
 			if (map->def.type == BPF_MAP_TYPE_ARENA) {
-				map->mmaped = mmap((void *)map->map_extra, bpf_map_mmap_sz(map),
-						   PROT_READ | PROT_WRITE,
+				map->mmaped = mmap((void *)(long)map->map_extra,
+						   bpf_map_mmap_sz(map), PROT_READ | PROT_WRITE,
 						   map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED,
 						   map->fd, 0);
 				if (map->mmaped == MAP_FAILED) {
@@ -7317,11 +7368,15 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog
 	char *cp, errmsg[STRERR_BUFSIZE];
 	size_t log_buf_size = 0;
 	char *log_buf = NULL, *tmp;
-	int btf_fd, ret, err;
 	bool own_log_buf = true;
 	__u32 log_level = prog->log_level;
+	int ret, err;
 
-	if (prog->type == BPF_PROG_TYPE_UNSPEC) {
+	/* Be more helpful by rejecting programs that can't be validated early
+	 * with more meaningful and actionable error message.
+	 */
+	switch (prog->type) {
+	case BPF_PROG_TYPE_UNSPEC:
 		/*
 		 * The program type must be set.  Most likely we couldn't find a proper
 		 * section definition at load time, and thus we didn't infer the type.
@@ -7329,6 +7384,15 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog
 		pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n",
 			prog->name, prog->sec_name);
 		return -EINVAL;
+	case BPF_PROG_TYPE_STRUCT_OPS:
+		if (prog->attach_btf_id == 0) {
+			pr_warn("prog '%s': SEC(\"struct_ops\") program isn't referenced anywhere, did you forget to use it?\n",
+				prog->name);
+			return -EINVAL;
+		}
+		break;
+	default:
+		break;
 	}
 
 	if (!insns || !insns_cnt)
@@ -7343,9 +7407,8 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog
 	load_attr.prog_ifindex = prog->prog_ifindex;
 
 	/* specify func_info/line_info only if kernel supports them */
-	btf_fd = btf__fd(obj->btf);
-	if (btf_fd >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) {
-		load_attr.prog_btf_fd = btf_fd;
+	if (obj->btf && btf__fd(obj->btf) >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) {
+		load_attr.prog_btf_fd = btf__fd(obj->btf);
 		load_attr.func_info = prog->func_info;
 		load_attr.func_info_rec_size = prog->func_info_rec_size;
 		load_attr.func_info_cnt = prog->func_info_cnt;
@@ -7969,7 +8032,10 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj)
 	return 0;
 }
 
-int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx)
+typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type,
+			     const char *sym_name, void *ctx);
+
+static int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx)
 {
 	char sym_type, sym_name[500];
 	unsigned long long sym_addr;
@@ -8009,8 +8075,13 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type,
 	struct bpf_object *obj = ctx;
 	const struct btf_type *t;
 	struct extern_desc *ext;
+	char *res;
 
-	ext = find_extern_by_name(obj, sym_name);
+	res = strstr(sym_name, ".llvm.");
+	if (sym_type == 'd' && res)
+		ext = find_extern_by_name_with_len(obj, sym_name, res - sym_name);
+	else
+		ext = find_extern_by_name(obj, sym_name);
 	if (!ext || ext->type != EXT_KSYM)
 		return 0;
 
@@ -8559,6 +8630,11 @@ int bpf_map__pin(struct bpf_map *map, const char *path)
 		return libbpf_err(-EINVAL);
 	}
 
+	if (map->fd < 0) {
+		pr_warn("map '%s': can't pin BPF map without FD (was it created?)\n", map->name);
+		return libbpf_err(-EINVAL);
+	}
+
 	if (map->pin_path) {
 		if (path && strcmp(path, map->pin_path)) {
 			pr_warn("map '%s' already has pin path '%s' different from '%s'\n",
@@ -9227,6 +9303,7 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin
 static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
+static int attach_kprobe_session(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link);
@@ -9243,6 +9320,7 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("uretprobe.s+",		KPROBE, 0, SEC_SLEEPABLE, attach_uprobe),
 	SEC_DEF("kprobe.multi+",	KPROBE,	BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
 	SEC_DEF("kretprobe.multi+",	KPROBE,	BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
+	SEC_DEF("kprobe.session+",	KPROBE,	BPF_TRACE_KPROBE_SESSION, SEC_NONE, attach_kprobe_session),
 	SEC_DEF("uprobe.multi+",	KPROBE,	BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi),
 	SEC_DEF("uretprobe.multi+",	KPROBE,	BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi),
 	SEC_DEF("uprobe.multi.s+",	KPROBE,	BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi),
@@ -9294,6 +9372,7 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("sockops",		SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT),
 	SEC_DEF("sk_skb/stream_parser",	SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT),
 	SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT),
+	SEC_DEF("sk_skb/verdict",	SK_SKB, BPF_SK_SKB_VERDICT, SEC_ATTACHABLE_OPT),
 	SEC_DEF("sk_skb",		SK_SKB, 0, SEC_NONE),
 	SEC_DEF("sk_msg",		SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT),
 	SEC_DEF("lirc_mode2",		LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT),
@@ -9812,16 +9891,28 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name,
 			      enum bpf_attach_type attach_type,
 			      int *btf_obj_fd, int *btf_type_id)
 {
-	int ret, i;
+	int ret, i, mod_len;
+	const char *fn_name, *mod_name = NULL;
 
-	ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type);
-	if (ret > 0) {
-		*btf_obj_fd = 0; /* vmlinux BTF */
-		*btf_type_id = ret;
-		return 0;
+	fn_name = strchr(attach_name, ':');
+	if (fn_name) {
+		mod_name = attach_name;
+		mod_len = fn_name - mod_name;
+		fn_name++;
+	}
+
+	if (!mod_name || strncmp(mod_name, "vmlinux", mod_len) == 0) {
+		ret = find_attach_btf_id(obj->btf_vmlinux,
+					 mod_name ? fn_name : attach_name,
+					 attach_type);
+		if (ret > 0) {
+			*btf_obj_fd = 0; /* vmlinux BTF */
+			*btf_type_id = ret;
+			return 0;
+		}
+		if (ret != -ENOENT)
+			return ret;
 	}
-	if (ret != -ENOENT)
-		return ret;
 
 	ret = load_module_btfs(obj);
 	if (ret)
@@ -9830,7 +9921,12 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name,
 	for (i = 0; i < obj->btf_module_cnt; i++) {
 		const struct module_btf *mod = &obj->btf_modules[i];
 
-		ret = find_attach_btf_id(mod->btf, attach_name, attach_type);
+		if (mod_name && strncmp(mod->name, mod_name, mod_len) != 0)
+			continue;
+
+		ret = find_attach_btf_id(mod->btf,
+					 mod_name ? fn_name : attach_name,
+					 attach_type);
 		if (ret > 0) {
 			*btf_obj_fd = mod->fd;
 			*btf_type_id = ret;
@@ -10303,6 +10399,11 @@ static int validate_map_op(const struct bpf_map *map, size_t key_sz,
 		return -EINVAL;
 	}
 
+	if (map->fd < 0) {
+		pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name);
+		return -EINVAL;
+	}
+
 	if (!check_value_sz)
 		return 0;
 
@@ -10415,8 +10516,15 @@ long libbpf_get_error(const void *ptr)
 int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog)
 {
 	int ret;
+	int prog_fd = bpf_program__fd(prog);
 
-	ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL);
+	if (prog_fd < 0) {
+		pr_warn("prog '%s': can't use BPF program without FD (was it loaded?)\n",
+			prog->name);
+		return libbpf_err(-EINVAL);
+	}
+
+	ret = bpf_link_update(bpf_link__fd(link), prog_fd, NULL);
 	return libbpf_err_errno(ret);
 }
 
@@ -10610,7 +10718,7 @@ struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *p
 	}
 	prog_fd = bpf_program__fd(prog);
 	if (prog_fd < 0) {
-		pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n",
+		pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n",
 			prog->name);
 		return libbpf_err_ptr(-EINVAL);
 	}
@@ -11322,18 +11430,26 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog,
 	struct kprobe_multi_resolve res = {
 		.pattern = pattern,
 	};
+	enum bpf_attach_type attach_type;
 	struct bpf_link *link = NULL;
 	char errmsg[STRERR_BUFSIZE];
 	const unsigned long *addrs;
 	int err, link_fd, prog_fd;
+	bool retprobe, session;
 	const __u64 *cookies;
 	const char **syms;
-	bool retprobe;
 	size_t cnt;
 
 	if (!OPTS_VALID(opts, bpf_kprobe_multi_opts))
 		return libbpf_err_ptr(-EINVAL);
 
+	prog_fd = bpf_program__fd(prog);
+	if (prog_fd < 0) {
+		pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n",
+			prog->name);
+		return libbpf_err_ptr(-EINVAL);
+	}
+
 	syms    = OPTS_GET(opts, syms, false);
 	addrs   = OPTS_GET(opts, addrs, false);
 	cnt     = OPTS_GET(opts, cnt, false);
@@ -11360,6 +11476,12 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog,
 	}
 
 	retprobe = OPTS_GET(opts, retprobe, false);
+	session  = OPTS_GET(opts, session, false);
+
+	if (retprobe && session)
+		return libbpf_err_ptr(-EINVAL);
+
+	attach_type = session ? BPF_TRACE_KPROBE_SESSION : BPF_TRACE_KPROBE_MULTI;
 
 	lopts.kprobe_multi.syms = syms;
 	lopts.kprobe_multi.addrs = addrs;
@@ -11374,8 +11496,7 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog,
 	}
 	link->detach = &bpf_link__detach_fd;
 
-	prog_fd = bpf_program__fd(prog);
-	link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_KPROBE_MULTI, &lopts);
+	link_fd = bpf_link_create(prog_fd, 0, attach_type, &lopts);
 	if (link_fd < 0) {
 		err = -errno;
 		pr_warn("prog '%s': failed to attach: %s\n",
@@ -11472,7 +11593,7 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru
 
 	n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern);
 	if (n < 1) {
-		pr_warn("kprobe multi pattern is invalid: %s\n", pattern);
+		pr_warn("kprobe multi pattern is invalid: %s\n", spec);
 		return -EINVAL;
 	}
 
@@ -11481,6 +11602,32 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru
 	return libbpf_get_error(*link);
 }
 
+static int attach_kprobe_session(const struct bpf_program *prog, long cookie,
+				 struct bpf_link **link)
+{
+	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts, .session = true);
+	const char *spec;
+	char *pattern;
+	int n;
+
+	*link = NULL;
+
+	/* no auto-attach for SEC("kprobe.session") */
+	if (strcmp(prog->sec_name, "kprobe.session") == 0)
+		return 0;
+
+	spec = prog->sec_name + sizeof("kprobe.session/") - 1;
+	n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern);
+	if (n < 1) {
+		pr_warn("kprobe session pattern is invalid: %s\n", spec);
+		return -EINVAL;
+	}
+
+	*link = bpf_program__attach_kprobe_multi_opts(prog, pattern, &opts);
+	free(pattern);
+	return *link ? 0 : -errno;
+}
+
 static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link)
 {
 	char *probe_type = NULL, *binary_path = NULL, *func_name = NULL;
@@ -11757,6 +11904,13 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog,
 	if (!OPTS_VALID(opts, bpf_uprobe_multi_opts))
 		return libbpf_err_ptr(-EINVAL);
 
+	prog_fd = bpf_program__fd(prog);
+	if (prog_fd < 0) {
+		pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n",
+			prog->name);
+		return libbpf_err_ptr(-EINVAL);
+	}
+
 	syms = OPTS_GET(opts, syms, NULL);
 	offsets = OPTS_GET(opts, offsets, NULL);
 	ref_ctr_offsets = OPTS_GET(opts, ref_ctr_offsets, NULL);
@@ -11832,7 +11986,6 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog,
 	}
 	link->detach = &bpf_link__detach_fd;
 
-	prog_fd = bpf_program__fd(prog);
 	link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &lopts);
 	if (link_fd < 0) {
 		err = -errno;
@@ -12076,7 +12229,7 @@ struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog,
 		return libbpf_err_ptr(-EINVAL);
 
 	if (bpf_program__fd(prog) < 0) {
-		pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n",
+		pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n",
 			prog->name);
 		return libbpf_err_ptr(-EINVAL);
 	}
@@ -12267,13 +12420,19 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin
 	return libbpf_get_error(*link);
 }
 
-struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog,
-						    const char *tp_name)
+struct bpf_link *
+bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog,
+					const char *tp_name,
+					struct bpf_raw_tracepoint_opts *opts)
 {
+	LIBBPF_OPTS(bpf_raw_tp_opts, raw_opts);
 	char errmsg[STRERR_BUFSIZE];
 	struct bpf_link *link;
 	int prog_fd, pfd;
 
+	if (!OPTS_VALID(opts, bpf_raw_tracepoint_opts))
+		return libbpf_err_ptr(-EINVAL);
+
 	prog_fd = bpf_program__fd(prog);
 	if (prog_fd < 0) {
 		pr_warn("prog '%s': can't attach before loaded\n", prog->name);
@@ -12285,7 +12444,9 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr
 		return libbpf_err_ptr(-ENOMEM);
 	link->detach = &bpf_link__detach_fd;
 
-	pfd = bpf_raw_tracepoint_open(tp_name, prog_fd);
+	raw_opts.tp_name = tp_name;
+	raw_opts.cookie = OPTS_GET(opts, cookie, 0);
+	pfd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_opts);
 	if (pfd < 0) {
 		pfd = -errno;
 		free(link);
@@ -12297,6 +12458,12 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr
 	return link;
 }
 
+struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog,
+						    const char *tp_name)
+{
+	return bpf_program__attach_raw_tracepoint_opts(prog, tp_name, NULL);
+}
+
 static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link)
 {
 	static const char *const prefixes[] = {
@@ -12450,6 +12617,12 @@ bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd)
 	return bpf_program_attach_fd(prog, netns_fd, "netns", NULL);
 }
 
+struct bpf_link *
+bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd)
+{
+	return bpf_program_attach_fd(prog, map_fd, "sockmap", NULL);
+}
+
 struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex)
 {
 	/* target_fd/target_ifindex use the same field in LINK_CREATE */
@@ -12658,6 +12831,12 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog)
 	if (!prog->sec_def || !prog->sec_def->prog_attach_fn)
 		return libbpf_err_ptr(-EOPNOTSUPP);
 
+	if (bpf_program__fd(prog) < 0) {
+		pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n",
+			prog->name);
+		return libbpf_err_ptr(-EINVAL);
+	}
+
 	err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, &link);
 	if (err)
 		return libbpf_err_ptr(err);
@@ -12698,8 +12877,13 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
 	__u32 zero = 0;
 	int err, fd;
 
-	if (!bpf_map__is_struct_ops(map) || map->fd == -1)
+	if (!bpf_map__is_struct_ops(map))
+		return libbpf_err_ptr(-EINVAL);
+
+	if (map->fd < 0) {
+		pr_warn("map '%s': can't attach BPF map without FD (was it created?)\n", map->name);
 		return libbpf_err_ptr(-EINVAL);
+	}
 
 	link = calloc(1, sizeof(*link));
 	if (!link)
@@ -12747,9 +12931,14 @@ int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map)
 	__u32 zero = 0;
 	int err;
 
-	if (!bpf_map__is_struct_ops(map) || !map_is_created(map))
+	if (!bpf_map__is_struct_ops(map))
 		return -EINVAL;
 
+	if (map->fd < 0) {
+		pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name);
+		return -EINVAL;
+	}
+
 	st_ops_link = container_of(link, struct bpf_link_struct_ops, link);
 	/* Ensure the type of a link is correct */
 	if (st_ops_link->map_fd < 0)
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 7b510761f545..c3f77d9260fe 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -539,10 +539,12 @@ struct bpf_kprobe_multi_opts {
 	size_t cnt;
 	/* create return kprobes */
 	bool retprobe;
+	/* create session kprobes */
+	bool session;
 	size_t :0;
 };
 
-#define bpf_kprobe_multi_opts__last_field retprobe
+#define bpf_kprobe_multi_opts__last_field session
 
 LIBBPF_API struct bpf_link *
 bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog,
@@ -760,9 +762,20 @@ bpf_program__attach_tracepoint_opts(const struct bpf_program *prog,
 				    const char *tp_name,
 				    const struct bpf_tracepoint_opts *opts);
 
+struct bpf_raw_tracepoint_opts {
+	size_t sz; /* size of this struct for forward/backward compatibility */
+	__u64 cookie;
+	size_t :0;
+};
+#define bpf_raw_tracepoint_opts__last_field cookie
+
 LIBBPF_API struct bpf_link *
 bpf_program__attach_raw_tracepoint(const struct bpf_program *prog,
 				   const char *tp_name);
+LIBBPF_API struct bpf_link *
+bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog,
+					const char *tp_name,
+					struct bpf_raw_tracepoint_opts *opts);
 
 struct bpf_trace_opts {
 	/* size of this struct, for forward/backward compatibility */
@@ -784,6 +797,8 @@ bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd);
 LIBBPF_API struct bpf_link *
 bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd);
 LIBBPF_API struct bpf_link *
+bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd);
+LIBBPF_API struct bpf_link *
 bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex);
 LIBBPF_API struct bpf_link *
 bpf_program__attach_freplace(const struct bpf_program *prog,
@@ -1282,6 +1297,7 @@ LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd,
 				ring_buffer_sample_fn sample_cb, void *ctx);
 LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms);
 LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb);
+LIBBPF_API int ring_buffer__consume_n(struct ring_buffer *rb, size_t n);
 LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb);
 
 /**
@@ -1356,6 +1372,17 @@ LIBBPF_API int ring__map_fd(const struct ring *r);
  */
 LIBBPF_API int ring__consume(struct ring *r);
 
+/**
+ * @brief **ring__consume_n()** consumes up to a requested amount of items from
+ * a ringbuffer without event polling.
+ *
+ * @param r A ringbuffer object.
+ * @param n Maximum amount of items to consume.
+ * @return The number of items consumed, or a negative number if any of the
+ * callbacks return an error.
+ */
+LIBBPF_API int ring__consume_n(struct ring *r, size_t n);
+
 struct user_ring_buffer_opts {
 	size_t sz; /* size of this struct, for forward/backward compatibility */
 };
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 86804fd90dd1..c1ce8aa3520b 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -410,7 +410,16 @@ LIBBPF_1.3.0 {
 
 LIBBPF_1.4.0 {
 	global:
+		bpf_program__attach_raw_tracepoint_opts;
+		bpf_raw_tracepoint_open_opts;
 		bpf_token_create;
 		btf__new_split;
 		btf_ext__raw_data;
 } LIBBPF_1.3.0;
+
+LIBBPF_1.5.0 {
+	global:
+		bpf_program__attach_sockmap;
+		ring__consume_n;
+		ring_buffer__consume_n;
+} LIBBPF_1.4.0;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 864b36177424..a0dcfb82e455 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -518,11 +518,6 @@ int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void
 __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name,
 				 __u32 kind);
 
-typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type,
-			     const char *sym_name, void *ctx);
-
-int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg);
-
 /* handle direct returned errors */
 static inline int libbpf_err(int ret)
 {
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 302188122439..9dfbe7750f56 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -448,7 +448,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe
 	/* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id)
 	 * at all, it will emit something like "invalid func unknown#181".
 	 * If BPF verifier recognizes BPF helper but it's not supported for
-	 * given BPF program type, it will emit "unknown func bpf_sys_bpf#166".
+	 * given BPF program type, it will emit "unknown func bpf_sys_bpf#166"
+	 * or "program of this type cannot use helper bpf_sys_bpf#166".
 	 * In both cases, provided combination of BPF program type and BPF
 	 * helper is not supported by the kernel.
 	 * In all other cases, probe_prog_load() above will either succeed (e.g.,
@@ -457,7 +458,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe
 	 * that), or we'll get some more specific BPF verifier error about
 	 * some unsatisfied conditions.
 	 */
-	if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ")))
+	if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ") ||
+			 strstr(buf, "program of this type cannot use helper ")))
 		return 0;
 	return 1; /* assume supported */
 }
diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h
index e783a47da815..d6e5eff967cb 100644
--- a/tools/lib/bpf/libbpf_version.h
+++ b/tools/lib/bpf/libbpf_version.h
@@ -4,6 +4,6 @@
 #define __LIBBPF_VERSION_H
 
 #define LIBBPF_MAJOR_VERSION 1
-#define LIBBPF_MINOR_VERSION 4
+#define LIBBPF_MINOR_VERSION 5
 
 #endif /* __LIBBPF_VERSION_H */
diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
index aacb64278a01..bfd8dac4c0cc 100644
--- a/tools/lib/bpf/ringbuf.c
+++ b/tools/lib/bpf/ringbuf.c
@@ -231,7 +231,7 @@ static inline int roundup_len(__u32 len)
 	return (len + 7) / 8 * 8;
 }
 
-static int64_t ringbuf_process_ring(struct ring *r)
+static int64_t ringbuf_process_ring(struct ring *r, size_t n)
 {
 	int *len_ptr, len, err;
 	/* 64-bit to avoid overflow in case of extreme application behavior */
@@ -268,12 +268,42 @@ static int64_t ringbuf_process_ring(struct ring *r)
 			}
 
 			smp_store_release(r->consumer_pos, cons_pos);
+
+			if (cnt >= n)
+				goto done;
 		}
 	} while (got_new_data);
 done:
 	return cnt;
 }
 
+/* Consume available ring buffer(s) data without event polling, up to n
+ * records.
+ *
+ * Returns number of records consumed across all registered ring buffers (or
+ * n, whichever is less), or negative number if any of the callbacks return
+ * error.
+ */
+int ring_buffer__consume_n(struct ring_buffer *rb, size_t n)
+{
+	int64_t err, res = 0;
+	int i;
+
+	for (i = 0; i < rb->ring_cnt; i++) {
+		struct ring *ring = rb->rings[i];
+
+		err = ringbuf_process_ring(ring, n);
+		if (err < 0)
+			return libbpf_err(err);
+		res += err;
+		n -= err;
+
+		if (n == 0)
+			break;
+	}
+	return res > INT_MAX ? INT_MAX : res;
+}
+
 /* Consume available ring buffer(s) data without event polling.
  * Returns number of records consumed across all registered ring buffers (or
  * INT_MAX, whichever is less), or negative number if any of the callbacks
@@ -287,13 +317,15 @@ int ring_buffer__consume(struct ring_buffer *rb)
 	for (i = 0; i < rb->ring_cnt; i++) {
 		struct ring *ring = rb->rings[i];
 
-		err = ringbuf_process_ring(ring);
+		err = ringbuf_process_ring(ring, INT_MAX);
 		if (err < 0)
 			return libbpf_err(err);
 		res += err;
+		if (res > INT_MAX) {
+			res = INT_MAX;
+			break;
+		}
 	}
-	if (res > INT_MAX)
-		return INT_MAX;
 	return res;
 }
 
@@ -314,13 +346,13 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
 		__u32 ring_id = rb->events[i].data.fd;
 		struct ring *ring = rb->rings[ring_id];
 
-		err = ringbuf_process_ring(ring);
+		err = ringbuf_process_ring(ring, INT_MAX);
 		if (err < 0)
 			return libbpf_err(err);
 		res += err;
 	}
 	if (res > INT_MAX)
-		return INT_MAX;
+		res = INT_MAX;
 	return res;
 }
 
@@ -371,17 +403,22 @@ int ring__map_fd(const struct ring *r)
 	return r->map_fd;
 }
 
-int ring__consume(struct ring *r)
+int ring__consume_n(struct ring *r, size_t n)
 {
 	int64_t res;
 
-	res = ringbuf_process_ring(r);
+	res = ringbuf_process_ring(r, n);
 	if (res < 0)
 		return libbpf_err(res);
 
 	return res > INT_MAX ? INT_MAX : res;
 }
 
+int ring__consume(struct ring *r)
+{
+	return ring__consume_n(r, INT_MAX);
+}
+
 static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb)
 {
 	if (rb->consumer_pos) {
diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c
index 146da01979c7..5e6a1e27ddf9 100644
--- a/tools/lib/bpf/str_error.c
+++ b/tools/lib/bpf/str_error.c
@@ -2,6 +2,7 @@
 #undef _GNU_SOURCE
 #include <string.h>
 #include <stdio.h>
+#include <errno.h>
 #include "str_error.h"
 
 /* make sure libbpf doesn't use kernel-only integer typedefs */
@@ -15,7 +16,18 @@
 char *libbpf_strerror_r(int err, char *dst, int len)
 {
 	int ret = strerror_r(err < 0 ? -err : err, dst, len);
-	if (ret)
-		snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret);
+	/* on glibc <2.13, ret == -1 and errno is set, if strerror_r() can't
+	 * handle the error, on glibc >=2.13 *positive* (errno-like) error
+	 * code is returned directly
+	 */
+	if (ret == -1)
+		ret = errno;
+	if (ret) {
+		if (ret == EINVAL)
+			/* strerror_r() doesn't recognize this specific error */
+			snprintf(dst, len, "unknown error (%d)", err < 0 ? err : -err);
+		else
+			snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret);
+	}
 	return dst;
 }
diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h
index f6763300b26a..76359bcdc94a 100644
--- a/tools/lib/bpf/usdt.bpf.h
+++ b/tools/lib/bpf/usdt.bpf.h
@@ -214,18 +214,18 @@ long bpf_usdt_cookie(struct pt_regs *ctx)
 
 /* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */
 #define ___bpf_usdt_args0() ctx
-#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; })
-#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; })
-#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; })
-#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; })
-#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; })
-#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; })
-#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; })
-#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; })
-#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; })
-#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; })
-#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; })
-#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; })
+#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); _x; })
+#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); _x; })
+#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); _x; })
+#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); _x; })
+#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); _x; })
+#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); _x; })
+#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); _x; })
+#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); _x; })
+#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); _x; })
+#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); _x; })
+#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); _x; })
+#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); _x; })
 #define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args)
 
 /*
diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
index 058e3ff10f9b..c6d67fc9e57e 100644
--- a/tools/lib/perf/evlist.c
+++ b/tools/lib/perf/evlist.c
@@ -248,10 +248,10 @@ u64 perf_evlist__read_format(struct perf_evlist *evlist)
 
 static void perf_evlist__id_hash(struct perf_evlist *evlist,
 				 struct perf_evsel *evsel,
-				 int cpu, int thread, u64 id)
+				 int cpu_map_idx, int thread, u64 id)
 {
 	int hash;
-	struct perf_sample_id *sid = SID(evsel, cpu, thread);
+	struct perf_sample_id *sid = SID(evsel, cpu_map_idx, thread);
 
 	sid->id = id;
 	sid->evsel = evsel;
@@ -269,21 +269,27 @@ void perf_evlist__reset_id_hash(struct perf_evlist *evlist)
 
 void perf_evlist__id_add(struct perf_evlist *evlist,
 			 struct perf_evsel *evsel,
-			 int cpu, int thread, u64 id)
+			 int cpu_map_idx, int thread, u64 id)
 {
-	perf_evlist__id_hash(evlist, evsel, cpu, thread, id);
+	if (!SID(evsel, cpu_map_idx, thread))
+		return;
+
+	perf_evlist__id_hash(evlist, evsel, cpu_map_idx, thread, id);
 	evsel->id[evsel->ids++] = id;
 }
 
 int perf_evlist__id_add_fd(struct perf_evlist *evlist,
 			   struct perf_evsel *evsel,
-			   int cpu, int thread, int fd)
+			   int cpu_map_idx, int thread, int fd)
 {
 	u64 read_data[4] = { 0, };
 	int id_idx = 1; /* The first entry is the counter value */
 	u64 id;
 	int ret;
 
+	if (!SID(evsel, cpu_map_idx, thread))
+		return -1;
+
 	ret = ioctl(fd, PERF_EVENT_IOC_ID, &id);
 	if (!ret)
 		goto add;
@@ -312,7 +318,7 @@ int perf_evlist__id_add_fd(struct perf_evlist *evlist,
 	id = read_data[id_idx];
 
 add:
-	perf_evlist__id_add(evlist, evsel, cpu, thread, id);
+	perf_evlist__id_add(evlist, evsel, cpu_map_idx, thread, id);
 	return 0;
 }
 
diff --git a/tools/lib/perf/include/internal/evlist.h b/tools/lib/perf/include/internal/evlist.h
index d86ffe8ed483..f43bdb9b6227 100644
--- a/tools/lib/perf/include/internal/evlist.h
+++ b/tools/lib/perf/include/internal/evlist.h
@@ -126,11 +126,11 @@ u64 perf_evlist__read_format(struct perf_evlist *evlist);
 
 void perf_evlist__id_add(struct perf_evlist *evlist,
 			 struct perf_evsel *evsel,
-			 int cpu, int thread, u64 id);
+			 int cpu_map_idx, int thread, u64 id);
 
 int perf_evlist__id_add_fd(struct perf_evlist *evlist,
 			   struct perf_evsel *evsel,
-			   int cpu, int thread, int fd);
+			   int cpu_map_idx, int thread, int fd);
 
 void perf_evlist__reset_id_hash(struct perf_evlist *evlist);
 
diff --git a/tools/lib/subcmd/run-command.c b/tools/lib/subcmd/run-command.c
index 5cdac2162532..d435eb42354b 100644
--- a/tools/lib/subcmd/run-command.c
+++ b/tools/lib/subcmd/run-command.c
@@ -122,6 +122,8 @@ int start_command(struct child_process *cmd)
 		}
 		if (cmd->preexec_cb)
 			cmd->preexec_cb();
+		if (cmd->no_exec_cmd)
+			exit(cmd->no_exec_cmd(cmd));
 		if (cmd->exec_cmd) {
 			execv_cmd(cmd->argv);
 		} else {
diff --git a/tools/lib/subcmd/run-command.h b/tools/lib/subcmd/run-command.h
index 17d969c6add3..d794138a797f 100644
--- a/tools/lib/subcmd/run-command.h
+++ b/tools/lib/subcmd/run-command.h
@@ -47,6 +47,8 @@ struct child_process {
 	unsigned exec_cmd:1; /* if this is to be external sub-command */
 	unsigned stdout_to_stderr:1;
 	void (*preexec_cb)(void);
+	 /* If set, call function in child rather than doing an exec. */
+	int (*no_exec_cmd)(struct child_process *process);
 };
 
 int start_command(struct child_process *);
diff --git a/tools/mm/Makefile b/tools/mm/Makefile
index 1c5606cc3334..7bb03606b9ea 100644
--- a/tools/mm/Makefile
+++ b/tools/mm/Makefile
@@ -3,7 +3,8 @@
 #
 include ../scripts/Makefile.include
 
-TARGETS=page-types slabinfo page_owner_sort
+BUILD_TARGETS=page-types slabinfo page_owner_sort
+INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps
 
 LIB_DIR = ../lib/api
 LIBS = $(LIB_DIR)/libapi.a
@@ -11,9 +12,9 @@ LIBS = $(LIB_DIR)/libapi.a
 CFLAGS += -Wall -Wextra -I../lib/ -pthread
 LDFLAGS += $(LIBS) -pthread
 
-all: $(TARGETS)
+all: $(BUILD_TARGETS)
 
-$(TARGETS): $(LIBS)
+$(BUILD_TARGETS): $(LIBS)
 
 $(LIBS):
 	make -C $(LIB_DIR)
@@ -29,4 +30,4 @@ sbindir ?= /usr/sbin
 
 install: all
 	install -d $(DESTDIR)$(sbindir)
-	install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir)
+	install -m 755 -p $(INSTALL_TARGETS) $(DESTDIR)$(sbindir)
diff --git a/tools/mm/thpmaps b/tools/mm/thpmaps
new file mode 100644
index 000000000000..803e0318f2fe
--- /dev/null
+++ b/tools/mm/thpmaps
@@ -0,0 +1,675 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2024 ARM Ltd.
+#
+# Utility providing smaps-like output detailing transparent hugepage usage.
+# For more info, run:
+# ./thpmaps --help
+#
+# Requires numpy:
+# pip3 install numpy
+
+
+import argparse
+import collections
+import math
+import os
+import re
+import resource
+import shutil
+import sys
+import textwrap
+import time
+import numpy as np
+
+
+with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f:
+    PAGE_SIZE = resource.getpagesize()
+    PAGE_SHIFT = int(math.log2(PAGE_SIZE))
+    PMD_SIZE = int(f.read())
+    PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE))
+
+
+def align_forward(v, a):
+    return (v + (a - 1)) & ~(a - 1)
+
+
+def align_offset(v, a):
+    return v & (a - 1)
+
+
+def kbnr(kb):
+    # Convert KB to number of pages.
+    return (kb << 10) >> PAGE_SHIFT
+
+
+def nrkb(nr):
+    # Convert number of pages to KB.
+    return (nr << PAGE_SHIFT) >> 10
+
+
+def odkb(order):
+    # Convert page order to KB.
+    return (PAGE_SIZE << order) >> 10
+
+
+def cont_ranges_all(search, index):
+    # Given a list of arrays, find the ranges for which values are monotonically
+    # incrementing in all arrays. all arrays in search and index must be the
+    # same size.
+    sz = len(search[0])
+    r = np.full(sz, 2)
+    d = np.diff(search[0]) == 1
+    for dd in [np.diff(arr) == 1 for arr in search[1:]]:
+        d &= dd
+    r[1:] -= d
+    r[:-1] -= d
+    return [np.repeat(arr, r).reshape(-1, 2) for arr in index]
+
+
+class ArgException(Exception):
+    pass
+
+
+class FileIOException(Exception):
+    pass
+
+
+class BinArrayFile:
+    # Base class used to read /proc/<pid>/pagemap and /proc/kpageflags into a
+    # numpy array. Use inherrited class in a with clause to ensure file is
+    # closed when it goes out of scope.
+    def __init__(self, filename, element_size):
+        self.element_size = element_size
+        self.filename = filename
+        self.fd = os.open(self.filename, os.O_RDONLY)
+
+    def cleanup(self):
+        os.close(self.fd)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.cleanup()
+
+    def _readin(self, offset, buffer):
+        length = os.preadv(self.fd, (buffer,), offset)
+        if len(buffer) != length:
+            raise FileIOException('error: {} failed to read {} bytes at {:x}'
+                            .format(self.filename, len(buffer), offset))
+
+    def _toarray(self, buf):
+        assert(self.element_size == 8)
+        return np.frombuffer(buf, dtype=np.uint64)
+
+    def getv(self, vec):
+        vec *= self.element_size
+        offsets = vec[:, 0]
+        lengths = (np.diff(vec) + self.element_size).reshape(len(vec))
+        buf = bytearray(int(np.sum(lengths)))
+        view = memoryview(buf)
+        pos = 0
+        for offset, length in zip(offsets, lengths):
+            offset = int(offset)
+            length = int(length)
+            self._readin(offset, view[pos:pos+length])
+            pos += length
+        return self._toarray(buf)
+
+    def get(self, index, nr=1):
+        offset = index * self.element_size
+        length = nr * self.element_size
+        buf = bytearray(length)
+        self._readin(offset, buf)
+        return self._toarray(buf)
+
+
+PM_PAGE_PRESENT = 1 << 63
+PM_PFN_MASK = (1 << 55) - 1
+
+class PageMap(BinArrayFile):
+    # Read ranges of a given pid's pagemap into a numpy array.
+    def __init__(self, pid='self'):
+        super().__init__(f'/proc/{pid}/pagemap', 8)
+
+
+KPF_ANON = 1 << 12
+KPF_COMPOUND_HEAD = 1 << 15
+KPF_COMPOUND_TAIL = 1 << 16
+KPF_THP = 1 << 22
+
+class KPageFlags(BinArrayFile):
+    # Read ranges of /proc/kpageflags into a numpy array.
+    def __init__(self):
+         super().__init__(f'/proc/kpageflags', 8)
+
+
+vma_all_stats = set([
+    "Size",
+    "Rss",
+    "Pss",
+    "Pss_Dirty",
+    "Shared_Clean",
+    "Shared_Dirty",
+    "Private_Clean",
+    "Private_Dirty",
+    "Referenced",
+    "Anonymous",
+    "KSM",
+    "LazyFree",
+    "AnonHugePages",
+    "ShmemPmdMapped",
+    "FilePmdMapped",
+    "Shared_Hugetlb",
+    "Private_Hugetlb",
+    "Swap",
+    "SwapPss",
+    "Locked",
+])
+
+vma_min_stats = set([
+    "Rss",
+    "Anonymous",
+    "AnonHugePages",
+    "ShmemPmdMapped",
+    "FilePmdMapped",
+])
+
+VMA = collections.namedtuple('VMA', [
+    'name',
+    'start',
+    'end',
+    'read',
+    'write',
+    'execute',
+    'private',
+    'pgoff',
+    'major',
+    'minor',
+    'inode',
+    'stats',
+])
+
+class VMAList:
+    # A container for VMAs, parsed from /proc/<pid>/smaps. Iterate over the
+    # instance to receive VMAs.
+    def __init__(self, pid='self', stats=[]):
+        self.vmas = []
+        with open(f'/proc/{pid}/smaps', 'r') as file:
+            for line in file:
+                elements = line.split()
+                if '-' in elements[0]:
+                    start, end = map(lambda x: int(x, 16), elements[0].split('-'))
+                    major, minor = map(lambda x: int(x, 16), elements[3].split(':'))
+                    self.vmas.append(VMA(
+                        name=elements[5] if len(elements) == 6 else '',
+                        start=start,
+                        end=end,
+                        read=elements[1][0] == 'r',
+                        write=elements[1][1] == 'w',
+                        execute=elements[1][2] == 'x',
+                        private=elements[1][3] == 'p',
+                        pgoff=int(elements[2], 16),
+                        major=major,
+                        minor=minor,
+                        inode=int(elements[4], 16),
+                        stats={},
+                    ))
+                else:
+                    param = elements[0][:-1]
+                    if param in stats:
+                        value = int(elements[1])
+                        self.vmas[-1].stats[param] = {'type': None, 'value': value}
+
+    def __iter__(self):
+        yield from self.vmas
+
+
+def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads):
+    # Given 4 same-sized arrays representing a range within a page table backed
+    # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+    # True if page is anonymous, heads: True if page is head of a THP), return a
+    # dictionary of statistics describing the mapped THPs.
+    stats = {
+        'file': {
+            'partial': 0,
+            'aligned': [0] * (PMD_ORDER + 1),
+            'unaligned': [0] * (PMD_ORDER + 1),
+        },
+        'anon': {
+            'partial': 0,
+            'aligned': [0] * (PMD_ORDER + 1),
+            'unaligned': [0] * (PMD_ORDER + 1),
+        },
+    }
+
+    for rindex, rpfn in zip(ranges[0], ranges[2]):
+        index_next = int(rindex[0])
+        index_end = int(rindex[1]) + 1
+        pfn_end = int(rpfn[1]) + 1
+
+        folios = indexes[index_next:index_end][heads[index_next:index_end]]
+
+        # Account pages for any partially mapped THP at the front. In that case,
+        # the first page of the range is a tail.
+        nr = (int(folios[0]) if len(folios) else index_end) - index_next
+        stats['anon' if anons[index_next] else 'file']['partial'] += nr
+
+        # Account pages for any partially mapped THP at the back. In that case,
+        # the next page after the range is a tail.
+        if len(folios):
+            flags = int(kpageflags.get(pfn_end)[0])
+            if flags & KPF_COMPOUND_TAIL:
+                nr = index_end - int(folios[-1])
+                folios = folios[:-1]
+                index_end -= nr
+                stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr
+
+        # Account fully mapped THPs in the middle of the range.
+        if len(folios):
+            folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1]))
+            folio_orders = np.log2(folio_nrs).astype(np.uint64)
+            for index, order in zip(folios, folio_orders):
+                index = int(index)
+                order = int(order)
+                nr = 1 << order
+                vfn = int(vfns[index])
+                align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned'
+                anon = 'anon' if anons[index] else 'file'
+                stats[anon][align][order] += nr
+
+    # Account PMD-mapped THPs spearately, so filter out of the stats. There is a
+    # race between acquiring the smaps stats and reading pagemap, where memory
+    # could be deallocated. So clamp to zero incase it would have gone negative.
+    anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+    file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+                      vma.stats['FilePmdMapped']['value']
+    stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped))
+    stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped))
+
+    rstats = {
+        f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+        f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped},
+    }
+
+    def flatten_sub(type, subtype, stats):
+        param = f"{type}-thp-pte-{subtype}-{{}}kB"
+        for od, nr in enumerate(stats[2:], 2):
+            rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)}
+
+    def flatten_type(type, stats):
+        flatten_sub(type, 'aligned', stats['aligned'])
+        flatten_sub(type, 'unaligned', stats['unaligned'])
+        rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])}
+
+    flatten_type('anon', stats['anon'])
+    flatten_type('file', stats['file'])
+
+    return rstats
+
+
+def cont_parse(vma, order, ranges, anons, heads):
+    # Given 4 same-sized arrays representing a range within a page table backed
+    # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
+    # True if page is anonymous, heads: True if page is head of a THP), return a
+    # dictionary of statistics describing the contiguous blocks.
+    nr_cont = 1 << order
+    nr_anon = 0
+    nr_file = 0
+
+    for rindex, rvfn, rpfn in zip(*ranges):
+        index_next = int(rindex[0])
+        index_end = int(rindex[1]) + 1
+        vfn_start = int(rvfn[0])
+        pfn_start = int(rpfn[0])
+
+        if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont):
+            continue
+
+        off = align_forward(vfn_start, nr_cont) - vfn_start
+        index_next += off
+
+        while index_next + nr_cont <= index_end:
+            folio_boundary = heads[index_next+1:index_next+nr_cont].any()
+            if not folio_boundary:
+                if anons[index_next]:
+                    nr_anon += nr_cont
+                else:
+                    nr_file += nr_cont
+            index_next += nr_cont
+
+    # Account blocks that are PMD-mapped spearately, so filter out of the stats.
+    # There is a race between acquiring the smaps stats and reading pagemap,
+    # where memory could be deallocated. So clamp to zero incase it would have
+    # gone negative.
+    anon_pmd_mapped = vma.stats['AnonHugePages']['value']
+    file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
+                    vma.stats['FilePmdMapped']['value']
+    nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped))
+    nr_file = max(0, nr_file - kbnr(file_pmd_mapped))
+
+    rstats = {
+        f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
+        f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped},
+    }
+
+    rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)}
+    rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)}
+
+    return rstats
+
+
+def vma_print(vma, pid):
+    # Prints a VMA instance in a format similar to smaps. The main difference is
+    # that the pid is included as the first value.
+    print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}"
+        .format(
+            pid, vma.start, vma.end,
+            'r' if vma.read else '-', 'w' if vma.write else '-',
+            'x' if vma.execute else '-', 'p' if vma.private else 's',
+            vma.pgoff, vma.major, vma.minor, vma.inode, vma.name
+        ))
+
+
+def stats_print(stats, tot_anon, tot_file, inc_empty):
+    # Print a statistics dictionary.
+    label_field = 32
+    for label, stat in stats.items():
+        type = stat['type']
+        value = stat['value']
+        if value or inc_empty:
+            pad = max(0, label_field - len(label) - 1)
+            if type == 'anon' and tot_anon > 0:
+                percent = f' ({value / tot_anon:3.0%})'
+            elif type == 'file' and tot_file > 0:
+                percent = f' ({value / tot_file:3.0%})'
+            else:
+                percent = ''
+            print(f"{label}:{' ' * pad}{value:8} kB{percent}")
+
+
+def vma_parse(vma, pagemap, kpageflags, contorders):
+    # Generate thp and cont statistics for a single VMA.
+    start = vma.start >> PAGE_SHIFT
+    end = vma.end >> PAGE_SHIFT
+
+    pmes = pagemap.get(start, end - start)
+    present = pmes & PM_PAGE_PRESENT != 0
+    pfns = pmes & PM_PFN_MASK
+    pfns = pfns[present]
+    vfns = np.arange(start, end, dtype=np.uint64)
+    vfns = vfns[present]
+
+    pfn_vec = cont_ranges_all([pfns], [pfns])[0]
+    flags = kpageflags.getv(pfn_vec)
+    anons = flags & KPF_ANON != 0
+    heads = flags & KPF_COMPOUND_HEAD != 0
+    thps = flags & KPF_THP != 0
+
+    vfns = vfns[thps]
+    pfns = pfns[thps]
+    anons = anons[thps]
+    heads = heads[thps]
+
+    indexes = np.arange(len(vfns), dtype=np.uint64)
+    ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns])
+
+    thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads)
+    contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders]
+
+    tot_anon = vma.stats['Anonymous']['value']
+    tot_file = vma.stats['Rss']['value'] - tot_anon
+
+    return {
+        **thpstats,
+        **{k: v for s in contstats for k, v in s.items()}
+    }, tot_anon, tot_file
+
+
+def do_main(args):
+    pids = set()
+    rollup = {}
+    rollup_anon = 0
+    rollup_file = 0
+
+    if args.cgroup:
+        strict = False
+        for walk_info in os.walk(args.cgroup):
+            cgroup = walk_info[0]
+            with open(f'{cgroup}/cgroup.procs') as pidfile:
+                for line in pidfile.readlines():
+                    pids.add(int(line.strip()))
+    elif args.pid:
+        strict = True
+        pids = pids.union(args.pid)
+    else:
+        strict = False
+        for pid in os.listdir('/proc'):
+            if pid.isdigit():
+                pids.add(int(pid))
+
+    if not args.rollup:
+        print("       PID             START              END PROT   OFFSET   DEV    INODE OBJECT")
+
+    for pid in pids:
+        try:
+            with PageMap(pid) as pagemap:
+                with KPageFlags() as kpageflags:
+                    for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats):
+                        if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0:
+                            stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont)
+                        else:
+                            stats = {}
+                            vma_anon = 0
+                            vma_file = 0
+                        if args.inc_smaps:
+                            stats = {**vma.stats, **stats}
+                        if args.rollup:
+                            for k, v in stats.items():
+                                if k in rollup:
+                                    assert(rollup[k]['type'] == v['type'])
+                                    rollup[k]['value'] += v['value']
+                                else:
+                                    rollup[k] = v
+                            rollup_anon += vma_anon
+                            rollup_file += vma_file
+                        else:
+                            vma_print(vma, pid)
+                            stats_print(stats, vma_anon, vma_file, args.inc_empty)
+        except (FileNotFoundError, ProcessLookupError, FileIOException):
+            if strict:
+                raise
+
+    if args.rollup:
+        stats_print(rollup, rollup_anon, rollup_file, args.inc_empty)
+
+
+def main():
+    docs_width = shutil.get_terminal_size().columns
+    docs_width -= 2
+    docs_width = min(80, docs_width)
+
+    def format(string):
+        text = re.sub(r'\s+', ' ', string)
+        text = re.sub(r'\s*\\n\s*', '\n', text)
+        paras = text.split('\n')
+        paras = [textwrap.fill(p, width=docs_width) for p in paras]
+        return '\n'.join(paras)
+
+    def formatter(prog):
+        return argparse.RawDescriptionHelpFormatter(prog, width=docs_width)
+
+    def size2order(human):
+        units = {
+            "K": 2**10, "M": 2**20, "G": 2**30,
+            "k": 2**10, "m": 2**20, "g": 2**30,
+        }
+        unit = 1
+        if human[-1] in units:
+            unit = units[human[-1]]
+            human = human[:-1]
+        try:
+            size = int(human)
+        except ValueError:
+            raise ArgException('error: --cont value must be integer size with optional KMG unit')
+        size *= unit
+        order = int(math.log2(size / PAGE_SIZE))
+        if order < 1:
+            raise ArgException('error: --cont value must be size of at least 2 pages')
+        if (1 << order) * PAGE_SIZE != size:
+            raise ArgException('error: --cont value must be size of power-of-2 pages')
+        if order > PMD_ORDER:
+            raise ArgException('error: --cont value must be less than or equal to PMD order')
+        return order
+
+    parser = argparse.ArgumentParser(formatter_class=formatter,
+        description=format("""Prints information about how transparent huge
+                    pages are mapped, either system-wide, or for a specified
+                    process or cgroup.\\n
+                    \\n
+                    When run with --pid, the user explicitly specifies the set
+                    of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run
+                    with --cgroup, the user passes either a v1 or v2 cgroup and
+                    all pids that belong to the cgroup subtree are scanned. When
+                    run with neither --pid nor --cgroup, the full set of pids on
+                    the system is gathered from /proc and scanned as if the user
+                    had provided "--pid 1 --pid 2 ...".\\n
+                    \\n
+                    A default set of statistics is always generated for THP
+                    mappings. However, it is also possible to generate
+                    additional statistics for "contiguous block mappings" where
+                    the block size is user-defined.\\n
+                    \\n
+                    Statistics are maintained independently for anonymous and
+                    file-backed (pagecache) memory and are shown both in kB and
+                    as a percentage of either total anonymous or total
+                    file-backed memory as appropriate.\\n
+                    \\n
+                    THP Statistics\\n
+                    --------------\\n
+                    \\n
+                    Statistics are always generated for fully- and
+                    contiguously-mapped THPs whose mapping address is aligned to
+                    their size, for each <size> supported by the system.
+                    Separate counters describe THPs mapped by PTE vs those
+                    mapped by PMD. (Although note a THP can only be mapped by
+                    PMD if it is PMD-sized):\\n
+                    \\n
+                    - anon-thp-pte-aligned-<size>kB\\n
+                    - file-thp-pte-aligned-<size>kB\\n
+                    - anon-thp-pmd-aligned-<size>kB\\n
+                    - file-thp-pmd-aligned-<size>kB\\n
+                    \\n
+                    Similarly, statistics are always generated for fully- and
+                    contiguously-mapped THPs whose mapping address is *not*
+                    aligned to their size, for each <size> supported by the
+                    system. Due to the unaligned mapping, it is impossible to
+                    map by PMD, so there are only PTE counters for this case:\\n
+                    \\n
+                    - anon-thp-pte-unaligned-<size>kB\\n
+                    - file-thp-pte-unaligned-<size>kB\\n
+                    \\n
+                    Statistics are also always generated for mapped pages that
+                    belong to a THP but where the is THP is *not* fully- and
+                    contiguously- mapped. These "partial" mappings are all
+                    counted in the same counter regardless of the size of the
+                    THP that is partially mapped:\\n
+                    \\n
+                    - anon-thp-pte-partial\\n
+                    - file-thp-pte-partial\\n
+                    \\n
+                    Contiguous Block Statistics\\n
+                    ---------------------------\\n
+                    \\n
+                    An optional, additional set of statistics is generated for
+                    every contiguous block size specified with `--cont <size>`.
+                    These statistics show how much memory is mapped in
+                    contiguous blocks of <size> and also aligned to <size>. A
+                    given contiguous block must all belong to the same THP, but
+                    there is no requirement for it to be the *whole* THP.
+                    Separate counters describe contiguous blocks mapped by PTE
+                    vs those mapped by PMD:\\n
+                    \\n
+                    - anon-cont-pte-aligned-<size>kB\\n
+                    - file-cont-pte-aligned-<size>kB\\n
+                    - anon-cont-pmd-aligned-<size>kB\\n
+                    - file-cont-pmd-aligned-<size>kB\\n
+                    \\n
+                    As an example, if monitoring 64K contiguous blocks (--cont
+                    64K), there are a number of sources that could provide such
+                    blocks: a fully- and contiguously-mapped 64K THP that is
+                    aligned to a 64K boundary would provide 1 block. A fully-
+                    and contiguously-mapped 128K THP that is aligned to at least
+                    a 64K boundary would provide 2 blocks. Or a 128K THP that
+                    maps its first 100K, but contiguously and starting at a 64K
+                    boundary would provide 1 block. A fully- and
+                    contiguously-mapped 2M THP would provide 32 blocks. There
+                    are many other possible permutations.\\n"""),
+        epilog=format("""Requires root privilege to access pagemap and
+                    kpageflags."""))
+
+    group = parser.add_mutually_exclusive_group(required=False)
+    group.add_argument('--pid',
+        metavar='pid', required=False, type=int, default=[], action='append',
+        help="""Process id of the target process. Maybe issued multiple times to
+            scan multiple processes. --pid and --cgroup are mutually exclusive.
+            If neither are provided, all processes are scanned to provide
+            system-wide information.""")
+
+    group.add_argument('--cgroup',
+        metavar='path', required=False,
+        help="""Path to the target cgroup in sysfs. Iterates over every pid in
+            the cgroup and its children. --pid and --cgroup are mutually
+            exclusive. If neither are provided, all processes are scanned to
+            provide system-wide information.""")
+
+    parser.add_argument('--rollup',
+        required=False, default=False, action='store_true',
+        help="""Sum the per-vma statistics to provide a summary over the whole
+            system, process or cgroup.""")
+
+    parser.add_argument('--cont',
+        metavar='size[KMG]', required=False, default=[], action='append',
+        help="""Adds stats for memory that is mapped in contiguous blocks of
+            <size> and also aligned to <size>. May be issued multiple times to
+            track multiple sized blocks. Useful to infer e.g. arm64 contpte and
+            hpa mappings. Size must be a power-of-2 number of pages.""")
+
+    parser.add_argument('--inc-smaps',
+        required=False, default=False, action='store_true',
+        help="""Include all numerical, additive /proc/<pid>/smaps stats in the
+            output.""")
+
+    parser.add_argument('--inc-empty',
+        required=False, default=False, action='store_true',
+        help="""Show all statistics including those whose value is 0.""")
+
+    parser.add_argument('--periodic',
+        metavar='sleep_ms', required=False, type=int,
+        help="""Run in a loop, polling every sleep_ms milliseconds.""")
+
+    args = parser.parse_args()
+
+    try:
+        args.cont = [size2order(cont) for cont in args.cont]
+    except ArgException as e:
+        parser.print_usage()
+        raise
+
+    if args.periodic:
+        while True:
+            do_main(args)
+            print()
+            time.sleep(args.periodic / 1000)
+    else:
+        do_main(args)
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        prog = os.path.basename(sys.argv[0])
+        print(f'{prog}: {e}')
+        exit(1)
diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps
index 07373c5a7afe..f4e8eb79c1b8 100644
--- a/tools/net/ynl/Makefile.deps
+++ b/tools/net/ynl/Makefile.deps
@@ -20,6 +20,7 @@ CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h)
 CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h)
 CFLAGS_mptcp_pm:=$(call get_hdr_inc,_LINUX_MPTCP_PM_H,mptcp_pm.h)
 CFLAGS_netdev:=$(call get_hdr_inc,_LINUX_NETDEV_H,netdev.h)
+CFLAGS_nlctrl:=$(call get_hdr_inc,__LINUX_GENERIC_NETLINK_H,genetlink.h)
 CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_NETLINK_H,nfsd_netlink.h)
 CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h)
 CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h)
diff --git a/tools/net/ynl/cli.py b/tools/net/ynl/cli.py
index f131e33ac3ee..b8481f401376 100755
--- a/tools/net/ynl/cli.py
+++ b/tools/net/ynl/cli.py
@@ -19,13 +19,30 @@ class YnlEncoder(json.JSONEncoder):
 
 
 def main():
-    parser = argparse.ArgumentParser(description='YNL CLI sample')
+    description = """
+    YNL CLI utility - a general purpose netlink utility that uses YAML
+    specs to drive protocol encoding and decoding.
+    """
+    epilog = """
+    The --multi option can be repeated to include several do operations
+    in the same netlink payload.
+    """
+
+    parser = argparse.ArgumentParser(description=description,
+                                     epilog=epilog)
     parser.add_argument('--spec', dest='spec', type=str, required=True)
     parser.add_argument('--schema', dest='schema', type=str)
     parser.add_argument('--no-schema', action='store_true')
     parser.add_argument('--json', dest='json_text', type=str)
-    parser.add_argument('--do', dest='do', type=str)
-    parser.add_argument('--dump', dest='dump', type=str)
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str)
+    group.add_argument('--multi', dest='multi', nargs=2, action='append',
+                       metavar=('DO-OPERATION', 'JSON_TEXT'), type=str)
+    group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str)
+    group.add_argument('--list-ops', action='store_true')
+    group.add_argument('--list-msgs', action='store_true')
+
     parser.add_argument('--sleep', dest='sleep', type=int)
     parser.add_argument('--subscribe', dest='ntf', type=str)
     parser.add_argument('--replace', dest='flags', action='append_const',
@@ -66,6 +83,13 @@ def main():
     if args.sleep:
         time.sleep(args.sleep)
 
+    if args.list_ops:
+        for op_name, op in ynl.ops.items():
+            print(op_name, " [", ", ".join(op.modes), "]")
+    if args.list_msgs:
+        for op_name, op in ynl.msgs.items():
+            print(op_name, " [", ", ".join(op.modes), "]")
+
     try:
         if args.do:
             reply = ynl.do(args.do, attrs, args.flags)
@@ -73,6 +97,10 @@ def main():
         if args.dump:
             reply = ynl.dump(args.dump, attrs)
             output(reply)
+        if args.multi:
+            ops = [ (item[0], json.loads(item[1]), args.flags or []) for item in args.multi ]
+            reply = ynl.do_multi(ops)
+            output(reply)
     except NlError as e:
         print(e)
         exit(1)
diff --git a/tools/net/ynl/ethtool.py b/tools/net/ynl/ethtool.py
index 6c9f7e31250c..63c471f075ab 100755
--- a/tools/net/ynl/ethtool.py
+++ b/tools/net/ynl/ethtool.py
@@ -6,6 +6,7 @@ import json
 import pprint
 import sys
 import re
+import os
 
 from lib import YnlFamily
 
@@ -152,8 +153,11 @@ def main():
     global args
     args = parser.parse_args()
 
-    spec = '../../../Documentation/netlink/specs/ethtool.yaml'
-    schema = '../../../Documentation/netlink/genetlink-legacy.yaml'
+    script_abs_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
+    spec = os.path.join(script_abs_dir,
+                        '../../../Documentation/netlink/specs/ethtool.yaml')
+    schema = os.path.join(script_abs_dir,
+                          '../../../Documentation/netlink/genetlink-legacy.yaml')
 
     ynl = YnlFamily(spec, schema)
 
@@ -320,7 +324,13 @@ def main():
         return
 
     if args.show_time_stamping:
-        tsinfo = dumpit(ynl, args, 'tsinfo-get')
+        req = {
+          'header': {
+            'flags': 'stats',
+          },
+        }
+
+        tsinfo = dumpit(ynl, args, 'tsinfo-get', req)
 
         print(f'Time stamping parameters for {args.device}:')
 
@@ -334,6 +344,9 @@ def main():
 
         print('Hardware Receive Filter Modes:')
         [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])]
+
+        print('Statistics:')
+        [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()]
         return
 
     print(f'Settings for {args.device}:')
diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
index 6d08ab9e213f..b6d6f8aef423 100644
--- a/tools/net/ynl/lib/nlspec.py
+++ b/tools/net/ynl/lib/nlspec.py
@@ -335,6 +335,7 @@ class SpecOperation(SpecElement):
 
         req_value       numerical ID when serialized, user -> kernel
         rsp_value       numerical ID when serialized, user <- kernel
+        modes           supported operation modes (do, dump, event etc.)
         is_call         bool, whether the operation is a call
         is_async        bool, whether the operation is a notification
         is_resv         bool, whether the operation does not exist (it's just a reserved ID)
@@ -350,6 +351,7 @@ class SpecOperation(SpecElement):
         self.req_value = req_value
         self.rsp_value = rsp_value
 
+        self.modes = yaml.keys() & {'do', 'dump', 'event', 'notify'}
         self.is_call = 'do' in yaml or 'dump' in yaml
         self.is_async = 'notify' in yaml or 'event' in yaml
         self.is_resv = not self.is_async and not self.is_call
diff --git a/tools/net/ynl/lib/ynl.h b/tools/net/ynl/lib/ynl.h
index 9842e85a8c57..eef7c6324ed4 100644
--- a/tools/net/ynl/lib/ynl.h
+++ b/tools/net/ynl/lib/ynl.h
@@ -91,6 +91,18 @@ void ynl_sock_destroy(struct ynl_sock *ys);
 	     !ynl_dump_obj_is_last(iter);				\
 	     iter = ynl_dump_obj_next(iter))
 
+/**
+ * ynl_dump_empty() - does the dump have no entries
+ * @dump: pointer to the dump list, as returned by a dump call
+ *
+ * Check if the dump is empty, i.e. contains no objects.
+ * Dump calls return NULL on error, and terminator element if empty.
+ */
+static inline bool ynl_dump_empty(void *dump)
+{
+	return dump == (void *)YNL_LIST_END;
+}
+
 int ynl_subscribe(struct ynl_sock *ys, const char *grp_name);
 int ynl_socket_get_fd(struct ynl_sock *ys);
 int ynl_ntf_check(struct ynl_sock *ys);
diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py
index 5fa7957f6e0f..35e666928119 100644
--- a/tools/net/ynl/lib/ynl.py
+++ b/tools/net/ynl/lib/ynl.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
 from collections import namedtuple
+from enum import Enum
 import functools
 import os
 import random
@@ -76,13 +77,33 @@ class Netlink:
     NLMSGERR_ATTR_MISS_TYPE = 5
     NLMSGERR_ATTR_MISS_NEST = 6
 
+    # Policy types
+    NL_POLICY_TYPE_ATTR_TYPE = 1
+    NL_POLICY_TYPE_ATTR_MIN_VALUE_S = 2
+    NL_POLICY_TYPE_ATTR_MAX_VALUE_S = 3
+    NL_POLICY_TYPE_ATTR_MIN_VALUE_U = 4
+    NL_POLICY_TYPE_ATTR_MAX_VALUE_U = 5
+    NL_POLICY_TYPE_ATTR_MIN_LENGTH = 6
+    NL_POLICY_TYPE_ATTR_MAX_LENGTH = 7
+    NL_POLICY_TYPE_ATTR_POLICY_IDX = 8
+    NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE = 9
+    NL_POLICY_TYPE_ATTR_BITFIELD32_MASK = 10
+    NL_POLICY_TYPE_ATTR_PAD = 11
+    NL_POLICY_TYPE_ATTR_MASK = 12
+
+    AttrType = Enum('AttrType', ['flag', 'u8', 'u16', 'u32', 'u64',
+                                  's8', 's16', 's32', 's64',
+                                  'binary', 'string', 'nul-string',
+                                  'nested', 'nested-array',
+                                  'bitfield32', 'sint', 'uint'])
 
 class NlError(Exception):
   def __init__(self, nl_msg):
     self.nl_msg = nl_msg
+    self.error = -nl_msg.error
 
   def __str__(self):
-    return f"Netlink error: {os.strerror(-self.nl_msg.error)}\n{self.nl_msg}"
+    return f"Netlink error: {os.strerror(self.error)}\n{self.nl_msg}"
 
 
 class ConfigError(Exception):
@@ -182,6 +203,7 @@ class NlMsg:
             self.done = 1
             extack_off = 20
         elif self.nl_type == Netlink.NLMSG_DONE:
+            self.error = struct.unpack("i", self.raw[0:4])[0]
             self.done = 1
             extack_off = 4
 
@@ -198,6 +220,8 @@ class NlMsg:
                     self.extack['miss-nest'] = extack.as_scalar('u32')
                 elif extack.type == Netlink.NLMSGERR_ATTR_OFFS:
                     self.extack['bad-attr-offs'] = extack.as_scalar('u32')
+                elif extack.type == Netlink.NLMSGERR_ATTR_POLICY:
+                    self.extack['policy'] = self._decode_policy(extack.raw)
                 else:
                     if 'unknown' not in self.extack:
                         self.extack['unknown'] = []
@@ -209,10 +233,33 @@ class NlMsg:
                     miss_type = self.extack['miss-type']
                     if miss_type in attr_space.attrs_by_val:
                         spec = attr_space.attrs_by_val[miss_type]
-                        desc = spec['name']
+                        self.extack['miss-type'] = spec['name']
                         if 'doc' in spec:
-                            desc += f" ({spec['doc']})"
-                        self.extack['miss-type'] = desc
+                            self.extack['miss-type-doc'] = spec['doc']
+
+    def _decode_policy(self, raw):
+        policy = {}
+        for attr in NlAttrs(raw):
+            if attr.type == Netlink.NL_POLICY_TYPE_ATTR_TYPE:
+                type = attr.as_scalar('u32')
+                policy['type'] = Netlink.AttrType(type).name
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_S:
+                policy['min-value'] = attr.as_scalar('s64')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_S:
+                policy['max-value'] = attr.as_scalar('s64')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_U:
+                policy['min-value'] = attr.as_scalar('u64')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_U:
+                policy['max-value'] = attr.as_scalar('u64')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_LENGTH:
+                policy['min-length'] = attr.as_scalar('u32')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_LENGTH:
+                policy['max-length'] = attr.as_scalar('u32')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_BITFIELD32_MASK:
+                policy['bitfield32-mask'] = attr.as_scalar('u32')
+            elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MASK:
+                policy['mask'] = attr.as_scalar('u64')
+        return policy
 
     def cmd(self):
         return self.nl_type
@@ -339,12 +386,9 @@ class NetlinkProtocol:
     def _decode(self, nl_msg):
         return nl_msg
 
-    def decode(self, ynl, nl_msg):
+    def decode(self, ynl, nl_msg, op):
         msg = self._decode(nl_msg)
-        fixed_header_size = 0
-        if ynl:
-            op = ynl.rsp_by_value[msg.cmd()]
-            fixed_header_size = ynl._struct_size(op.fixed_header)
+        fixed_header_size = ynl._struct_size(op.fixed_header)
         msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size)
         return msg
 
@@ -584,15 +628,28 @@ class YnlFamily(SpecFamily):
                 decoded = self._formatted_string(decoded, attr_spec.display_hint)
         return decoded
 
-    def _decode_array_nest(self, attr, attr_spec):
+    def _decode_array_attr(self, attr, attr_spec):
         decoded = []
         offset = 0
         while offset < len(attr.raw):
             item = NlAttr(attr.raw, offset)
             offset += item.full_len
 
-            subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes'])
-            decoded.append({ item.type: subattrs })
+            if attr_spec["sub-type"] == 'nest':
+                subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes'])
+                decoded.append({ item.type: subattrs })
+            elif attr_spec["sub-type"] == 'binary':
+                subattrs = item.as_bin()
+                if attr_spec.display_hint:
+                    subattrs = self._formatted_string(subattrs, attr_spec.display_hint)
+                decoded.append(subattrs)
+            elif attr_spec["sub-type"] in NlAttr.type_formats:
+                subattrs = item.as_scalar(attr_spec['sub-type'], attr_spec.byte_order)
+                if attr_spec.display_hint:
+                    subattrs = self._formatted_string(subattrs, attr_spec.display_hint)
+                decoded.append(subattrs)
+            else:
+                raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}')
         return decoded
 
     def _decode_nest_type_value(self, attr, attr_spec):
@@ -686,8 +743,8 @@ class YnlFamily(SpecFamily):
                 decoded = attr.as_scalar(attr_spec['type'], attr_spec.byte_order)
                 if 'enum' in attr_spec:
                     decoded = self._decode_enum(decoded, attr_spec)
-            elif attr_spec["type"] == 'array-nest':
-                decoded = self._decode_array_nest(attr, attr_spec)
+            elif attr_spec["type"] == 'indexed-array':
+                decoded = self._decode_array_attr(attr, attr_spec)
             elif attr_spec["type"] == 'bitfield32':
                 value, selector = struct.unpack("II", attr.raw)
                 if 'enum' in attr_spec:
@@ -737,7 +794,7 @@ class YnlFamily(SpecFamily):
         if 'bad-attr-offs' not in extack:
             return
 
-        msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set))
+        msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set), op)
         offset = self.nlproto.msghdr_size() + self._struct_size(op.fixed_header)
         path = self._decode_extack_path(msg.raw_attrs, op.attr_set, offset,
                                         extack['bad-attr-offs'])
@@ -819,7 +876,10 @@ class YnlFamily(SpecFamily):
         if display_hint == 'mac':
             formatted = ':'.join('%02x' % b for b in raw)
         elif display_hint == 'hex':
-            formatted = bytes.hex(raw, ' ')
+            if isinstance(raw, int):
+                formatted = hex(raw)
+            else:
+                formatted = bytes.hex(raw, ' ')
         elif display_hint in [ 'ipv4', 'ipv6' ]:
             formatted = format(ipaddress.ip_address(raw))
         elif display_hint == 'uuid':
@@ -859,7 +919,8 @@ class YnlFamily(SpecFamily):
                     print("Netlink done while checking for ntf!?")
                     continue
 
-                decoded = self.nlproto.decode(self, nl_msg)
+                op = self.rsp_by_value[nl_msg.cmd()]
+                decoded = self.nlproto.decode(self, nl_msg, op)
                 if decoded.cmd() not in self.async_msg_ids:
                     print("Unexpected msg id done while checking for ntf", decoded)
                     continue
@@ -877,16 +938,11 @@ class YnlFamily(SpecFamily):
 
       return op['do']['request']['attributes'].copy()
 
-    def _op(self, method, vals, flags=None, dump=False):
-        op = self.ops[method]
-
+    def _encode_message(self, op, vals, flags, req_seq):
         nl_flags = Netlink.NLM_F_REQUEST | Netlink.NLM_F_ACK
         for flag in flags or []:
             nl_flags |= flag
-        if dump:
-            nl_flags |= Netlink.NLM_F_DUMP
 
-        req_seq = random.randint(1024, 65535)
         msg = self.nlproto.message(nl_flags, op.req_value, 1, req_seq)
         if op.fixed_header:
             msg += self._encode_struct(op.fixed_header, vals)
@@ -894,18 +950,36 @@ class YnlFamily(SpecFamily):
         for name, value in vals.items():
             msg += self._add_attr(op.attr_set.name, name, value, search_attrs)
         msg = _genl_msg_finalize(msg)
+        return msg
+
+    def _ops(self, ops):
+        reqs_by_seq = {}
+        req_seq = random.randint(1024, 65535)
+        payload = b''
+        for (method, vals, flags) in ops:
+            op = self.ops[method]
+            msg = self._encode_message(op, vals, flags, req_seq)
+            reqs_by_seq[req_seq] = (op, msg, flags)
+            payload += msg
+            req_seq += 1
 
-        self.sock.send(msg, 0)
+        self.sock.send(payload, 0)
 
         done = False
         rsp = []
+        op_rsp = []
         while not done:
             reply = self.sock.recv(self._recv_size)
             nms = NlMsgs(reply, attr_space=op.attr_set)
             self._recv_dbg_print(reply, nms)
             for nl_msg in nms:
-                if nl_msg.extack:
-                    self._decode_extack(msg, op, nl_msg.extack)
+                if nl_msg.nl_seq in reqs_by_seq:
+                    (op, req_msg, req_flags) = reqs_by_seq[nl_msg.nl_seq]
+                    if nl_msg.extack:
+                        self._decode_extack(req_msg, op, nl_msg.extack)
+                else:
+                    op = self.rsp_by_value[nl_msg.cmd()]
+                    req_flags = []
 
                 if nl_msg.error:
                     raise NlError(nl_msg)
@@ -913,13 +987,25 @@ class YnlFamily(SpecFamily):
                     if nl_msg.extack:
                         print("Netlink warning:")
                         print(nl_msg)
-                    done = True
+
+                    if Netlink.NLM_F_DUMP in req_flags:
+                        rsp.append(op_rsp)
+                    elif not op_rsp:
+                        rsp.append(None)
+                    elif len(op_rsp) == 1:
+                        rsp.append(op_rsp[0])
+                    else:
+                        rsp.append(op_rsp)
+                    op_rsp = []
+
+                    del reqs_by_seq[nl_msg.nl_seq]
+                    done = len(reqs_by_seq) == 0
                     break
 
-                decoded = self.nlproto.decode(self, nl_msg)
+                decoded = self.nlproto.decode(self, nl_msg, op)
 
                 # Check if this is a reply to our request
-                if nl_msg.nl_seq != req_seq or decoded.cmd() != op.rsp_value:
+                if nl_msg.nl_seq not in reqs_by_seq or decoded.cmd() != op.rsp_value:
                     if decoded.cmd() in self.async_msg_ids:
                         self.handle_ntf(decoded)
                         continue
@@ -930,16 +1016,23 @@ class YnlFamily(SpecFamily):
                 rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name)
                 if op.fixed_header:
                     rsp_msg.update(self._decode_struct(decoded.raw, op.fixed_header))
-                rsp.append(rsp_msg)
+                op_rsp.append(rsp_msg)
 
-        if not rsp:
-            return None
-        if not dump and len(rsp) == 1:
-            return rsp[0]
         return rsp
 
+    def _op(self, method, vals, flags=None, dump=False):
+        req_flags = flags or []
+        if dump:
+            req_flags.append(Netlink.NLM_F_DUMP)
+
+        ops = [(method, vals, req_flags)]
+        return self._ops(ops)[0]
+
     def do(self, method, vals, flags=None):
         return self._op(method, vals, flags)
 
     def dump(self, method, vals):
-        return self._op(method, vals, [], dump=True)
+        return self._op(method, vals, dump=True)
+
+    def do_multi(self, ops):
+        return self._ops(ops)
diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c
index 591b90e21890..3e7b29bd55d5 100644
--- a/tools/net/ynl/samples/netdev.c
+++ b/tools/net/ynl/samples/netdev.c
@@ -100,6 +100,8 @@ int main(int argc, char **argv)
 		if (!devs)
 			goto err_close;
 
+		if (ynl_dump_empty(devs))
+			fprintf(stderr, "Error: no devices reported\n");
 		ynl_dump_foreach(devs, d)
 			netdev_print_device(d, 0);
 		netdev_dev_get_list_free(devs);
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index 6b7eb2d2aaf1..a42d62b23ee0 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -228,8 +228,11 @@ class Type(SpecAttr):
         presence = ''
         for i in range(0, len(ref)):
             presence = f"{var}->{'.'.join(ref[:i] + [''])}_present.{ref[i]}"
-            if self.presence_type() == 'bit':
-                code.append(presence + ' = 1;')
+            # Every layer below last is a nest, so we know it uses bit presence
+            # last layer is "self" and may be a complex type
+            if i == len(ref) - 1 and self.presence_type() != 'bit':
+                continue
+            code.append(presence + ' = 1;')
         code += self._setter_lines(ri, member, presence)
 
         func_name = f"{op_prefix(ri, direction, deref=deref)}_set_{'_'.join(ref)}"
@@ -410,7 +413,7 @@ class TypeString(Type):
 
     def _attr_policy(self, policy):
         if 'exact-len' in self.checks:
-            mem = 'NLA_POLICY_EXACT_LEN(' + str(self.checks['exact-len']) + ')'
+            mem = 'NLA_POLICY_EXACT_LEN(' + str(self.get_limit('exact-len')) + ')'
         else:
             mem = '{ .type = ' + policy
             if 'max-len' in self.checks:
@@ -462,7 +465,7 @@ class TypeBinary(Type):
 
     def _attr_policy(self, policy):
         if 'exact-len' in self.checks:
-            mem = 'NLA_POLICY_EXACT_LEN(' + str(self.checks['exact-len']) + ')'
+            mem = 'NLA_POLICY_EXACT_LEN(' + str(self.get_limit('exact-len')) + ')'
         else:
             mem = '{ '
             if len(self.checks) == 1 and 'min-len' in self.checks:
@@ -838,8 +841,11 @@ class AttrSet(SpecAttrSet):
             t = TypeBitfield32(self.family, self, elem, value)
         elif elem['type'] == 'nest':
             t = TypeNest(self.family, self, elem, value)
-        elif elem['type'] == 'array-nest':
-            t = TypeArrayNest(self.family, self, elem, value)
+        elif elem['type'] == 'indexed-array' and 'sub-type' in elem:
+            if elem["sub-type"] == 'nest':
+                t = TypeArrayNest(self.family, self, elem, value)
+            else:
+                raise Exception(f'new_attr: unsupported sub-type {elem["sub-type"]}')
         elif elem['type'] == 'nest-type-value':
             t = TypeNestTypeValue(self.family, self, elem, value)
         else:
@@ -1052,7 +1058,7 @@ class Family(SpecFamily):
                     if nested in self.root_sets:
                         raise Exception("Inheriting members to a space used as root not supported")
                     inherit.update(set(spec['type-value']))
-                elif spec['type'] == 'array-nest':
+                elif spec['type'] == 'indexed-array':
                     inherit.add('idx')
                 self.pure_nested_structs[nested].set_inherited(inherit)
 
@@ -1616,9 +1622,12 @@ def _multi_parse(ri, struct, init_lines, local_vars):
     multi_attrs = set()
     needs_parg = False
     for arg, aspec in struct.member_list():
-        if aspec['type'] == 'array-nest':
-            local_vars.append(f'const struct nlattr *attr_{aspec.c_name};')
-            array_nests.add(arg)
+        if aspec['type'] == 'indexed-array' and 'sub-type' in aspec:
+            if aspec["sub-type"] == 'nest':
+                local_vars.append(f'const struct nlattr *attr_{aspec.c_name};')
+                array_nests.add(arg)
+            else:
+                raise Exception(f'Not supported sub-type {aspec["sub-type"]}')
         if 'multi-attr' in aspec:
             multi_attrs.add(arg)
         needs_parg |= 'nested-attributes' in aspec
diff --git a/tools/net/ynl/ynl-gen-rst.py b/tools/net/ynl/ynl-gen-rst.py
index 927407b3efb3..657e881d2ea4 100755
--- a/tools/net/ynl/ynl-gen-rst.py
+++ b/tools/net/ynl/ynl-gen-rst.py
@@ -82,9 +82,9 @@ def rst_subsubsection(title: str) -> str:
     return f"{title}\n" + "~" * len(title)
 
 
-def rst_section(title: str) -> str:
+def rst_section(namespace: str, prefix: str, title: str) -> str:
     """Add a section to the document"""
-    return f"\n{title}\n" + "=" * len(title)
+    return f".. _{namespace}-{prefix}-{title}:\n\n{title}\n" + "=" * len(title)
 
 
 def rst_subtitle(title: str) -> str:
@@ -102,6 +102,17 @@ def rst_list_inline(list_: List[str], level: int = 0) -> str:
     return headroom(level) + "[" + ", ".join(inline(i) for i in list_) + "]"
 
 
+def rst_ref(namespace: str, prefix: str, name: str) -> str:
+    """Add a hyperlink to the document"""
+    mappings = {'enum': 'definition',
+                'fixed-header': 'definition',
+                'nested-attributes': 'attribute-set',
+                'struct': 'definition'}
+    if prefix in mappings:
+        prefix = mappings[prefix]
+    return f":ref:`{namespace}-{prefix}-{name}`"
+
+
 def rst_header() -> str:
     """The headers for all the auto generated RST files"""
     lines = []
@@ -159,20 +170,24 @@ def parse_do_attributes(attrs: Dict[str, Any], level: int = 0) -> str:
     return "\n".join(lines)
 
 
-def parse_operations(operations: List[Dict[str, Any]]) -> str:
+def parse_operations(operations: List[Dict[str, Any]], namespace: str) -> str:
     """Parse operations block"""
     preprocessed = ["name", "doc", "title", "do", "dump"]
+    linkable = ["fixed-header", "attribute-set"]
     lines = []
 
     for operation in operations:
-        lines.append(rst_section(operation["name"]))
+        lines.append(rst_section(namespace, 'operation', operation["name"]))
         lines.append(rst_paragraph(sanitize(operation["doc"])) + "\n")
 
         for key in operation.keys():
             if key in preprocessed:
                 # Skip the special fields
                 continue
-            lines.append(rst_fields(key, operation[key], 0))
+            value = operation[key]
+            if key in linkable:
+                value = rst_ref(namespace, key, value)
+            lines.append(rst_fields(key, value, 0))
 
         if "do" in operation:
             lines.append(rst_paragraph(":do:", 0))
@@ -212,14 +227,14 @@ def parse_entries(entries: List[Dict[str, Any]], level: int) -> str:
     return "\n".join(lines)
 
 
-def parse_definitions(defs: Dict[str, Any]) -> str:
+def parse_definitions(defs: Dict[str, Any], namespace: str) -> str:
     """Parse definitions section"""
     preprocessed = ["name", "entries", "members"]
     ignored = ["render-max"]  # This is not printed
     lines = []
 
     for definition in defs:
-        lines.append(rst_section(definition["name"]))
+        lines.append(rst_section(namespace, 'definition', definition["name"]))
         for k in definition.keys():
             if k in preprocessed + ignored:
                 continue
@@ -237,14 +252,15 @@ def parse_definitions(defs: Dict[str, Any]) -> str:
     return "\n".join(lines)
 
 
-def parse_attr_sets(entries: List[Dict[str, Any]]) -> str:
+def parse_attr_sets(entries: List[Dict[str, Any]], namespace: str) -> str:
     """Parse attribute from attribute-set"""
     preprocessed = ["name", "type"]
+    linkable = ["enum", "nested-attributes", "struct", "sub-message"]
     ignored = ["checks"]
     lines = []
 
     for entry in entries:
-        lines.append(rst_section(entry["name"]))
+        lines.append(rst_section(namespace, 'attribute-set', entry["name"]))
         for attr in entry["attributes"]:
             type_ = attr.get("type")
             attr_line = attr["name"]
@@ -257,25 +273,31 @@ def parse_attr_sets(entries: List[Dict[str, Any]]) -> str:
             for k in attr.keys():
                 if k in preprocessed + ignored:
                     continue
-                lines.append(rst_fields(k, sanitize(attr[k]), 0))
+                if k in linkable:
+                    value = rst_ref(namespace, k, attr[k])
+                else:
+                    value = sanitize(attr[k])
+                lines.append(rst_fields(k, value, 0))
             lines.append("\n")
 
     return "\n".join(lines)
 
 
-def parse_sub_messages(entries: List[Dict[str, Any]]) -> str:
+def parse_sub_messages(entries: List[Dict[str, Any]], namespace: str) -> str:
     """Parse sub-message definitions"""
     lines = []
 
     for entry in entries:
-        lines.append(rst_section(entry["name"]))
+        lines.append(rst_section(namespace, 'sub-message', entry["name"]))
         for fmt in entry["formats"]:
             value = fmt["value"]
 
             lines.append(rst_bullet(bold(value)))
             for attr in ['fixed-header', 'attribute-set']:
                 if attr in fmt:
-                    lines.append(rst_fields(attr, fmt[attr], 1))
+                    lines.append(rst_fields(attr,
+                                            rst_ref(namespace, attr, fmt[attr]),
+                                            1))
             lines.append("\n")
 
     return "\n".join(lines)
@@ -289,9 +311,11 @@ def parse_yaml(obj: Dict[str, Any]) -> str:
 
     lines.append(rst_header())
 
-    title = f"Family ``{obj['name']}`` netlink specification"
+    family = obj['name']
+
+    title = f"Family ``{family}`` netlink specification"
     lines.append(rst_title(title))
-    lines.append(rst_paragraph(".. contents::\n"))
+    lines.append(rst_paragraph(".. contents:: :depth: 3\n"))
 
     if "doc" in obj:
         lines.append(rst_subtitle("Summary"))
@@ -300,7 +324,7 @@ def parse_yaml(obj: Dict[str, Any]) -> str:
     # Operations
     if "operations" in obj:
         lines.append(rst_subtitle("Operations"))
-        lines.append(parse_operations(obj["operations"]["list"]))
+        lines.append(parse_operations(obj["operations"]["list"], family))
 
     # Multicast groups
     if "mcast-groups" in obj:
@@ -310,17 +334,17 @@ def parse_yaml(obj: Dict[str, Any]) -> str:
     # Definitions
     if "definitions" in obj:
         lines.append(rst_subtitle("Definitions"))
-        lines.append(parse_definitions(obj["definitions"]))
+        lines.append(parse_definitions(obj["definitions"], family))
 
     # Attributes set
     if "attribute-sets" in obj:
         lines.append(rst_subtitle("Attribute sets"))
-        lines.append(parse_attr_sets(obj["attribute-sets"]))
+        lines.append(parse_attr_sets(obj["attribute-sets"], family))
 
     # Sub-messages
     if "sub-messages" in obj:
         lines.append(rst_subtitle("Sub-messages"))
-        lines.append(parse_sub_messages(obj["sub-messages"]))
+        lines.append(parse_sub_messages(obj["sub-messages"], family))
 
     return "\n".join(lines)
 
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 83b100c1e7f6..bf7f7f84ac62 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -57,6 +57,10 @@ ifeq ($(SRCARCH),x86)
 	BUILD_ORC := y
 endif
 
+ifeq ($(SRCARCH),loongarch)
+	BUILD_ORC := y
+endif
+
 export BUILD_ORC
 export srctree OUTPUT CFLAGS SRCARCH AWK
 include $(srctree)/tools/build/Makefile.include
diff --git a/tools/objtool/arch/loongarch/Build b/tools/objtool/arch/loongarch/Build
new file mode 100644
index 000000000000..1d4b784b6887
--- /dev/null
+++ b/tools/objtool/arch/loongarch/Build
@@ -0,0 +1,3 @@
+objtool-y += decode.o
+objtool-y += special.o
+objtool-y += orc.o
diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c
new file mode 100644
index 000000000000..aee479d2191c
--- /dev/null
+++ b/tools/objtool/arch/loongarch/decode.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <string.h>
+#include <objtool/check.h>
+#include <objtool/warn.h>
+#include <asm/inst.h>
+#include <asm/orc_types.h>
+#include <linux/objtool_types.h>
+
+#ifndef EM_LOONGARCH
+#define EM_LOONGARCH	258
+#endif
+
+int arch_ftrace_match(char *name)
+{
+	return !strcmp(name, "_mcount");
+}
+
+unsigned long arch_jump_destination(struct instruction *insn)
+{
+	return insn->offset + (insn->immediate << 2);
+}
+
+unsigned long arch_dest_reloc_offset(int addend)
+{
+	return addend;
+}
+
+bool arch_pc_relative_reloc(struct reloc *reloc)
+{
+	return false;
+}
+
+bool arch_callee_saved_reg(unsigned char reg)
+{
+	switch (reg) {
+	case CFI_RA:
+	case CFI_FP:
+	case CFI_S0 ... CFI_S8:
+		return true;
+	default:
+		return false;
+	}
+}
+
+int arch_decode_hint_reg(u8 sp_reg, int *base)
+{
+	switch (sp_reg) {
+	case ORC_REG_UNDEFINED:
+		*base = CFI_UNDEFINED;
+		break;
+	case ORC_REG_SP:
+		*base = CFI_SP;
+		break;
+	case ORC_REG_FP:
+		*base = CFI_FP;
+		break;
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static bool is_loongarch(const struct elf *elf)
+{
+	if (elf->ehdr.e_machine == EM_LOONGARCH)
+		return true;
+
+	WARN("unexpected ELF machine type %d", elf->ehdr.e_machine);
+	return false;
+}
+
+#define ADD_OP(op) \
+	if (!(op = calloc(1, sizeof(*op)))) \
+		return -1; \
+	else for (*ops_list = op, ops_list = &op->next; op; op = NULL)
+
+static bool decode_insn_reg0i26_fomat(union loongarch_instruction inst,
+				      struct instruction *insn)
+{
+	switch (inst.reg0i26_format.opcode) {
+	case b_op:
+		insn->type = INSN_JUMP_UNCONDITIONAL;
+		insn->immediate = sign_extend64(inst.reg0i26_format.immediate_h << 16 |
+						inst.reg0i26_format.immediate_l, 25);
+		break;
+	case bl_op:
+		insn->type = INSN_CALL;
+		insn->immediate = sign_extend64(inst.reg0i26_format.immediate_h << 16 |
+						inst.reg0i26_format.immediate_l, 25);
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool decode_insn_reg1i21_fomat(union loongarch_instruction inst,
+				      struct instruction *insn)
+{
+	switch (inst.reg1i21_format.opcode) {
+	case beqz_op:
+	case bnez_op:
+	case bceqz_op:
+		insn->type = INSN_JUMP_CONDITIONAL;
+		insn->immediate = sign_extend64(inst.reg1i21_format.immediate_h << 16 |
+						inst.reg1i21_format.immediate_l, 20);
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool decode_insn_reg2i12_fomat(union loongarch_instruction inst,
+				      struct instruction *insn,
+				      struct stack_op **ops_list,
+				      struct stack_op *op)
+{
+	switch (inst.reg2i12_format.opcode) {
+	case addid_op:
+		if ((inst.reg2i12_format.rd == CFI_SP) || (inst.reg2i12_format.rj == CFI_SP)) {
+			/* addi.d sp,sp,si12 or addi.d fp,sp,si12 */
+			insn->immediate = sign_extend64(inst.reg2i12_format.immediate, 11);
+			ADD_OP(op) {
+				op->src.type = OP_SRC_ADD;
+				op->src.reg = inst.reg2i12_format.rj;
+				op->src.offset = insn->immediate;
+				op->dest.type = OP_DEST_REG;
+				op->dest.reg = inst.reg2i12_format.rd;
+			}
+		}
+		break;
+	case ldd_op:
+		if (inst.reg2i12_format.rj == CFI_SP) {
+			/* ld.d rd,sp,si12 */
+			insn->immediate = sign_extend64(inst.reg2i12_format.immediate, 11);
+			ADD_OP(op) {
+				op->src.type = OP_SRC_REG_INDIRECT;
+				op->src.reg = CFI_SP;
+				op->src.offset = insn->immediate;
+				op->dest.type = OP_DEST_REG;
+				op->dest.reg = inst.reg2i12_format.rd;
+			}
+		}
+		break;
+	case std_op:
+		if (inst.reg2i12_format.rj == CFI_SP) {
+			/* st.d rd,sp,si12 */
+			insn->immediate = sign_extend64(inst.reg2i12_format.immediate, 11);
+			ADD_OP(op) {
+				op->src.type = OP_SRC_REG;
+				op->src.reg = inst.reg2i12_format.rd;
+				op->dest.type = OP_DEST_REG_INDIRECT;
+				op->dest.reg = CFI_SP;
+				op->dest.offset = insn->immediate;
+			}
+		}
+		break;
+	case andi_op:
+		if (inst.reg2i12_format.rd == 0 &&
+		    inst.reg2i12_format.rj == 0 &&
+		    inst.reg2i12_format.immediate == 0)
+			/* andi r0,r0,0 */
+			insn->type = INSN_NOP;
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool decode_insn_reg2i14_fomat(union loongarch_instruction inst,
+				      struct instruction *insn,
+				      struct stack_op **ops_list,
+				      struct stack_op *op)
+{
+	switch (inst.reg2i14_format.opcode) {
+	case ldptrd_op:
+		if (inst.reg2i14_format.rj == CFI_SP) {
+			/* ldptr.d rd,sp,si14 */
+			insn->immediate = sign_extend64(inst.reg2i14_format.immediate, 13);
+			ADD_OP(op) {
+				op->src.type = OP_SRC_REG_INDIRECT;
+				op->src.reg = CFI_SP;
+				op->src.offset = insn->immediate;
+				op->dest.type = OP_DEST_REG;
+				op->dest.reg = inst.reg2i14_format.rd;
+			}
+		}
+		break;
+	case stptrd_op:
+		if (inst.reg2i14_format.rj == CFI_SP) {
+			/* stptr.d ra,sp,0 */
+			if (inst.reg2i14_format.rd == LOONGARCH_GPR_RA &&
+			    inst.reg2i14_format.immediate == 0)
+				break;
+
+			/* stptr.d rd,sp,si14 */
+			insn->immediate = sign_extend64(inst.reg2i14_format.immediate, 13);
+			ADD_OP(op) {
+				op->src.type = OP_SRC_REG;
+				op->src.reg = inst.reg2i14_format.rd;
+				op->dest.type = OP_DEST_REG_INDIRECT;
+				op->dest.reg = CFI_SP;
+				op->dest.offset = insn->immediate;
+			}
+		}
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool decode_insn_reg2i16_fomat(union loongarch_instruction inst,
+				      struct instruction *insn)
+{
+	switch (inst.reg2i16_format.opcode) {
+	case jirl_op:
+		if (inst.reg2i16_format.rd == 0 &&
+		    inst.reg2i16_format.rj == CFI_RA &&
+		    inst.reg2i16_format.immediate == 0) {
+			/* jirl r0,ra,0 */
+			insn->type = INSN_RETURN;
+		} else if (inst.reg2i16_format.rd == CFI_RA) {
+			/* jirl ra,rj,offs16 */
+			insn->type = INSN_CALL_DYNAMIC;
+		} else if (inst.reg2i16_format.rd == CFI_A0 &&
+			   inst.reg2i16_format.immediate == 0) {
+			/*
+			 * jirl a0,t0,0
+			 * this is a special case in loongarch_suspend_enter,
+			 * just treat it as a call instruction.
+			 */
+			insn->type = INSN_CALL_DYNAMIC;
+		} else if (inst.reg2i16_format.rd == 0 &&
+			   inst.reg2i16_format.immediate == 0) {
+			/* jirl r0,rj,0 */
+			insn->type = INSN_JUMP_DYNAMIC;
+		} else if (inst.reg2i16_format.rd == 0 &&
+			   inst.reg2i16_format.immediate != 0) {
+			/*
+			 * jirl r0,t0,12
+			 * this is a rare case in JUMP_VIRT_ADDR,
+			 * just ignore it due to it is harmless for tracing.
+			 */
+			break;
+		} else {
+			/* jirl rd,rj,offs16 */
+			insn->type = INSN_JUMP_UNCONDITIONAL;
+			insn->immediate = sign_extend64(inst.reg2i16_format.immediate, 15);
+		}
+		break;
+	case beq_op:
+	case bne_op:
+	case blt_op:
+	case bge_op:
+	case bltu_op:
+	case bgeu_op:
+		insn->type = INSN_JUMP_CONDITIONAL;
+		insn->immediate = sign_extend64(inst.reg2i16_format.immediate, 15);
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+int arch_decode_instruction(struct objtool_file *file, const struct section *sec,
+			    unsigned long offset, unsigned int maxlen,
+			    struct instruction *insn)
+{
+	struct stack_op **ops_list = &insn->stack_ops;
+	const struct elf *elf = file->elf;
+	struct stack_op *op = NULL;
+	union loongarch_instruction inst;
+
+	if (!is_loongarch(elf))
+		return -1;
+
+	if (maxlen < LOONGARCH_INSN_SIZE)
+		return 0;
+
+	insn->len = LOONGARCH_INSN_SIZE;
+	insn->type = INSN_OTHER;
+	insn->immediate = 0;
+
+	inst = *(union loongarch_instruction *)(sec->data->d_buf + offset);
+
+	if (decode_insn_reg0i26_fomat(inst, insn))
+		return 0;
+	if (decode_insn_reg1i21_fomat(inst, insn))
+		return 0;
+	if (decode_insn_reg2i12_fomat(inst, insn, ops_list, op))
+		return 0;
+	if (decode_insn_reg2i14_fomat(inst, insn, ops_list, op))
+		return 0;
+	if (decode_insn_reg2i16_fomat(inst, insn))
+		return 0;
+
+	if (inst.word == 0)
+		insn->type = INSN_NOP;
+	else if (inst.reg0i15_format.opcode == break_op) {
+		/* break */
+		insn->type = INSN_BUG;
+	} else if (inst.reg2_format.opcode == ertn_op) {
+		/* ertn */
+		insn->type = INSN_RETURN;
+	}
+
+	return 0;
+}
+
+const char *arch_nop_insn(int len)
+{
+	static u32 nop;
+
+	if (len != LOONGARCH_INSN_SIZE)
+		WARN("invalid NOP size: %d\n", len);
+
+	nop = LOONGARCH_INSN_NOP;
+
+	return (const char *)&nop;
+}
+
+const char *arch_ret_insn(int len)
+{
+	static u32 ret;
+
+	if (len != LOONGARCH_INSN_SIZE)
+		WARN("invalid RET size: %d\n", len);
+
+	emit_jirl((union loongarch_instruction *)&ret, LOONGARCH_GPR_RA, LOONGARCH_GPR_ZERO, 0);
+
+	return (const char *)&ret;
+}
+
+void arch_initial_func_cfi_state(struct cfi_init_state *state)
+{
+	int i;
+
+	for (i = 0; i < CFI_NUM_REGS; i++) {
+		state->regs[i].base = CFI_UNDEFINED;
+		state->regs[i].offset = 0;
+	}
+
+	/* initial CFA (call frame address) */
+	state->cfa.base = CFI_SP;
+	state->cfa.offset = 0;
+}
diff --git a/tools/objtool/arch/loongarch/include/arch/cfi_regs.h b/tools/objtool/arch/loongarch/include/arch/cfi_regs.h
new file mode 100644
index 000000000000..d183cc8f43bf
--- /dev/null
+++ b/tools/objtool/arch/loongarch/include/arch/cfi_regs.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _OBJTOOL_ARCH_CFI_REGS_H
+#define _OBJTOOL_ARCH_CFI_REGS_H
+
+#define CFI_RA		1
+#define CFI_SP		3
+#define CFI_A0		4
+#define CFI_FP		22
+#define CFI_S0		23
+#define CFI_S1		24
+#define CFI_S2		25
+#define CFI_S3		26
+#define CFI_S4		27
+#define CFI_S5		28
+#define CFI_S6		29
+#define CFI_S7		30
+#define CFI_S8		31
+#define CFI_NUM_REGS	32
+
+#define CFI_BP		CFI_FP
+
+#endif /* _OBJTOOL_ARCH_CFI_REGS_H */
diff --git a/tools/objtool/arch/loongarch/include/arch/elf.h b/tools/objtool/arch/loongarch/include/arch/elf.h
new file mode 100644
index 000000000000..9623d663220e
--- /dev/null
+++ b/tools/objtool/arch/loongarch/include/arch/elf.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _OBJTOOL_ARCH_ELF_H
+#define _OBJTOOL_ARCH_ELF_H
+
+/*
+ * See the following link for more info about ELF Relocation types:
+ * https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html#_relocations
+ */
+#ifndef R_LARCH_NONE
+#define R_LARCH_NONE		0
+#endif
+#ifndef R_LARCH_32
+#define R_LARCH_32		1
+#endif
+#ifndef R_LARCH_64
+#define R_LARCH_64		2
+#endif
+#ifndef R_LARCH_32_PCREL
+#define R_LARCH_32_PCREL	99
+#endif
+
+#define R_NONE			R_LARCH_NONE
+#define R_ABS32			R_LARCH_32
+#define R_ABS64			R_LARCH_64
+#define R_DATA32		R_LARCH_32_PCREL
+#define R_DATA64		R_LARCH_32_PCREL
+#define R_TEXT32		R_LARCH_32_PCREL
+#define R_TEXT64		R_LARCH_32_PCREL
+
+#endif /* _OBJTOOL_ARCH_ELF_H */
diff --git a/tools/objtool/arch/loongarch/include/arch/special.h b/tools/objtool/arch/loongarch/include/arch/special.h
new file mode 100644
index 000000000000..35fc979b550a
--- /dev/null
+++ b/tools/objtool/arch/loongarch/include/arch/special.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _OBJTOOL_ARCH_SPECIAL_H
+#define _OBJTOOL_ARCH_SPECIAL_H
+
+/*
+ * See more info about struct exception_table_entry
+ * in arch/loongarch/include/asm/extable.h
+ */
+#define EX_ENTRY_SIZE		12
+#define EX_ORIG_OFFSET		0
+#define EX_NEW_OFFSET		4
+
+/*
+ * See more info about struct jump_entry
+ * in include/linux/jump_label.h
+ */
+#define JUMP_ENTRY_SIZE		16
+#define JUMP_ORIG_OFFSET	0
+#define JUMP_NEW_OFFSET		4
+#define JUMP_KEY_OFFSET		8
+
+/*
+ * See more info about struct alt_instr
+ * in arch/loongarch/include/asm/alternative.h
+ */
+#define ALT_ENTRY_SIZE		12
+#define ALT_ORIG_OFFSET		0
+#define ALT_NEW_OFFSET		4
+#define ALT_FEATURE_OFFSET	8
+#define ALT_ORIG_LEN_OFFSET	10
+#define ALT_NEW_LEN_OFFSET	11
+
+#endif /* _OBJTOOL_ARCH_SPECIAL_H */
diff --git a/tools/objtool/arch/loongarch/orc.c b/tools/objtool/arch/loongarch/orc.c
new file mode 100644
index 000000000000..873536d009d9
--- /dev/null
+++ b/tools/objtool/arch/loongarch/orc.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/objtool_types.h>
+#include <asm/orc_types.h>
+
+#include <objtool/check.h>
+#include <objtool/orc.h>
+#include <objtool/warn.h>
+#include <objtool/endianness.h>
+
+int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn)
+{
+	struct cfi_reg *fp = &cfi->regs[CFI_FP];
+	struct cfi_reg *ra = &cfi->regs[CFI_RA];
+
+	memset(orc, 0, sizeof(*orc));
+
+	if (!cfi) {
+		/*
+		 * This is usually either unreachable nops/traps (which don't
+		 * trigger unreachable instruction warnings), or
+		 * STACK_FRAME_NON_STANDARD functions.
+		 */
+		orc->type = ORC_TYPE_UNDEFINED;
+		return 0;
+	}
+
+	switch (cfi->type) {
+	case UNWIND_HINT_TYPE_UNDEFINED:
+		orc->type = ORC_TYPE_UNDEFINED;
+		return 0;
+	case UNWIND_HINT_TYPE_END_OF_STACK:
+		orc->type = ORC_TYPE_END_OF_STACK;
+		return 0;
+	case UNWIND_HINT_TYPE_CALL:
+		orc->type = ORC_TYPE_CALL;
+		break;
+	case UNWIND_HINT_TYPE_REGS:
+		orc->type = ORC_TYPE_REGS;
+		break;
+	case UNWIND_HINT_TYPE_REGS_PARTIAL:
+		orc->type = ORC_TYPE_REGS_PARTIAL;
+		break;
+	default:
+		WARN_INSN(insn, "unknown unwind hint type %d", cfi->type);
+		return -1;
+	}
+
+	orc->signal = cfi->signal;
+
+	switch (cfi->cfa.base) {
+	case CFI_SP:
+		orc->sp_reg = ORC_REG_SP;
+		break;
+	case CFI_FP:
+		orc->sp_reg = ORC_REG_FP;
+		break;
+	default:
+		WARN_INSN(insn, "unknown CFA base reg %d", cfi->cfa.base);
+		return -1;
+	}
+
+	switch (fp->base) {
+	case CFI_UNDEFINED:
+		orc->fp_reg = ORC_REG_UNDEFINED;
+		orc->fp_offset = 0;
+		break;
+	case CFI_CFA:
+		orc->fp_reg = ORC_REG_PREV_SP;
+		orc->fp_offset = fp->offset;
+		break;
+	case CFI_FP:
+		orc->fp_reg = ORC_REG_FP;
+		break;
+	default:
+		WARN_INSN(insn, "unknown FP base reg %d", fp->base);
+		return -1;
+	}
+
+	switch (ra->base) {
+	case CFI_UNDEFINED:
+		orc->ra_reg = ORC_REG_UNDEFINED;
+		orc->ra_offset = 0;
+		break;
+	case CFI_CFA:
+		orc->ra_reg = ORC_REG_PREV_SP;
+		orc->ra_offset = ra->offset;
+		break;
+	case CFI_FP:
+		orc->ra_reg = ORC_REG_FP;
+		break;
+	default:
+		WARN_INSN(insn, "unknown RA base reg %d", ra->base);
+		return -1;
+	}
+
+	orc->sp_offset = cfi->cfa.offset;
+
+	return 0;
+}
+
+int write_orc_entry(struct elf *elf, struct section *orc_sec,
+		    struct section *ip_sec, unsigned int idx,
+		    struct section *insn_sec, unsigned long insn_off,
+		    struct orc_entry *o)
+{
+	struct orc_entry *orc;
+
+	/* populate ORC data */
+	orc = (struct orc_entry *)orc_sec->data->d_buf + idx;
+	memcpy(orc, o, sizeof(*orc));
+
+	/* populate reloc for ip */
+	if (!elf_init_reloc_text_sym(elf, ip_sec, idx * sizeof(int), idx,
+				     insn_sec, insn_off))
+		return -1;
+
+	return 0;
+}
+
+static const char *reg_name(unsigned int reg)
+{
+	switch (reg) {
+	case ORC_REG_SP:
+		return "sp";
+	case ORC_REG_FP:
+		return "fp";
+	case ORC_REG_PREV_SP:
+		return "prevsp";
+	default:
+		return "?";
+	}
+}
+
+static const char *orc_type_name(unsigned int type)
+{
+	switch (type) {
+	case UNWIND_HINT_TYPE_CALL:
+		return "call";
+	case UNWIND_HINT_TYPE_REGS:
+		return "regs";
+	case UNWIND_HINT_TYPE_REGS_PARTIAL:
+		return "regs (partial)";
+	default:
+		return "?";
+	}
+}
+
+static void print_reg(unsigned int reg, int offset)
+{
+	if (reg == ORC_REG_UNDEFINED)
+		printf(" (und) ");
+	else
+		printf("%s + %3d", reg_name(reg), offset);
+
+}
+
+void orc_print_dump(struct elf *dummy_elf, struct orc_entry *orc, int i)
+{
+	printf("type:%s", orc_type_name(orc[i].type));
+
+	printf(" sp:");
+	print_reg(orc[i].sp_reg, orc[i].sp_offset);
+
+	printf(" fp:");
+	print_reg(orc[i].fp_reg, orc[i].fp_offset);
+
+	printf(" ra:");
+	print_reg(orc[i].ra_reg, orc[i].ra_offset);
+
+	printf(" signal:%d\n", orc[i].signal);
+}
diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c
new file mode 100644
index 000000000000..9bba1e9318e0
--- /dev/null
+++ b/tools/objtool/arch/loongarch/special.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <objtool/special.h>
+
+bool arch_support_alt_relocation(struct special_alt *special_alt,
+				 struct instruction *insn,
+				 struct reloc *reloc)
+{
+	return false;
+}
+
+struct reloc *arch_find_switch_table(struct objtool_file *file,
+				     struct instruction *insn)
+{
+	return NULL;
+}
diff --git a/tools/objtool/arch/x86/Build b/tools/objtool/arch/x86/Build
index 9f7869b5c5e0..3dedb2fd8f3a 100644
--- a/tools/objtool/arch/x86/Build
+++ b/tools/objtool/arch/x86/Build
@@ -1,5 +1,6 @@
 objtool-y += special.o
 objtool-y += decode.o
+objtool-y += orc.o
 
 inat_tables_script = ../arch/x86/tools/gen-insn-attr-x86.awk
 inat_tables_maps = ../arch/x86/lib/x86-opcode-map.txt
diff --git a/tools/objtool/arch/x86/orc.c b/tools/objtool/arch/x86/orc.c
new file mode 100644
index 000000000000..b6cd943e87f9
--- /dev/null
+++ b/tools/objtool/arch/x86/orc.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/objtool_types.h>
+#include <asm/orc_types.h>
+
+#include <objtool/check.h>
+#include <objtool/orc.h>
+#include <objtool/warn.h>
+#include <objtool/endianness.h>
+
+int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn)
+{
+	struct cfi_reg *bp = &cfi->regs[CFI_BP];
+
+	memset(orc, 0, sizeof(*orc));
+
+	if (!cfi) {
+		/*
+		 * This is usually either unreachable nops/traps (which don't
+		 * trigger unreachable instruction warnings), or
+		 * STACK_FRAME_NON_STANDARD functions.
+		 */
+		orc->type = ORC_TYPE_UNDEFINED;
+		return 0;
+	}
+
+	switch (cfi->type) {
+	case UNWIND_HINT_TYPE_UNDEFINED:
+		orc->type = ORC_TYPE_UNDEFINED;
+		return 0;
+	case UNWIND_HINT_TYPE_END_OF_STACK:
+		orc->type = ORC_TYPE_END_OF_STACK;
+		return 0;
+	case UNWIND_HINT_TYPE_CALL:
+		orc->type = ORC_TYPE_CALL;
+		break;
+	case UNWIND_HINT_TYPE_REGS:
+		orc->type = ORC_TYPE_REGS;
+		break;
+	case UNWIND_HINT_TYPE_REGS_PARTIAL:
+		orc->type = ORC_TYPE_REGS_PARTIAL;
+		break;
+	default:
+		WARN_INSN(insn, "unknown unwind hint type %d", cfi->type);
+		return -1;
+	}
+
+	orc->signal = cfi->signal;
+
+	switch (cfi->cfa.base) {
+	case CFI_SP:
+		orc->sp_reg = ORC_REG_SP;
+		break;
+	case CFI_SP_INDIRECT:
+		orc->sp_reg = ORC_REG_SP_INDIRECT;
+		break;
+	case CFI_BP:
+		orc->sp_reg = ORC_REG_BP;
+		break;
+	case CFI_BP_INDIRECT:
+		orc->sp_reg = ORC_REG_BP_INDIRECT;
+		break;
+	case CFI_R10:
+		orc->sp_reg = ORC_REG_R10;
+		break;
+	case CFI_R13:
+		orc->sp_reg = ORC_REG_R13;
+		break;
+	case CFI_DI:
+		orc->sp_reg = ORC_REG_DI;
+		break;
+	case CFI_DX:
+		orc->sp_reg = ORC_REG_DX;
+		break;
+	default:
+		WARN_INSN(insn, "unknown CFA base reg %d", cfi->cfa.base);
+		return -1;
+	}
+
+	switch (bp->base) {
+	case CFI_UNDEFINED:
+		orc->bp_reg = ORC_REG_UNDEFINED;
+		break;
+	case CFI_CFA:
+		orc->bp_reg = ORC_REG_PREV_SP;
+		break;
+	case CFI_BP:
+		orc->bp_reg = ORC_REG_BP;
+		break;
+	default:
+		WARN_INSN(insn, "unknown BP base reg %d", bp->base);
+		return -1;
+	}
+
+	orc->sp_offset = cfi->cfa.offset;
+	orc->bp_offset = bp->offset;
+
+	return 0;
+}
+
+int write_orc_entry(struct elf *elf, struct section *orc_sec,
+		    struct section *ip_sec, unsigned int idx,
+		    struct section *insn_sec, unsigned long insn_off,
+		    struct orc_entry *o)
+{
+	struct orc_entry *orc;
+
+	/* populate ORC data */
+	orc = (struct orc_entry *)orc_sec->data->d_buf + idx;
+	memcpy(orc, o, sizeof(*orc));
+	orc->sp_offset = bswap_if_needed(elf, orc->sp_offset);
+	orc->bp_offset = bswap_if_needed(elf, orc->bp_offset);
+
+	/* populate reloc for ip */
+	if (!elf_init_reloc_text_sym(elf, ip_sec, idx * sizeof(int), idx,
+				     insn_sec, insn_off))
+		return -1;
+
+	return 0;
+}
+
+static const char *reg_name(unsigned int reg)
+{
+	switch (reg) {
+	case ORC_REG_PREV_SP:
+		return "prevsp";
+	case ORC_REG_DX:
+		return "dx";
+	case ORC_REG_DI:
+		return "di";
+	case ORC_REG_BP:
+		return "bp";
+	case ORC_REG_SP:
+		return "sp";
+	case ORC_REG_R10:
+		return "r10";
+	case ORC_REG_R13:
+		return "r13";
+	case ORC_REG_BP_INDIRECT:
+		return "bp(ind)";
+	case ORC_REG_SP_INDIRECT:
+		return "sp(ind)";
+	default:
+		return "?";
+	}
+}
+
+static const char *orc_type_name(unsigned int type)
+{
+	switch (type) {
+	case ORC_TYPE_UNDEFINED:
+		return "(und)";
+	case ORC_TYPE_END_OF_STACK:
+		return "end";
+	case ORC_TYPE_CALL:
+		return "call";
+	case ORC_TYPE_REGS:
+		return "regs";
+	case ORC_TYPE_REGS_PARTIAL:
+		return "regs (partial)";
+	default:
+		return "?";
+	}
+}
+
+static void print_reg(unsigned int reg, int offset)
+{
+	if (reg == ORC_REG_BP_INDIRECT)
+		printf("(bp%+d)", offset);
+	else if (reg == ORC_REG_SP_INDIRECT)
+		printf("(sp)%+d", offset);
+	else if (reg == ORC_REG_UNDEFINED)
+		printf("(und)");
+	else
+		printf("%s%+d", reg_name(reg), offset);
+}
+
+void orc_print_dump(struct elf *dummy_elf, struct orc_entry *orc, int i)
+{
+	printf("type:%s", orc_type_name(orc[i].type));
+
+	printf(" sp:");
+	print_reg(orc[i].sp_reg, bswap_if_needed(dummy_elf, orc[i].sp_offset));
+
+	printf(" bp:");
+	print_reg(orc[i].bp_reg, bswap_if_needed(dummy_elf, orc[i].bp_offset));
+
+	printf(" signal:%d\n", orc[i].signal);
+}
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index eb7e12ebc1d0..0a33d9195b7a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -20,6 +20,7 @@
 #include <linux/hashtable.h>
 #include <linux/kernel.h>
 #include <linux/static_call_types.h>
+#include <linux/string.h>
 
 struct alternative {
 	struct alternative *next;
@@ -584,7 +585,7 @@ static int add_dead_ends(struct objtool_file *file)
 	struct section *rsec;
 	struct reloc *reloc;
 	struct instruction *insn;
-	s64 addend;
+	uint64_t offset;
 
 	/*
 	 * Check for manually annotated dead ends.
@@ -594,27 +595,28 @@ static int add_dead_ends(struct objtool_file *file)
 		goto reachable;
 
 	for_each_reloc(rsec, reloc) {
-
-		if (reloc->sym->type != STT_SECTION) {
+		if (reloc->sym->type == STT_SECTION) {
+			offset = reloc_addend(reloc);
+		} else if (reloc->sym->local_label) {
+			offset = reloc->sym->offset;
+		} else {
 			WARN("unexpected relocation symbol type in %s", rsec->name);
 			return -1;
 		}
 
-		addend = reloc_addend(reloc);
-
-		insn = find_insn(file, reloc->sym->sec, addend);
+		insn = find_insn(file, reloc->sym->sec, offset);
 		if (insn)
 			insn = prev_insn_same_sec(file, insn);
-		else if (addend == reloc->sym->sec->sh.sh_size) {
+		else if (offset == reloc->sym->sec->sh.sh_size) {
 			insn = find_last_insn(file, reloc->sym->sec);
 			if (!insn) {
 				WARN("can't find unreachable insn at %s+0x%" PRIx64,
-				     reloc->sym->sec->name, addend);
+				     reloc->sym->sec->name, offset);
 				return -1;
 			}
 		} else {
 			WARN("can't find unreachable insn at %s+0x%" PRIx64,
-			     reloc->sym->sec->name, addend);
+			     reloc->sym->sec->name, offset);
 			return -1;
 		}
 
@@ -633,27 +635,28 @@ reachable:
 		return 0;
 
 	for_each_reloc(rsec, reloc) {
-
-		if (reloc->sym->type != STT_SECTION) {
+		if (reloc->sym->type == STT_SECTION) {
+			offset = reloc_addend(reloc);
+		} else if (reloc->sym->local_label) {
+			offset = reloc->sym->offset;
+		} else {
 			WARN("unexpected relocation symbol type in %s", rsec->name);
 			return -1;
 		}
 
-		addend = reloc_addend(reloc);
-
-		insn = find_insn(file, reloc->sym->sec, addend);
+		insn = find_insn(file, reloc->sym->sec, offset);
 		if (insn)
 			insn = prev_insn_same_sec(file, insn);
-		else if (addend == reloc->sym->sec->sh.sh_size) {
+		else if (offset == reloc->sym->sec->sh.sh_size) {
 			insn = find_last_insn(file, reloc->sym->sec);
 			if (!insn) {
 				WARN("can't find reachable insn at %s+0x%" PRIx64,
-				     reloc->sym->sec->name, addend);
+				     reloc->sym->sec->name, offset);
 				return -1;
 			}
 		} else {
 			WARN("can't find reachable insn at %s+0x%" PRIx64,
-			     reloc->sym->sec->name, addend);
+			     reloc->sym->sec->name, offset);
 			return -1;
 		}
 
@@ -2224,6 +2227,7 @@ static int read_unwind_hints(struct objtool_file *file)
 	struct unwind_hint *hint;
 	struct instruction *insn;
 	struct reloc *reloc;
+	unsigned long offset;
 	int i;
 
 	sec = find_section_by_name(file->elf, ".discard.unwind_hints");
@@ -2251,7 +2255,16 @@ static int read_unwind_hints(struct objtool_file *file)
 			return -1;
 		}
 
-		insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc));
+		if (reloc->sym->type == STT_SECTION) {
+			offset = reloc_addend(reloc);
+		} else if (reloc->sym->local_label) {
+			offset = reloc->sym->offset;
+		} else {
+			WARN("unexpected relocation symbol type in %s", sec->rsec->name);
+			return -1;
+		}
+
+		insn = find_insn(file, reloc->sym->sec, offset);
 		if (!insn) {
 			WARN("can't find insn for unwind_hints[%d]", i);
 			return -1;
@@ -2522,6 +2535,9 @@ static int classify_symbols(struct objtool_file *file)
 	struct symbol *func;
 
 	for_each_sym(file, func) {
+		if (func->type == STT_NOTYPE && strstarts(func->name, ".L"))
+			func->local_label = true;
+
 		if (func->bind != STB_GLOBAL)
 			continue;
 
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index 9f71e988eca4..2b8a69de4db8 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -67,6 +67,7 @@ struct symbol {
 	u8 profiling_func    : 1;
 	u8 warned	     : 1;
 	u8 embedded_insn     : 1;
+	u8 local_label       : 1;
 	struct list_head pv_target;
 	struct reloc *relocs;
 };
diff --git a/tools/objtool/include/objtool/orc.h b/tools/objtool/include/objtool/orc.h
new file mode 100644
index 000000000000..15a32def1071
--- /dev/null
+++ b/tools/objtool/include/objtool/orc.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _OBJTOOL_ORC_H
+#define _OBJTOOL_ORC_H
+
+#include <objtool/check.h>
+
+int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn);
+void orc_print_dump(struct elf *dummy_elf, struct orc_entry *orc, int i);
+int write_orc_entry(struct elf *elf, struct section *orc_sec,
+		    struct section *ip_sec, unsigned int idx,
+		    struct section *insn_sec, unsigned long insn_off,
+		    struct orc_entry *o);
+
+#endif /* _OBJTOOL_ORC_H */
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index 3a301696f005..7ebf29c91184 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -13,7 +13,6 @@ NORETURN(__reiserfs_panic)
 NORETURN(__stack_chk_fail)
 NORETURN(__tdx_hypercall_failed)
 NORETURN(__ubsan_handle_builtin_unreachable)
-NORETURN(arch_call_rest_init)
 NORETURN(arch_cpu_idle_dead)
 NORETURN(bch2_trans_in_restart_error)
 NORETURN(bch2_trans_restart_error)
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 0e183bb1c720..a62247efb64f 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -6,65 +6,10 @@
 #include <unistd.h>
 #include <asm/orc_types.h>
 #include <objtool/objtool.h>
+#include <objtool/orc.h>
 #include <objtool/warn.h>
 #include <objtool/endianness.h>
 
-static const char *reg_name(unsigned int reg)
-{
-	switch (reg) {
-	case ORC_REG_PREV_SP:
-		return "prevsp";
-	case ORC_REG_DX:
-		return "dx";
-	case ORC_REG_DI:
-		return "di";
-	case ORC_REG_BP:
-		return "bp";
-	case ORC_REG_SP:
-		return "sp";
-	case ORC_REG_R10:
-		return "r10";
-	case ORC_REG_R13:
-		return "r13";
-	case ORC_REG_BP_INDIRECT:
-		return "bp(ind)";
-	case ORC_REG_SP_INDIRECT:
-		return "sp(ind)";
-	default:
-		return "?";
-	}
-}
-
-static const char *orc_type_name(unsigned int type)
-{
-	switch (type) {
-	case ORC_TYPE_UNDEFINED:
-		return "(und)";
-	case ORC_TYPE_END_OF_STACK:
-		return "end";
-	case ORC_TYPE_CALL:
-		return "call";
-	case ORC_TYPE_REGS:
-		return "regs";
-	case ORC_TYPE_REGS_PARTIAL:
-		return "regs (partial)";
-	default:
-		return "?";
-	}
-}
-
-static void print_reg(unsigned int reg, int offset)
-{
-	if (reg == ORC_REG_BP_INDIRECT)
-		printf("(bp%+d)", offset);
-	else if (reg == ORC_REG_SP_INDIRECT)
-		printf("(sp)%+d", offset);
-	else if (reg == ORC_REG_UNDEFINED)
-		printf("(und)");
-	else
-		printf("%s%+d", reg_name(reg), offset);
-}
-
 int orc_dump(const char *_objname)
 {
 	int fd, nr_entries, i, *orc_ip = NULL, orc_size = 0;
@@ -205,17 +150,7 @@ int orc_dump(const char *_objname)
 			printf("%llx:", (unsigned long long)(orc_ip_addr + (i * sizeof(int)) + orc_ip[i]));
 		}
 
-		printf("type:%s", orc_type_name(orc[i].type));
-
-		printf(" sp:");
-
-		print_reg(orc[i].sp_reg, bswap_if_needed(&dummy_elf, orc[i].sp_offset));
-
-		printf(" bp:");
-
-		print_reg(orc[i].bp_reg, bswap_if_needed(&dummy_elf, orc[i].bp_offset));
-
-		printf(" signal:%d\n", orc[i].signal);
+		orc_print_dump(&dummy_elf, orc, i);
 	}
 
 	elf_end(elf);
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index bae343908867..922e6aac7cea 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -10,121 +10,10 @@
 #include <asm/orc_types.h>
 
 #include <objtool/check.h>
+#include <objtool/orc.h>
 #include <objtool/warn.h>
 #include <objtool/endianness.h>
 
-static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi,
-			  struct instruction *insn)
-{
-	struct cfi_reg *bp = &cfi->regs[CFI_BP];
-
-	memset(orc, 0, sizeof(*orc));
-
-	if (!cfi) {
-		/*
-		 * This is usually either unreachable nops/traps (which don't
-		 * trigger unreachable instruction warnings), or
-		 * STACK_FRAME_NON_STANDARD functions.
-		 */
-		orc->type = ORC_TYPE_UNDEFINED;
-		return 0;
-	}
-
-	switch (cfi->type) {
-	case UNWIND_HINT_TYPE_UNDEFINED:
-		orc->type = ORC_TYPE_UNDEFINED;
-		return 0;
-	case UNWIND_HINT_TYPE_END_OF_STACK:
-		orc->type = ORC_TYPE_END_OF_STACK;
-		return 0;
-	case UNWIND_HINT_TYPE_CALL:
-		orc->type = ORC_TYPE_CALL;
-		break;
-	case UNWIND_HINT_TYPE_REGS:
-		orc->type = ORC_TYPE_REGS;
-		break;
-	case UNWIND_HINT_TYPE_REGS_PARTIAL:
-		orc->type = ORC_TYPE_REGS_PARTIAL;
-		break;
-	default:
-		WARN_INSN(insn, "unknown unwind hint type %d", cfi->type);
-		return -1;
-	}
-
-	orc->signal = cfi->signal;
-
-	switch (cfi->cfa.base) {
-	case CFI_SP:
-		orc->sp_reg = ORC_REG_SP;
-		break;
-	case CFI_SP_INDIRECT:
-		orc->sp_reg = ORC_REG_SP_INDIRECT;
-		break;
-	case CFI_BP:
-		orc->sp_reg = ORC_REG_BP;
-		break;
-	case CFI_BP_INDIRECT:
-		orc->sp_reg = ORC_REG_BP_INDIRECT;
-		break;
-	case CFI_R10:
-		orc->sp_reg = ORC_REG_R10;
-		break;
-	case CFI_R13:
-		orc->sp_reg = ORC_REG_R13;
-		break;
-	case CFI_DI:
-		orc->sp_reg = ORC_REG_DI;
-		break;
-	case CFI_DX:
-		orc->sp_reg = ORC_REG_DX;
-		break;
-	default:
-		WARN_INSN(insn, "unknown CFA base reg %d", cfi->cfa.base);
-		return -1;
-	}
-
-	switch (bp->base) {
-	case CFI_UNDEFINED:
-		orc->bp_reg = ORC_REG_UNDEFINED;
-		break;
-	case CFI_CFA:
-		orc->bp_reg = ORC_REG_PREV_SP;
-		break;
-	case CFI_BP:
-		orc->bp_reg = ORC_REG_BP;
-		break;
-	default:
-		WARN_INSN(insn, "unknown BP base reg %d", bp->base);
-		return -1;
-	}
-
-	orc->sp_offset = cfi->cfa.offset;
-	orc->bp_offset = bp->offset;
-
-	return 0;
-}
-
-static int write_orc_entry(struct elf *elf, struct section *orc_sec,
-			   struct section *ip_sec, unsigned int idx,
-			   struct section *insn_sec, unsigned long insn_off,
-			   struct orc_entry *o)
-{
-	struct orc_entry *orc;
-
-	/* populate ORC data */
-	orc = (struct orc_entry *)orc_sec->data->d_buf + idx;
-	memcpy(orc, o, sizeof(*orc));
-	orc->sp_offset = bswap_if_needed(elf, orc->sp_offset);
-	orc->bp_offset = bswap_if_needed(elf, orc->bp_offset);
-
-	/* populate reloc for ip */
-	if (!elf_init_reloc_text_sym(elf, ip_sec, idx * sizeof(int), idx,
-				     insn_sec, insn_off))
-		return -1;
-
-	return 0;
-}
-
 struct orc_list_entry {
 	struct list_head list;
 	struct orc_entry orc;
diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt
index 2109690b0d5f..59ab1ff9d75f 100644
--- a/tools/perf/Documentation/perf-intel-pt.txt
+++ b/tools/perf/Documentation/perf-intel-pt.txt
@@ -115,9 +115,13 @@ toggle respectively.
 
 perf script also supports higher level ways to dump instruction traces:
 
+	perf script --insn-trace=disasm
+
+or to use the xed disassembler, which requires installing the xed tool
+(see XED below):
+
 	perf script --insn-trace --xed
 
-Dump all instructions. This requires installing the xed tool (see XED below)
 Dumping all instructions in a long trace can be fairly slow. It is usually better
 to start with higher level decoding, like
 
@@ -130,12 +134,12 @@ or
 and then select a time range of interest. The time range can then be examined
 in detail with
 
-	perf script --time starttime,stoptime --insn-trace --xed
+	perf script --time starttime,stoptime --insn-trace=disasm
 
 While examining the trace it's also useful to filter on specific CPUs using
 the -C option
 
-	perf script --time starttime,stoptime --insn-trace --xed -C 1
+	perf script --time starttime,stoptime --insn-trace=disasm -C 1
 
 Dump all instructions in time range on CPU 1.
 
@@ -1306,7 +1310,7 @@ Without timestamps, --per-thread must be specified to distinguish threads.
 
 perf script can be used to provide an instruction trace
 
- $ perf script --guestkallsyms $KALLSYMS --insn-trace --xed -F+ipc | grep -C10 vmresume | head -21
+ $ perf script --guestkallsyms $KALLSYMS --insn-trace=disasm -F+ipc | grep -C10 vmresume | head -21
        CPU 0/KVM  1440  ffffffff82133cdd __vmx_vcpu_run+0x3d ([kernel.kallsyms])                movq  0x48(%rax), %r9
        CPU 0/KVM  1440  ffffffff82133ce1 __vmx_vcpu_run+0x41 ([kernel.kallsyms])                movq  0x50(%rax), %r10
        CPU 0/KVM  1440  ffffffff82133ce5 __vmx_vcpu_run+0x45 ([kernel.kallsyms])                movq  0x58(%rax), %r11
@@ -1407,7 +1411,7 @@ There were none.
 
 'perf script' can be used to provide an instruction trace showing timestamps
 
- $ perf script -i perf.data.kvm --guestkallsyms $KALLSYMS --insn-trace --xed -F+ipc | grep -C10 vmresume | head -21
+ $ perf script -i perf.data.kvm --guestkallsyms $KALLSYMS --insn-trace=disasm -F+ipc | grep -C10 vmresume | head -21
        CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133cdd __vmx_vcpu_run+0x3d ([kernel.kallsyms])                 movq  0x48(%rax), %r9
        CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133ce1 __vmx_vcpu_run+0x41 ([kernel.kallsyms])                 movq  0x50(%rax), %r10
        CPU 1/KVM 17006 [001] 11500.262865593:  ffffffff82133ce5 __vmx_vcpu_run+0x45 ([kernel.kallsyms])                 movq  0x58(%rax), %r11
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 38f59ac064f7..d8b863e01fe0 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -531,8 +531,35 @@ include::itrace.txt[]
 --raw-trace::
 	When displaying traceevent output, do not use print fmt or plugins.
 
+-H::
 --hierarchy::
-	Enable hierarchical output.
+	Enable hierarchical output.  In the hierarchy mode, each sort key groups
+	samples based on the criteria and then sub-divide it using the lower
+	level sort key.
+
+	For example:
+	In normal output:
+
+	  perf report -s dso,sym
+	  # Overhead  Shared Object      Symbol
+	      50.00%  [kernel.kallsyms]  [k] kfunc1
+	      20.00%  perf               [.] foo
+	      15.00%  [kernel.kallsyms]  [k] kfunc2
+	      10.00%  perf               [.] bar
+	       5.00%  libc.so            [.] libcall
+
+	In hierarchy output:
+
+	  perf report -s dso,sym --hierarchy
+	  #   Overhead  Shared Object / Symbol
+	      65.00%    [kernel.kallsyms]
+	        50.00%    [k] kfunc1
+	        15.00%    [k] kfunc2
+	      30.00%    perf
+	        20.00%    [.] foo
+	        10.00%    [.] bar
+	       5.00%    libc.so
+	         5.00%    [.] libcall
 
 --inline::
 	If a callgraph address belongs to an inlined function, the inline stack
diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt
index 6a8581012e16..13e37e9385ee 100644
--- a/tools/perf/Documentation/perf-script-python.txt
+++ b/tools/perf/Documentation/perf-script-python.txt
@@ -642,8 +642,8 @@ SUPPORTED FIELDS
 
 Currently supported fields:
 
-ev_name, comm, pid, tid, cpu, ip, time, period, phys_addr, addr,
-symbol, symoff, dso, time_enabled, time_running, values, callchain,
+ev_name, comm, id, stream_id, pid, tid, cpu, ip, time, period, phys_addr,
+addr, symbol, symoff, dso, time_enabled, time_running, values, callchain,
 brstack, brstacksym, datasrc, datasrc_decode, iregs, uregs,
 weight, transaction, raw_buf, attr, cpumode.
 
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index ff9a52e44688..005e51df855e 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -132,9 +132,10 @@ OPTIONS
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, dsoff, addr, symoff,
         srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output,
-        brstackinsn, brstackinsnlen, brstackoff, callindent, insn, insnlen, synth,
-        phys_addr, metric, misc, srccode, ipc, data_page_size, code_page_size, ins_lat,
-        machine_pid, vcpu, cgroup, retire_lat.
+        brstackinsn, brstackinsnlen, brstackoff, callindent, insn, disasm,
+        insnlen, synth, phys_addr, metric, misc, srccode, ipc, data_page_size,
+        code_page_size, ins_lat, machine_pid, vcpu, cgroup, retire_lat.
+
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -F sw:comm,tid,time,ip,sym  and -F trace:time,cpu,trace
@@ -217,9 +218,9 @@ OPTIONS
 	Instruction Trace decoding. For calls and returns, it will display the
 	name of the symbol indented with spaces to reflect the stack depth.
 
-	When doing instruction trace decoding insn and insnlen give the
-	instruction bytes and the instruction length of the current
-	instruction.
+	When doing instruction trace decoding, insn, disasm and insnlen give the
+	instruction bytes, disassembled instructions (requires libcapstone support)
+	and the instruction length of the current instruction respectively.
 
 	The synth field is used by synthesized events which may be created when
 	Instruction Trace decoding.
@@ -441,9 +442,10 @@ include::itrace.txt[]
 	will be printed. Each entry has function name and file/line. Enabled by
 	default, disable with --no-inline.
 
---insn-trace::
-	Show instruction stream for intel_pt traces. Combine with --xed to
-	show disassembly.
+--insn-trace[=<raw|disasm>]::
+	Show instruction stream in bytes (raw) or disassembled (disasm)
+	for intel_pt traces. The default is 'raw'. To use xed, combine
+	'raw' with --xed to show disassembly done by xed.
 
 --xed::
 	Run xed disassembler on output. Requires installing the xed disassembler.
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 5af2e432b54f..29756a87ab6f 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -308,6 +308,14 @@ use --per-die in addition to -a. (system-wide).  The output includes the
 die number and the number of online processors on that die. This is
 useful to gauge the amount of aggregation.
 
+--per-cluster::
+Aggregate counts per processor cluster for system-wide mode measurement.  This
+is a useful mode to detect imbalance between clusters.  To enable this mode,
+use --per-cluster in addition to -a. (system-wide).  The output includes the
+cluster number and the number of online processors on that cluster. This is
+useful to gauge the amount of aggregation. The information of cluster ID and
+related CPUs can be gotten from /sys/devices/system/cpu/cpuX/topology/cluster_{id, cpus}.
+
 --per-cache::
 Aggregate counts per cache instance for system-wide mode measurements.  By
 default, the aggregation happens for the cache level at the highest index
@@ -396,6 +404,9 @@ Aggregate counts per processor socket for system-wide mode measurements.
 --per-die::
 Aggregate counts per processor die for system-wide mode measurements.
 
+--per-cluster::
+Aggregate counts perf processor cluster for system-wide mode measurements.
+
 --per-cache::
 Aggregate counts per cache instance for system-wide mode measurements.  By
 default, the aggregation happens for the cache level at the highest index
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 3c202ec080ba..a754875fa5bb 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -261,8 +261,38 @@ Default is to monitor all CPUS.
 --raw-trace::
 	When displaying traceevent output, do not use print fmt or plugins.
 
+-H::
 --hierarchy::
-	Enable hierarchy output.
+	Enable hierarchical output.  In the hierarchy mode, each sort key groups
+	samples based on the criteria and then sub-divide it using the lower
+	level sort key.
+
+	For example, in normal output:
+
+	  perf report -s dso,sym
+	  #
+	  # Overhead  Shared Object      Symbol
+	  # ........  .................  ...........
+	      50.00%  [kernel.kallsyms]  [k] kfunc1
+	      20.00%  perf               [.] foo
+	      15.00%  [kernel.kallsyms]  [k] kfunc2
+	      10.00%  perf               [.] bar
+	       5.00%  libc.so            [.] libcall
+
+	In hierarchy output:
+
+	  perf report -s dso,sym --hierarchy
+	  #
+	  #   Overhead  Shared Object / Symbol
+	  # ..........  ......................
+	      65.00%    [kernel.kallsyms]
+	        50.00%    [k] kfunc1
+	        15.00%    [k] kfunc2
+	      30.00%    perf
+	        20.00%    [.] foo
+	        10.00%    [.] bar
+	       5.00%    libc.so
+	         5.00%    [.] libcall
 
 --overwrite::
 	Enable this to use just the most recent records, which helps in high core count
diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt
index a7cf7bc2f968..09f516f3fdfb 100644
--- a/tools/perf/Documentation/perf.txt
+++ b/tools/perf/Documentation/perf.txt
@@ -63,6 +63,8 @@ OPTIONS
                              in browser mode
           perf-event-open  - Print perf_event_open() arguments and
                              return value
+          kmaps            - Print kernel and module maps (perf script
+                             and perf report without browser)
 
 --debug-file::
 	Write debug output to a specified file.
diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt
index 825745a645c1..67b326ba0040 100644
--- a/tools/perf/Documentation/tips.txt
+++ b/tools/perf/Documentation/tips.txt
@@ -2,6 +2,7 @@ For a higher level overview, try: perf report --sort comm,dso
 Sample related events with: perf record -e '{cycles,instructions}:S'
 Compare performance results with: perf diff [<old file> <new file>]
 Boolean options have negative forms, e.g.: perf report --no-children
+To not accumulate CPU time of children symbols add --no-children
 Customize output of perf script with: perf script -F event,ip,sym
 Generate a script for your data: perf script -g <lang>
 Save output of perf stat using: perf stat record <target workload>
@@ -12,32 +13,52 @@ List events using substring match: perf list <keyword>
 To see list of saved events and attributes: perf evlist -v
 Use --symfs <dir> if your symbol files are in non-standard locations
 To see callchains in a more compact form: perf report -g folded
+To see call chains by final symbol taking CPU time (bottom up) use perf report -G
 Show individual samples with: perf script
 Limit to show entries above 5% only: perf report --percent-limit 5
 Profiling branch (mis)predictions with: perf record -b / perf report
-To show assembler sample contexts use perf record -b / perf script -F +brstackinsn --xed
-Treat branches as callchains: perf report --branch-history
-To count events in every 1000 msec: perf stat -I 1000
-Print event counts in CSV format with: perf stat -x,
+To show assembler sample context control flow use perf record -b / perf report --samples 10 and then browse context
+To adjust path to source files to local file system use perf report --prefix=... --prefix-strip=...
+Treat branches as callchains: perf record -b ... ; perf report --branch-history
+Show estimate cycles per function and IPC in annotate use perf record -b ... ; perf report --total-cycles
+To count events every 1000 msec: perf stat -I 1000
+Print event counts in machine readable CSV format with: perf stat -x\;
 If you have debuginfo enabled, try: perf report -s sym,srcline
 For memory address profiling, try: perf mem record / perf mem report
 For tracepoint events, try: perf report -s trace_fields
 To record callchains for each sample: perf record -g
+If call chains don't work try perf record --call-graph dwarf or --call-graph lbr
 To record every process run by a user: perf record -u <user>
+To show inline functions in call traces add --inline to perf report
+To not record events from perf itself add --exclude-perf
 Skip collecting build-id when recording: perf record -B
 To change sampling frequency to 100 Hz: perf record -F 100
+To show information about system the samples were collected on use perf report --header
+To only collect call graph on one event use perf record -e cpu/cpu-cycles,callgraph=1/,branches ; perf report --show-ref-call-graph
+To set sampling period of individual events use perf record -e cpu/cpu-cycles,period=100001/,cpu/branches,period=10001/ ...
+To group events which need to be collected together for accuracy use {}: perf record -e {cycles,branches}' ...
+To compute metrics for samples use perf record -e '{cycles,instructions}' ... ; perf script -F +metric
 See assembly instructions with percentage: perf annotate <symbol>
 If you prefer Intel style assembly, try: perf annotate -M intel
+When collecting LBR backtraces use --stitch-lbr to handle more than 32 deep entries: perf record --call-graph lbr ; perf report --stitch-lbr
 For hierarchical output, try: perf report --hierarchy
 Order by the overhead of source file name and line number: perf report -s srcline
 System-wide collection from all CPUs: perf record -a
 Show current config key-value pairs: perf config --list
+To collect Processor Trace with samples use perf record -e '{intel_pt//,cycles}' ; perf script --call-trace or --insn-trace --xed -F +ipc (remove --xed if no xed)
+To trace calls using Processor Trace use perf record -e intel_pt// ... ; perf script --call-trace. Then use perf script --time A-B --insn-trace to look at region of interest.
+To measure approximate function latency with Processor Trace use perf record -e intel_pt// ... ; perf script --call-ret-trace
+To trace only single function with Processor Trace use perf record --filter 'filter func @ program' -e intel_pt//u ./program ; perf script --insn-trace
 Show user configuration overrides: perf config --user --list
 To add Node.js USDT(User-Level Statically Defined Tracing): perf buildid-cache --add `which node`
-To report cacheline events from previous recording: perf c2c report
+To analyze cache line scalability issues use perf c2c record ... ; perf c2c report
 To browse sample contexts use perf report --sample 10 and select in context menu
 To separate samples by time use perf report --sort time,overhead,sym
+To filter subset of samples with report or script add --time X-Y or --cpu A,B,C or --socket-filter ...
 To set sample time separation other than 100ms with --sort time use --time-quantum
 Add -I to perf record to sample register values, which will be visible in perf report sample context.
 To show IPC for sampling periods use perf record -e '{cycles,instructions}:S' and then browse context
 To show context switches in perf report sample context add --switch-events to perf record.
+To show time in nanoseconds in record/report add --ns
+To compare hot regions in two workloads use perf record -b -o file ... ; perf diff --stream file1 file2
+To compare scalability of two workload samples use perf diff -c ratio file1 file2
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index aa55850fbc21..1fe8df97fe88 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -28,8 +28,6 @@ include $(srctree)/tools/scripts/Makefile.arch
 
 $(call detected_var,SRCARCH)
 
-NO_PERF_REGS := 1
-
 ifneq ($(NO_SYSCALL_TABLE),1)
   NO_SYSCALL_TABLE := 1
 
@@ -50,7 +48,6 @@ endif
 
 # Additional ARCH settings for ppc
 ifeq ($(SRCARCH),powerpc)
-  NO_PERF_REGS := 0
   CFLAGS += -I$(OUTPUT)arch/powerpc/include/generated
   LIBUNWIND_LIBS := -lunwind -lunwind-ppc64
 endif
@@ -66,49 +63,31 @@ ifeq ($(SRCARCH),x86)
   else
     LIBUNWIND_LIBS = -lunwind-x86 -llzma -lunwind
   endif
-  NO_PERF_REGS := 0
 endif
 
 ifeq ($(SRCARCH),arm)
-  NO_PERF_REGS := 0
   LIBUNWIND_LIBS = -lunwind -lunwind-arm
 endif
 
 ifeq ($(SRCARCH),arm64)
-  NO_PERF_REGS := 0
   CFLAGS += -I$(OUTPUT)arch/arm64/include/generated
   LIBUNWIND_LIBS = -lunwind -lunwind-aarch64
 endif
 
 ifeq ($(SRCARCH),loongarch)
-  NO_PERF_REGS := 0
   CFLAGS += -I$(OUTPUT)arch/loongarch/include/generated
   LIBUNWIND_LIBS = -lunwind -lunwind-loongarch64
 endif
 
-ifeq ($(SRCARCH),riscv)
-  NO_PERF_REGS := 0
-endif
-
-ifeq ($(SRCARCH),csky)
-  NO_PERF_REGS := 0
-endif
-
 ifeq ($(ARCH),s390)
-  NO_PERF_REGS := 0
   CFLAGS += -fPIC -I$(OUTPUT)arch/s390/include/generated
 endif
 
 ifeq ($(ARCH),mips)
-  NO_PERF_REGS := 0
   CFLAGS += -I$(OUTPUT)arch/mips/include/generated
   LIBUNWIND_LIBS = -lunwind -lunwind-mips
 endif
 
-ifeq ($(NO_PERF_REGS),0)
-  $(call detected,CONFIG_PERF_REGS)
-endif
-
 # So far there's only x86 and arm libdw unwind support merged in perf.
 # Disable it on all other architectures in case libdw unwind
 # support is detected in system. Add supported architectures
@@ -165,10 +144,6 @@ endif
 FEATURE_CHECK_CFLAGS-libopencsd := $(LIBOPENCSD_CFLAGS)
 FEATURE_CHECK_LDFLAGS-libopencsd := $(LIBOPENCSD_LDFLAGS) $(OPENCSDLIBS)
 
-ifeq ($(NO_PERF_REGS),0)
-  CFLAGS += -DHAVE_PERF_REGS_SUPPORT
-endif
-
 # for linking with debug library, run like:
 # make DEBUG=1 LIBDW_DIR=/opt/libdw/
 ifdef LIBDW_DIR
@@ -191,6 +166,15 @@ endif
 FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
 FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf
 
+# for linking with debug library, run like:
+# make DEBUG=1 LIBCAPSTONE_DIR=/opt/capstone/
+ifdef LIBCAPSTONE_DIR
+  LIBCAPSTONE_CFLAGS  := -I$(LIBCAPSTONE_DIR)/include
+  LIBCAPSTONE_LDFLAGS := -L$(LIBCAPSTONE_DIR)/
+endif
+FEATURE_CHECK_CFLAGS-libcapstone := $(LIBCAPSTONE_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libcapstone := $(LIBCAPSTONE_LDFLAGS) -lcapstone
+
 ifdef LIBZSTD_DIR
   LIBZSTD_CFLAGS  := -I$(LIBZSTD_DIR)/lib
   LIBZSTD_LDFLAGS := -L$(LIBZSTD_DIR)/lib
@@ -209,11 +193,17 @@ endif
 include $(srctree)/tools/scripts/utilities.mak
 
 ifeq ($(call get-executable,$(FLEX)),)
-  dummy := $(error Error: $(FLEX) is missing on this system, please install it)
+  $(error Error: $(FLEX) is missing on this system, please install it)
 endif
 
 ifeq ($(call get-executable,$(BISON)),)
-  dummy := $(error Error: $(BISON) is missing on this system, please install it)
+  $(error Error: $(BISON) is missing on this system, please install it)
+endif
+
+ifneq ($(NO_LIBTRACEEVENT),1)
+  ifeq ($(call get-executable,$(PKG_CONFIG)),)
+  dummy := $(error Error: $(PKG_CONFIG) needed by libtraceevent is missing on this system, please install it)
+  endif
 endif
 
 ifneq ($(OUTPUT),)
@@ -438,46 +428,46 @@ else
       LIBC_SUPPORT := 1
     endif
     ifeq ($(LIBC_SUPPORT),1)
-      msg := $(error ERROR: No libelf found. Disables 'probe' tool, jvmti and BPF support. Please install libelf-dev, libelf-devel, elfutils-libelf-devel or build with NO_LIBELF=1.)
+      $(error ERROR: No libelf found. Disables 'probe' tool, jvmti and BPF support. Please install libelf-dev, libelf-devel, elfutils-libelf-devel or build with NO_LIBELF=1.)
     else
       ifneq ($(filter s% -fsanitize=address%,$(EXTRA_CFLAGS),),)
         ifneq ($(shell ldconfig -p | grep libasan >/dev/null 2>&1; echo $$?), 0)
-          msg := $(error No libasan found, please install libasan);
+          $(error No libasan found, please install libasan)
         endif
       endif
 
       ifneq ($(filter s% -fsanitize=undefined%,$(EXTRA_CFLAGS),),)
         ifneq ($(shell ldconfig -p | grep libubsan >/dev/null 2>&1; echo $$?), 0)
-          msg := $(error No libubsan found, please install libubsan);
+          $(error No libubsan found, please install libubsan)
         endif
       endif
 
       ifneq ($(filter s% -static%,$(LDFLAGS),),)
-        msg := $(error No static glibc found, please install glibc-static);
+        $(error No static glibc found, please install glibc-static)
       else
-        msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]);
+        $(error No gnu/libc-version.h found, please install glibc-dev[el])
       endif
     endif
   else
     ifndef NO_LIBDW_DWARF_UNWIND
       ifneq ($(feature-libdw-dwarf-unwind),1)
         NO_LIBDW_DWARF_UNWIND := 1
-        msg := $(warning No libdw DWARF unwind found, Please install elfutils-devel/libdw-dev >= 0.158 and/or set LIBDW_DIR);
+        $(warning No libdw DWARF unwind found, Please install elfutils-devel/libdw-dev >= 0.158 and/or set LIBDW_DIR)
       endif
     endif
     ifneq ($(feature-dwarf), 1)
       ifndef NO_DWARF
-        msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
+        $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev)
         NO_DWARF := 1
       endif
     else
       ifneq ($(feature-dwarf_getlocations), 1)
-        msg := $(warning Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.157);
+        $(warning Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.157)
       else
         CFLAGS += -DHAVE_DWARF_GETLOCATIONS_SUPPORT
       endif # dwarf_getlocations
       ifneq ($(feature-dwarf_getcfi), 1)
-        msg := $(warning Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.142);
+        $(warning Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.142)
       else
         CFLAGS += -DHAVE_DWARF_CFI_SUPPORT
       endif # dwarf_getcfi
@@ -525,7 +515,7 @@ ifdef CORESIGHT
       endif
     endif
   else
-    dummy := $(error Error: No libopencsd library found or the version is not up-to-date. Please install recent libopencsd to build with CORESIGHT=1)
+    $(error Error: No libopencsd library found or the version is not up-to-date. Please install recent libopencsd to build with CORESIGHT=1)
   endif
 endif
 
@@ -551,7 +541,7 @@ ifndef NO_LIBELF
   ifeq ($(feature-libelf-gelf_getnote), 1)
     CFLAGS += -DHAVE_GELF_GETNOTE_SUPPORT
   else
-    msg := $(warning gelf_getnote() not found on libelf, SDT support disabled);
+    $(warning gelf_getnote() not found on libelf, SDT support disabled)
   endif
 
   ifeq ($(feature-libelf-getshdrstrndx), 1)
@@ -568,7 +558,7 @@ ifndef NO_LIBELF
 
   ifndef NO_DWARF
     ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
-      msg := $(warning DWARF register mappings have not been defined for architecture $(SRCARCH), DWARF support disabled);
+      $(warning DWARF register mappings have not been defined for architecture $(SRCARCH), DWARF support disabled)
       NO_DWARF := 1
     else
       CFLAGS += -DHAVE_DWARF_SUPPORT $(LIBDW_CFLAGS)
@@ -590,11 +580,11 @@ ifndef NO_LIBELF
           $(call detected,CONFIG_LIBBPF)
           $(call detected,CONFIG_LIBBPF_DYNAMIC)
         else
-          dummy := $(error Error: No libbpf devel library found or older than v1.0, please install/update libbpf-devel);
+          $(error Error: No libbpf devel library found or older than v1.0, please install/update libbpf-devel)
         endif
       else
         ifeq ($(NO_ZLIB), 1)
-          dummy := $(warning Warning: Statically building libbpf not possible as zlib is missing)
+          $(warning Warning: Statically building libbpf not possible as zlib is missing)
           NO_LIBBPF := 1
         else
           # Libbpf will be built as a static library from tools/lib/bpf.
@@ -609,7 +599,7 @@ endif # NO_LIBELF
 
 ifndef NO_SDT
   ifneq ($(feature-sdt), 1)
-    msg := $(warning No sys/sdt.h found, no SDT events are defined, please install systemtap-sdt-devel or systemtap-sdt-dev);
+    $(warning No sys/sdt.h found, no SDT events are defined, please install systemtap-sdt-devel or systemtap-sdt-dev)
     NO_SDT := 1;
   else
     CFLAGS += -DHAVE_SDT_EVENT
@@ -651,13 +641,13 @@ ifndef NO_LIBUNWIND
     have_libunwind = 1
     $(call feature_check,libunwind-debug-frame-aarch64)
     ifneq ($(feature-libunwind-debug-frame-aarch64), 1)
-      msg := $(warning No debug_frame support found in libunwind-aarch64);
+      $(warning No debug_frame support found in libunwind-aarch64)
       CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME_AARCH64
     endif
   endif
 
   ifneq ($(feature-libunwind), 1)
-    msg := $(warning No libunwind found. Please install libunwind-dev[el] >= 1.1 and/or set LIBUNWIND_DIR);
+    $(warning No libunwind found. Please install libunwind-dev[el] >= 1.1 and/or set LIBUNWIND_DIR)
     NO_LOCAL_LIBUNWIND := 1
   else
     have_libunwind := 1
@@ -673,7 +663,7 @@ endif
 
 ifndef NO_LIBBPF
   ifneq ($(feature-bpf), 1)
-    msg := $(warning BPF API too old. Please install recent kernel headers. BPF support in 'perf record' is disabled.)
+    $(warning BPF API too old. Please install recent kernel headers. BPF support in 'perf record' is disabled.)
     NO_LIBBPF := 1
   endif
 endif
@@ -686,28 +676,28 @@ endif
 
 ifeq ($(BUILD_BPF_SKEL),1)
   ifeq ($(filter -DHAVE_LIBELF_SUPPORT, $(CFLAGS)),)
-    dummy := $(warning Warning: Disabled BPF skeletons as libelf is required by bpftool)
+    $(warning Warning: Disabled BPF skeletons as libelf is required by bpftool)
     BUILD_BPF_SKEL := 0
   else ifeq ($(filter -DHAVE_ZLIB_SUPPORT, $(CFLAGS)),)
-    dummy := $(warning Warning: Disabled BPF skeletons as zlib is required by bpftool)
+    $(warning Warning: Disabled BPF skeletons as zlib is required by bpftool)
     BUILD_BPF_SKEL := 0
   else ifeq ($(filter -DHAVE_LIBBPF_SUPPORT, $(CFLAGS)),)
-    dummy := $(warning Warning: Disabled BPF skeletons as libbpf is required)
+    $(warning Warning: Disabled BPF skeletons as libbpf is required)
     BUILD_BPF_SKEL := 0
   else ifeq ($(call get-executable,$(CLANG)),)
-    dummy := $(warning Warning: Disabled BPF skeletons as clang ($(CLANG)) is missing)
+    $(warning Warning: Disabled BPF skeletons as clang ($(CLANG)) is missing)
     BUILD_BPF_SKEL := 0
   else
     CLANG_VERSION := $(shell $(CLANG) --version | head -1 | sed 's/.*clang version \([[:digit:]]\+.[[:digit:]]\+.[[:digit:]]\+\).*/\1/g')
     ifeq ($(call version-lt3,$(CLANG_VERSION),12.0.1),1)
-      dummy := $(warning Warning: Disabled BPF skeletons as reliable BTF generation needs at least $(CLANG) version 12.0.1)
+      $(warning Warning: Disabled BPF skeletons as reliable BTF generation needs at least $(CLANG) version 12.0.1)
       BUILD_BPF_SKEL := 0
     endif
   endif
   ifeq ($(BUILD_BPF_SKEL),1)
     $(call feature_check,clang-bpf-co-re)
     ifeq ($(feature-clang-bpf-co-re), 0)
-      dummy := $(warning Warning: Disabled BPF skeletons as clang is too old)
+      $(warning Warning: Disabled BPF skeletons as clang is too old)
       BUILD_BPF_SKEL := 0
     endif
   endif
@@ -727,7 +717,7 @@ dwarf-post-unwind-text := BUG
 # setup DWARF post unwinder
 ifdef NO_LIBUNWIND
   ifdef NO_LIBDW_DWARF_UNWIND
-    msg := $(warning Disabling post unwind, no support found.);
+    $(warning Disabling post unwind, no support found.)
     dwarf-post-unwind := 0
   else
     dwarf-post-unwind-text := libdw
@@ -753,7 +743,7 @@ ifndef NO_LOCAL_LIBUNWIND
   ifeq ($(SRCARCH),$(filter $(SRCARCH),arm arm64))
     $(call feature_check,libunwind-debug-frame)
     ifneq ($(feature-libunwind-debug-frame), 1)
-      msg := $(warning No debug_frame support found in libunwind);
+      $(warning No debug_frame support found in libunwind)
       CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
     endif
   else
@@ -782,7 +772,7 @@ ifneq ($(NO_LIBTRACEEVENT),1)
     ifndef NO_LIBAUDIT
       $(call feature_check,libaudit)
       ifneq ($(feature-libaudit), 1)
-        msg := $(warning No libaudit.h found, disables 'trace' tool, please install audit-libs-devel or libaudit-dev);
+        $(warning No libaudit.h found, disables 'trace' tool, please install audit-libs-devel or libaudit-dev)
         NO_LIBAUDIT := 1
       else
         CFLAGS += -DHAVE_LIBAUDIT_SUPPORT
@@ -795,7 +785,7 @@ endif
 
 ifndef NO_LIBCRYPTO
   ifneq ($(feature-libcrypto), 1)
-    msg := $(warning No libcrypto.h found, disables jitted code injection, please install openssl-devel or libssl-dev);
+    $(warning No libcrypto.h found, disables jitted code injection, please install openssl-devel or libssl-dev)
     NO_LIBCRYPTO := 1
   else
     CFLAGS += -DHAVE_LIBCRYPTO_SUPPORT
@@ -807,7 +797,7 @@ endif
 ifndef NO_SLANG
   ifneq ($(feature-libslang), 1)
     ifneq ($(feature-libslang-include-subdir), 1)
-      msg := $(warning slang not found, disables TUI support. Please install slang-devel, libslang-dev or libslang2-dev);
+      $(warning slang not found, disables TUI support. Please install slang-devel, libslang-dev or libslang2-dev)
       NO_SLANG := 1
     else
       CFLAGS += -DHAVE_SLANG_INCLUDE_SUBDIR
@@ -825,7 +815,7 @@ ifdef GTK2
   FLAGS_GTK2=$(CFLAGS) $(LDFLAGS) $(EXTLIBS) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
   $(call feature_check,gtk2)
   ifneq ($(feature-gtk2), 1)
-    msg := $(warning GTK2 not found, disables GTK2 support. Please install gtk2-devel or libgtk2.0-dev);
+    $(warning GTK2 not found, disables GTK2 support. Please install gtk2-devel or libgtk2.0-dev)
     NO_GTK2 := 1
   else
     $(call feature_check,gtk2-infobar)
@@ -854,7 +844,7 @@ else
   ifneq ($(feature-libperl), 1)
     CFLAGS += -DNO_LIBPERL
     NO_LIBPERL := 1
-    msg := $(warning Missing perl devel files. Disabling perl scripting support, please install perl-ExtUtils-Embed/libperl-dev);
+    $(warning Missing perl devel files. Disabling perl scripting support, please install perl-ExtUtils-Embed/libperl-dev)
   else
     LDFLAGS += $(PERL_EMBED_LDFLAGS)
     EXTLIBS += $(PERL_EMBED_LIBADD)
@@ -869,7 +859,7 @@ endif
 ifeq ($(feature-timerfd), 1)
   CFLAGS += -DHAVE_TIMERFD_SUPPORT
 else
-  msg := $(warning No timerfd support. Disables 'perf kvm stat live');
+  $(warning No timerfd support. Disables 'perf kvm stat live')
 endif
 
 disable-python = $(eval $(disable-python_code))
@@ -903,7 +893,7 @@ else
            PYTHON_EXTENSION_SUFFIX := $(shell $(PYTHON) -c 'from importlib import machinery; print(machinery.EXTENSION_SUFFIXES[0])')
            LANG_BINDINGS += $(obj-perf)python/perf$(PYTHON_EXTENSION_SUFFIX)
 	 else
-           msg := $(warning Missing python setuptools, the python binding won't be built, please install python3-setuptools or equivalent);
+           $(warning Missing python setuptools, the python binding won't be built, please install python3-setuptools or equivalent)
          endif
          CFLAGS += -DHAVE_LIBPYTHON_SUPPORT
          $(call detected,CONFIG_LIBPYTHON)
@@ -962,7 +952,7 @@ ifdef BUILD_NONDISTRO
   ifeq ($(feature-libbfd-buildid), 1)
     CFLAGS += -DHAVE_LIBBFD_BUILDID_SUPPORT
   else
-    msg := $(warning Old version of libbfd/binutils things like PE executable profiling will not be available);
+    $(warning Old version of libbfd/binutils things like PE executable profiling will not be available)
   endif
 endif
 
@@ -994,7 +984,7 @@ ifndef NO_LZMA
     EXTLIBS += -llzma
     $(call detected,CONFIG_LZMA)
   else
-    msg := $(warning No liblzma found, disables xz kernel module decompression, please install xz-devel/liblzma-dev);
+    $(warning No liblzma found, disables xz kernel module decompression, please install xz-devel/liblzma-dev)
     NO_LZMA := 1
   endif
 endif
@@ -1007,7 +997,7 @@ ifndef NO_LIBZSTD
     EXTLIBS += -lzstd
     $(call detected,CONFIG_ZSTD)
   else
-    msg := $(warning No libzstd found, disables trace compression, please install libzstd-dev[el] and/or set LIBZSTD_DIR);
+    $(warning No libzstd found, disables trace compression, please install libzstd-dev[el] and/or set LIBZSTD_DIR)
     NO_LIBZSTD := 1
   endif
 endif
@@ -1018,7 +1008,7 @@ ifndef NO_LIBCAP
     EXTLIBS += -lcap
     $(call detected,CONFIG_LIBCAP)
   else
-    msg := $(warning No libcap found, disables capability support, please install libcap-devel/libcap-dev);
+    $(warning No libcap found, disables capability support, please install libcap-devel/libcap-dev)
     NO_LIBCAP := 1
   endif
 endif
@@ -1031,11 +1021,11 @@ endif
 
 ifndef NO_LIBNUMA
   ifeq ($(feature-libnuma), 0)
-    msg := $(warning No numa.h found, disables 'perf bench numa mem' benchmark, please install numactl-devel/libnuma-devel/libnuma-dev);
+    $(warning No numa.h found, disables 'perf bench numa mem' benchmark, please install numactl-devel/libnuma-devel/libnuma-dev)
     NO_LIBNUMA := 1
   else
     ifeq ($(feature-numa_num_possible_cpus), 0)
-      msg := $(warning Old numa library found, disables 'perf bench numa mem' benchmark, please install numactl-devel/libnuma-devel/libnuma-dev >= 2.0.8);
+      $(warning Old numa library found, disables 'perf bench numa mem' benchmark, please install numactl-devel/libnuma-devel/libnuma-dev >= 2.0.8)
       NO_LIBNUMA := 1
     else
       CFLAGS += -DHAVE_LIBNUMA_SUPPORT
@@ -1090,14 +1080,26 @@ ifndef NO_LIBBABELTRACE
     EXTLIBS += -lbabeltrace-ctf
     $(call detected,CONFIG_LIBBABELTRACE)
   else
-    msg := $(warning No libbabeltrace found, disables 'perf data' CTF format support, please install libbabeltrace-dev[el]/libbabeltrace-ctf-dev);
+    $(warning No libbabeltrace found, disables 'perf data' CTF format support, please install libbabeltrace-dev[el]/libbabeltrace-ctf-dev)
+  endif
+endif
+
+ifndef NO_CAPSTONE
+  $(call feature_check,libcapstone)
+  ifeq ($(feature-libcapstone), 1)
+    CFLAGS += -DHAVE_LIBCAPSTONE_SUPPORT $(LIBCAPSTONE_CFLAGS)
+    LDFLAGS += $(LICAPSTONE_LDFLAGS)
+    EXTLIBS += -lcapstone
+    $(call detected,CONFIG_LIBCAPSTONE)
+  else
+    msg := $(warning No libcapstone found, disables disasm engine support for 'perf script', please install libcapstone-dev/capstone-devel);
   endif
 endif
 
 ifndef NO_AUXTRACE
   ifeq ($(SRCARCH),x86)
     ifeq ($(feature-get_cpuid), 0)
-      msg := $(warning Your gcc lacks the __get_cpuid() builtin, disables support for auxtrace/Intel PT, please install a newer gcc);
+      $(warning Your gcc lacks the __get_cpuid() builtin, disables support for auxtrace/Intel PT, please install a newer gcc)
       NO_AUXTRACE := 1
     endif
   endif
@@ -1155,7 +1157,7 @@ ifndef NO_LIBPFM4
     ASCIIDOC_EXTRA = -aHAVE_LIBPFM=1
     $(call detected,CONFIG_LIBPFM4)
   else
-    msg := $(warning libpfm4 not found, disables libpfm4 support. Please install libpfm4-dev);
+    $(warning libpfm4 not found, disables libpfm4 support. Please install libpfm4-dev)
   endif
 endif
 
@@ -1173,7 +1175,7 @@ ifneq ($(NO_LIBTRACEEVENT),1)
     CFLAGS += -DLIBTRACEEVENT_VERSION=$(LIBTRACEEVENT_VERSION_CPP)
     $(call detected,CONFIG_LIBTRACEEVENT)
   else
-    dummy := $(error ERROR: libtraceevent is missing. Please install libtraceevent-dev/libtraceevent-devel or build with NO_LIBTRACEEVENT=1)
+    $(error ERROR: libtraceevent is missing. Please install libtraceevent-dev/libtraceevent-devel or build with NO_LIBTRACEEVENT=1)
   endif
 
   $(call feature_check,libtracefs)
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index f8774a9b1377..04d89d2ed209 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -84,6 +84,9 @@ include ../scripts/utilities.mak
 # Define NO_LIBBABELTRACE if you do not want libbabeltrace support
 # for CTF data format.
 #
+# Define NO_CAPSTONE if you do not want libcapstone support
+# for disasm engine.
+#
 # Define NO_LZMA if you do not want to support compressed (xz) kernel modules
 #
 # Define NO_AUXTRACE if you do not want AUX area tracing support
@@ -380,7 +383,7 @@ python-clean := $(call QUIET_CLEAN, python) $(RM) -r $(PYTHON_EXTBUILD) $(OUTPUT
 ifeq ($(CONFIG_LIBTRACEEVENT),y)
   PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
 else
-  PYTHON_EXT_SRCS := $(shell grep -v ^\#\\\|util/trace-event.c util/python-ext-sources)
+  PYTHON_EXT_SRCS := $(shell grep -v ^\#\\\|util/trace-event.c\\\|util/trace-event-parse.c util/python-ext-sources)
 endif
 
 PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py $(LIBAPI)
@@ -482,7 +485,7 @@ drm_hdr_dir := $(srctree)/tools/include/uapi/drm
 drm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/drm_ioctl.sh
 
 # Create output directory if not already present
-_dummy := $(shell [ -d '$(beauty_ioctl_outdir)' ] || mkdir -p '$(beauty_ioctl_outdir)')
+$(shell [ -d '$(beauty_ioctl_outdir)' ] || mkdir -p '$(beauty_ioctl_outdir)')
 
 $(drm_ioctl_array): $(drm_hdr_dir)/drm.h $(drm_hdr_dir)/i915_drm.h $(drm_ioctl_tbl)
 	$(Q)$(SHELL) '$(drm_ioctl_tbl)' $(drm_hdr_dir) > $@
@@ -672,7 +675,7 @@ tests-coresight-targets-clean:
 all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS) tests-coresight-targets
 
 # Create python binding output directory if not already present
-_dummy := $(shell [ -d '$(OUTPUT)python' ] || mkdir -p '$(OUTPUT)python')
+$(shell [ -d '$(OUTPUT)python' ] || mkdir -p '$(OUTPUT)python')
 
 $(OUTPUT)python/perf$(PYTHON_EXTENSION_SUFFIX): $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(LIBPERF) $(LIBSUBCMD)
 	$(QUIET_GEN)LDSHARED="$(CC) -pthread -shared" \
@@ -1048,6 +1051,11 @@ install-tests: all install-gtk
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \
 		$(INSTALL) tests/shell/lib/*.sh -m 644 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \
 		$(INSTALL) tests/shell/lib/*.py -m 644 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/common'; \
+		$(INSTALL) tests/shell/common/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/common'; \
+		$(INSTALL) tests/shell/common/*.pl '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/common'; \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/base_probe'; \
+		$(INSTALL) tests/shell/base_probe/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/base_probe'; \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight' ; \
 		$(INSTALL) tests/shell/coresight/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight'
 	$(Q)$(MAKE) -C tests/shell/coresight install-tests
@@ -1142,7 +1150,7 @@ ifeq ($(VMLINUX_H),)
   endif
 endif
 
-$(SKEL_OUT)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL)
+$(SKEL_OUT)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) $(VMLINUX_H)
 ifeq ($(VMLINUX_H),)
 	$(QUIET_GEN)$(BPFTOOL) btf dump file $< format c > $@
 else
@@ -1167,7 +1175,7 @@ bpf-skel:
 endif # CONFIG_PERF_BPF_SKEL
 
 bpf-skel-clean:
-	$(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS)
+	$(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS) $(SKEL_OUT)/vmlinux.h
 
 clean:: $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBSYMBOL)-clean $(LIBPERF)-clean arm64-sysreg-defs-clean fixdep-clean python-clean bpf-skel-clean tests-coresight-targets-clean
 	$(call QUIET_CLEAN, core-objs)  $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-iostat $(LANG_BINDINGS)
diff --git a/tools/perf/arch/arm/util/perf_regs.c b/tools/perf/arch/arm/util/perf_regs.c
index 2c56e8b56ddf..f94a0210c7b7 100644
--- a/tools/perf/arch/arm/util/perf_regs.c
+++ b/tools/perf/arch/arm/util/perf_regs.c
@@ -2,7 +2,7 @@
 #include "perf_regs.h"
 #include "../../../util/perf_regs.h"
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG_END
 };
 
@@ -15,3 +15,8 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+	return sample_reg_masks;
+}
diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c
index 7f3af3b97f3b..8b7cb68ba1a8 100644
--- a/tools/perf/arch/arm/util/pmu.c
+++ b/tools/perf/arch/arm/util/pmu.c
@@ -13,6 +13,7 @@
 #include "hisi-ptt.h"
 #include "../../../util/pmu.h"
 #include "../../../util/cs-etm.h"
+#include "../../arm64/util/mem-events.h"
 
 void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
 {
@@ -26,6 +27,8 @@ void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
 		pmu->selectable = true;
 		pmu->is_uncore = false;
 		pmu->perf_event_attr_init_default = arm_spe_pmu_default_config;
+		if (!strcmp(pmu->name, "arm_spe_0"))
+			pmu->mem_events = perf_mem_events_arm;
 	} else if (strstarts(pmu->name, HISI_PTT_PMU_NAME)) {
 		pmu->selectable = true;
 #endif
diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile
index fab3095fb5d0..5735ed4479bb 100644
--- a/tools/perf/arch/arm64/Makefile
+++ b/tools/perf/arch/arm64/Makefile
@@ -18,7 +18,7 @@ sysprf := $(srctree)/tools/perf/arch/arm64/entry/syscalls/
 systbl := $(sysprf)/mksyscalltbl
 
 # Create output directory if not already present
-_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+$(shell [ -d '$(out)' ] || mkdir -p '$(out)')
 
 $(header): $(sysdef) $(systbl)
 	$(Q)$(SHELL) '$(systbl)' '$(CC)' '$(HOSTCC)' $(incpath) $(sysdef) > $@
diff --git a/tools/perf/arch/arm64/util/machine.c b/tools/perf/arch/arm64/util/machine.c
index ba1144366e85..aab1cc2bc283 100644
--- a/tools/perf/arch/arm64/util/machine.c
+++ b/tools/perf/arch/arm64/util/machine.c
@@ -12,5 +12,7 @@
 
 void arch__add_leaf_frame_record_opts(struct record_opts *opts)
 {
+	const struct sample_reg *sample_reg_masks = arch__sample_reg_masks();
+
 	opts->sample_user_regs |= sample_reg_masks[PERF_REG_ARM64_LR].mask;
 }
diff --git a/tools/perf/arch/arm64/util/mem-events.c b/tools/perf/arch/arm64/util/mem-events.c
index 3bcc5c7035c2..9f8da7937255 100644
--- a/tools/perf/arch/arm64/util/mem-events.c
+++ b/tools/perf/arch/arm64/util/mem-events.c
@@ -1,37 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "map_symbol.h"
+#include "util/map_symbol.h"
+#include "util/mem-events.h"
 #include "mem-events.h"
 
-#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
+#define E(t, n, s, l, a) { .tag = t, .name = n, .event_name = s, .ldlat = l, .aux_event = a }
 
-static struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
-	E("spe-load",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	"arm_spe_0"),
-	E("spe-store",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			"arm_spe_0"),
-	E("spe-ldst",	"arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	"arm_spe_0"),
+struct perf_mem_event perf_mem_events_arm[PERF_MEM_EVENTS__MAX] = {
+	E("spe-load",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	NULL,	true,	0),
+	E("spe-store",	"%s/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			NULL,	false,	0),
+	E("spe-ldst",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	NULL,	true,	0),
 };
-
-static char mem_ev_name[100];
-
-struct perf_mem_event *perf_mem_events__ptr(int i)
-{
-	if (i >= PERF_MEM_EVENTS__MAX)
-		return NULL;
-
-	return &perf_mem_events[i];
-}
-
-const char *perf_mem_events__name(int i, const char *pmu_name __maybe_unused)
-{
-	struct perf_mem_event *e = perf_mem_events__ptr(i);
-
-	if (i >= PERF_MEM_EVENTS__MAX)
-		return NULL;
-
-	if (i == PERF_MEM_EVENTS__LOAD || i == PERF_MEM_EVENTS__LOAD_STORE)
-		scnprintf(mem_ev_name, sizeof(mem_ev_name),
-			  e->name, perf_mem_events__loads_ldlat);
-	else /* PERF_MEM_EVENTS__STORE */
-		scnprintf(mem_ev_name, sizeof(mem_ev_name), e->name);
-
-	return mem_ev_name;
-}
diff --git a/tools/perf/arch/arm64/util/mem-events.h b/tools/perf/arch/arm64/util/mem-events.h
new file mode 100644
index 000000000000..5fc50be4be38
--- /dev/null
+++ b/tools/perf/arch/arm64/util/mem-events.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARM64_MEM_EVENTS_H
+#define _ARM64_MEM_EVENTS_H
+
+extern struct perf_mem_event perf_mem_events_arm[PERF_MEM_EVENTS__MAX];
+
+#endif /* _ARM64_MEM_EVENTS_H */
diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c
index 1b79d8eab22f..09308665e28a 100644
--- a/tools/perf/arch/arm64/util/perf_regs.c
+++ b/tools/perf/arch/arm64/util/perf_regs.c
@@ -16,7 +16,7 @@
 #define HWCAP_SVE	(1 << 22)
 #endif
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG(x0, PERF_REG_ARM64_X0),
 	SMPL_REG(x1, PERF_REG_ARM64_X1),
 	SMPL_REG(x2, PERF_REG_ARM64_X2),
@@ -175,3 +175,8 @@ uint64_t arch__user_reg_mask(void)
 	}
 	return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+	return sample_reg_masks;
+}
diff --git a/tools/perf/arch/csky/util/perf_regs.c b/tools/perf/arch/csky/util/perf_regs.c
index c0877c264d49..6b1665f41180 100644
--- a/tools/perf/arch/csky/util/perf_regs.c
+++ b/tools/perf/arch/csky/util/perf_regs.c
@@ -2,7 +2,7 @@
 #include "perf_regs.h"
 #include "../../util/perf_regs.h"
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG_END
 };
 
@@ -15,3 +15,8 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+	return sample_reg_masks;
+}
diff --git a/tools/perf/arch/loongarch/Makefile b/tools/perf/arch/loongarch/Makefile
index c392e7af4743..3992a67a87d9 100644
--- a/tools/perf/arch/loongarch/Makefile
+++ b/tools/perf/arch/loongarch/Makefile
@@ -17,7 +17,7 @@ sysprf := $(srctree)/tools/perf/arch/loongarch/entry/syscalls/
 systbl := $(sysprf)/mksyscalltbl
 
 # Create output directory if not already present
-_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+$(shell [ -d '$(out)' ] || mkdir -p '$(out)')
 
 $(header): $(sysdef) $(systbl)
 	$(Q)$(SHELL) '$(systbl)' '$(CC)' '$(HOSTCC)' $(incpath) $(sysdef) > $@
diff --git a/tools/perf/arch/loongarch/util/perf_regs.c b/tools/perf/arch/loongarch/util/perf_regs.c
index 2c56e8b56ddf..f94a0210c7b7 100644
--- a/tools/perf/arch/loongarch/util/perf_regs.c
+++ b/tools/perf/arch/loongarch/util/perf_regs.c
@@ -2,7 +2,7 @@
 #include "perf_regs.h"
 #include "../../../util/perf_regs.h"
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG_END
 };
 
@@ -15,3 +15,8 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+	return sample_reg_masks;
+}
diff --git a/tools/perf/arch/mips/Makefile b/tools/perf/arch/mips/Makefile
index 8bc09072e3d6..cd0b011b3be5 100644
--- a/tools/perf/arch/mips/Makefile
+++ b/tools/perf/arch/mips/Makefile
@@ -11,7 +11,7 @@ sysdef := $(sysprf)/syscall_n64.tbl
 systbl := $(sysprf)/mksyscalltbl
 
 # Create output directory if not already present
-_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+$(shell [ -d '$(out)' ] || mkdir -p '$(out)')
 
 $(header): $(sysdef) $(systbl)
 	$(Q)$(SHELL) '$(systbl)' $(sysdef) > $@
diff --git a/tools/perf/arch/mips/util/perf_regs.c b/tools/perf/arch/mips/util/perf_regs.c
index c0877c264d49..6b1665f41180 100644
--- a/tools/perf/arch/mips/util/perf_regs.c
+++ b/tools/perf/arch/mips/util/perf_regs.c
@@ -2,7 +2,7 @@
 #include "perf_regs.h"
 #include "../../util/perf_regs.h"
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG_END
 };
 
@@ -15,3 +15,8 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+	return sample_reg_masks;
+}
diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index 840ea0e59287..bf6d323574f6 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -19,7 +19,7 @@ sysdef := $(sysprf)/syscall.tbl
 systbl := $(sysprf)/mksyscalltbl
 
 # Create output directory if not already present
-_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+$(shell [ -d '$(out)' ] || mkdir -p '$(out)')
 
 $(header64): $(sysdef) $(systbl)
 	$(Q)$(SHELL) '$(systbl)' '64' $(sysdef) > $@
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index 9889245c555c..1d323f3a3322 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -2,6 +2,7 @@ perf-y += header.o
 perf-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-y += perf_regs.o
 perf-y += mem-events.o
+perf-y += pmu.o
 perf-y += sym-handling.o
 perf-y += evsel.o
 perf-y += event.o
diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
index d9a0ac1cdf30..c8357b571ccf 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -114,7 +114,7 @@ static int is_tracepoint_available(const char *str, struct evlist *evlist)
 
 	parse_events_error__init(&err);
 	ret = parse_events(evlist, str, &err);
-	if (err.str)
+	if (ret)
 		parse_events_error__print(&err, "tracepoint");
 	parse_events_error__exit(&err);
 	return ret;
diff --git a/tools/perf/arch/powerpc/util/mem-events.c b/tools/perf/arch/powerpc/util/mem-events.c
index 78b986e5268d..765d4a054b0a 100644
--- a/tools/perf/arch/powerpc/util/mem-events.c
+++ b/tools/perf/arch/powerpc/util/mem-events.c
@@ -1,12 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "map_symbol.h"
+#include "util/map_symbol.h"
+#include "util/mem-events.h"
 #include "mem-events.h"
 
-/* PowerPC does not support 'ldlat' parameter. */
-const char *perf_mem_events__name(int i, const char *pmu_name __maybe_unused)
-{
-	if (i == PERF_MEM_EVENTS__LOAD)
-		return "cpu/mem-loads/";
+#define E(t, n, s, l, a) { .tag = t, .name = n, .event_name = s, .ldlat = l, .aux_event = a }
 
-	return "cpu/mem-stores/";
-}
+struct perf_mem_event perf_mem_events_power[PERF_MEM_EVENTS__MAX] = {
+	E("ldlat-loads",	"%s/mem-loads/",	"mem-loads",	false,	0),
+	E("ldlat-stores",	"%s/mem-stores/",	"mem-stores",	false,	0),
+	E(NULL,			NULL,			NULL,		false,	0),
+};
diff --git a/tools/perf/arch/powerpc/util/mem-events.h b/tools/perf/arch/powerpc/util/mem-events.h
new file mode 100644
index 000000000000..6acc3d1b6873
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/mem-events.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _POWER_MEM_EVENTS_H
+#define _POWER_MEM_EVENTS_H
+
+extern struct perf_mem_event perf_mem_events_power[PERF_MEM_EVENTS__MAX];
+
+#endif /* _POWER_MEM_EVENTS_H */
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
index b38aa056eea0..e8e6e6fc6f17 100644
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -17,7 +17,7 @@
 #define PVR_POWER9		0x004E
 #define PVR_POWER10		0x0080
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG(r0, PERF_REG_POWERPC_R0),
 	SMPL_REG(r1, PERF_REG_POWERPC_R1),
 	SMPL_REG(r2, PERF_REG_POWERPC_R2),
@@ -232,3 +232,8 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+	return sample_reg_masks;
+}
diff --git a/tools/perf/arch/powerpc/util/pmu.c b/tools/perf/arch/powerpc/util/pmu.c
new file mode 100644
index 000000000000..554675deef7b
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/pmu.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <string.h>
+
+#include "../../../util/pmu.h"
+#include "mem-events.h"
+
+void perf_pmu__arch_init(struct perf_pmu *pmu)
+{
+	if (pmu->is_core)
+		pmu->mem_events = perf_mem_events_power;
+}
diff --git a/tools/perf/arch/riscv/util/header.c b/tools/perf/arch/riscv/util/header.c
index 4a41856938a8..1b29030021ee 100644
--- a/tools/perf/arch/riscv/util/header.c
+++ b/tools/perf/arch/riscv/util/header.c
@@ -41,7 +41,7 @@ static char *_get_cpuid(void)
 	char *mimpid = NULL;
 	char *cpuid = NULL;
 	int read;
-	unsigned long line_sz;
+	size_t line_sz;
 	FILE *cpuinfo;
 
 	cpuinfo = fopen(CPUINFO, "r");
diff --git a/tools/perf/arch/riscv/util/perf_regs.c b/tools/perf/arch/riscv/util/perf_regs.c
index c0877c264d49..6b1665f41180 100644
--- a/tools/perf/arch/riscv/util/perf_regs.c
+++ b/tools/perf/arch/riscv/util/perf_regs.c
@@ -2,7 +2,7 @@
 #include "perf_regs.h"
 #include "../../util/perf_regs.h"
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG_END
 };
 
@@ -15,3 +15,8 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+	return sample_reg_masks;
+}
diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile
index 74bffbea03e2..56994e63b43a 100644
--- a/tools/perf/arch/s390/Makefile
+++ b/tools/perf/arch/s390/Makefile
@@ -17,7 +17,7 @@ sysdef := $(sysprf)/syscall.tbl
 systbl := $(sysprf)/mksyscalltbl
 
 # Create output directory if not already present
-_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+$(shell [ -d '$(out)' ] || mkdir -p '$(out)')
 
 $(header): $(sysdef) $(systbl)
 	$(Q)$(SHELL) '$(systbl)' $(sysdef) > $@
diff --git a/tools/perf/arch/s390/util/perf_regs.c b/tools/perf/arch/s390/util/perf_regs.c
index c0877c264d49..6b1665f41180 100644
--- a/tools/perf/arch/s390/util/perf_regs.c
+++ b/tools/perf/arch/s390/util/perf_regs.c
@@ -2,7 +2,7 @@
 #include "perf_regs.h"
 #include "../../util/perf_regs.h"
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG_END
 };
 
@@ -15,3 +15,8 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
+
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+	return sample_reg_masks;
+}
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
index 5a9f9a7bf07d..8952e00f9b60 100644
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -17,7 +17,7 @@ sys       := $(srctree)/tools/perf/arch/x86/entry/syscalls
 systbl    := $(sys)/syscalltbl.sh
 
 # Create output directory if not already present
-_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+$(shell [ -d '$(out)' ] || mkdir -p '$(out)')
 
 $(header): $(sys)/syscall_64.tbl $(systbl)
 	$(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@
diff --git a/tools/perf/arch/x86/tests/dwarf-unwind.c b/tools/perf/arch/x86/tests/dwarf-unwind.c
index 5bfec3345d59..c05c0a85dad4 100644
--- a/tools/perf/arch/x86/tests/dwarf-unwind.c
+++ b/tools/perf/arch/x86/tests/dwarf-unwind.c
@@ -34,6 +34,7 @@ static int sample_ustack(struct perf_sample *sample,
 	}
 
 	stack_size = map__end(map) - sp;
+	map__put(map);
 	stack_size = stack_size > STACK_SIZE ? STACK_SIZE : stack_size;
 
 	memcpy(buf, (void *) sp, stack_size);
diff --git a/tools/perf/arch/x86/tests/hybrid.c b/tools/perf/arch/x86/tests/hybrid.c
index 40f5d17fedab..e221ea104174 100644
--- a/tools/perf/arch/x86/tests/hybrid.c
+++ b/tools/perf/arch/x86/tests/hybrid.c
@@ -259,11 +259,10 @@ static int test_event(const struct evlist_test *e)
 	parse_events_error__init(&err);
 	ret = parse_events(evlist, e->name, &err);
 	if (ret) {
-		pr_debug("failed to parse event '%s', err %d, str '%s'\n",
-			 e->name, ret, err.str);
+		pr_debug("failed to parse event '%s', err %d\n", e->name, ret);
 		parse_events_error__print(&err, e->name);
 		ret = TEST_FAIL;
-		if (strstr(err.str, "can't access trace events"))
+		if (parse_events_error__contains(&err, "can't access trace events"))
 			ret = TEST_SKIP;
 	} else {
 		ret = e->check(evlist);
diff --git a/tools/perf/arch/x86/util/mem-events.c b/tools/perf/arch/x86/util/mem-events.c
index 191b372f9a2d..62df03e91c7e 100644
--- a/tools/perf/arch/x86/util/mem-events.c
+++ b/tools/perf/arch/x86/util/mem-events.c
@@ -1,93 +1,28 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "util/pmu.h"
-#include "util/pmus.h"
-#include "util/env.h"
-#include "map_symbol.h"
-#include "mem-events.h"
 #include "linux/string.h"
-#include "env.h"
+#include "util/map_symbol.h"
+#include "util/mem-events.h"
+#include "mem-events.h"
 
-static char mem_loads_name[100];
-static bool mem_loads_name__init;
-static char mem_stores_name[100];
 
 #define MEM_LOADS_AUX		0x8203
-#define MEM_LOADS_AUX_NAME     "{%s/mem-loads-aux/,%s/mem-loads,ldlat=%u/}:P"
 
-#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
+#define E(t, n, s, l, a) { .tag = t, .name = n, .event_name = s, .ldlat = l, .aux_event = a }
 
-static struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX] = {
-	E("ldlat-loads",	"%s/mem-loads,ldlat=%u/P",	"%s/events/mem-loads"),
-	E("ldlat-stores",	"%s/mem-stores/P",		"%s/events/mem-stores"),
-	E(NULL,			NULL,				NULL),
+struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX] = {
+	E("ldlat-loads",	"%s/mem-loads,ldlat=%u/P",	"mem-loads",	true,	0),
+	E("ldlat-stores",	"%s/mem-stores/P",		"mem-stores",	false,	0),
+	E(NULL,			NULL,				NULL,		false,	0),
 };
 
-static struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = {
-	E(NULL,		NULL,		NULL),
-	E(NULL,		NULL,		NULL),
-	E("mem-ldst",	"ibs_op//",	"ibs_op"),
+struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX] = {
+	E("ldlat-loads",	"{%s/mem-loads-aux/,%s/mem-loads,ldlat=%u/}:P",	"mem-loads",	true,	MEM_LOADS_AUX),
+	E("ldlat-stores",	"%s/mem-stores/P",		"mem-stores",	false,	0),
+	E(NULL,			NULL,				NULL,		false,	0),
 };
 
-struct perf_mem_event *perf_mem_events__ptr(int i)
-{
-	if (i >= PERF_MEM_EVENTS__MAX)
-		return NULL;
-
-	if (x86__is_amd_cpu())
-		return &perf_mem_events_amd[i];
-
-	return &perf_mem_events_intel[i];
-}
-
-bool is_mem_loads_aux_event(struct evsel *leader)
-{
-	struct perf_pmu *pmu = perf_pmus__find("cpu");
-
-	if (!pmu)
-		pmu = perf_pmus__find("cpu_core");
-
-	if (pmu && !perf_pmu__have_event(pmu, "mem-loads-aux"))
-		return false;
-
-	return leader->core.attr.config == MEM_LOADS_AUX;
-}
-
-const char *perf_mem_events__name(int i, const char *pmu_name)
-{
-	struct perf_mem_event *e = perf_mem_events__ptr(i);
-
-	if (!e)
-		return NULL;
-
-	if (i == PERF_MEM_EVENTS__LOAD) {
-		if (mem_loads_name__init && !pmu_name)
-			return mem_loads_name;
-
-		if (!pmu_name) {
-			mem_loads_name__init = true;
-			pmu_name = "cpu";
-		}
-
-		if (perf_pmus__have_event(pmu_name, "mem-loads-aux")) {
-			scnprintf(mem_loads_name, sizeof(mem_loads_name),
-				  MEM_LOADS_AUX_NAME, pmu_name, pmu_name,
-				  perf_mem_events__loads_ldlat);
-		} else {
-			scnprintf(mem_loads_name, sizeof(mem_loads_name),
-				  e->name, pmu_name,
-				  perf_mem_events__loads_ldlat);
-		}
-		return mem_loads_name;
-	}
-
-	if (i == PERF_MEM_EVENTS__STORE) {
-		if (!pmu_name)
-			pmu_name = "cpu";
-
-		scnprintf(mem_stores_name, sizeof(mem_stores_name),
-			  e->name, pmu_name);
-		return mem_stores_name;
-	}
-
-	return e->name;
-}
+struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = {
+	E(NULL,		NULL,		NULL,	false,	0),
+	E(NULL,		NULL,		NULL,	false,	0),
+	E("mem-ldst",	"%s//",		NULL,	false,	0),
+};
diff --git a/tools/perf/arch/x86/util/mem-events.h b/tools/perf/arch/x86/util/mem-events.h
new file mode 100644
index 000000000000..f55c8d3b7d59
--- /dev/null
+++ b/tools/perf/arch/x86/util/mem-events.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_MEM_EVENTS_H
+#define _X86_MEM_EVENTS_H
+
+extern struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX];
+extern struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX];
+
+extern struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX];
+
+#endif /* _X86_MEM_EVENTS_H */
diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c
index b813502a2727..12fd93f04802 100644
--- a/tools/perf/arch/x86/util/perf_regs.c
+++ b/tools/perf/arch/x86/util/perf_regs.c
@@ -13,7 +13,7 @@
 #include "../../../util/pmu.h"
 #include "../../../util/pmus.h"
 
-const struct sample_reg sample_reg_masks[] = {
+static const struct sample_reg sample_reg_masks[] = {
 	SMPL_REG(AX, PERF_REG_X86_AX),
 	SMPL_REG(BX, PERF_REG_X86_BX),
 	SMPL_REG(CX, PERF_REG_X86_CX),
@@ -276,6 +276,11 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
 	return SDT_ARG_VALID;
 }
 
+const struct sample_reg *arch__sample_reg_masks(void)
+{
+	return sample_reg_masks;
+}
+
 uint64_t arch__intr_reg_mask(void)
 {
 	struct perf_event_attr attr = {
diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
index 469555ae9b3c..c3d89d6ba1bf 100644
--- a/tools/perf/arch/x86/util/pmu.c
+++ b/tools/perf/arch/x86/util/pmu.c
@@ -15,6 +15,7 @@
 #include "../../../util/pmu.h"
 #include "../../../util/fncache.h"
 #include "../../../util/pmus.h"
+#include "mem-events.h"
 #include "env.h"
 
 void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
@@ -30,14 +31,14 @@ void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
 		pmu->selectable = true;
 	}
 #endif
-}
-
-int perf_pmus__num_mem_pmus(void)
-{
-	/* AMD uses IBS OP pmu and not a core PMU for perf mem/c2c */
-	if (x86__is_amd_cpu())
-		return 1;
 
-	/* Intel uses core pmus for perf mem/c2c */
-	return perf_pmus__num_core_pmus();
+	if (x86__is_amd_cpu()) {
+		if (!strcmp(pmu->name, "ibs_op"))
+			pmu->mem_events = perf_mem_events_amd;
+	} else if (pmu->is_core) {
+		if (perf_pmu__have_event(pmu, "mem-loads-aux"))
+			pmu->mem_events = perf_mem_events_intel_aux;
+		else
+			pmu->mem_events = perf_mem_events_intel;
+	}
 }
diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c
index 9b99f48b923c..e2d6cfe21057 100644
--- a/tools/perf/arch/x86/util/tsc.c
+++ b/tools/perf/arch/x86/util/tsc.c
@@ -33,7 +33,7 @@ static double cpuinfo_tsc_freq(void)
 
 	cpuinfo = fopen("/proc/cpuinfo", "r");
 	if (!cpuinfo) {
-		pr_err("Failed to read /proc/cpuinfo for TSC frequency");
+		pr_err("Failed to read /proc/cpuinfo for TSC frequency\n");
 		return NAN;
 	}
 	while (getline(&line, &len, cpuinfo) > 0) {
@@ -48,7 +48,7 @@ static double cpuinfo_tsc_freq(void)
 	}
 out:
 	if (fpclassify(result) == FP_ZERO)
-		pr_err("Failed to find TSC frequency in /proc/cpuinfo");
+		pr_err("Failed to find TSC frequency in /proc/cpuinfo\n");
 
 	free(line);
 	fclose(cpuinfo);
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index f78eea9e2153..16b40f5d43db 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -3215,12 +3215,19 @@ static int parse_record_events(const struct option *opt,
 			       const char *str, int unset __maybe_unused)
 {
 	bool *event_set = (bool *) opt->value;
+	struct perf_pmu *pmu;
+
+	pmu = perf_mem_events_find_pmu();
+	if (!pmu) {
+		pr_err("failed: there is no PMU that supports perf c2c\n");
+		exit(-1);
+	}
 
 	if (!strcmp(str, "list")) {
-		perf_mem_events__list();
+		perf_pmu__mem_events_list(pmu);
 		exit(0);
 	}
-	if (perf_mem_events__parse(str))
+	if (perf_pmu__mem_events_parse(pmu, str))
 		exit(-1);
 
 	*event_set = true;
@@ -3238,13 +3245,13 @@ static const char * const *record_mem_usage = __usage_record;
 
 static int perf_c2c__record(int argc, const char **argv)
 {
-	int rec_argc, i = 0, j, rec_tmp_nr = 0;
+	int rec_argc, i = 0, j;
 	const char **rec_argv;
-	char **rec_tmp;
 	int ret;
 	bool all_user = false, all_kernel = false;
 	bool event_set = false;
 	struct perf_mem_event *e;
+	struct perf_pmu *pmu;
 	struct option options[] = {
 	OPT_CALLBACK('e', "event", &event_set, "event",
 		     "event selector. Use 'perf c2c record -e list' to list available events",
@@ -3256,7 +3263,13 @@ static int perf_c2c__record(int argc, const char **argv)
 	OPT_END()
 	};
 
-	if (perf_mem_events__init()) {
+	pmu = perf_mem_events_find_pmu();
+	if (!pmu) {
+		pr_err("failed: no PMU supports the memory events\n");
+		return -1;
+	}
+
+	if (perf_pmu__mem_events_init(pmu)) {
 		pr_err("failed: memory events not supported\n");
 		return -1;
 	}
@@ -3265,22 +3278,16 @@ static int perf_c2c__record(int argc, const char **argv)
 			     PARSE_OPT_KEEP_UNKNOWN);
 
 	/* Max number of arguments multiplied by number of PMUs that can support them. */
-	rec_argc = argc + 11 * perf_pmus__num_mem_pmus();
+	rec_argc = argc + 11 * (perf_pmu__mem_events_num_mem_pmus(pmu) + 1);
 
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
 	if (!rec_argv)
 		return -1;
 
-	rec_tmp = calloc(rec_argc + 1, sizeof(char *));
-	if (!rec_tmp) {
-		free(rec_argv);
-		return -1;
-	}
-
 	rec_argv[i++] = "record";
 
 	if (!event_set) {
-		e = perf_mem_events__ptr(PERF_MEM_EVENTS__LOAD_STORE);
+		e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD_STORE);
 		/*
 		 * The load and store operations are required, use the event
 		 * PERF_MEM_EVENTS__LOAD_STORE if it is supported.
@@ -3289,15 +3296,15 @@ static int perf_c2c__record(int argc, const char **argv)
 			e->record = true;
 			rec_argv[i++] = "-W";
 		} else {
-			e = perf_mem_events__ptr(PERF_MEM_EVENTS__LOAD);
+			e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD);
 			e->record = true;
 
-			e = perf_mem_events__ptr(PERF_MEM_EVENTS__STORE);
+			e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__STORE);
 			e->record = true;
 		}
 	}
 
-	e = perf_mem_events__ptr(PERF_MEM_EVENTS__LOAD);
+	e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD);
 	if (e->record)
 		rec_argv[i++] = "-W";
 
@@ -3305,7 +3312,7 @@ static int perf_c2c__record(int argc, const char **argv)
 	rec_argv[i++] = "--phys-data";
 	rec_argv[i++] = "--sample-cpu";
 
-	ret = perf_mem_events__record_args(rec_argv, &i, rec_tmp, &rec_tmp_nr);
+	ret = perf_mem_events__record_args(rec_argv, &i);
 	if (ret)
 		goto out;
 
@@ -3332,10 +3339,6 @@ static int perf_c2c__record(int argc, const char **argv)
 
 	ret = cmd_record(i, rec_argv);
 out:
-	for (i = 0; i < rec_tmp_nr; i++)
-		free(rec_tmp[i]);
-
-	free(rec_tmp);
 	free(rec_argv);
 	return ret;
 }
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index e27a1b1288c2..02bf608d585e 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -208,17 +208,24 @@ static void default_print_metric(void *ps,
 	if (!print_state->last_metricgroups ||
 	    strcmp(print_state->last_metricgroups, group ?: "")) {
 		if (group && print_state->metricgroups) {
-			if (print_state->name_only)
+			if (print_state->name_only) {
 				fprintf(fp, "%s ", group);
-			else if (print_state->metrics) {
-				const char *gdesc = describe_metricgroup(group);
+			} else {
+				const char *gdesc = print_state->desc
+					? describe_metricgroup(group)
+					: NULL;
+				const char *print_colon = "";
+
+				if (print_state->metrics) {
+					print_colon = ":";
+					fputc('\n', fp);
+				}
 
 				if (gdesc)
-					fprintf(fp, "\n%s: [%s]\n", group, gdesc);
+					fprintf(fp, "%s%s [%s]\n", group, print_colon, gdesc);
 				else
-					fprintf(fp, "\n%s:\n", group);
-			} else
-				fprintf(fp, "%s\n", group);
+					fprintf(fp, "%s%s\n", group, print_colon);
+			}
 		}
 		zfree(&print_state->last_metricgroups);
 		print_state->last_metricgroups = strdup(group ?: "");
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 51499c20da01..5b851e64e4a1 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -43,12 +43,19 @@ static int parse_record_events(const struct option *opt,
 			       const char *str, int unset __maybe_unused)
 {
 	struct perf_mem *mem = *(struct perf_mem **)opt->value;
+	struct perf_pmu *pmu;
+
+	pmu = perf_mem_events_find_pmu();
+	if (!pmu) {
+		pr_err("failed: there is no PMU that supports perf mem\n");
+		exit(-1);
+	}
 
 	if (!strcmp(str, "list")) {
-		perf_mem_events__list();
+		perf_pmu__mem_events_list(pmu);
 		exit(0);
 	}
-	if (perf_mem_events__parse(str))
+	if (perf_pmu__mem_events_parse(pmu, str))
 		exit(-1);
 
 	mem->operation = 0;
@@ -65,13 +72,13 @@ static const char * const *record_mem_usage = __usage;
 
 static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 {
-	int rec_argc, i = 0, j, tmp_nr = 0;
+	int rec_argc, i = 0, j;
 	int start, end;
 	const char **rec_argv;
-	char **rec_tmp;
 	int ret;
 	bool all_user = false, all_kernel = false;
 	struct perf_mem_event *e;
+	struct perf_pmu *pmu;
 	struct option options[] = {
 	OPT_CALLBACK('e', "event", &mem, "event",
 		     "event selector. use 'perf mem record -e list' to list available events",
@@ -84,7 +91,13 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 	OPT_END()
 	};
 
-	if (perf_mem_events__init()) {
+	pmu = perf_mem_events_find_pmu();
+	if (!pmu) {
+		pr_err("failed: no PMU supports the memory events\n");
+		return -1;
+	}
+
+	if (perf_pmu__mem_events_init(pmu)) {
 		pr_err("failed: memory events not supported\n");
 		return -1;
 	}
@@ -93,7 +106,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 			     PARSE_OPT_KEEP_UNKNOWN);
 
 	/* Max number of arguments multiplied by number of PMUs that can support them. */
-	rec_argc = argc + 9 * perf_pmus__num_mem_pmus();
+	rec_argc = argc + 9 * (perf_pmu__mem_events_num_mem_pmus(pmu) + 1);
 
 	if (mem->cpu_list)
 		rec_argc += 2;
@@ -102,18 +115,9 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 	if (!rec_argv)
 		return -1;
 
-	/*
-	 * Save the allocated event name strings.
-	 */
-	rec_tmp = calloc(rec_argc + 1, sizeof(char *));
-	if (!rec_tmp) {
-		free(rec_argv);
-		return -1;
-	}
-
 	rec_argv[i++] = "record";
 
-	e = perf_mem_events__ptr(PERF_MEM_EVENTS__LOAD_STORE);
+	e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD_STORE);
 
 	/*
 	 * The load and store operations are required, use the event
@@ -126,17 +130,17 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 		rec_argv[i++] = "-W";
 	} else {
 		if (mem->operation & MEM_OPERATION_LOAD) {
-			e = perf_mem_events__ptr(PERF_MEM_EVENTS__LOAD);
+			e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD);
 			e->record = true;
 		}
 
 		if (mem->operation & MEM_OPERATION_STORE) {
-			e = perf_mem_events__ptr(PERF_MEM_EVENTS__STORE);
+			e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__STORE);
 			e->record = true;
 		}
 	}
 
-	e = perf_mem_events__ptr(PERF_MEM_EVENTS__LOAD);
+	e = perf_pmu__mem_events_ptr(pmu, PERF_MEM_EVENTS__LOAD);
 	if (e->record)
 		rec_argv[i++] = "-W";
 
@@ -149,7 +153,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 		rec_argv[i++] = "--data-page-size";
 
 	start = i;
-	ret = perf_mem_events__record_args(rec_argv, &i, rec_tmp, &tmp_nr);
+	ret = perf_mem_events__record_args(rec_argv, &i);
 	if (ret)
 		goto out;
 	end = i;
@@ -179,10 +183,6 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 
 	ret = cmd_record(i, rec_argv);
 out:
-	for (i = 0; i < tmp_nr; i++)
-		free(rec_tmp[i]);
-
-	free(rec_tmp);
 	free(rec_argv);
 	return ret;
 }
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 86c910125172..ff7e1d6cfcd2 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1773,8 +1773,11 @@ record__finish_output(struct record *rec)
 	struct perf_data *data = &rec->data;
 	int fd = perf_data__fd(data);
 
-	if (data->is_pipe)
+	if (data->is_pipe) {
+		/* Just to display approx. size */
+		data->file.size = rec->bytes_written;
 		return;
+	}
 
 	rec->session->header.data_size += rec->bytes_written;
 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
@@ -1830,8 +1833,8 @@ static int
 record__switch_output(struct record *rec, bool at_exit)
 {
 	struct perf_data *data = &rec->data;
+	char *new_filename = NULL;
 	int fd, err;
-	char *new_filename;
 
 	/* Same Size:      "2015122520103046"*/
 	char timestamp[] = "InvalidTimestamp";
@@ -1853,16 +1856,17 @@ record__switch_output(struct record *rec, bool at_exit)
 	}
 
 	fd = perf_data__switch(data, timestamp,
-				    rec->session->header.data_offset,
-				    at_exit, &new_filename);
+			       rec->session->header.data_offset,
+			       at_exit, &new_filename);
 	if (fd >= 0 && !at_exit) {
 		rec->bytes_written = 0;
 		rec->session->header.data_size = 0;
 	}
 
-	if (!quiet)
+	if (!quiet) {
 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
 			data->path, timestamp);
+	}
 
 	if (rec->switch_output.num_files) {
 		int n = rec->switch_output.cur_file + 1;
@@ -2472,6 +2476,11 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
 		rec->opts.sample_id = true;
 
+	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
+		rec->timestamp_filename = false;
+		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
+	}
+
 	evlist__uniquify_name(rec->evlist);
 
 	/* Debug message used by test scripts */
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f2ed2b7e80a3..dcd93ee5fc24 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -59,6 +59,7 @@
 #include <linux/ctype.h>
 #include <signal.h>
 #include <linux/bitmap.h>
+#include <linux/list_sort.h>
 #include <linux/string.h>
 #include <linux/stringify.h>
 #include <linux/time64.h>
@@ -828,35 +829,6 @@ static void tasks_setup(struct report *rep)
 	rep->tool.no_warn = true;
 }
 
-struct task {
-	struct thread		*thread;
-	struct list_head	 list;
-	struct list_head	 children;
-};
-
-static struct task *tasks_list(struct task *task, struct machine *machine)
-{
-	struct thread *parent_thread, *thread = task->thread;
-	struct task   *parent_task;
-
-	/* Already listed. */
-	if (!list_empty(&task->list))
-		return NULL;
-
-	/* Last one in the chain. */
-	if (thread__ppid(thread) == -1)
-		return task;
-
-	parent_thread = machine__find_thread(machine, -1, thread__ppid(thread));
-	if (!parent_thread)
-		return ERR_PTR(-ENOENT);
-
-	parent_task = thread__priv(parent_thread);
-	thread__put(parent_thread);
-	list_add_tail(&task->list, &parent_task->children);
-	return tasks_list(parent_task, machine);
-}
-
 struct maps__fprintf_task_args {
 	int indent;
 	FILE *fp;
@@ -900,89 +872,156 @@ static size_t maps__fprintf_task(struct maps *maps, int indent, FILE *fp)
 	return args.printed;
 }
 
-static void task__print_level(struct task *task, FILE *fp, int level)
+static int thread_level(struct machine *machine, const struct thread *thread)
 {
-	struct thread *thread = task->thread;
-	struct task *child;
-	int comm_indent = fprintf(fp, "  %8d %8d %8d |%*s",
-				  thread__pid(thread), thread__tid(thread),
-				  thread__ppid(thread), level, "");
+	struct thread *parent_thread;
+	int res;
 
-	fprintf(fp, "%s\n", thread__comm_str(thread));
+	if (thread__tid(thread) <= 0)
+		return 0;
 
-	maps__fprintf_task(thread__maps(thread), comm_indent, fp);
+	if (thread__ppid(thread) <= 0)
+		return 1;
 
-	if (!list_empty(&task->children)) {
-		list_for_each_entry(child, &task->children, list)
-			task__print_level(child, fp, level + 1);
+	parent_thread = machine__find_thread(machine, -1, thread__ppid(thread));
+	if (!parent_thread) {
+		pr_err("Missing parent thread of %d\n", thread__tid(thread));
+		return 0;
 	}
+	res = 1 + thread_level(machine, parent_thread);
+	thread__put(parent_thread);
+	return res;
 }
 
-static int tasks_print(struct report *rep, FILE *fp)
+static void task__print_level(struct machine *machine, struct thread *thread, FILE *fp)
 {
-	struct perf_session *session = rep->session;
-	struct machine      *machine = &session->machines.host;
-	struct task *tasks, *task;
-	unsigned int nr = 0, itask = 0, i;
-	struct rb_node *nd;
-	LIST_HEAD(list);
+	int level = thread_level(machine, thread);
+	int comm_indent = fprintf(fp, "  %8d %8d %8d |%*s",
+				  thread__pid(thread), thread__tid(thread),
+				  thread__ppid(thread), level, "");
 
-	/*
-	 * No locking needed while accessing machine->threads,
-	 * because --tasks is single threaded command.
-	 */
+	fprintf(fp, "%s\n", thread__comm_str(thread));
 
-	/* Count all the threads. */
-	for (i = 0; i < THREADS__TABLE_SIZE; i++)
-		nr += machine->threads[i].nr;
+	maps__fprintf_task(thread__maps(thread), comm_indent, fp);
+}
 
-	tasks = malloc(sizeof(*tasks) * nr);
-	if (!tasks)
-		return -ENOMEM;
+/*
+ * Sort two thread list nodes such that they form a tree. The first node is the
+ * root of the tree, its children are ordered numerically after it. If a child
+ * has children itself then they appear immediately after their parent. For
+ * example, the 4 threads in the order they'd appear in the list:
+ * - init with a TID 1 and a parent of 0
+ * - systemd with a TID 3000 and a parent of init/1
+ * - systemd child thread with TID 4000, the parent is 3000
+ * - NetworkManager is a child of init with a TID of 3500.
+ */
+static int task_list_cmp(void *priv, const struct list_head *la, const struct list_head *lb)
+{
+	struct machine *machine = priv;
+	struct thread_list *task_a = list_entry(la, struct thread_list, list);
+	struct thread_list *task_b = list_entry(lb, struct thread_list, list);
+	struct thread *a = task_a->thread;
+	struct thread *b = task_b->thread;
+	int level_a, level_b, res;
+
+	/* Same thread? */
+	if (thread__tid(a) == thread__tid(b))
+		return 0;
 
-	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
-		struct threads *threads = &machine->threads[i];
+	/* Compare a and b to root. */
+	if (thread__tid(a) == 0)
+		return -1;
 
-		for (nd = rb_first_cached(&threads->entries); nd;
-		     nd = rb_next(nd)) {
-			task = tasks + itask++;
+	if (thread__tid(b) == 0)
+		return 1;
 
-			task->thread = rb_entry(nd, struct thread_rb_node, rb_node)->thread;
-			INIT_LIST_HEAD(&task->children);
-			INIT_LIST_HEAD(&task->list);
-			thread__set_priv(task->thread, task);
-		}
-	}
+	/* If parents match sort by tid. */
+	if (thread__ppid(a) == thread__ppid(b))
+		return thread__tid(a) < thread__tid(b) ? -1 : 1;
 
 	/*
-	 * Iterate every task down to the unprocessed parent
-	 * and link all in task children list. Task with no
-	 * parent is added into 'list'.
+	 * Find a and b such that if they are a child of each other a and b's
+	 * tid's match, otherwise a and b have a common parent and distinct
+	 * tid's to sort by. First make the depths of the threads match.
 	 */
-	for (itask = 0; itask < nr; itask++) {
-		task = tasks + itask;
-
-		if (!list_empty(&task->list))
-			continue;
-
-		task = tasks_list(task, machine);
-		if (IS_ERR(task)) {
-			pr_err("Error: failed to process tasks\n");
-			free(tasks);
-			return PTR_ERR(task);
+	level_a = thread_level(machine, a);
+	level_b = thread_level(machine, b);
+	a = thread__get(a);
+	b = thread__get(b);
+	for (int i = level_a; i > level_b; i--) {
+		struct thread *parent = machine__find_thread(machine, -1, thread__ppid(a));
+
+		thread__put(a);
+		if (!parent) {
+			pr_err("Missing parent thread of %d\n", thread__tid(a));
+			thread__put(b);
+			return -1;
 		}
+		a = parent;
+	}
+	for (int i = level_b; i > level_a; i--) {
+		struct thread *parent = machine__find_thread(machine, -1, thread__ppid(b));
 
-		if (task)
-			list_add_tail(&task->list, &list);
+		thread__put(b);
+		if (!parent) {
+			pr_err("Missing parent thread of %d\n", thread__tid(b));
+			thread__put(a);
+			return 1;
+		}
+		b = parent;
+	}
+	/* Search up to a common parent. */
+	while (thread__ppid(a) != thread__ppid(b)) {
+		struct thread *parent;
+
+		parent = machine__find_thread(machine, -1, thread__ppid(a));
+		thread__put(a);
+		if (!parent)
+			pr_err("Missing parent thread of %d\n", thread__tid(a));
+		a = parent;
+		parent = machine__find_thread(machine, -1, thread__ppid(b));
+		thread__put(b);
+		if (!parent)
+			pr_err("Missing parent thread of %d\n", thread__tid(b));
+		b = parent;
+		if (!a || !b) {
+			/* Handle missing parent (unexpected) with some sanity. */
+			thread__put(a);
+			thread__put(b);
+			return !a && !b ? 0 : (!a ? -1 : 1);
+		}
+	}
+	if (thread__tid(a) == thread__tid(b)) {
+		/* a is a child of b or vice-versa, deeper levels appear later. */
+		res = level_a < level_b ? -1 : (level_a > level_b ? 1 : 0);
+	} else {
+		/* Sort by tid now the parent is the same. */
+		res = thread__tid(a) < thread__tid(b) ? -1 : 1;
 	}
+	thread__put(a);
+	thread__put(b);
+	return res;
+}
 
-	fprintf(fp, "# %8s %8s %8s  %s\n", "pid", "tid", "ppid", "comm");
+static int tasks_print(struct report *rep, FILE *fp)
+{
+	struct machine *machine = &rep->session->machines.host;
+	LIST_HEAD(tasks);
+	int ret;
 
-	list_for_each_entry(task, &list, list)
-		task__print_level(task, fp, 0);
+	ret = machine__thread_list(machine, &tasks);
+	if (!ret) {
+		struct thread_list *task;
 
-	free(tasks);
-	return 0;
+		list_sort(machine, &tasks, task_list_cmp);
+
+		fprintf(fp, "# %8s %8s %8s  %s\n", "pid", "tid", "ppid", "comm");
+
+		list_for_each_entry(task, &tasks, list)
+			task__print_level(machine, task->thread, fp);
+	}
+	thread_list__delete(&tasks);
+	return ret;
 }
 
 static int __cmd_report(struct report *rep)
@@ -1410,7 +1449,7 @@ int cmd_report(int argc, const char **argv)
 		    "only show processor socket that match with this filter"),
 	OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
 		    "Show raw trace event output (do not use print fmt or plugins)"),
-	OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
+	OPT_BOOLEAN('H', "hierarchy", &symbol_conf.report_hierarchy,
 		    "Show entries in a hierarchy"),
 	OPT_CALLBACK_DEFAULT(0, "stdio-color", NULL, "mode",
 			     "'always' (default), 'never' or 'auto' only applicable to --stdio mode",
@@ -1766,6 +1805,8 @@ repeat:
 	} else
 		ret = 0;
 
+	if (!use_browser && (verbose > 2 || debug_kmaps))
+		perf_session__dump_kmaps(session);
 error:
 	if (report.ptime_range) {
 		itrace_synth_opts__clear_time_range(&itrace_synth_opts);
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index dd6065afbbaf..b248c433529a 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -92,24 +92,6 @@ struct sched_atom {
 	struct task_desc	*wakee;
 };
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
-
-/* task state bitmask, copied from include/linux/sched.h */
-#define TASK_RUNNING		0
-#define TASK_INTERRUPTIBLE	1
-#define TASK_UNINTERRUPTIBLE	2
-#define __TASK_STOPPED		4
-#define __TASK_TRACED		8
-/* in tsk->exit_state */
-#define EXIT_DEAD		16
-#define EXIT_ZOMBIE		32
-#define EXIT_TRACE		(EXIT_ZOMBIE | EXIT_DEAD)
-/* in tsk->state again */
-#define TASK_DEAD		64
-#define TASK_WAKEKILL		128
-#define TASK_WAKING		256
-#define TASK_PARKED		512
-
 enum thread_state {
 	THREAD_SLEEPING = 0,
 	THREAD_WAIT_CPU,
@@ -266,7 +248,7 @@ struct thread_runtime {
 	u64 total_preempt_time;
 	u64 total_delay_time;
 
-	int last_state;
+	char last_state;
 
 	char shortname[3];
 	bool comm_changed;
@@ -436,7 +418,7 @@ static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *t
 }
 
 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
-				  u64 timestamp, u64 task_state __maybe_unused)
+				  u64 timestamp, const char task_state __maybe_unused)
 {
 	struct sched_atom *event = get_new_event(task, timestamp);
 
@@ -860,7 +842,7 @@ static int replay_switch_event(struct perf_sched *sched,
 		   *next_comm  = evsel__strval(evsel, sample, "next_comm");
 	const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
 		  next_pid = evsel__intval(evsel, sample, "next_pid");
-	const u64 prev_state = evsel__intval(evsel, sample, "prev_state");
+	const char prev_state = evsel__taskstate(evsel, sample, "prev_state");
 	struct task_desc *prev, __maybe_unused *next;
 	u64 timestamp0, timestamp = sample->time;
 	int cpu = sample->cpu;
@@ -1050,13 +1032,6 @@ static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
 	return 0;
 }
 
-static char sched_out_state(u64 prev_state)
-{
-	const char *str = TASK_STATE_TO_CHAR_STR;
-
-	return str[prev_state];
-}
-
 static int
 add_sched_out_event(struct work_atoms *atoms,
 		    char run_state,
@@ -1132,7 +1107,7 @@ static int latency_switch_event(struct perf_sched *sched,
 {
 	const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
 		  next_pid = evsel__intval(evsel, sample, "next_pid");
-	const u64 prev_state = evsel__intval(evsel, sample, "prev_state");
+	const char prev_state = evsel__taskstate(evsel, sample, "prev_state");
 	struct work_atoms *out_events, *in_events;
 	struct thread *sched_out, *sched_in;
 	u64 timestamp0, timestamp = sample->time;
@@ -1168,7 +1143,7 @@ static int latency_switch_event(struct perf_sched *sched,
 			goto out_put;
 		}
 	}
-	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
+	if (add_sched_out_event(out_events, prev_state, timestamp))
 		return -1;
 
 	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
@@ -2033,24 +2008,12 @@ static void timehist_header(struct perf_sched *sched)
 	printf("\n");
 }
 
-static char task_state_char(struct thread *thread, int state)
-{
-	static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-	unsigned bit = state ? ffs(state) : 0;
-
-	/* 'I' for idle */
-	if (thread__tid(thread) == 0)
-		return 'I';
-
-	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
 static void timehist_print_sample(struct perf_sched *sched,
 				  struct evsel *evsel,
 				  struct perf_sample *sample,
 				  struct addr_location *al,
 				  struct thread *thread,
-				  u64 t, int state)
+				  u64 t, const char state)
 {
 	struct thread_runtime *tr = thread__priv(thread);
 	const char *next_comm = evsel__strval(evsel, sample, "next_comm");
@@ -2091,7 +2054,7 @@ static void timehist_print_sample(struct perf_sched *sched,
 	print_sched_time(tr->dt_run, 6);
 
 	if (sched->show_state)
-		printf(" %5c ", task_state_char(thread, state));
+		printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
 
 	if (sched->show_next) {
 		snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid);
@@ -2163,9 +2126,9 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
 		else if (r->last_time) {
 			u64 dt_wait = tprev - r->last_time;
 
-			if (r->last_state == TASK_RUNNING)
+			if (r->last_state == 'R')
 				r->dt_preempt = dt_wait;
-			else if (r->last_state == TASK_UNINTERRUPTIBLE)
+			else if (r->last_state == 'D')
 				r->dt_iowait = dt_wait;
 			else
 				r->dt_sleep = dt_wait;
@@ -2590,7 +2553,7 @@ static int timehist_sched_change_event(struct perf_tool *tool,
 	struct thread_runtime *tr = NULL;
 	u64 tprev, t = sample->time;
 	int rc = 0;
-	int state = evsel__intval(evsel, sample, "prev_state");
+	const char state = evsel__taskstate(evsel, sample, "prev_state");
 
 	addr_location__init(&al);
 	if (machine__resolve(machine, &al, sample) < 0) {
@@ -3204,14 +3167,44 @@ static void perf_sched__merge_lat(struct perf_sched *sched)
 	}
 }
 
+static int setup_cpus_switch_event(struct perf_sched *sched)
+{
+	unsigned int i;
+
+	sched->cpu_last_switched = calloc(MAX_CPUS, sizeof(*(sched->cpu_last_switched)));
+	if (!sched->cpu_last_switched)
+		return -1;
+
+	sched->curr_pid = malloc(MAX_CPUS * sizeof(*(sched->curr_pid)));
+	if (!sched->curr_pid) {
+		zfree(&sched->cpu_last_switched);
+		return -1;
+	}
+
+	for (i = 0; i < MAX_CPUS; i++)
+		sched->curr_pid[i] = -1;
+
+	return 0;
+}
+
+static void free_cpus_switch_event(struct perf_sched *sched)
+{
+	zfree(&sched->curr_pid);
+	zfree(&sched->cpu_last_switched);
+}
+
 static int perf_sched__lat(struct perf_sched *sched)
 {
+	int rc = -1;
 	struct rb_node *next;
 
 	setup_pager();
 
+	if (setup_cpus_switch_event(sched))
+		return rc;
+
 	if (perf_sched__read_events(sched))
-		return -1;
+		goto out_free_cpus_switch_event;
 
 	perf_sched__merge_lat(sched);
 	perf_sched__sort_lat(sched);
@@ -3240,13 +3233,15 @@ static int perf_sched__lat(struct perf_sched *sched)
 	print_bad_events(sched);
 	printf("\n");
 
-	return 0;
+	rc = 0;
+
+out_free_cpus_switch_event:
+	free_cpus_switch_event(sched);
+	return rc;
 }
 
 static int setup_map_cpus(struct perf_sched *sched)
 {
-	struct perf_cpu_map *map;
-
 	sched->max_cpu.cpu  = sysconf(_SC_NPROCESSORS_CONF);
 
 	if (sched->map.comp) {
@@ -3255,16 +3250,15 @@ static int setup_map_cpus(struct perf_sched *sched)
 			return -1;
 	}
 
-	if (!sched->map.cpus_str)
-		return 0;
-
-	map = perf_cpu_map__new(sched->map.cpus_str);
-	if (!map) {
-		pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
-		return -1;
+	if (sched->map.cpus_str) {
+		sched->map.cpus = perf_cpu_map__new(sched->map.cpus_str);
+		if (!sched->map.cpus) {
+			pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
+			zfree(&sched->map.comp_cpus);
+			return -1;
+		}
 	}
 
-	sched->map.cpus = map;
 	return 0;
 }
 
@@ -3304,33 +3298,69 @@ static int setup_color_cpus(struct perf_sched *sched)
 
 static int perf_sched__map(struct perf_sched *sched)
 {
+	int rc = -1;
+
+	sched->curr_thread = calloc(MAX_CPUS, sizeof(*(sched->curr_thread)));
+	if (!sched->curr_thread)
+		return rc;
+
+	if (setup_cpus_switch_event(sched))
+		goto out_free_curr_thread;
+
 	if (setup_map_cpus(sched))
-		return -1;
+		goto out_free_cpus_switch_event;
 
 	if (setup_color_pids(sched))
-		return -1;
+		goto out_put_map_cpus;
 
 	if (setup_color_cpus(sched))
-		return -1;
+		goto out_put_color_pids;
 
 	setup_pager();
 	if (perf_sched__read_events(sched))
-		return -1;
+		goto out_put_color_cpus;
+
+	rc = 0;
 	print_bad_events(sched);
-	return 0;
+
+out_put_color_cpus:
+	perf_cpu_map__put(sched->map.color_cpus);
+
+out_put_color_pids:
+	perf_thread_map__put(sched->map.color_pids);
+
+out_put_map_cpus:
+	zfree(&sched->map.comp_cpus);
+	perf_cpu_map__put(sched->map.cpus);
+
+out_free_cpus_switch_event:
+	free_cpus_switch_event(sched);
+
+out_free_curr_thread:
+	zfree(&sched->curr_thread);
+	return rc;
 }
 
 static int perf_sched__replay(struct perf_sched *sched)
 {
+	int ret;
 	unsigned long i;
 
+	mutex_init(&sched->start_work_mutex);
+	mutex_init(&sched->work_done_wait_mutex);
+
+	ret = setup_cpus_switch_event(sched);
+	if (ret)
+		goto out_mutex_destroy;
+
 	calibrate_run_measurement_overhead(sched);
 	calibrate_sleep_measurement_overhead(sched);
 
 	test_calibrations(sched);
 
-	if (perf_sched__read_events(sched))
-		return -1;
+	ret = perf_sched__read_events(sched);
+	if (ret)
+		goto out_free_cpus_switch_event;
 
 	printf("nr_run_events:        %ld\n", sched->nr_run_events);
 	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
@@ -3355,7 +3385,14 @@ static int perf_sched__replay(struct perf_sched *sched)
 
 	sched->thread_funcs_exit = true;
 	destroy_tasks(sched);
-	return 0;
+
+out_free_cpus_switch_event:
+	free_cpus_switch_event(sched);
+
+out_mutex_destroy:
+	mutex_destroy(&sched->start_work_mutex);
+	mutex_destroy(&sched->work_done_wait_mutex);
+	return ret;
 }
 
 static void setup_sorting(struct perf_sched *sched, const struct option *options,
@@ -3590,28 +3627,7 @@ int cmd_sched(int argc, const char **argv)
 		.switch_event	    = replay_switch_event,
 		.fork_event	    = replay_fork_event,
 	};
-	unsigned int i;
-	int ret = 0;
-
-	mutex_init(&sched.start_work_mutex);
-	mutex_init(&sched.work_done_wait_mutex);
-	sched.curr_thread = calloc(MAX_CPUS, sizeof(*sched.curr_thread));
-	if (!sched.curr_thread) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	sched.cpu_last_switched = calloc(MAX_CPUS, sizeof(*sched.cpu_last_switched));
-	if (!sched.cpu_last_switched) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	sched.curr_pid = malloc(MAX_CPUS * sizeof(*sched.curr_pid));
-	if (!sched.curr_pid) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	for (i = 0; i < MAX_CPUS; i++)
-		sched.curr_pid[i] = -1;
+	int ret;
 
 	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
 					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
@@ -3622,9 +3638,9 @@ int cmd_sched(int argc, const char **argv)
 	 * Aliased to 'perf script' for now:
 	 */
 	if (!strcmp(argv[0], "script")) {
-		ret = cmd_script(argc, argv);
+		return cmd_script(argc, argv);
 	} else if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) {
-		ret = __cmd_record(argc, argv);
+		return __cmd_record(argc, argv);
 	} else if (strlen(argv[0]) > 2 && strstarts("latency", argv[0])) {
 		sched.tp_handler = &lat_ops;
 		if (argc > 1) {
@@ -3633,7 +3649,7 @@ int cmd_sched(int argc, const char **argv)
 				usage_with_options(latency_usage, latency_options);
 		}
 		setup_sorting(&sched, latency_options, latency_usage);
-		ret = perf_sched__lat(&sched);
+		return perf_sched__lat(&sched);
 	} else if (!strcmp(argv[0], "map")) {
 		if (argc) {
 			argc = parse_options(argc, argv, map_options, map_usage, 0);
@@ -3642,7 +3658,7 @@ int cmd_sched(int argc, const char **argv)
 		}
 		sched.tp_handler = &map_ops;
 		setup_sorting(&sched, latency_options, latency_usage);
-		ret = perf_sched__map(&sched);
+		return perf_sched__map(&sched);
 	} else if (strlen(argv[0]) > 2 && strstarts("replay", argv[0])) {
 		sched.tp_handler = &replay_ops;
 		if (argc) {
@@ -3650,7 +3666,7 @@ int cmd_sched(int argc, const char **argv)
 			if (argc)
 				usage_with_options(replay_usage, replay_options);
 		}
-		ret = perf_sched__replay(&sched);
+		return perf_sched__replay(&sched);
 	} else if (!strcmp(argv[0], "timehist")) {
 		if (argc) {
 			argc = parse_options(argc, argv, timehist_options,
@@ -3666,24 +3682,16 @@ int cmd_sched(int argc, const char **argv)
 				parse_options_usage(NULL, timehist_options, "w", true);
 			if (sched.show_next)
 				parse_options_usage(NULL, timehist_options, "n", true);
-			ret = -EINVAL;
-			goto out;
+			return -EINVAL;
 		}
 		ret = symbol__validate_sym_arguments();
 		if (ret)
-			goto out;
+			return ret;
 
-		ret = perf_sched__timehist(&sched);
+		return perf_sched__timehist(&sched);
 	} else {
 		usage_with_options(sched_usage, sched_options);
 	}
 
-out:
-	free(sched.curr_pid);
-	free(sched.cpu_last_switched);
-	free(sched.curr_thread);
-	mutex_destroy(&sched.start_work_mutex);
-	mutex_destroy(&sched.work_done_wait_mutex);
-
-	return ret;
+	return 0;
 }
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index b1f57401ff23..37088cc0ff1b 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -34,6 +34,7 @@
 #include "util/event.h"
 #include "ui/ui.h"
 #include "print_binary.h"
+#include "print_insn.h"
 #include "archinsn.h"
 #include <linux/bitmap.h>
 #include <linux/kernel.h>
@@ -134,6 +135,7 @@ enum perf_output_field {
 	PERF_OUTPUT_CGROUP          = 1ULL << 39,
 	PERF_OUTPUT_RETIRE_LAT      = 1ULL << 40,
 	PERF_OUTPUT_DSOFF           = 1ULL << 41,
+	PERF_OUTPUT_DISASM          = 1ULL << 42,
 };
 
 struct perf_script {
@@ -189,6 +191,7 @@ struct output_option {
 	{.str = "bpf-output",   .field = PERF_OUTPUT_BPF_OUTPUT},
 	{.str = "callindent", .field = PERF_OUTPUT_CALLINDENT},
 	{.str = "insn", .field = PERF_OUTPUT_INSN},
+	{.str = "disasm", .field = PERF_OUTPUT_DISASM},
 	{.str = "insnlen", .field = PERF_OUTPUT_INSNLEN},
 	{.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN},
 	{.str = "brstackoff", .field = PERF_OUTPUT_BRSTACKOFF},
@@ -1162,7 +1165,8 @@ out:
 static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en,
 			    struct perf_insn *x, u8 *inbuf, int len,
 			    int insn, FILE *fp, int *total_cycles,
-			    struct perf_event_attr *attr)
+			    struct perf_event_attr *attr,
+			    struct thread *thread)
 {
 	int ilen = 0;
 	int printed = fprintf(fp, "\t%016" PRIx64 "\t%-30s\t", ip,
@@ -1171,6 +1175,16 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en,
 	if (PRINT_FIELD(BRSTACKINSNLEN))
 		printed += fprintf(fp, "ilen: %d\t", ilen);
 
+	if (PRINT_FIELD(SRCLINE)) {
+		struct addr_location al;
+
+		addr_location__init(&al);
+		thread__find_map(thread, x->cpumode, ip, &al);
+		printed += map__fprintf_srcline(al.map, al.addr, " srcline: ", fp);
+		printed += fprintf(fp, "\t");
+		addr_location__exit(&al);
+	}
+
 	printed += fprintf(fp, "#%s%s%s%s",
 			      en->flags.predicted ? " PRED" : "",
 			      en->flags.mispred ? " MISPRED" : "",
@@ -1182,6 +1196,7 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en,
 		if (insn)
 			printed += fprintf(fp, " %.2f IPC", (float)insn / en->flags.cycles);
 	}
+
 	return printed + fprintf(fp, "\n");
 }
 
@@ -1260,7 +1275,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 					   x.cpumode, x.cpu, &lastsym, attr, fp);
 		printed += ip__fprintf_jump(entries[nr - 1].from, &entries[nr - 1],
 					    &x, buffer, len, 0, fp, &total_cycles,
-					    attr);
+					    attr, thread);
 		if (PRINT_FIELD(SRCCODE))
 			printed += print_srccode(thread, x.cpumode, entries[nr - 1].from);
 	}
@@ -1291,7 +1306,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 			printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp);
 			if (ip == end) {
 				printed += ip__fprintf_jump(ip, &entries[i], &x, buffer + off, len - off, ++insn, fp,
-							    &total_cycles, attr);
+							    &total_cycles, attr, thread);
 				if (PRINT_FIELD(SRCCODE))
 					printed += print_srccode(thread, x.cpumode, ip);
 				break;
@@ -1511,11 +1526,12 @@ static int perf_sample__fprintf_insn(struct perf_sample *sample,
 	if (PRINT_FIELD(INSNLEN))
 		printed += fprintf(fp, " ilen: %d", sample->insn_len);
 	if (PRINT_FIELD(INSN) && sample->insn_len) {
-		int i;
-
-		printed += fprintf(fp, " insn:");
-		for (i = 0; i < sample->insn_len; i++)
-			printed += fprintf(fp, " %02x", (unsigned char)sample->insn[i]);
+		printed += fprintf(fp, " insn: ");
+		printed += sample__fprintf_insn_raw(sample, fp);
+	}
+	if (PRINT_FIELD(DISASM) && sample->insn_len) {
+		printed += fprintf(fp, "\t\t");
+		printed += sample__fprintf_insn_asm(sample, thread, machine, fp);
 	}
 	if (PRINT_FIELD(BRSTACKINSN) || PRINT_FIELD(BRSTACKINSNLEN))
 		printed += perf_sample__fprintf_brstackinsn(sample, thread, attr, machine, fp);
@@ -3108,6 +3124,13 @@ parse:
 			rc = -EINVAL;
 			goto out;
 		}
+#ifndef HAVE_LIBCAPSTONE_SUPPORT
+		if (change != REMOVE && strcmp(tok, "disasm") == 0) {
+			fprintf(stderr, "Field \"disasm\" requires perf to be built with libcapstone support.\n");
+			rc = -EINVAL;
+			goto out;
+		}
+#endif
 
 		if (type == -1) {
 			/* add user option to all events types for
@@ -3765,10 +3788,24 @@ static int perf_script__process_auxtrace_info(struct perf_session *session,
 #endif
 
 static int parse_insn_trace(const struct option *opt __maybe_unused,
-			    const char *str __maybe_unused,
-			    int unset __maybe_unused)
+			    const char *str, int unset __maybe_unused)
 {
-	parse_output_fields(NULL, "+insn,-event,-period", 0);
+	const char *fields = "+insn,-event,-period";
+	int ret;
+
+	if (str) {
+		if (strcmp(str, "disasm") == 0)
+			fields = "+disasm,-event,-period";
+		else if (strlen(str) != 0 && strcmp(str, "raw") != 0) {
+			fprintf(stderr, "Only accept raw|disasm\n");
+			return -EINVAL;
+		}
+	}
+
+	ret = parse_output_fields(NULL, fields, 0);
+	if (ret < 0)
+		return ret;
+
 	itrace_parse_synth_opts(opt, "i0ns", 0);
 	symbol_conf.nanosecs = true;
 	return 0;
@@ -3902,7 +3939,7 @@ int cmd_script(int argc, const char **argv)
 		     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,dsoff,"
 		     "addr,symoff,srcline,period,iregs,uregs,brstack,"
 		     "brstacksym,flags,data_src,weight,bpf-output,brstackinsn,"
-		     "brstackinsnlen,brstackoff,callindent,insn,insnlen,synth,"
+		     "brstackinsnlen,brstackoff,callindent,insn,disasm,insnlen,synth,"
 		     "phys_addr,metric,misc,srccode,ipc,tod,data_page_size,"
 		     "code_page_size,ins_lat,machine_pid,vcpu,cgroup,retire_lat",
 		     parse_output_fields),
@@ -3914,7 +3951,7 @@ int cmd_script(int argc, const char **argv)
 		   "only consider these symbols"),
 	OPT_INTEGER(0, "addr-range", &symbol_conf.addr_range,
 		    "Use with -S to list traced records within address range"),
-	OPT_CALLBACK_OPTARG(0, "insn-trace", &itrace_synth_opts, NULL, NULL,
+	OPT_CALLBACK_OPTARG(0, "insn-trace", &itrace_synth_opts, NULL, "raw|disasm",
 			"Decode instructions from itrace", parse_insn_trace),
 	OPT_CALLBACK_OPTARG(0, "xed", NULL, NULL, NULL,
 			"Run xed disassembler on output", parse_xed),
@@ -4366,6 +4403,9 @@ script_found:
 
 	flush_scripting();
 
+	if (verbose > 2 || debug_kmaps)
+		perf_session__dump_kmaps(session);
+
 out_delete:
 	if (script.ptime_range) {
 		itrace_synth_opts__clear_time_range(&itrace_synth_opts);
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 5fe9abc6a524..6bba1a89d030 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1238,6 +1238,8 @@ static struct option stat_options[] = {
 		     "aggregate counts per processor socket", AGGR_SOCKET),
 	OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode,
 		     "aggregate counts per processor die", AGGR_DIE),
+	OPT_SET_UINT(0, "per-cluster", &stat_config.aggr_mode,
+		     "aggregate counts per processor cluster", AGGR_CLUSTER),
 	OPT_CALLBACK_OPTARG(0, "per-cache", &stat_config.aggr_mode, &stat_config.aggr_level,
 			    "cache level", "aggregate count at this cache level (Default: LLC)",
 			    parse_cache_level),
@@ -1428,6 +1430,7 @@ static struct aggr_cpu_id aggr_cpu_id__cache(struct perf_cpu cpu, void *data)
 static const char *const aggr_mode__string[] = {
 	[AGGR_CORE] = "core",
 	[AGGR_CACHE] = "cache",
+	[AGGR_CLUSTER] = "cluster",
 	[AGGR_DIE] = "die",
 	[AGGR_GLOBAL] = "global",
 	[AGGR_NODE] = "node",
@@ -1455,6 +1458,12 @@ static struct aggr_cpu_id perf_stat__get_cache_id(struct perf_stat_config *confi
 	return aggr_cpu_id__cache(cpu, /*data=*/NULL);
 }
 
+static struct aggr_cpu_id perf_stat__get_cluster(struct perf_stat_config *config __maybe_unused,
+						 struct perf_cpu cpu)
+{
+	return aggr_cpu_id__cluster(cpu, /*data=*/NULL);
+}
+
 static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
 					      struct perf_cpu cpu)
 {
@@ -1507,6 +1516,12 @@ static struct aggr_cpu_id perf_stat__get_die_cached(struct perf_stat_config *con
 	return perf_stat__get_aggr(config, perf_stat__get_die, cpu);
 }
 
+static struct aggr_cpu_id perf_stat__get_cluster_cached(struct perf_stat_config *config,
+							struct perf_cpu cpu)
+{
+	return perf_stat__get_aggr(config, perf_stat__get_cluster, cpu);
+}
+
 static struct aggr_cpu_id perf_stat__get_cache_id_cached(struct perf_stat_config *config,
 							 struct perf_cpu cpu)
 {
@@ -1544,6 +1559,8 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr(enum aggr_mode aggr_mode)
 		return aggr_cpu_id__socket;
 	case AGGR_DIE:
 		return aggr_cpu_id__die;
+	case AGGR_CLUSTER:
+		return aggr_cpu_id__cluster;
 	case AGGR_CACHE:
 		return aggr_cpu_id__cache;
 	case AGGR_CORE:
@@ -1569,6 +1586,8 @@ static aggr_get_id_t aggr_mode__get_id(enum aggr_mode aggr_mode)
 		return perf_stat__get_socket_cached;
 	case AGGR_DIE:
 		return perf_stat__get_die_cached;
+	case AGGR_CLUSTER:
+		return perf_stat__get_cluster_cached;
 	case AGGR_CACHE:
 		return perf_stat__get_cache_id_cached;
 	case AGGR_CORE:
@@ -1737,6 +1756,21 @@ static struct aggr_cpu_id perf_env__get_cache_aggr_by_cpu(struct perf_cpu cpu,
 	return id;
 }
 
+static struct aggr_cpu_id perf_env__get_cluster_aggr_by_cpu(struct perf_cpu cpu,
+							    void *data)
+{
+	struct perf_env *env = data;
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
+
+	if (cpu.cpu != -1) {
+		id.socket = env->cpu[cpu.cpu].socket_id;
+		id.die = env->cpu[cpu.cpu].die_id;
+		id.cluster = env->cpu[cpu.cpu].cluster_id;
+	}
+
+	return id;
+}
+
 static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, void *data)
 {
 	struct perf_env *env = data;
@@ -1744,12 +1778,12 @@ static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, vo
 
 	if (cpu.cpu != -1) {
 		/*
-		 * core_id is relative to socket and die,
-		 * we need a global id. So we set
-		 * socket, die id and core id
+		 * core_id is relative to socket, die and cluster, we need a
+		 * global id. So we set socket, die id, cluster id and core id.
 		 */
 		id.socket = env->cpu[cpu.cpu].socket_id;
 		id.die = env->cpu[cpu.cpu].die_id;
+		id.cluster = env->cpu[cpu.cpu].cluster_id;
 		id.core = env->cpu[cpu.cpu].core_id;
 	}
 
@@ -1805,6 +1839,12 @@ static struct aggr_cpu_id perf_stat__get_die_file(struct perf_stat_config *confi
 	return perf_env__get_die_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 
+static struct aggr_cpu_id perf_stat__get_cluster_file(struct perf_stat_config *config __maybe_unused,
+						      struct perf_cpu cpu)
+{
+	return perf_env__get_cluster_aggr_by_cpu(cpu, &perf_stat.session->header.env);
+}
+
 static struct aggr_cpu_id perf_stat__get_cache_file(struct perf_stat_config *config __maybe_unused,
 						    struct perf_cpu cpu)
 {
@@ -1842,6 +1882,8 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr_file(enum aggr_mode aggr_mode)
 		return perf_env__get_socket_aggr_by_cpu;
 	case AGGR_DIE:
 		return perf_env__get_die_aggr_by_cpu;
+	case AGGR_CLUSTER:
+		return perf_env__get_cluster_aggr_by_cpu;
 	case AGGR_CACHE:
 		return perf_env__get_cache_aggr_by_cpu;
 	case AGGR_CORE:
@@ -1867,6 +1909,8 @@ static aggr_get_id_t aggr_mode__get_id_file(enum aggr_mode aggr_mode)
 		return perf_stat__get_socket_file;
 	case AGGR_DIE:
 		return perf_stat__get_die_file;
+	case AGGR_CLUSTER:
+		return perf_stat__get_cluster_file;
 	case AGGR_CACHE:
 		return perf_stat__get_cache_file;
 	case AGGR_CORE:
@@ -2398,6 +2442,8 @@ static int __cmd_report(int argc, const char **argv)
 		     "aggregate counts per processor socket", AGGR_SOCKET),
 	OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode,
 		     "aggregate counts per processor die", AGGR_DIE),
+	OPT_SET_UINT(0, "per-cluster", &perf_stat.aggr_mode,
+		     "aggregate counts perf processor cluster", AGGR_CLUSTER),
 	OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level,
 			    "cache level",
 			    "aggregate count at this cache level (Default: LLC)",
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 5301d1badd43..5ac6dcc64cef 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1573,7 +1573,7 @@ int cmd_top(int argc, const char **argv)
 		    "add last branch records to call history"),
 	OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
 		    "Show raw trace event output (do not use print fmt or plugins)"),
-	OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
+	OPT_BOOLEAN('H', "hierarchy", &symbol_conf.report_hierarchy,
 		    "Show entries in a hierarchy"),
 	OPT_BOOLEAN(0, "overwrite", &top.record_opts.overwrite,
 		    "Use a backward ring buffer, default: no"),
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 109b8e64fe69..90eaff8c0f6e 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -74,6 +74,7 @@
 #include <linux/err.h>
 #include <linux/filter.h>
 #include <linux/kernel.h>
+#include <linux/list_sort.h>
 #include <linux/random.h>
 #include <linux/stringify.h>
 #include <linux/time64.h>
@@ -4312,34 +4313,38 @@ static unsigned long thread__nr_events(struct thread_trace *ttrace)
 	return ttrace ? ttrace->nr_events : 0;
 }
 
-DEFINE_RESORT_RB(threads,
-		(thread__nr_events(thread__priv(a->thread)) <
-		 thread__nr_events(thread__priv(b->thread))),
-	struct thread *thread;
-)
+static int trace_nr_events_cmp(void *priv __maybe_unused,
+			       const struct list_head *la,
+			       const struct list_head *lb)
 {
-	entry->thread = rb_entry(nd, struct thread_rb_node, rb_node)->thread;
+	struct thread_list *a = list_entry(la, struct thread_list, list);
+	struct thread_list *b = list_entry(lb, struct thread_list, list);
+	unsigned long a_nr_events = thread__nr_events(thread__priv(a->thread));
+	unsigned long b_nr_events = thread__nr_events(thread__priv(b->thread));
+
+	if (a_nr_events != b_nr_events)
+		return a_nr_events < b_nr_events ? -1 : 1;
+
+	/* Identical number of threads, place smaller tids first. */
+	return thread__tid(a->thread) < thread__tid(b->thread)
+		? -1
+		: (thread__tid(a->thread) > thread__tid(b->thread) ? 1 : 0);
 }
 
 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
 {
 	size_t printed = trace__fprintf_threads_header(fp);
-	struct rb_node *nd;
-	int i;
-
-	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
-		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
+	LIST_HEAD(threads);
 
-		if (threads == NULL) {
-			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
-			return 0;
-		}
+	if (machine__thread_list(trace->host, &threads) == 0) {
+		struct thread_list *pos;
 
-		resort_rb__for_each_entry(nd, threads)
-			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
+		list_sort(NULL, &threads, trace_nr_events_cmp);
 
-		resort_rb__delete(threads);
+		list_for_each_entry(pos, &threads, list)
+			printed += trace__fprintf_thread(fp, pos->thread, trace);
 	}
+	thread_list__delete(&threads);
 	return printed;
 }
 
diff --git a/tools/perf/builtin-version.c b/tools/perf/builtin-version.c
index ac20c2b9bbc2..398aa53e9e2e 100644
--- a/tools/perf/builtin-version.c
+++ b/tools/perf/builtin-version.c
@@ -73,6 +73,7 @@ static void library_status(void)
 	STATUS(HAVE_LIBCRYPTO_SUPPORT, libcrypto);
 	STATUS(HAVE_LIBUNWIND_SUPPORT, libunwind);
 	STATUS(HAVE_DWARF_SUPPORT, libdw-dwarf-unwind);
+	STATUS(HAVE_LIBCAPSTONE_SUPPORT, libcapstone);
 	STATUS(HAVE_ZLIB_SUPPORT, zlib);
 	STATUS(HAVE_LZMA_SUPPORT, lzma);
 	STATUS(HAVE_AUXTRACE_SUPPORT, get_cpuid);
@@ -82,6 +83,8 @@ static void library_status(void)
 	STATUS(HAVE_LIBPFM, libpfm4);
 	STATUS(HAVE_LIBTRACEEVENT, libtraceevent);
 	STATUS(HAVE_BPF_SKEL, bpf_skeletons);
+	STATUS(HAVE_DWARF_UNWIND_SUPPORT, dwarf-unwind-support);
+	STATUS(HAVE_CSTRACE_SUPPORT, libopencsd);
 }
 
 int cmd_version(int argc, const char **argv)
diff --git a/tools/perf/pmu-events/arch/powerpc/mapfile.csv b/tools/perf/pmu-events/arch/powerpc/mapfile.csv
index 599a588dbeb4..4d5e9138d4cc 100644
--- a/tools/perf/pmu-events/arch/powerpc/mapfile.csv
+++ b/tools/perf/pmu-events/arch/powerpc/mapfile.csv
@@ -15,3 +15,4 @@
 0x0066[[:xdigit:]]{4},1,power8,core
 0x004e[[:xdigit:]]{4},1,power9,core
 0x0080[[:xdigit:]]{4},1,power10,core
+0x0082[[:xdigit:]]{4},1,power10,core
diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json
new file mode 100644
index 000000000000..9b4a032186a7
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json
@@ -0,0 +1,68 @@
+[
+  {
+    "ArchStdEvent": "FW_MISALIGNED_LOAD"
+  },
+  {
+    "ArchStdEvent": "FW_MISALIGNED_STORE"
+  },
+  {
+    "ArchStdEvent": "FW_ACCESS_LOAD"
+  },
+  {
+    "ArchStdEvent": "FW_ACCESS_STORE"
+  },
+  {
+    "ArchStdEvent": "FW_ILLEGAL_INSN"
+  },
+  {
+    "ArchStdEvent": "FW_SET_TIMER"
+  },
+  {
+    "ArchStdEvent": "FW_IPI_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_IPI_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_FENCE_I_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_FENCE_I_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_VMID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_VMID_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_ASID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_ASID_RECEIVED"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/instructions.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/instructions.json
new file mode 100644
index 000000000000..713a08c1a40f
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/andes/ax45/instructions.json
@@ -0,0 +1,127 @@
+[
+	{
+		"EventCode": "0x10",
+		"EventName": "cycle_count",
+		"BriefDescription": "Cycle count"
+	},
+	{
+		"EventCode": "0x20",
+		"EventName": "inst_count",
+		"BriefDescription": "Retired instruction count"
+	},
+	{
+		"EventCode": "0x30",
+		"EventName": "int_load_inst",
+		"BriefDescription": "Integer load instruction count"
+	},
+	{
+		"EventCode": "0x40",
+		"EventName": "int_store_inst",
+		"BriefDescription": "Integer store instruction count"
+	},
+	{
+		"EventCode": "0x50",
+		"EventName": "atomic_inst",
+		"BriefDescription": "Atomic instruction count"
+	},
+	{
+		"EventCode": "0x60",
+		"EventName": "sys_inst",
+		"BriefDescription": "System instruction count"
+	},
+	{
+		"EventCode": "0x70",
+		"EventName": "int_compute_inst",
+		"BriefDescription": "Integer computational instruction count"
+	},
+	{
+		"EventCode": "0x80",
+		"EventName": "condition_br",
+		"BriefDescription": "Conditional branch instruction count"
+	},
+	{
+		"EventCode": "0x90",
+		"EventName": "taken_condition_br",
+		"BriefDescription": "Taken conditional branch instruction count"
+	},
+	{
+		"EventCode": "0xA0",
+		"EventName": "jal_inst",
+		"BriefDescription": "JAL instruction count"
+	},
+	{
+		"EventCode": "0xB0",
+		"EventName": "jalr_inst",
+		"BriefDescription": "JALR instruction count"
+	},
+	{
+		"EventCode": "0xC0",
+		"EventName": "ret_inst",
+		"BriefDescription": "Return instruction count"
+	},
+	{
+		"EventCode": "0xD0",
+		"EventName": "control_trans_inst",
+		"BriefDescription": "Control transfer instruction count"
+	},
+	{
+		"EventCode": "0xE0",
+		"EventName": "ex9_inst",
+		"BriefDescription": "EXEC.IT instruction count"
+	},
+	{
+		"EventCode": "0xF0",
+		"EventName": "int_mul_inst",
+		"BriefDescription": "Integer multiplication instruction count"
+	},
+	{
+		"EventCode": "0x100",
+		"EventName": "int_div_rem_inst",
+		"BriefDescription": "Integer division/remainder instruction count"
+	},
+	{
+		"EventCode": "0x110",
+		"EventName": "float_load_inst",
+		"BriefDescription": "Floating-point load instruction count"
+	},
+	{
+		"EventCode": "0x120",
+		"EventName": "float_store_inst",
+		"BriefDescription": "Floating-point store instruction count"
+	},
+	{
+		"EventCode": "0x130",
+		"EventName": "float_add_sub_inst",
+		"BriefDescription": "Floating-point addition/subtraction instruction count"
+	},
+	{
+		"EventCode": "0x140",
+		"EventName": "float_mul_inst",
+		"BriefDescription": "Floating-point multiplication instruction count"
+	},
+	{
+		"EventCode": "0x150",
+		"EventName": "float_fused_muladd_inst",
+		"BriefDescription": "Floating-point fused multiply-add instruction count"
+	},
+	{
+		"EventCode": "0x160",
+		"EventName": "float_div_sqrt_inst",
+		"BriefDescription": "Floating-point division or square-root instruction count"
+	},
+	{
+		"EventCode": "0x170",
+		"EventName": "other_float_inst",
+		"BriefDescription": "Other floating-point instruction count"
+	},
+	{
+		"EventCode": "0x180",
+		"EventName": "int_mul_add_sub_inst",
+		"BriefDescription": "Integer multiplication and add/sub instruction count"
+	},
+	{
+		"EventCode": "0x190",
+		"EventName": "retired_ops",
+		"BriefDescription": "Retired operation count"
+	}
+]
diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/memory.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/memory.json
new file mode 100644
index 000000000000..c7401b526c77
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/andes/ax45/memory.json
@@ -0,0 +1,57 @@
+[
+	{
+		"EventCode": "0x01",
+		"EventName": "ilm_access",
+		"BriefDescription": "ILM access"
+	},
+	{
+		"EventCode": "0x11",
+		"EventName": "dlm_access",
+		"BriefDescription": "DLM access"
+	},
+	{
+		"EventCode": "0x21",
+		"EventName": "icache_access",
+		"BriefDescription": "ICACHE access"
+	},
+	{
+		"EventCode": "0x31",
+		"EventName": "icache_miss",
+		"BriefDescription": "ICACHE miss"
+	},
+	{
+		"EventCode": "0x41",
+		"EventName": "dcache_access",
+		"BriefDescription": "DCACHE access"
+	},
+	{
+		"EventCode": "0x51",
+		"EventName": "dcache_miss",
+		"BriefDescription": "DCACHE miss"
+	},
+	{
+		"EventCode": "0x61",
+		"EventName": "dcache_load_access",
+		"BriefDescription": "DCACHE load access"
+	},
+	{
+		"EventCode": "0x71",
+		"EventName": "dcache_load_miss",
+		"BriefDescription": "DCACHE load miss"
+	},
+	{
+		"EventCode": "0x81",
+		"EventName": "dcache_store_access",
+		"BriefDescription": "DCACHE store access"
+	},
+	{
+		"EventCode": "0x91",
+		"EventName": "dcache_store_miss",
+		"BriefDescription": "DCACHE store miss"
+	},
+	{
+		"EventCode": "0xA1",
+		"EventName": "dcache_wb",
+		"BriefDescription": "DCACHE writeback"
+	}
+]
diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/microarch.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/microarch.json
new file mode 100644
index 000000000000..a6d378cbaa74
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/andes/ax45/microarch.json
@@ -0,0 +1,77 @@
+[
+	{
+		"EventCode": "0xB1",
+		"EventName": "cycle_wait_icache_fill",
+		"BriefDescription": "Cycles waiting for ICACHE fill data"
+	},
+	{
+		"EventCode": "0xC1",
+		"EventName": "cycle_wait_dcache_fill",
+		"BriefDescription": "Cycles waiting for DCACHE fill data"
+	},
+	{
+		"EventCode": "0xD1",
+		"EventName": "uncached_ifetch_from_bus",
+		"BriefDescription": "Uncached ifetch data access from bus"
+	},
+	{
+		"EventCode": "0xE1",
+		"EventName": "uncached_load_from_bus",
+		"BriefDescription": "Uncached load data access from bus"
+	},
+	{
+		"EventCode": "0xF1",
+		"EventName": "cycle_wait_uncached_ifetch",
+		"BriefDescription": "Cycles waiting for uncached ifetch data from bus"
+	},
+	{
+		"EventCode": "0x101",
+		"EventName": "cycle_wait_uncached_load",
+		"BriefDescription": "Cycles waiting for uncached load data from bus"
+	},
+	{
+		"EventCode": "0x111",
+		"EventName": "main_itlb_access",
+		"BriefDescription": "Main ITLB access"
+	},
+	{
+		"EventCode": "0x121",
+		"EventName": "main_itlb_miss",
+		"BriefDescription": "Main ITLB miss"
+	},
+	{
+		"EventCode": "0x131",
+		"EventName": "main_dtlb_access",
+		"BriefDescription": "Main DTLB access"
+	},
+	{
+		"EventCode": "0x141",
+		"EventName": "main_dtlb_miss",
+		"BriefDescription": "Main DTLB miss"
+	},
+	{
+		"EventCode": "0x151",
+		"EventName": "cycle_wait_itlb_fill",
+		"BriefDescription": "Cycles waiting for Main ITLB fill data"
+	},
+	{
+		"EventCode": "0x161",
+		"EventName": "pipe_stall_cycle_dtlb_miss",
+		"BriefDescription": "Pipeline stall cycles caused by Main DTLB miss"
+	},
+	{
+		"EventCode": "0x02",
+		"EventName": "mispredict_condition_br",
+		"BriefDescription": "Misprediction of conditional branches"
+	},
+	{
+		"EventCode": "0x12",
+		"EventName": "mispredict_take_condition_br",
+		"BriefDescription": "Misprediction of taken conditional branches"
+	},
+	{
+		"EventCode": "0x22",
+		"EventName": "mispredict_target_ret_inst",
+		"BriefDescription": "Misprediction of targets of Return instructions"
+	}
+]
diff --git a/tools/perf/pmu-events/arch/riscv/mapfile.csv b/tools/perf/pmu-events/arch/riscv/mapfile.csv
index cfc449b19810..3d3a809a5446 100644
--- a/tools/perf/pmu-events/arch/riscv/mapfile.csv
+++ b/tools/perf/pmu-events/arch/riscv/mapfile.csv
@@ -17,3 +17,4 @@
 0x489-0x8000000000000007-0x[[:xdigit:]]+,v1,sifive/u74,core
 0x5b7-0x0-0x0,v1,thead/c900-legacy,core
 0x67e-0x80000000db0000[89]0-0x[[:xdigit:]]+,v1,starfive/dubhe-80,core
+0x31e-0x8000000000008a45-0x[[:xdigit:]]+,v1,andes/ax45,core
diff --git a/tools/perf/pmu-events/arch/s390/cf_z16/extended.json b/tools/perf/pmu-events/arch/s390/cf_z16/extended.json
index c2b10ec1c6e0..02cce3a629cb 100644
--- a/tools/perf/pmu-events/arch/s390/cf_z16/extended.json
+++ b/tools/perf/pmu-events/arch/s390/cf_z16/extended.json
@@ -94,77 +94,77 @@
 		"Unit": "CPU-M-CF",
 		"EventCode": "145",
 		"EventName": "DCW_REQ",
-		"BriefDescription": "Directory Write Level 1 Data Cache from Cache",
+		"BriefDescription": "Directory Write Level 1 Data Cache from L2-Cache",
 		"PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from the requestors Level-2 cache."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "146",
 		"EventName": "DCW_REQ_IV",
-		"BriefDescription": "Directory Write Level 1 Data Cache from Cache with Intervention",
+		"BriefDescription": "Directory Write Level 1 Data Cache from L2-Cache with Intervention",
 		"PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from the requestors Level-2 cache with intervention."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "147",
 		"EventName": "DCW_REQ_CHIP_HIT",
-		"BriefDescription": "Directory Write Level 1 Data Cache from Cache with Chip HP Hit",
+		"BriefDescription": "Directory Write Level 1 Data Cache from L2-Cache with Chip HP Hit",
 		"PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from the requestors Level-2 cache after using chip level horizontal persistence, Chip-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "148",
 		"EventName": "DCW_REQ_DRAWER_HIT",
-		"BriefDescription": "Directory Write Level 1 Data Cache from Cache with Drawer HP Hit",
+		"BriefDescription": "Directory Write Level 1 Data Cache from L2-Cache with Drawer HP Hit",
 		"PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from the requestors Level-2 cache after using drawer level horizontal persistence, Drawer-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "149",
 		"EventName": "DCW_ON_CHIP",
-		"BriefDescription": "Directory Write Level 1 Data Cache from On-Chip Cache",
+		"BriefDescription": "Directory Write Level 1 Data Cache from On-Chip L2-Cache",
 		"PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an On-Chip Level-2 cache."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "150",
 		"EventName": "DCW_ON_CHIP_IV",
-		"BriefDescription": "Directory Write Level 1 Data Cache from On-Chip Cache with Intervention",
+		"BriefDescription": "Directory Write Level 1 Data Cache from On-Chip L2-Cache with Intervention",
 		"PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an On-Chip Level-2 cache with intervention."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "151",
 		"EventName": "DCW_ON_CHIP_CHIP_HIT",
-		"BriefDescription": "Directory Write Level 1 Data Cache from On-Chip Cache with Chip HP Hit",
+		"BriefDescription": "Directory Write Level 1 Data Cache from On-Chip L2-Cache with Chip HP Hit",
 		"PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an On-Chip Level-2 cache after using chip level horizontal persistence, Chip-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "152",
 		"EventName": "DCW_ON_CHIP_DRAWER_HIT",
-		"BriefDescription": "Directory Write Level 1 Data Cache from On-Chip Cache with Drawer HP Hit",
+		"BriefDescription": "Directory Write Level 1 Data Cache from On-Chip L2-Cache with Drawer HP Hit",
 		"PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an On-Chip Level-2 cache using drawer level horizontal persistence, Drawer-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "153",
 		"EventName": "DCW_ON_MODULE",
-		"BriefDescription": "Directory Write Level 1 Data Cache from On-Module Cache",
+		"BriefDescription": "Directory Write Level 1 Data Cache from On-Module L2-Cache",
 		"PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an On-Module Level-2 cache."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "154",
 		"EventName": "DCW_ON_DRAWER",
-		"BriefDescription": "Directory Write Level 1 Data Cache from On-Drawer Cache",
+		"BriefDescription": "Directory Write Level 1 Data Cache from On-Drawer L2-Cache",
 		"PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an On-Drawer Level-2 cache."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "155",
 		"EventName": "DCW_OFF_DRAWER",
-		"BriefDescription": "Directory Write Level 1 Data Cache from Off-Drawer Cache",
+		"BriefDescription": "Directory Write Level 1 Data Cache from Off-Drawer L2-Cache",
 		"PublicDescription": "A directory write to the Level-1 Data cache directory where the returned cache line was sourced from an Off-Drawer Level-2 cache."
 	},
 	{
@@ -199,140 +199,140 @@
 		"Unit": "CPU-M-CF",
 		"EventCode": "160",
 		"EventName": "IDCW_ON_MODULE_IV",
-		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Module Memory Cache with Intervention",
+		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Module Memory L2-Cache with Intervention",
 		"PublicDescription": "A directory write to the Level-1 Data or Level-1 Instruction cache directory where the returned cache line was sourced from an On-Module Level-2 cache with intervention."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "161",
 		"EventName": "IDCW_ON_MODULE_CHIP_HIT",
-		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Module Memory Cache with Chip Hit",
+		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Module Memory L2-Cache with Chip Hit",
 		"PublicDescription": "A directory write to the Level-1 Data or Level-1 Instruction cache directory where the returned cache line was sourced from an On-Module Level-2 cache using chip horizontal persistence, Chip-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "162",
 		"EventName": "IDCW_ON_MODULE_DRAWER_HIT",
-		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Module Memory Cache with Drawer Hit",
+		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Module Memory L2-Cache with Drawer Hit",
 		"PublicDescription": "A directory write to the Level-1 Data or Level-1 Instruction cache directory where the returned cache line was sourced from an On-Module Level-2 cache using drawer level horizontal persistence, Drawer-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "163",
 		"EventName": "IDCW_ON_DRAWER_IV",
-		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Drawer Cache with Intervention",
+		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Drawer L2-Cache with Intervention",
 		"PublicDescription": "A directory write to the Level-1 Data or Level-1 Instruction cache directory where the returned cache line was sourced from an On-Drawer Level-2 cache with intervention."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "164",
 		"EventName": "IDCW_ON_DRAWER_CHIP_HIT",
-		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Drawer Cache with Chip Hit",
+		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Drawer L2-Cache with Chip Hit",
 		"PublicDescription": "A directory write to the Level-1 Data or Level-1 instruction cache directory where the returned cache line was sourced from an On-Drawer Level-2 cache using chip level horizontal persistence, Chip-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "165",
 		"EventName": "IDCW_ON_DRAWER_DRAWER_HIT",
-		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Drawer Cache with Drawer Hit",
+		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from On-Drawer L2-Cache with Drawer Hit",
 		"PublicDescription": "A directory write to the Level-1 Data or Level-1 instruction cache directory where the returned cache line was sourced from an On-Drawer Level-2 cache using drawer level horizontal persistence, Drawer-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "166",
 		"EventName": "IDCW_OFF_DRAWER_IV",
-		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from Off-Drawer Cache with Intervention",
+		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from Off-Drawer L2-Cache with Intervention",
 		"PublicDescription": "A directory write to the Level-1 Data or Level-1 instruction cache directory where the returned cache line was sourced from an Off-Drawer Level-2 cache with intervention."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "167",
 		"EventName": "IDCW_OFF_DRAWER_CHIP_HIT",
-		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from Off-Drawer Cache with Chip Hit",
+		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from Off-Drawer L2-Cache with Chip Hit",
 		"PublicDescription": "A directory write to the Level-1 Data or Level-1 instruction cache directory where the returned cache line was sourced from an Off-Drawer Level-2 cache using chip level horizontal persistence, Chip-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "168",
 		"EventName": "IDCW_OFF_DRAWER_DRAWER_HIT",
-		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from Off-Drawer Cache with Drawer Hit",
+		"BriefDescription": "Directory Write Level 1 Instruction and Data Cache from Off-Drawer L2-Cache with Drawer Hit",
 		"PublicDescription": "A directory write to the Level-1 Data or Level-1 Instruction cache directory where the returned cache line was sourced from an Off-Drawer Level-2 cache using drawer level horizontal persistence, Drawer-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "169",
 		"EventName": "ICW_REQ",
-		"BriefDescription": "Directory Write Level 1 Instruction Cache from Cache",
+		"BriefDescription": "Directory Write Level 1 Instruction Cache from L2-Cache",
 		"PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced the requestors Level-2 cache."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "170",
 		"EventName": "ICW_REQ_IV",
-		"BriefDescription": "Directory Write Level 1 Instruction Cache from Cache with Intervention",
+		"BriefDescription": "Directory Write Level 1 Instruction Cache from L2-Cache with Intervention",
 		"PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from the requestors Level-2 cache with intervention."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "171",
 		"EventName": "ICW_REQ_CHIP_HIT",
-		"BriefDescription": "Directory Write Level 1 Instruction Cache from Cache with Chip HP Hit",
+		"BriefDescription": "Directory Write Level 1 Instruction Cache from L2-Cache with Chip HP Hit",
 		"PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from the requestors Level-2 cache using chip level horizontal persistence, Chip-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "172",
 		"EventName": "ICW_REQ_DRAWER_HIT",
-		"BriefDescription": "Directory Write Level 1 Instruction Cache from Cache with Drawer HP Hit",
+		"BriefDescription": "Directory Write Level 1 Instruction Cache from L2-Cache with Drawer HP Hit",
 		"PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from the requestors Level-2 cache using drawer level horizontal persistence, Drawer-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "173",
 		"EventName": "ICW_ON_CHIP",
-		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Chip Cache",
+		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Chip L2-Cache",
 		"PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from an On-Chip Level-2 cache."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "174",
 		"EventName": "ICW_ON_CHIP_IV",
-		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Chip Cache with Intervention",
+		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Chip L2-Cache with Intervention",
 		"PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced an On-Chip Level-2 cache with intervention."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "175",
 		"EventName": "ICW_ON_CHIP_CHIP_HIT",
-		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Chip Cache with Chip HP Hit",
+		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Chip L2-Cache with Chip HP Hit",
 		"PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from an On-Chip Level-2 cache using chip level horizontal persistence, Chip-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "176",
 		"EventName": "ICW_ON_CHIP_DRAWER_HIT",
-		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Chip Cache with Drawer HP Hit",
+		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Chip L2-Cache with Drawer HP Hit",
 		"PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from an On-Chip level 2 cache using drawer level horizontal persistence, Drawer-HP hit."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "177",
 		"EventName": "ICW_ON_MODULE",
-		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Module Cache",
+		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Module L2-Cache",
 		"PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced from an On-Module Level-2 cache."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "178",
 		"EventName": "ICW_ON_DRAWER",
-		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Drawer Cache",
+		"BriefDescription": "Directory Write Level 1 Instruction Cache from On-Drawer L2-Cache",
 		"PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced an On-Drawer Level-2 cache."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "179",
 		"EventName": "ICW_OFF_DRAWER",
-		"BriefDescription": "Directory Write Level 1 Instruction Cache from Off-Drawer Cache",
+		"BriefDescription": "Directory Write Level 1 Instruction Cache from Off-Drawer L2-Cache",
 		"PublicDescription": "A directory write to the Level-1 Instruction cache directory where the returned cache line was sourced an Off-Drawer Level-2 cache."
 	},
 	{
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
index bbfa3883e533..b72c0e2cb946 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
@@ -811,13 +811,13 @@
         "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_0@ + cpu_core@UOPS_DISPATCHED.PORT_1@ + cpu_core@UOPS_DISPATCHED.PORT_5_11@ + cpu_core@UOPS_DISPATCHED.PORT_6@) / (5 * tma_info_core_core_clks)",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * cpu_core@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_thread_slots",
+        "MetricExpr": "78 * cpu_core@ASSISTS.ANY@ / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -880,6 +880,24 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.1 power-performance optimized state (Faster wakeup time; Smaller power savings).",
+        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C01@ / tma_info_thread_clks",
+        "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group",
+        "MetricName": "tma_c01_wait",
+        "MetricThreshold": "tma_c01_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.2 power-performance optimized state (Slower wakeup time; Larger power savings).",
+        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C02@ / tma_info_thread_clks",
+        "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group",
+        "MetricName": "tma_c02_wait",
+        "MetricThreshold": "tma_c02_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction",
         "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
@@ -901,7 +919,7 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
-        "MetricExpr": "(25 * tma_info_system_average_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) + 24 * tma_info_system_average_frequency * cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
+        "MetricExpr": "(25 * tma_info_system_core_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) + 24 * tma_info_system_core_frequency * cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_contested_accesses",
         "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -922,7 +940,7 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
-        "MetricExpr": "24 * tma_info_system_average_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@ + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (1 - cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
+        "MetricExpr": "24 * tma_info_system_core_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@ + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (1 - cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_data_sharing",
         "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -935,7 +953,7 @@
         "MetricExpr": "(cpu_core@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu_core@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
         "MetricName": "tma_decoder0_alone",
-        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35))",
+        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
@@ -965,7 +983,7 @@
         "MetricExpr": "(cpu_core@IDQ.DSB_CYCLES_ANY@ - cpu_core@IDQ.DSB_CYCLES_OK@) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
@@ -986,7 +1004,7 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
         "MetricName": "tma_dtlb_load",
         "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
@@ -996,13 +1014,13 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
         "MetricName": "tma_dtlb_store",
         "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
-        "MetricExpr": "28 * tma_info_system_average_frequency * cpu_core@OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM@ / tma_info_thread_clks",
+        "MetricExpr": "28 * tma_info_system_core_frequency * cpu_core@OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM@ / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
         "MetricName": "tma_false_sharing",
         "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -1016,7 +1034,7 @@
         "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
         "MetricName": "tma_fb_full",
         "MetricThreshold": "tma_fb_full > 0.3",
-        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
@@ -1025,7 +1043,7 @@
         "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%",
@@ -1127,10 +1145,10 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions",
         "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.MACRO_FUSED@ / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_fused_instructions",
         "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
@@ -1141,14 +1159,14 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .). Sample with: UOPS_RETIRED.HEAVY",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
         "MetricExpr": "cpu_core@ICACHE_DATA.STALLS@ / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
@@ -1157,7 +1175,7 @@
     },
     {
         "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ / 100",
         "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
         "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers",
@@ -1181,7 +1199,7 @@
     },
     {
         "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
-        "MetricExpr": "cpu_core@BR_MISP_RETIRED.INDIRECT_CALL\\,umask\\=0x80@ / BR_MISP_RETIRED.INDIRECT",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / BR_MISP_RETIRED.INDIRECT",
         "MetricGroup": "Bad;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmisp_indirect",
         "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3",
@@ -1204,6 +1222,13 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)",
+        "MetricExpr": "cpu_core@INT_MISC.CLEARS_COUNT@ / (cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ + cpu_core@MACHINE_CLEARS.COUNT@)",
+        "MetricGroup": "BrMispredicts",
+        "MetricName": "tma_info_bad_spec_spec_clears_ratio",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
         "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
         "MetricGroup": "Cor;SMT",
@@ -1230,61 +1255,94 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.",
+        "MetricExpr": "100 * (tma_retiring - (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + cpu_core@BR_INST_RETIRED.NEAR_CALL@) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Ret",
+        "MetricName": "tma_info_bottleneck_base_non_br",
+        "MetricThreshold": "tma_info_bottleneck_base_non_br > 20",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
         "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
-        "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC",
+        "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB",
         "MetricName": "tma_info_bottleneck_big_code",
         "MetricThreshold": "tma_info_bottleneck_big_code > 20",
-        "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
-        "MetricExpr": "100 * ((cpu_core@BR_INST_RETIRED.COND@ + 3 * cpu_core@BR_INST_RETIRED.NEAR_CALL@ + (cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@)) / tma_info_thread_slots)",
-        "MetricGroup": "Ret;tma_issueBC",
+        "MetricExpr": "100 * ((cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + cpu_core@BR_INST_RETIRED.NEAR_CALL@) / tma_info_thread_slots)",
+        "MetricGroup": "Ret",
         "MetricName": "tma_info_bottleneck_branching_overhead",
-        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10",
-        "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code",
+        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
+        "MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
+        "MetricName": "tma_info_bottleneck_cache_memory_latency",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
+        "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
+        "MetricGroup": "Cor;tma_issueComp",
+        "MetricName": "tma_info_bottleneck_compute_bound_est",
+        "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
+        "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
-        "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
+        "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))) - tma_info_bottleneck_big_code",
         "MetricGroup": "Fed;FetchBW;Frontend",
         "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
         "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
-        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
-        "MetricName": "tma_info_bottleneck_memory_bandwidth",
-        "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20",
-        "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full",
+        "BriefDescription": "Total pipeline cost of irregular execution (e.g",
+        "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY\\,umask\\=1@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Bad;Cor;Ret;tma_issueMS",
+        "MetricName": "tma_info_bottleneck_irregular_overhead",
+        "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
+        "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
-        "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
         "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB",
         "MetricName": "tma_info_bottleneck_memory_data_tlbs",
         "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
-        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store",
+        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))",
-        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
-        "MetricName": "tma_info_bottleneck_memory_latency",
-        "MetricThreshold": "tma_info_bottleneck_memory_latency > 20",
-        "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency",
+        "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
+        "MetricGroup": "Mem;Offcore;tma_issueTLB",
+        "MetricName": "tma_info_bottleneck_memory_synchronization",
+        "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
+        "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
-        "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+        "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
         "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bottleneck_mispredictions",
         "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
@@ -1292,6 +1350,15 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)",
+        "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)",
+        "MetricGroup": "Cor;Offcore",
+        "MetricName": "tma_info_bottleneck_other_bottlenecks",
+        "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
+        "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Fraction of branches that are CALL or RET",
         "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@BR_INST_RETIRED.NEAR_RETURN@) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
@@ -1328,7 +1395,7 @@
     },
     {
         "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
-        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.DISTRIBUTED@",
+        "MetricExpr": "(cpu_core@CPU_CLK_UNHALTED.DISTRIBUTED@ if #SMT_on else tma_info_thread_clks)",
         "MetricGroup": "SMT",
         "MetricName": "tma_info_core_core_clks",
         "Unit": "cpu_core"
@@ -1341,8 +1408,15 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "uops Executed per Cycle",
+        "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / tma_info_thread_clks",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_core_epc",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Floating Point Operations Per Cycle",
-        "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_core_core_clks",
+        "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc",
         "Unit": "cpu_core"
@@ -1356,8 +1430,8 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp",
         "Unit": "cpu_core"
@@ -1429,6 +1503,14 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection",
+        "MetricExpr": "cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES@ / cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES\\,cmask\\=1\\,edge@",
+        "MetricGroup": "Fed",
+        "MetricName": "tma_info_frontend_unknown_branch_cost",
+        "PublicDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection. See Unknown_Branches node.",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Branch instructions per taken branch.",
         "MetricExpr": "cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ / BR_INST_RETIRED.NEAR_TAKEN",
         "MetricGroup": "Branches;Fed;PGO",
@@ -1449,7 +1531,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.",
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW.",
         "Unit": "cpu_core"
     },
     {
@@ -1458,7 +1540,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.",
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.",
         "Unit": "cpu_core"
     },
     {
@@ -1467,7 +1549,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.",
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.",
         "Unit": "cpu_core"
     },
     {
@@ -1476,7 +1558,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.",
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.",
         "Unit": "cpu_core"
     },
     {
@@ -1485,7 +1567,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.",
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting.",
         "Unit": "cpu_core"
     },
     {
@@ -1506,7 +1588,7 @@
     },
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
-        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10",
@@ -1521,6 +1603,13 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)",
+        "MetricExpr": "tma_info_inst_mix_instructions / CPU_CLK_UNHALTED.PAUSE_INST",
+        "MetricGroup": "Flops;FpVector;InsType",
+        "MetricName": "tma_info_inst_mix_ippause",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
         "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / MEM_INST_RETIRED.ALL_STORES",
         "MetricGroup": "InsType",
@@ -1547,164 +1636,178 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw",
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * cpu_core@L2_LINES_IN.ALL@ / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw",
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * cpu_core@OFFCORE_REQUESTS.ALL_REQUESTS@ / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_access_bw",
         "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_core_l3_cache_access_bw",
+        "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * cpu_core@LONGEST_LAT_CACHE.MISS@ / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw",
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
         "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_fb_hpki",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
         "MetricExpr": "1e3 * cpu_core@L2_RQSTS.ALL_DEMAND_DATA_RD@ / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki_load",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * cpu_core@L2_LINES_IN.ALL@ / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (cpu_core@L2_RQSTS.REFERENCES@ - cpu_core@L2_RQSTS.MISS@) / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_all",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_HIT@ / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L2_MISS@ / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * cpu_core@L2_RQSTS.MISS@ / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_MISS@ / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
-        "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L3_MISS@ / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
-        "MetricName": "tma_info_memory_l3mpki",
+        "BriefDescription": "",
+        "MetricExpr": "64 * cpu_core@OFFCORE_REQUESTS.ALL_REQUESTS@ / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW;Offcore",
+        "MetricName": "tma_info_memory_l3_cache_access_bw",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / MEM_LOAD_COMPLETED.L1_MISS_ANY",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency",
+        "BriefDescription": "",
+        "MetricExpr": "64 * cpu_core@LONGEST_LAT_CACHE.MISS@ / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L3_MISS@ / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_l3mpki",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
-        "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD@ / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
+        "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DATA_RD@ / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp",
+        "MetricName": "tma_info_memory_latency_data_l2_mlp",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD@ / OFFCORE_REQUESTS.DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency",
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD@ / cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp",
+        "MetricName": "tma_info_memory_latency_load_l2_mlp",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Average Latency for L3 cache miss demand Loads",
         "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l3_miss_latency",
+        "MetricName": "tma_info_memory_latency_load_l3_miss_latency",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t",
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / MEM_LOAD_COMPLETED.L1_MISS_ANY",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t",
+        "BriefDescription": "\"Bus lock\" per kilo instruction",
+        "MetricExpr": "1e3 * cpu_core@SQ_MISC.BUS_LOCK@ / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_bus_lock_pki",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_access_bw",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t",
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "1e3 * cpu_core@MEM_LOAD_MISC_RETIRED.UC@ / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_uc_load_pki",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t",
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
         "Unit": "cpu_core"
     },
     {
@@ -1737,16 +1840,16 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "",
+        "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Instructions per a microcode Assist invocation",
-        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@ASSISTS.ANY\\,umask\\=0x1B@",
-        "MetricGroup": "Pipeline;Ret;Retire",
+        "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / ASSISTS.ANY",
+        "MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
         "MetricName": "tma_info_pipeline_ipassist",
         "MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
         "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)",
@@ -1762,39 +1865,54 @@
     {
         "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions",
         "MetricExpr": "cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
-        "MetricGroup": "Pipeline;Ret",
+        "MetricGroup": "MicroSeq;Pipeline;Ret",
         "MetricName": "tma_info_pipeline_strings_cycles",
         "MetricThreshold": "tma_info_pipeline_strings_cycles > 0.1",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Fraction of cycles the processor is waiting yet unhalted; covering legacy PAUSE instruction, as well as C0.1 / C0.2 power-performance optimized states",
+        "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.C0_WAIT@ / tma_info_thread_clks",
+        "MetricGroup": "C0Wait",
+        "MetricName": "tma_info_system_c0_wait",
+        "MetricThreshold": "tma_info_system_c0_wait > 0.05",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency",
+        "MetricName": "tma_info_system_core_frequency",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
-        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full",
+        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / duration_time",
+        "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine.",
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width",
         "Unit": "cpu_core"
     },
     {
@@ -1838,14 +1956,6 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.ALL",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_request_latency",
-        "Unit": "cpu_core"
-    },
-    {
         "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
         "MetricExpr": "(1 - cpu_core@CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE@ / cpu_core@CPU_CLK_UNHALTED.REF_DISTRIBUTED@ if #SMT_on else 0)",
         "MetricGroup": "SMT",
@@ -1927,7 +2037,7 @@
     },
     {
         "BriefDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired)",
-        "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b + tma_shuffles",
+        "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b",
         "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_int_operations",
         "MetricThreshold": "tma_int_operations > 0.1 & tma_light_operations > 0.6",
@@ -1946,19 +2056,19 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired",
+        "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired",
         "MetricExpr": "(cpu_core@INT_VEC_RETIRED.ADD_256@ + cpu_core@INT_VEC_RETIRED.MUL_256@ + cpu_core@INT_VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_thread_slots)",
         "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P",
         "MetricName": "tma_int_vector_256b",
         "MetricThreshold": "tma_int_vector_256b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
         "MetricExpr": "cpu_core@ICACHE_TAG.STALLS@ / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
@@ -1968,7 +2078,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -1978,7 +2088,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
@@ -1988,7 +2098,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
@@ -1996,12 +2106,12 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
-        "MetricExpr": "9 * tma_info_system_average_frequency * cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "MetricExpr": "9 * tma_info_system_core_frequency * (cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2)) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
@@ -2022,7 +2132,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
@@ -2069,7 +2179,7 @@
         "MetricExpr": "(cpu_core@LSD.CYCLES_ACTIVE@ - cpu_core@LSD.CYCLES_OK@) / tma_info_core_core_clks / 2",
         "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_lsd",
-        "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)",
+        "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
@@ -2086,22 +2196,22 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD@) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
@@ -2120,9 +2230,9 @@
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.",
         "MetricConstraint": "NO_GROUP_EVENTS_NMI",
         "MetricExpr": "13 * cpu_core@MISC2_RETIRED.LFENCE@ / tma_info_thread_clks",
-        "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
         "MetricName": "tma_memory_fence",
-        "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))",
+        "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
@@ -2141,7 +2251,7 @@
         "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
         "MetricName": "tma_microcode_sequencer",
         "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
-        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches",
+        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
@@ -2160,35 +2270,35 @@
         "MetricExpr": "(cpu_core@IDQ.MITE_CYCLES_ANY@ - cpu_core@IDQ.MITE_CYCLES_OK@) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
+        "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)",
         "MetricExpr": "160 * cpu_core@ASSISTS.SSE_AVX_MIX@ / tma_info_thread_clks",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
         "MetricName": "tma_mixing_vectors",
         "MetricThreshold": "tma_mixing_vectors > 0.05",
-        "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
+        "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
-        "MetricExpr": "3 * cpu_core@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_thread_slots / cpu_core@UOPS_ISSUED.ANY@) / tma_info_thread_clks",
+        "MetricExpr": "3 * cpu_core@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (cpu_core@UOPS_RETIRED.SLOTS@ / cpu_core@UOPS_ISSUED.ANY@) / tma_info_thread_clks",
         "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
         "MetricName": "tma_ms_switches",
         "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
-        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
+        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused",
         "MetricExpr": "tma_light_operations * (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ - cpu_core@INST_RETIRED.MACRO_FUSED@) / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_non_fused_branches",
         "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.",
@@ -2198,16 +2308,16 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
         "MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.NOP@ / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
         "MetricName": "tma_nop_instructions",
-        "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6",
+        "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
-        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))",
+        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))",
         "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_other_light_ops",
         "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
@@ -2216,6 +2326,24 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
+        "MetricExpr": "max(tma_branch_mispredicts * (1 - cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ / (cpu_core@INT_MISC.CLEARS_COUNT@ - cpu_core@MACHINE_CLEARS.COUNT@)), 0.0001)",
+        "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
+        "MetricName": "tma_other_mispredicts",
+        "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
+        "MetricExpr": "max(tma_machine_clears * (1 - cpu_core@MACHINE_CLEARS.MEMORY_ORDERING@ / cpu_core@MACHINE_CLEARS.COUNT@), 0.0001)",
+        "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_other_nukes",
+        "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults",
         "MetricExpr": "99 * cpu_core@ASSISTS.PAGE_FAULT@ / tma_info_thread_slots",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_assists_group",
@@ -2246,18 +2374,18 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "cpu_core@UOPS_DISPATCHED.PORT_6@ / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
-        "MetricExpr": "((cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)",
+        "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)",
         "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
         "MetricName": "tma_ports_utilization",
         "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -2267,7 +2395,7 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
-        "MetricExpr": "cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_thread_clks",
+        "MetricExpr": "(cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu_core@RS.EMPTY\\,umask\\=1@) / tma_info_thread_clks * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_0",
         "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -2302,7 +2430,7 @@
         "MetricExpr": "cpu_core@UOPS_EXECUTED.CYCLES_GE_3@ / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
@@ -2321,20 +2449,21 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
-        "MetricExpr": "cpu_core@RESOURCE_STALLS.SCOREBOARD@ / tma_info_thread_clks",
-        "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group",
+        "MetricExpr": "cpu_core@RESOURCE_STALLS.SCOREBOARD@ / tma_info_thread_clks + tma_c02_wait",
+        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
         "MetricName": "tma_serializing_operation",
-        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))",
+        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "This metric represents Shuffle (cross \"vector lane\" data transfers) uops fraction the CPU has retired.",
-        "MetricExpr": "cpu_core@INT_VEC_RETIRED.SHUFFLES@ / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group",
-        "MetricName": "tma_shuffles",
-        "MetricThreshold": "tma_shuffles > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)",
+        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer)",
+        "MetricExpr": "tma_light_operations * cpu_core@INT_VEC_RETIRED.SHUFFLES@ / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
+        "MetricName": "tma_shuffles_256b",
+        "MetricThreshold": "tma_shuffles_256b > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer). Shuffles may incur slow cross \"vector lane\" data transfers.",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
@@ -2342,9 +2471,9 @@
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
         "MetricConstraint": "NO_GROUP_EVENTS_NMI",
         "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.PAUSE@ / tma_info_thread_clks",
-        "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
         "MetricName": "tma_slow_pause",
-        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))",
+        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: CPU_CLK_UNHALTED.PAUSE_INST",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
@@ -2375,7 +2504,7 @@
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
         "MetricName": "tma_sq_full",
         "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
+        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
@@ -2450,10 +2579,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES@ / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH",
         "ScaleUnit": "100%",
         "Unit": "cpu_core"
     },
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json b/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json
index c8ba96c4a7f8..cd291943dc08 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json
@@ -26,7 +26,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0",
+        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0]",
         "EventCode": "0xb3",
         "EventName": "FP_ARITH_DISPATCHED.PORT_0",
         "SampleAfterValue": "2000003",
@@ -34,7 +34,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1",
+        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1]",
         "EventCode": "0xb3",
         "EventName": "FP_ARITH_DISPATCHED.PORT_1",
         "SampleAfterValue": "2000003",
@@ -42,7 +42,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5",
+        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2]",
         "EventCode": "0xb3",
         "EventName": "FP_ARITH_DISPATCHED.PORT_5",
         "SampleAfterValue": "2000003",
@@ -50,6 +50,30 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0]",
+        "EventCode": "0xb3",
+        "EventName": "FP_ARITH_DISPATCHED.V0",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x1",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1]",
+        "EventCode": "0xb3",
+        "EventName": "FP_ARITH_DISPATCHED.V1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5]",
+        "EventCode": "0xb3",
+        "EventName": "FP_ARITH_DISPATCHED.V2",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x4",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 2 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.",
         "EventCode": "0xc7",
         "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE",
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json b/tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json
index 516eb0f93f02..7a03835f262c 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/metricgroups.json
@@ -2,10 +2,11 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "C0Wait": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -26,7 +27,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -69,6 +72,7 @@
     "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category",
     "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category",
     "tma_base_group": "Metrics contributing to tma_base category",
+    "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category",
     "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category",
     "tma_core_bound_group": "Metrics contributing to tma_core_bound category",
     "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category",
@@ -82,9 +86,9 @@
     "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category",
     "tma_int_operations_group": "Metrics contributing to tma_int_operations category",
     "tma_issue2P": "Metrics related by the issue $issue2P",
-    "tma_issueBC": "Metrics related by the issue $issueBC",
     "tma_issueBM": "Metrics related by the issue $issueBM",
     "tma_issueBW": "Metrics related by the issue $issueBW",
+    "tma_issueComp": "Metrics related by the issue $issueComp",
     "tma_issueD0": "Metrics related by the issue $issueD0",
     "tma_issueFB": "Metrics related by the issue $issueFB",
     "tma_issueFL": "Metrics related by the issue $issueFL",
@@ -111,6 +115,7 @@
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
     "tma_mite_group": "Metrics contributing to tma_mite category",
     "tma_nuke_group": "Metrics contributing to tma_nuke category",
+    "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category",
     "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category",
     "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category",
     "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category",
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/other.json b/tools/perf/pmu-events/arch/x86/alderlake/other.json
index 1db73e020215..5250a17d9cae 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/other.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/other.json
@@ -40,6 +40,16 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "This event is deprecated. [This event is alias to MISC_RETIRED.LBR_INSERTS]",
+        "Deprecated": "1",
+        "EventCode": "0xe4",
+        "EventName": "LBR_INSERTS.ANY",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Counts modified writebacks from L1 cache and L2 cache that have any type of response.",
         "EventCode": "0xB7",
         "EventName": "OCR.COREWB_M.ANY_RESPONSE",
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
index f9876bef16da..df6032e816d4 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
@@ -799,6 +799,7 @@
         "BriefDescription": "INST_RETIRED.MACRO_FUSED",
         "EventCode": "0xc0",
         "EventName": "INST_RETIRED.MACRO_FUSED",
+        "PEBS": "1",
         "SampleAfterValue": "2000003",
         "UMask": "0x10",
         "Unit": "cpu_core"
@@ -807,6 +808,7 @@
         "BriefDescription": "Retired NOP instructions.",
         "EventCode": "0xc0",
         "EventName": "INST_RETIRED.NOP",
+        "PEBS": "1",
         "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions",
         "SampleAfterValue": "2000003",
         "UMask": "0x2",
@@ -825,6 +827,7 @@
         "BriefDescription": "Iterations of Repeat string retired instructions.",
         "EventCode": "0xc0",
         "EventName": "INST_RETIRED.REP_ITERATION",
+        "PEBS": "1",
         "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.",
         "SampleAfterValue": "2000003",
         "UMask": "0x8",
@@ -1107,6 +1110,16 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL. [This event is alias to LBR_INSERTS.ANY]",
+        "EventCode": "0xe4",
+        "EventName": "MISC_RETIRED.LBR_INSERTS",
+        "PEBS": "1",
+        "PublicDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL. This event is PDIR on GP0 and NPEBS on all other GPs [This event is alias to LBR_INSERTS.ANY]",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Increments whenever there is an update to the LBR array.",
         "EventCode": "0xcc",
         "EventName": "MISC_RETIRED.LBR_INSERTS",
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/other.json b/tools/perf/pmu-events/arch/x86/alderlaken/other.json
index 6336de61f628..ccc892149dbe 100644
--- a/tools/perf/pmu-events/arch/x86/alderlaken/other.json
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/other.json
@@ -1,5 +1,14 @@
 [
     {
+        "BriefDescription": "This event is deprecated. [This event is alias to MISC_RETIRED.LBR_INSERTS]",
+        "Deprecated": "1",
+        "EventCode": "0xe4",
+        "EventName": "LBR_INSERTS.ANY",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
         "BriefDescription": "Counts modified writebacks from L1 cache and L2 cache that have any type of response.",
         "EventCode": "0xB7",
         "EventName": "OCR.COREWB_M.ANY_RESPONSE",
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json b/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json
index 3153bab527a9..846bcdafca6d 100644
--- a/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json
@@ -345,6 +345,15 @@
         "UMask": "0x1"
     },
     {
+        "BriefDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL. [This event is alias to LBR_INSERTS.ANY]",
+        "EventCode": "0xe4",
+        "EventName": "MISC_RETIRED.LBR_INSERTS",
+        "PEBS": "1",
+        "PublicDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL. This event is PDIR on GP0 and NPEBS on all other GPs [This event is alias to LBR_INSERTS.ANY]",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
         "BriefDescription": "Counts the number of issue slots not consumed by the backend due to a micro-sequencer (MS) scoreboard, which stalls the front-end from issuing from the UROM until a specified older uop retires.",
         "EventCode": "0x75",
         "EventName": "SERIALIZATION.NON_C01_MS_SCB",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen4/cache.json b/tools/perf/pmu-events/arch/x86/amdzen4/cache.json
index ecbe9660b2b3..e6d710cf3ce2 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen4/cache.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen4/cache.json
@@ -676,6 +676,10 @@
     "EventCode": "0xac",
     "BriefDescription": "Average sampled latency when data is sourced from DRAM in the same NUMA node.",
     "UMask": "0x01",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -683,6 +687,10 @@
     "EventCode": "0xac",
     "BriefDescription": "Average sampled latency when data is sourced from DRAM in a different NUMA node.",
     "UMask": "0x02",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -690,6 +698,10 @@
     "EventCode": "0xac",
     "BriefDescription": "Average sampled latency when data is sourced from another CCX's cache when the address was in the same NUMA node.",
     "UMask": "0x04",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -697,6 +709,10 @@
     "EventCode": "0xac",
     "BriefDescription": "Average sampled latency when data is sourced from another CCX's cache when the address was in a different NUMA node.",
     "UMask": "0x08",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -704,6 +720,10 @@
     "EventCode": "0xac",
     "BriefDescription": "Average sampled latency when data is sourced from extension memory (CXL) in the same NUMA node.",
     "UMask": "0x10",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -711,6 +731,10 @@
     "EventCode": "0xac",
     "BriefDescription": "Average sampled latency when data is sourced from extension memory (CXL) in a different NUMA node.",
     "UMask": "0x20",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -718,6 +742,10 @@
     "EventCode": "0xac",
     "BriefDescription": "Average sampled latency from all data sources.",
     "UMask": "0x3f",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -725,6 +753,10 @@
     "EventCode": "0xad",
     "BriefDescription": "L3 cache fill requests sourced from DRAM in the same NUMA node.",
     "UMask": "0x01",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -732,6 +764,10 @@
     "EventCode": "0xad",
     "BriefDescription": "L3 cache fill requests sourced from DRAM in a different NUMA node.",
     "UMask": "0x02",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -739,6 +775,10 @@
     "EventCode": "0xad",
     "BriefDescription": "L3 cache fill requests sourced from another CCX's cache when the address was in the same NUMA node.",
     "UMask": "0x04",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -746,6 +786,10 @@
     "EventCode": "0xad",
     "BriefDescription": "L3 cache fill requests sourced from another CCX's cache when the address was in a different NUMA node.",
     "UMask": "0x08",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -753,6 +797,10 @@
     "EventCode": "0xad",
     "BriefDescription": "L3 cache fill requests sourced from extension memory (CXL) in the same NUMA node.",
     "UMask": "0x10",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -760,6 +808,10 @@
     "EventCode": "0xad",
     "BriefDescription": "L3 cache fill requests sourced from extension memory (CXL) in a different NUMA node.",
     "UMask": "0x20",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   },
   {
@@ -767,6 +819,10 @@
     "EventCode": "0xad",
     "BriefDescription": "L3 cache fill requests sourced from all data sources.",
     "UMask": "0x3f",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
     "Unit": "L3PMC"
   }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
index 55a10b0bf36f..c20833fb1f58 100644
--- a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
@@ -84,12 +84,12 @@
         "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
+        "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -211,7 +211,7 @@
         "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -266,7 +266,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -343,27 +343,20 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
         "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
-        "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
-        "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
-        "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_mispredicts_resteers"
-    },
-    {
         "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
-        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)",
         "MetricGroup": "Bad;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmisp_indirect",
         "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
@@ -389,7 +382,7 @@
     },
     {
         "BriefDescription": "Floating Point Operations Per Cycle",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc"
     },
@@ -401,8 +394,8 @@
         "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
@@ -439,7 +432,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
@@ -447,7 +440,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
@@ -455,7 +448,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
@@ -463,7 +456,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
@@ -471,7 +464,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -489,7 +482,7 @@
     },
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
-        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
@@ -518,120 +511,114 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_all"
     },
     {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "Mem",
         "MetricName": "tma_info_memory_l3mpki"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
-    },
-    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
-    },
-    {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
-    },
-    {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
-    },
-    {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "0",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
     },
     {
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
@@ -641,8 +628,8 @@
         "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
@@ -653,30 +640,36 @@
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
         "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -699,19 +692,6 @@
         "MetricThreshold": "tma_info_system_kernel_utilization > 0.05"
     },
     {
-        "BriefDescription": "Average number of parallel requests to external memory",
-        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_parallel_requests",
-        "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests"
-    },
-    {
-        "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
-        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_request_latency"
-    },
-    {
         "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
         "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)",
         "MetricGroup": "SMT",
@@ -777,7 +757,7 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
         "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
@@ -786,7 +766,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -795,7 +775,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS",
@@ -805,20 +785,20 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_SMT",
         "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -837,7 +817,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -872,21 +852,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -923,7 +903,7 @@
         "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
         "ScaleUnit": "100%"
     },
@@ -991,12 +971,12 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -1050,7 +1030,7 @@
         "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "ScaleUnit": "100%"
     },
     {
@@ -1130,10 +1110,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/broadwell/memory.json b/tools/perf/pmu-events/arch/x86/broadwell/memory.json
index ac7cdb831960..b01ed47072bc 100644
--- a/tools/perf/pmu-events/arch/x86/broadwell/memory.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/memory.json
@@ -2005,7 +2005,7 @@
         "BriefDescription": "Number of times RTM abort was triggered",
         "EventCode": "0xc9",
         "EventName": "RTM_RETIRED.ABORTED",
-        "PEBS": "1",
+        "PEBS": "2",
         "PublicDescription": "Number of times RTM abort was triggered .",
         "SampleAfterValue": "2000003",
         "UMask": "0x4"
diff --git a/tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json b/tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json
index f6a0258e3241..8c808347f6da 100644
--- a/tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -24,7 +24,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -94,6 +96,7 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
index e1f55fcfa0d0..826357787201 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
@@ -84,12 +84,12 @@
         "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
+        "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -211,7 +211,7 @@
         "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -257,7 +257,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -334,28 +334,21 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
         "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
-        "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
-        "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
-        "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_mispredicts_resteers"
-    },
-    {
         "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
-        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)",
         "MetricGroup": "Bad;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmisp_indirect",
         "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
@@ -381,7 +374,7 @@
     },
     {
         "BriefDescription": "Floating Point Operations Per Cycle",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc"
     },
@@ -393,8 +386,8 @@
         "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
@@ -431,7 +424,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
@@ -439,7 +432,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
@@ -447,7 +440,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
@@ -455,7 +448,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
@@ -463,7 +456,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -481,7 +474,7 @@
     },
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
-        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
@@ -510,120 +503,114 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_all"
     },
     {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "Mem",
         "MetricName": "tma_info_memory_l3mpki"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
-    },
-    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
-    },
-    {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
-    },
-    {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
-    },
-    {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "0",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
     },
     {
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
@@ -633,8 +620,8 @@
         "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
@@ -645,30 +632,36 @@
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
         "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -756,7 +749,7 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
         "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
@@ -765,7 +758,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -774,7 +767,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
@@ -784,20 +777,20 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_SMT",
         "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -816,7 +809,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -851,21 +844,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -902,7 +895,7 @@
         "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
         "ScaleUnit": "100%"
     },
@@ -968,12 +961,12 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -1026,7 +1019,7 @@
         "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
         "ScaleUnit": "100%"
     },
@@ -1108,10 +1101,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json b/tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json
index f6a0258e3241..8c808347f6da 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellde/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -24,7 +24,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -94,6 +96,7 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json
index 83d20130c217..320aaab53a0b 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json
@@ -394,6 +394,7 @@
         "BriefDescription": "Number of cores in C-State; C0 and C1",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
+        "Filter": "occ_sel=1",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
@@ -402,6 +403,7 @@
         "BriefDescription": "Number of cores in C-State; C3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
+        "Filter": "occ_sel=2",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
@@ -410,6 +412,7 @@
         "BriefDescription": "Number of cores in C-State; C6 and C7",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
+        "Filter": "occ_sel=3",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
index b319e4edc238..f2d378c9d68f 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
@@ -286,12 +286,12 @@
         "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
+        "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -413,7 +413,7 @@
         "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -468,7 +468,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -545,27 +545,20 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
         "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
-        "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
-        "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
-        "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_mispredicts_resteers"
-    },
-    {
         "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
-        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)",
         "MetricGroup": "Bad;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmisp_indirect",
         "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
@@ -591,7 +584,7 @@
     },
     {
         "BriefDescription": "Floating Point Operations Per Cycle",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc"
     },
@@ -603,8 +596,8 @@
         "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
@@ -641,7 +634,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
@@ -649,7 +642,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
@@ -657,7 +650,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
@@ -665,7 +658,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
@@ -673,7 +666,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -691,7 +684,7 @@
     },
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
-        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
@@ -720,120 +713,163 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
+    },
+    {
+        "BriefDescription": "Average Parallel L2 cache miss data reads",
+        "MetricExpr": "tma_info_memory_latency_data_l2_mlp",
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_data_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_all"
     },
     {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load"
     },
     {
-        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
-        "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
-        "MetricName": "tma_info_memory_l3mpki"
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_l3mpki"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
-        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
+        "MetricExpr": "tma_info_memory_load_l2_miss_latency",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
-        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
+        "MetricExpr": "tma_info_memory_load_l2_mlp",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
+        "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_miss_latency",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "0",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricExpr": "tma_info_memory_tlb_page_walks_utilization",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_page_walks_utilization",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
@@ -843,8 +879,8 @@
         "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
@@ -855,30 +891,36 @@
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
         "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -933,6 +975,12 @@
         "MetricName": "tma_info_system_turbo_utilization"
     },
     {
+        "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]",
+        "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time",
+        "MetricGroup": "SoC",
+        "MetricName": "tma_info_system_uncore_frequency"
+    },
+    {
         "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
         "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
         "MetricGroup": "Pipeline",
@@ -980,7 +1028,7 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
         "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
@@ -989,7 +1037,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -998,7 +1046,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS",
@@ -1008,20 +1056,20 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_SMT",
         "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_MISS / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -1040,7 +1088,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -1055,11 +1103,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory",
-        "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
         "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_local_dram",
-        "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_local_mem",
+        "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -1085,21 +1132,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -1136,7 +1183,7 @@
         "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
         "ScaleUnit": "100%"
     },
@@ -1204,12 +1251,12 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -1263,7 +1310,7 @@
         "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "ScaleUnit": "100%"
     },
     {
@@ -1278,11 +1325,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory",
-        "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
         "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_remote_dram",
-        "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_remote_mem",
+        "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -1363,10 +1409,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json b/tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json
index f6a0258e3241..8c808347f6da 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -24,7 +24,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -94,6 +96,7 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json
index 83d20130c217..320aaab53a0b 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json
@@ -394,6 +394,7 @@
         "BriefDescription": "Number of cores in C-State; C0 and C1",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
+        "Filter": "occ_sel=1",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
@@ -402,6 +403,7 @@
         "BriefDescription": "Number of cores in C-State; C3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
+        "Filter": "occ_sel=2",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
@@ -410,6 +412,7 @@
         "BriefDescription": "Number of cores in C-State; C6 and C7",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
+        "Filter": "occ_sel=3",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
index 8bc6c0707856..7f88b156f73b 100644
--- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
@@ -210,6 +210,12 @@
         "ScaleUnit": "1MB/s"
     },
     {
+        "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to remote memory.",
+        "MetricExpr": "UNC_CHA_REQUESTS.WRITES_REMOTE * 64 / 1e6 / duration_time",
+        "MetricName": "llc_miss_remote_memory_bandwidth_write",
+        "ScaleUnit": "1MB/s"
+    },
+    {
         "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions",
         "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY",
         "MetricName": "loads_per_instr",
@@ -316,12 +322,12 @@
         "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots",
+        "MetricExpr": "34 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -389,7 +395,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) + 44 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) + 44 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_contested_accesses",
         "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -410,7 +416,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_data_sharing",
         "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -422,7 +428,7 @@
         "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
         "MetricName": "tma_decoder0_alone",
-        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35))",
+        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
         "ScaleUnit": "100%"
     },
@@ -447,10 +453,10 @@
     },
     {
         "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
-        "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
+        "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -470,7 +476,7 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
         "MetricName": "tma_dtlb_load",
         "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
@@ -479,13 +485,13 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
         "MetricName": "tma_dtlb_store",
         "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(110 * tma_info_system_average_frequency * (OCR.DEMAND_RFO.L3_MISS.REMOTE_HITM + OCR.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_average_frequency * (OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OCR.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks",
+        "MetricExpr": "(110 * tma_info_system_core_frequency * (OCR.DEMAND_RFO.L3_MISS.REMOTE_HITM + OCR.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_core_frequency * (OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OCR.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
         "MetricName": "tma_false_sharing",
         "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -499,7 +505,7 @@
         "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
         "MetricName": "tma_fb_full",
         "MetricThreshold": "tma_fb_full > 0.3",
-        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
         "ScaleUnit": "100%"
     },
     {
@@ -507,7 +513,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -542,6 +548,15 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists",
+        "MetricExpr": "34 * FP_ASSIST.ANY / tma_info_thread_slots",
+        "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group",
+        "MetricName": "tma_fp_assists",
+        "MetricThreshold": "tma_fp_assists > 0.1",
+        "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
         "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / UOPS_RETIRED.RETIRE_SLOTS",
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
@@ -600,10 +615,10 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions",
         "MetricExpr": "tma_light_operations * UOPS_RETIRED.MACRO_FUSED / UOPS_RETIRED.RETIRE_SLOTS",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_fused_instructions",
         "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}",
         "ScaleUnit": "100%"
     },
     {
@@ -613,13 +628,13 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
         "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
@@ -627,26 +642,53 @@
     },
     {
         "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100",
         "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
         "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers"
     },
     {
         "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
-        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)",
         "MetricGroup": "Bad;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmisp_indirect",
         "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
     },
     {
         "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
-        "MetricExpr": "tma_info_core_ipmispredict",
+        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;BadSpec;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmispredict",
         "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
     },
     {
+        "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)",
+        "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)",
+        "MetricGroup": "BrMispredicts",
+        "MetricName": "tma_info_bad_spec_spec_clears_ratio"
+    },
+    {
+        "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
+        "MetricExpr": "(100 * (1 - tma_core_bound / (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if tma_core_bound < (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
+        "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_core_bound_likely",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.",
+        "MetricExpr": "100 * (100 * (tma_fetch_latency * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + tma_fetch_bandwidth * tma_mite / (tma_mite + tma_dsb)))",
+        "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_dsb_misses",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.",
+        "MetricExpr": "100 * (100 * (tma_fetch_latency * ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))",
+        "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_ic_misses",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
@@ -672,67 +714,102 @@
         "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: "
     },
     {
+        "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.",
+        "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Ret",
+        "MetricName": "tma_info_bottleneck_base_non_br",
+        "MetricThreshold": "tma_info_bottleneck_base_non_br > 20"
+    },
+    {
         "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
-        "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC",
+        "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB",
         "MetricName": "tma_info_bottleneck_big_code",
-        "MetricThreshold": "tma_info_bottleneck_big_code > 20",
-        "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead"
+        "MetricThreshold": "tma_info_bottleneck_big_code > 20"
     },
     {
         "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
-        "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)",
-        "MetricGroup": "Ret;tma_issueBC",
+        "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)",
+        "MetricGroup": "Ret",
         "MetricName": "tma_info_bottleneck_branching_overhead",
-        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10",
-        "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code"
+        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
+        "MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))",
+        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
+        "MetricName": "tma_info_bottleneck_cache_memory_latency",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency"
+    },
+    {
+        "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
+        "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
+        "MetricGroup": "Cor;tma_issueComp",
+        "MetricName": "tma_info_bottleneck_compute_bound_est",
+        "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
+        "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: "
     },
     {
         "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
+        "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
         "MetricGroup": "Fed;FetchBW;Frontend",
         "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
         "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20"
     },
     {
-        "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
-        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
-        "MetricName": "tma_info_bottleneck_memory_bandwidth",
-        "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20",
-        "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+        "BriefDescription": "Total pipeline cost of irregular execution (e.g",
+        "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Bad;Cor;Ret;tma_issueMS",
+        "MetricName": "tma_info_bottleneck_irregular_overhead",
+        "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
+        "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches"
     },
     {
         "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))",
         "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB",
         "MetricName": "tma_info_bottleneck_memory_data_tlbs",
         "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
-        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store"
+        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization"
     },
     {
-        "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))",
-        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
-        "MetricName": "tma_info_bottleneck_memory_latency",
-        "MetricThreshold": "tma_info_bottleneck_memory_latency > 20",
-        "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency"
+        "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
+        "MetricGroup": "Mem;Offcore;tma_issueTLB",
+        "MetricName": "tma_info_bottleneck_memory_synchronization",
+        "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
+        "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs"
     },
     {
         "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+        "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
         "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bottleneck_mispredictions",
         "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
         "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers"
     },
     {
+        "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)",
+        "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)",
+        "MetricGroup": "Cor;Offcore",
+        "MetricName": "tma_info_bottleneck_other_bottlenecks",
+        "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
+        "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls."
+    },
+    {
         "BriefDescription": "Fraction of branches that are CALL or RET",
         "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
@@ -753,7 +830,7 @@
     {
         "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.COND - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
         "MetricName": "tma_info_branches_jump"
     },
@@ -770,9 +847,15 @@
         "MetricName": "tma_info_core_coreipc"
     },
     {
+        "BriefDescription": "uops Executed per Cycle",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_core_epc"
+    },
+    {
         "BriefDescription": "Floating Point Operations Per Cycle",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc"
     },
@@ -784,19 +867,12 @@
         "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
     {
-        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
-        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
-        "MetricGroup": "Bad;BadSpec;BrMispredicts;TopdownL1;tma_L1_group",
-        "MetricName": "tma_info_core_ipmispredict",
-        "MetricgroupNoGroup": "TopdownL1"
-    },
-    {
         "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
         "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)",
         "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB",
@@ -867,7 +943,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
@@ -875,7 +951,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
@@ -883,7 +959,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)",
@@ -891,7 +967,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx512",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
@@ -899,7 +975,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
@@ -907,7 +983,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -926,7 +1002,7 @@
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
@@ -939,6 +1015,12 @@
         "MetricThreshold": "tma_info_inst_mix_ipload < 3"
     },
     {
+        "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)",
+        "MetricExpr": "tma_info_inst_mix_instructions / ROB_MISC_EVENTS.PAUSE_INST",
+        "MetricGroup": "Flops;FpVector;InsType",
+        "MetricName": "tma_info_inst_mix_ippause"
+    },
+    {
         "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
         "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES",
         "MetricGroup": "InsType",
@@ -961,16 +1043,23 @@
         "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp"
     },
     {
+        "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki",
+        "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_code_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction",
@@ -986,124 +1075,214 @@
     },
     {
         "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_access_bw",
         "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_core_l3_cache_access_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
+    },
+    {
+        "BriefDescription": "Average Parallel L2 cache miss data reads",
+        "MetricExpr": "tma_info_memory_latency_data_l2_mlp",
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_data_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_fb_hpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction",
+        "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY",
+        "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)",
+        "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY",
+        "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_evictions_silent_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_all"
     },
     {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW;Offcore",
+        "MetricName": "tma_info_memory_l3_cache_access_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l3_cache_access_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "Mem",
         "MetricName": "tma_info_memory_l3mpki"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
+        "BriefDescription": "Average Parallel L2 cache miss data reads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
+        "MetricGroup": "Memory_BW;Offcore",
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
+        "MetricExpr": "tma_info_memory_load_l2_miss_latency",
+        "MetricGroup": "Memory_Lat;Offcore",
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
-        "BriefDescription": "Average Parallel L2 cache miss data reads",
-        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
+        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
+        "MetricExpr": "tma_info_memory_load_l2_mlp",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
-        "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_miss_latency",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
-        "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_access_bw",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "tma_info_memory_uc_load_pki",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_uc_load_pki"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+    },
+    {
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricExpr": "tma_info_memory_tlb_page_walks_utilization",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_page_walks_utilization",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_store_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
@@ -1132,55 +1311,78 @@
         "MetricName": "tma_info_memory_tlb_store_stlb_mpki"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY",
+        "MetricGroup": "Mem;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_uc_load_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
     {
+        "BriefDescription": "Instructions per a microcode Assist invocation",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ASSIST.ANY + OTHER_ASSISTS.ANY)",
+        "MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
+        "MetricName": "tma_info_pipeline_ipassist",
+        "MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
+        "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)"
+    },
+    {
         "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
         "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@",
         "MetricGroup": "Pipeline;Ret",
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
-        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
+        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]",
         "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1e9 / duration_time",
-        "MetricGroup": "IoBW;Mem;Server;SoC",
-        "MetricName": "tma_info_system_io_read_bw"
+        "MetricGroup": "IoBW;MemOffcore;Server;SoC",
+        "MetricName": "tma_info_system_io_read_bw",
+        "PublicDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU"
     },
     {
         "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]",
         "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e9 / duration_time",
-        "MetricGroup": "IoBW;Mem;Server;SoC",
-        "MetricName": "tma_info_system_io_write_bw"
+        "MetricGroup": "IoBW;MemOffcore;Server;SoC",
+        "MetricName": "tma_info_system_io_write_bw",
+        "PublicDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]. Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -1205,7 +1407,7 @@
     {
         "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]",
         "MetricExpr": "1e9 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@",
-        "MetricGroup": "Mem;MemoryLat;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryLat;Server;SoC",
         "MetricName": "tma_info_system_mem_dram_read_latency",
         "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches"
     },
@@ -1219,7 +1421,7 @@
     {
         "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]",
         "MetricExpr": "(1e9 * (UNC_M_PMM_RPQ_OCCUPANCY.ALL / UNC_M_PMM_RPQ_INSERTS) / imc_0@event\\=0x0@ if #has_pmem > 0 else 0)",
-        "MetricGroup": "Mem;MemoryLat;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryLat;Server;SoC",
         "MetricName": "tma_info_system_mem_pmm_read_latency",
         "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches"
     },
@@ -1233,13 +1435,13 @@
     {
         "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]",
         "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)",
-        "MetricGroup": "Mem;MemoryBW;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryBW;Server;SoC",
         "MetricName": "tma_info_system_pmm_read_bw"
     },
     {
         "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]",
         "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)",
-        "MetricGroup": "Mem;MemoryBW;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryBW;Server;SoC",
         "MetricName": "tma_info_system_pmm_write_bw"
     },
     {
@@ -1284,6 +1486,12 @@
         "MetricName": "tma_info_system_turbo_utilization"
     },
     {
+        "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]",
+        "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time",
+        "MetricGroup": "SoC",
+        "MetricName": "tma_info_system_uncore_frequency"
+    },
+    {
         "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
         "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
         "MetricGroup": "Pipeline",
@@ -1330,8 +1538,8 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
-        "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
@@ -1340,7 +1548,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -1350,7 +1558,7 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
@@ -1359,24 +1567,24 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
-        "MetricExpr": "17 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "MetricExpr": "17 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
-        "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks",
+        "MetricExpr": "DECODE.LCP / tma_info_thread_clks",
         "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
         "MetricName": "tma_lcp",
         "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -1390,7 +1598,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -1421,10 +1629,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory",
-        "MetricExpr": "59.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "59.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_local_dram",
-        "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_local_mem",
+        "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -1449,21 +1657,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -1491,7 +1699,7 @@
         "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
         "MetricName": "tma_microcode_sequencer",
         "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
-        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches",
+        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1508,17 +1716,17 @@
         "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
+        "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)",
         "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
         "MetricName": "tma_mixing_vectors",
         "MetricThreshold": "tma_mixing_vectors > 0.05",
-        "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
+        "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1527,13 +1735,13 @@
         "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
         "MetricName": "tma_ms_switches",
         "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
-        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
+        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused",
         "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED) / UOPS_RETIRED.RETIRE_SLOTS",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_non_fused_branches",
         "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.",
@@ -1542,15 +1750,15 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
         "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / UOPS_RETIRED.RETIRE_SLOTS",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
         "MetricName": "tma_nop_instructions",
-        "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6",
+        "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
-        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))",
+        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))",
         "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_other_light_ops",
         "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
@@ -1558,9 +1766,25 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
+        "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)",
+        "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
+        "MetricName": "tma_other_mispredicts",
+        "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
+        "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)",
+        "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_other_nukes",
+        "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)",
+        "MetricExpr": "(((1 - (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)",
         "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_pmm_bound",
         "MetricThreshold": "tma_pmm_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -1622,12 +1846,12 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -1641,7 +1865,7 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
-        "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
+        "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
         "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
         "MetricName": "tma_ports_utilization",
         "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -1650,7 +1874,7 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
-        "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_core_clks",
+        "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_0",
         "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -1680,13 +1904,13 @@
         "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues",
         "MetricConstraint": "NO_GROUP_EVENTS_NMI",
-        "MetricExpr": "(89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(89.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group",
         "MetricName": "tma_remote_cache",
         "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
@@ -1695,10 +1919,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory",
-        "MetricExpr": "127 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "127 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_remote_dram",
-        "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_remote_mem",
+        "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -1715,18 +1939,18 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
         "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks",
-        "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group",
+        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
         "MetricName": "tma_serializing_operation",
-        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))",
+        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
         "MetricExpr": "40 * ROB_MISC_EVENTS.PAUSE_INST / tma_info_thread_clks",
-        "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
         "MetricName": "tma_slow_pause",
-        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))",
+        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST",
         "ScaleUnit": "100%"
     },
@@ -1755,7 +1979,7 @@
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
         "MetricName": "tma_sq_full",
         "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
+        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
         "ScaleUnit": "100%"
     },
     {
@@ -1813,10 +2037,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json b/tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json
index bc6a9a4d27a9..904d299c95a3 100644
--- a/tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/cascadelakex/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -26,7 +26,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -64,8 +66,10 @@
     "tma_L5_group": "Metrics for top-down breakdown at level 5",
     "tma_L6_group": "Metrics for top-down breakdown at level 6",
     "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category",
+    "tma_assists_group": "Metrics contributing to tma_assists category",
     "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category",
     "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category",
+    "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category",
     "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category",
     "tma_core_bound_group": "Metrics contributing to tma_core_bound category",
     "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category",
@@ -78,9 +82,9 @@
     "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category",
     "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category",
     "tma_issue2P": "Metrics related by the issue $issue2P",
-    "tma_issueBC": "Metrics related by the issue $issueBC",
     "tma_issueBM": "Metrics related by the issue $issueBM",
     "tma_issueBW": "Metrics related by the issue $issueBW",
+    "tma_issueComp": "Metrics related by the issue $issueComp",
     "tma_issueD0": "Metrics related by the issue $issueD0",
     "tma_issueFB": "Metrics related by the issue $issueFB",
     "tma_issueFL": "Metrics related by the issue $issueFL",
@@ -100,10 +104,12 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
     "tma_mite_group": "Metrics contributing to tma_mite category",
+    "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category",
     "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category",
     "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category",
     "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category",
diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-power.json b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-power.json
index c6254af7a468..ceef46046488 100644
--- a/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/cascadelakex/uncore-power.json
@@ -144,6 +144,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0x40",
         "Unit": "PCU"
     },
     {
@@ -152,6 +153,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0x80",
         "Unit": "PCU"
     },
     {
@@ -160,6 +162,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0xc0",
         "Unit": "PCU"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json
index bf5a511b99d1..86a8f3b7fe1d 100644
--- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json
@@ -3565,6 +3565,15 @@
         "Unit": "CHA"
     },
     {
+        "BriefDescription": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_CXL_EXP_LOCAL",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20c8968201",
+        "Unit": "CHA"
+    },
+    {
         "BriefDescription": "TOR Inserts : DRd_Prefs issued by iA Cores targeting DDR Mem that Missed the LLC",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_DDR",
@@ -3716,6 +3725,15 @@
         "Unit": "CHA"
     },
     {
+        "BriefDescription": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA_CXL_EXP_LOCAL",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20ccd68201",
+        "Unit": "CHA"
+    },
+    {
         "BriefDescription": "TOR Inserts; LLCPrefRFO misses from local IA",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO",
@@ -3742,6 +3760,15 @@
         "Unit": "CHA"
     },
     {
+        "BriefDescription": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO_CXL_EXP_LOCAL",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20c8868201",
+        "Unit": "CHA"
+    },
+    {
         "BriefDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCILF_DDR",
@@ -3892,6 +3919,15 @@
         "Unit": "CHA"
     },
     {
+        "BriefDescription": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_CXL_EXP_LOCAL",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20ccc68201",
+        "Unit": "CHA"
+    },
+    {
         "BriefDescription": "TOR Inserts; RFO prefetch misses from local IA",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_LOCAL",
@@ -4429,6 +4465,46 @@
         "Unit": "CHA"
     },
     {
+        "BriefDescription": "TOR Inserts for INVXTOM opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.RRQ_MISS_INVXTOM_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x20e87e8240",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Inserts for RDCODE opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.RRQ_MISS_RDCODE_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x20e80e8240",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Inserts for RDCUR opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.RRQ_MISS_RDCUR_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x20e8068240",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Inserts for RDDATA opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.RRQ_MISS_RDDATA_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x20e8168240",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Inserts for RDINVOWN_OPT opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.RRQ_MISS_RDINVOWN_OPT_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x20e8268240",
+        "Unit": "CHA"
+    },
+    {
         "BriefDescription": "TOR Inserts; All Snoops from Remote",
         "EventCode": "0x35",
         "EventName": "UNC_CHA_TOR_INSERTS.SNPS_FROM_REM",
@@ -5012,6 +5088,15 @@
         "Unit": "CHA"
     },
     {
+        "BriefDescription": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PREF_CXL_EXP_LOCAL",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PREF_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20c8968201",
+        "Unit": "CHA"
+    },
+    {
         "BriefDescription": "TOR Occupancy : DRd_Prefs issued by iA Cores targeting DDR Mem that Missed the LLC",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PREF_DDR",
@@ -5163,6 +5248,15 @@
         "Unit": "CHA"
     },
     {
+        "BriefDescription": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFDATA_CXL_EXP_LOCAL",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFDATA_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20ccd68201",
+        "Unit": "CHA"
+    },
+    {
         "BriefDescription": "TOR Occupancy; LLCPrefRFO misses from local IA",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO",
@@ -5189,6 +5283,15 @@
         "Unit": "CHA"
     },
     {
+        "BriefDescription": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO_CXL_EXP_LOCAL",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20c8868201",
+        "Unit": "CHA"
+    },
+    {
         "BriefDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCILF_DDR",
@@ -5339,6 +5442,15 @@
         "Unit": "CHA"
     },
     {
+        "BriefDescription": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_CXL_EXP_LOCAL",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20ccc68201",
+        "Unit": "CHA"
+    },
+    {
         "BriefDescription": "TOR Occupancy; RFO prefetch misses from local IA",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_LOCAL",
@@ -5842,6 +5954,46 @@
         "Unit": "CHA"
     },
     {
+        "BriefDescription": "TOR Occupancy for INVXTOM opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.RRQ_MISS_INVXTOM_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x20e87e8240",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for RDCODE opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.RRQ_MISS_RDCODE_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x20e80e8240",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for RDCUR opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.RRQ_MISS_RDCUR_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x20e8068240",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for RDDATA opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.RRQ_MISS_RDDATA_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x20e8168240",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for RDINVOWN_OPT opcodes received from a remote socket which miss the L3 and target memory in a CXL type 3 memory expander local to this socket.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.RRQ_MISS_RDINVOWN_OPT_CXL_EXP_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x20e8268240",
+        "Unit": "CHA"
+    },
+    {
         "BriefDescription": "TOR Occupancy; All Snoops from Remote",
         "EventCode": "0x36",
         "EventName": "UNC_CHA_TOR_OCCUPANCY.SNPS_FROM_REM",
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/cache.json b/tools/perf/pmu-events/arch/x86/grandridge/cache.json
index 7f0dc65a55d2..f937ba0e50e1 100644
--- a/tools/perf/pmu-events/arch/x86/grandridge/cache.json
+++ b/tools/perf/pmu-events/arch/x86/grandridge/cache.json
@@ -16,6 +16,148 @@
         "UMask": "0x4f"
     },
     {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an instruction cache or TLB miss.",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.ALL",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x7f"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2 cache.",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.L2_HIT",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or Translation Lookaside Buffer (TLB) miss which hit in the L2 cache.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an icache or itlb miss which hit in the LLC.",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_HIT",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x6"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an icache or itlb miss which missed all the caches.",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x78"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an L1 demand load miss.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.ALL",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x7f"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the L2 cache.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.L2_HIT",
+        "PublicDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 cache.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x6"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the local caches.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x78"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in DRAM",
+        "EventCode": "0xd3",
+        "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that hit the L1 data cache.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L1_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that miss in the L1 data cache.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L1_MISS",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x40"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that hit in the L2 cache.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that miss in the L2 cache.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_MISS",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x80"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that hit in the L3 cache.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x1c"
+    },
+    {
+        "BriefDescription": "Counts the number of loads that hit in a write combining buffer (WCB), excluding the first load that caused the WCB to allocate.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.WCB_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x20"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked for any of the following reasons:  load buffer, store buffer or RSV full.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.ALL",
+        "SampleAfterValue": "20003",
+        "UMask": "0x7"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to a load buffer full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.LD_BUF",
+        "SampleAfterValue": "20003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to an RSV full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.RSV",
+        "SampleAfterValue": "20003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to a store buffer full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.ST_BUF",
+        "SampleAfterValue": "20003",
+        "UMask": "0x1"
+    },
+    {
         "BriefDescription": "Counts the number of load ops retired.",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -144,6 +286,42 @@
         "UMask": "0x5"
     },
     {
+        "BriefDescription": "Counts the number of load uops retired that performed one or more locks",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOCK_LOADS",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x21"
+    },
+    {
+        "BriefDescription": "Counts the number of memory uops retired that were splits.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.SPLIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x43"
+    },
+    {
+        "BriefDescription": "Counts the number of retired split load uops.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x41"
+    },
+    {
+        "BriefDescription": "Counts the number of retired split store uops.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x42"
+    },
+    {
         "BriefDescription": "Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -151,5 +329,12 @@
         "PEBS": "2",
         "SampleAfterValue": "1000003",
         "UMask": "0x6"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to an icache miss",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ICACHE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x20"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/floating-point.json b/tools/perf/pmu-events/arch/x86/grandridge/floating-point.json
new file mode 100644
index 000000000000..00c9a8ae0f53
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/grandridge/floating-point.json
@@ -0,0 +1,68 @@
+[
+    {
+        "BriefDescription": "Counts the number of cycles when any of the floating point dividers are active.",
+        "CounterMask": "1",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.FPDIV_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of all types of floating point operations per uop with all default weighting",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.ALL",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3"
+    },
+    {
+        "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP64]",
+        "Deprecated": "1",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.DP",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point operations that produce 32 bit single precision results [This event is alias to FP_FLOPS_RETIRED.SP]",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.FP32",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point operations that produce 64 bit double precision results [This event is alias to FP_FLOPS_RETIRED.DP]",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.FP64",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP32]",
+        "Deprecated": "1",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.SP",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point operations retired that required microcode assist.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.FP_ASSIST",
+        "PublicDescription": "Counts the number of floating point operations retired that required microcode assist, which is not a reflection of the number of FP operations, instructions or uops.",
+        "SampleAfterValue": "20003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point divide uops retired (x87 and sse, including x87 sqrt).",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.FPDIV",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x8"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/frontend.json b/tools/perf/pmu-events/arch/x86/grandridge/frontend.json
index be8f1c7e195c..356d36aecc81 100644
--- a/tools/perf/pmu-events/arch/x86/grandridge/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/grandridge/frontend.json
@@ -1,5 +1,21 @@
 [
     {
+        "BriefDescription": "Counts the total number of BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+        "EventCode": "0xe6",
+        "EventName": "BACLEARS.ANY",
+        "PublicDescription": "Counts the total number of BACLEARS, which occur when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend.  Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss",
+        "EventCode": "0xc6",
+        "EventName": "FRONTEND_RETIRED.ITLB_MISS",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
         "BriefDescription": "Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump.",
         "EventCode": "0x80",
         "EventName": "ICACHE.ACCESSES",
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/memory.json b/tools/perf/pmu-events/arch/x86/grandridge/memory.json
index 79d8af45100c..e0ce2decc805 100644
--- a/tools/perf/pmu-events/arch/x86/grandridge/memory.json
+++ b/tools/perf/pmu-events/arch/x86/grandridge/memory.json
@@ -1,5 +1,71 @@
 [
     {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to any number of reasons, including an L1 miss, WCB full, pagewalk, store address block or store data block, on a load that retires.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.ANY_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xff"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to a core bound stall including a store address match, a DTLB miss or a page walk that detains the load from retiring.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.L1_BOUND_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xf4"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DL1 miss.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.L1_MISS_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x81"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.OTHER_AT_RET",
+        "PublicDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases such as pipeline conflicts, fences, etc.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xc0"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a pagewalk.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.PGWALK_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xa0"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a store address match.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.ST_ADDR_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x84"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to memory ordering caused by a snoop from an external agent. Does not count internally generated machine clears such as those due to memory disambiguation.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.MEMORY_ORDERING",
+        "SampleAfterValue": "20003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts misaligned loads that are 4K page splits.",
+        "EventCode": "0x13",
+        "EventName": "MISALIGN_MEM_REF.LOAD_PAGE_SPLIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts misaligned stores that are 4K page splits.",
+        "EventCode": "0x13",
+        "EventName": "MISALIGN_MEM_REF.STORE_PAGE_SPLIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4"
+    },
+    {
         "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.",
         "EventCode": "0xB7",
         "EventName": "OCR.DEMAND_DATA_RD.L3_MISS",
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/other.json b/tools/perf/pmu-events/arch/x86/grandridge/other.json
index 2414f6ff53b0..70a9da7e97df 100644
--- a/tools/perf/pmu-events/arch/x86/grandridge/other.json
+++ b/tools/perf/pmu-events/arch/x86/grandridge/other.json
@@ -1,5 +1,14 @@
 [
     {
+        "BriefDescription": "This event is deprecated. [This event is alias to MISC_RETIRED.LBR_INSERTS]",
+        "Deprecated": "1",
+        "EventCode": "0xe4",
+        "EventName": "LBR_INSERTS.ANY",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
         "BriefDescription": "Counts demand data reads that have any type of response.",
         "EventCode": "0xB7",
         "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE",
@@ -16,5 +25,12 @@
         "MSRValue": "0x10002",
         "SampleAfterValue": "100003",
         "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state.",
+        "EventCode": "0x75",
+        "EventName": "SERIALIZATION.C01_MS_SCB",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json b/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json
index 41212957ef21..daa0639bb1ca 100644
--- a/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json
@@ -1,5 +1,13 @@
 [
     {
+        "BriefDescription": "Counts the number of cycles when any of the dividers are active.",
+        "CounterMask": "1",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.DIV_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3"
+    },
+    {
         "BriefDescription": "Counts the total number of branch instructions retired for all branch types.",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.ALL_BRANCHES",
@@ -8,6 +16,71 @@
         "SampleAfterValue": "200003"
     },
     {
+        "BriefDescription": "Counts the number of retired JCC (Jump on Conditional Code) branch instructions retired, includes both taken and not taken branches.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x7e"
+    },
+    {
+        "BriefDescription": "Counts the number of taken JCC (Jump on Conditional Code) branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND_TAKEN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfe"
+    },
+    {
+        "BriefDescription": "Counts the number of far branch instructions retired, includes far jump, far call and return, and interrupt call and return.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.FAR_BRANCH",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xbf"
+    },
+    {
+        "BriefDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.INDIRECT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xeb"
+    },
+    {
+        "BriefDescription": "Counts the number of near indirect CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.INDIRECT_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT_CALL",
+        "Deprecated": "1",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.IND_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb"
+    },
+    {
+        "BriefDescription": "Counts the number of near CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf9"
+    },
+    {
+        "BriefDescription": "Counts the number of near RET branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_RETURN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf7"
+    },
+    {
         "BriefDescription": "Counts the total number of mispredicted branch instructions retired for all branch types.",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.ALL_BRANCHES",
@@ -16,6 +89,54 @@
         "SampleAfterValue": "200003"
     },
     {
+        "BriefDescription": "Counts the number of mispredicted JCC (Jump on Conditional Code) branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.COND",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x7e"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted taken JCC (Jump on Conditional Code) branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.COND_TAKEN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfe"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.INDIRECT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xeb"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted near indirect CALL branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.INDIRECT_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted near taken branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.NEAR_TAKEN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x80"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted near RET branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.RETURN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf7"
+    },
+    {
         "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles",
         "EventName": "CPU_CLK_UNHALTED.CORE",
         "SampleAfterValue": "2000003",
@@ -68,6 +189,66 @@
         "SampleAfterValue": "2000003"
     },
     {
+        "BriefDescription": "Counts the number of retired loads that are blocked because it initially appears to be store forward blocked, but subsequently is shown not to be blocked based on 4K alias check.",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.ADDRESS_ALIAS",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of retired loads that are blocked because its address exactly matches an older store whose data is not ready.",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.DATA_UNKNOWN",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of retired loads that are blocked because its address partially overlapped with an older store.",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.STORE_FORWARD",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to memory ordering in which an internal load passes an older store within the same CPU.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.DISAMBIGUATION",
+        "SampleAfterValue": "20003",
+        "UMask": "0x8"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to a page fault.  Counts both I-Side and D-Side (Loads/Stores) page faults.  A page fault occurs when either the page is not present, or an access violation occurs.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.PAGE_FAULT",
+        "SampleAfterValue": "20003",
+        "UMask": "0x20"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears that flush the pipeline and restart the machine with the use of microcode due to SMC, MEMORY_ORDERING, FP_ASSISTS, PAGE_FAULT, DISAMBIGUATION, and FPC_VIRTUAL_TRAP.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.SLOW",
+        "SampleAfterValue": "20003",
+        "UMask": "0x6f"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to program modifying data (self modifying code) within 1K of a recently fetched code page.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.SMC",
+        "SampleAfterValue": "20003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of Last Branch Record (LBR) entries. Requires LBRs to be enabled and configured in IA32_LBR_CTL. [This event is alias to LBR_INSERTS.ANY]",
+        "EventCode": "0xe4",
+        "EventName": "MISC_RETIRED.LBR_INSERTS",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
         "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.",
         "EventCode": "0x73",
         "EventName": "TOPDOWN_BAD_SPECULATION.ALL",
@@ -75,22 +256,194 @@
         "SampleAfterValue": "1000003"
     },
     {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to Fast Nukes such as  Memory Ordering Machine clears and MRN nukes",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.FASTNUKE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to Branch Mispredict",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.MISPREDICT",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to a machine clear (nuke).",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.NUKE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
         "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls",
         "EventCode": "0x74",
         "EventName": "TOPDOWN_BE_BOUND.ALL",
         "SampleAfterValue": "1000003"
     },
     {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to due to certain allocation restrictions",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop).  This could be caused by RSV full or load/store buffer block.",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.MEM_SCHEDULER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to IEC and FPC RAT stalls - which can be due to the FIQ and IEC reservation station stall (integer, FP and SIMD scheduler not being able to accept another uop. )",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to mrbl stall.  A 'marble' refers to a physical register file entry, also known as the physical destination (PDST).",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.REGISTER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x20"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to iq/jeu scoreboards or ms scb",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.SERIALIZATION",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
         "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls",
         "EventCode": "0x71",
         "EventName": "TOPDOWN_FE_BOUND.ALL",
         "SampleAfterValue": "1000003"
     },
     {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.BRANCH_DETECT",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BTClear",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.BRANCH_RESTEER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x40"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to ms",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.CISC",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to decode stall",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.DECODE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8d"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to latency related stalls including BACLEARs, BTCLEARs, ITLB misses, and ICache misses.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x72"
+    },
+    {
+        "BriefDescription": "This event is deprecated. [This event is alias to TOPDOWN_FE_BOUND.ITLB_MISS]",
+        "Deprecated": "1",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ITLB",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to itlb miss [This event is alias to TOPDOWN_FE_BOUND.ITLB]",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ITLB_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend that do not categorize into any other common frontend stall",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.OTHER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x80"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to predecode wrong",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.PREDECODE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4"
+    },
+    {
         "BriefDescription": "Counts the number of consumed retirement slots.  Similar to UOPS_RETIRED.ALL",
         "EventCode": "0x72",
         "EventName": "TOPDOWN_RETIRING.ALL",
         "PEBS": "1",
         "SampleAfterValue": "1000003"
+    },
+    {
+        "BriefDescription": "Counts the number of uops issued by the front end every cycle.",
+        "EventCode": "0x0e",
+        "EventName": "UOPS_ISSUED.ANY",
+        "PublicDescription": "Counts the number of uops issued by the front end every cycle. When 4-uops are requested and only 2-uops are delivered, the event counts 2.  Uops_issued correlates to the number of ROB entries.  If uop takes 2 ROB slots it counts as 2 uops_issued.",
+        "SampleAfterValue": "1000003"
+    },
+    {
+        "BriefDescription": "Counts the total number of uops retired.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.ALL",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003"
+    },
+    {
+        "BriefDescription": "Counts the number of integer divide uops retired.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.IDIV",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS).  This includes uops from flows due to complex instructions, faults, assists, and inserted flows.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.MS",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of x87 uops retired, includes those in ms flows",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.X87",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json
new file mode 100644
index 000000000000..74dfd9272cef
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-cache.json
@@ -0,0 +1,1795 @@
+[
+    {
+        "BriefDescription": "Clockticks for CMS units attached to CHA",
+        "EventCode": "0x01",
+        "EventName": "UNC_CHACMS_CLOCKTICKS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "PublicDescription": "UNC_CHACMS_CLOCKTICKS",
+        "Unit": "CHACMS"
+    },
+    {
+        "BriefDescription": "Number of CHA clock cycles while the event is enabled",
+        "EventCode": "0x01",
+        "EventName": "UNC_CHA_CLOCKTICKS",
+        "PerPkg": "1",
+        "PublicDescription": "Clockticks of the uncore caching and home agent (CHA)",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT).  Threshold for distress signal assertion reached in TOR or IRQ (immediate cause for triggering).",
+        "EventCode": "0x59",
+        "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_ANY",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x3",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT).  Threshold for distress signal assertion reached in IRQ (immediate cause for triggering).",
+        "EventCode": "0x59",
+        "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_IRQ",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT).  Threshold for distress signal assertion reached in TOR (immediate cause for triggering).",
+        "EventCode": "0x59",
+        "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_TOR",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts when a normal (Non-Isochronous) full line write is issued from the CHA to the any of the memory controller channels.",
+        "EventCode": "0x5b",
+        "EventName": "UNC_CHA_IMC_WRITES_COUNT.FULL",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CHA to iMC Full Line Writes Issued : ISOCH Full Line : Counts the total number of full line writes issued from the HA into the memory controller.",
+        "EventCode": "0x5b",
+        "EventName": "UNC_CHA_IMC_WRITES_COUNT.FULL_PRIORITY",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CHA to iMC Full Line Writes Issued : Partial Non-ISOCH : Counts the total number of full line writes issued from the HA into the memory controller.",
+        "EventCode": "0x5b",
+        "EventName": "UNC_CHA_IMC_WRITES_COUNT.PARTIAL",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CHA to iMC Full Line Writes Issued : ISOCH Partial : Counts the total number of full line writes issued from the HA into the memory controller.",
+        "EventCode": "0x5b",
+        "EventName": "UNC_CHA_IMC_WRITES_COUNT.PARTIAL_PRIORITY",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: CRd Requests",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.CODE",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : CRd Requests",
+        "UMask": "0x1bd0ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests and Read Prefetches",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.DATA_RD",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x1bc1ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests, Read Prefetches, and Snoops",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_ALL",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Data Reads",
+        "UMask": "0x1fc1ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Demand Data Reads, Core and LLC prefetches",
+        "UMask": "0x841ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests, Read Prefetches, and Snoops which miss the Cache",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_MISS",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Data Read Misses",
+        "UMask": "0x1fc101",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: All Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCALLY_HOMED_ADDRESS",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Transactions homed locally",
+        "UMask": "0xbdfff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Code Read Requests and Code Read Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_CODE",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : CRd Requests",
+        "UMask": "0x19d0ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests and Read Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DATA_RD",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x19c1ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Code Read Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_CODE",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : CRd Requests",
+        "UMask": "0x1850ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_DATA_RD",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x1841ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: RFO Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : RFO Requests",
+        "UMask": "0x1848ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: LLC Prefetch Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_LLC_PF",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x189dff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: All Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x199dff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Code Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_CODE",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : CRd Requests",
+        "UMask": "0x1910ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_DATA_RD",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x1981ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: RFO Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : RFO Requests",
+        "UMask": "0x1908ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: RFO Requests and RFO Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : RFO Requests",
+        "UMask": "0x19c8ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: All RFO and RFO Prefetches",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.RFO",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : All RFOs - Demand and Prefetches",
+        "UMask": "0x1bc8ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: RFO Requests and RFO Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.RFO_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Locally HOMed RFOs - Demand and Prefetches",
+        "UMask": "0x9c8ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Writes to Locally Homed Memory (includes writebacks from L1/L2)",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.WRITE_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Writes",
+        "UMask": "0x842ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.ALL",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : All Lines Victimized",
+        "UMask": "0xf",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : IA traffic : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.IA",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : IO traffic : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.IO",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_ALL",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Local - All Lines",
+        "UMask": "0x200f",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_E",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Local - Lines in E State",
+        "UMask": "0x2002",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_F",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Local - Lines in F State",
+        "UMask": "0x2008",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_M",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Local - Lines in M State",
+        "UMask": "0x2001",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_S",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Local - Lines in S State",
+        "UMask": "0x2004",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_E",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Lines in E state",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_M",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Lines in M state",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_S",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Lines in S State",
+        "UMask": "0x4",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts when a RFO (the Read for Ownership issued before a  write) request hit a cacheline in the S (Shared) state.",
+        "EventCode": "0x39",
+        "EventName": "UNC_CHA_MISC.RFO_HIT_S",
+        "PerPkg": "1",
+        "PublicDescription": "Cbo Misc : RFO HitS",
+        "UMask": "0x8",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "OSB Snoop Broadcast : Local InvItoE : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.",
+        "EventCode": "0x55",
+        "EventName": "UNC_CHA_OSB.LOCAL_INVITOE",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "OSB Snoop Broadcast : Local Rd : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.",
+        "EventCode": "0x55",
+        "EventName": "UNC_CHA_OSB.LOCAL_READ",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "OSB Snoop Broadcast : Off : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.",
+        "EventCode": "0x55",
+        "EventName": "UNC_CHA_OSB.OFF_PWRHEURISTIC",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "OSB Snoop Broadcast : RFO HitS Snoop Broadcast : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.",
+        "EventCode": "0x55",
+        "EventName": "UNC_CHA_OSB.RFO_HITS_SNP_BCAST",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the total number of requests coming from a unit on this socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.INVITOE",
+        "PerPkg": "1",
+        "PublicDescription": "HA Read and Write Requests : InvalItoE",
+        "UMask": "0x30",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the total number of requests coming from a unit on this socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.INVITOE_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts read requests made into this CHA. Reads include all read opcodes (including RFO: the Read for Ownership issued before a  write) .",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.READS",
+        "PerPkg": "1",
+        "PublicDescription": "HA Read and Write Requests : Reads",
+        "UMask": "0x3",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts read requests coming from a unit on this socket made into this CHA. Reads include all read opcodes (including RFO: the Read for Ownership issued before a  write).",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.READS_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts write requests made into the CHA, including streaming, evictions, HitM (Reads from another core to a Modified cacheline), etc.",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.WRITES",
+        "PerPkg": "1",
+        "PublicDescription": "HA Read and Write Requests : Writes",
+        "UMask": "0xc",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts  write requests coming from a unit on this socket made into this CHA, including streaming, evictions, HitM (Reads from another core to a Modified cacheline), etc.",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.WRITES_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR Inserts",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.ALL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All",
+        "UMask": "0xc001ffff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All locally initiated requests from IA Cores",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from iA Cores",
+        "UMask": "0xc001ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CLFlush events that are initiated from the Core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_CLFLUSH",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CLFlushes issued by iA Cores",
+        "UMask": "0xc8c7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CLFlushOpt events that are initiated from the Core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_CLFLUSHOPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CLFlushOpts issued by iA Cores",
+        "UMask": "0xc8d7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRDs issued by iA Cores",
+        "UMask": "0xc80fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts; Code read prefetch from local IA that misses in the snoop filter",
+        "UMask": "0xc88fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_DRD_OPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opts issued by iA Cores",
+        "UMask": "0xc827ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores",
+        "UMask": "0xc8a7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All locally initiated requests from IA Cores which hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from iA Cores that Hit the LLC",
+        "UMask": "0xc001fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Hit the LLC",
+        "UMask": "0xc80ffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read prefetch from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that hit the LLC",
+        "UMask": "0xc88ffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD_OPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opts issued by iA Cores that hit the LLC",
+        "UMask": "0xc827fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt prefetch from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that hit the LLC",
+        "UMask": "0xc8a7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoM requests from local IA cores that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores that Hit LLC",
+        "UMask": "0xcc47fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch code read from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores that hit the LLC",
+        "UMask": "0xcccffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch data read from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores that hit the LLC",
+        "UMask": "0xccd7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch read for ownership from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that hit the LLC",
+        "UMask": "0xccc7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Hit the LLC",
+        "UMask": "0xc807fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership prefetch from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Hit the LLC",
+        "UMask": "0xc887fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoM events that are initiated from the Core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores",
+        "UMask": "0xcc47ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMCacheNear requests from local IA cores",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMCacheNears issued by iA Cores",
+        "UMask": "0xcd47ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch code read from local IA.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores",
+        "UMask": "0xcccfff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch data read from local IA.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores",
+        "UMask": "0xccd7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch read for ownership from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores",
+        "UMask": "0xccc7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All locally initiated requests from IA Cores which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC",
+        "UMask": "0xc001fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Missed the LLC",
+        "UMask": "0xc80ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CRDs from local IA cores to locally homed memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRd issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc80efe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc88ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CRD Prefetches from local IA cores to locally homed memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc88efe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt issued by iA Cores that missed the LLC",
+        "UMask": "0xc827fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRd_Opt, and which target local memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc826fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that missed the LLC",
+        "UMask": "0xc8a7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRD_PREF_OPT, and target local memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_PREF_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that missed the LLC",
+        "UMask": "0xc8a6fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoM requests from local IA cores that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores that Missed LLC",
+        "UMask": "0xcc47fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch code read from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores that missed the LLC",
+        "UMask": "0xcccffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch data read from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores that missed the LLC",
+        "UMask": "0xccd7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch read for ownership from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that missed the LLC",
+        "UMask": "0xccc7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA cores to locally homed DDR addresses that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCILF_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally",
+        "UMask": "0xc8668601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from local IA cores to locally homed DDR addresses that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCIL_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed locally",
+        "UMask": "0xc86e8601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc807fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc806fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc887fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc886fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UCRDF requests from local IA cores that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_UCRDF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : UCRdFs issued by iA Cores that Missed LLC",
+        "UMask": "0xc877de01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from a local IA core that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc86ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA core that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCILF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLF issued by iA Cores that Missed the LLC",
+        "UMask": "0xc867fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA cores to DDR homed addresses which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCILF_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC",
+        "UMask": "0xc8678601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from local IA cores to DDR homed addresses which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCIL_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting DDR that missed the LLC",
+        "UMask": "0xc86f8601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WIL requests from local IA cores that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WiLs issued by iA Cores that Missed LLC",
+        "UMask": "0xc87fde01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by iA Cores",
+        "UMask": "0xc807ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores",
+        "UMask": "0xc887ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "SpecItoM events that are initiated from the Core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_SPECITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : SpecItoMs issued by iA Cores",
+        "UMask": "0xcc57ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WbEFtoEs issued by iA Cores.  (Non Modified Write Backs)",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WBEFTOE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc3fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WbEFtoIs issued by iA Cores .  (Non Modified Write Backs)",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WBEFTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc37ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WbMtoEs issued by iA Cores .  (Modified Write Backs)",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WBMTOE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc2fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WbMtoI requests from local IA cores",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WBMTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WbMtoIs issued by iA Cores",
+        "UMask": "0xcc27ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WbStoIs issued by iA Cores .  (Non Modified Write Backs)",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WBSTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc67ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from a local IA core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WCIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores",
+        "UMask": "0xc86fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WCILF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLF issued by iA Cores",
+        "UMask": "0xc867ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR inserts from local IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from IO Devices",
+        "UMask": "0xc001ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CLFlush requests from IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_CLFLUSH",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CLFlushes issued by IO Devices",
+        "UMask": "0xc8c3ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR inserts from local IO devices which hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from IO Devices that hit the LLC",
+        "UMask": "0xc001fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMs from local IO devices which hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc43fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC",
+        "UMask": "0xcd43fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "PCIRDCURs issued by IO devices which hit the LLC",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices that hit the LLC",
+        "UMask": "0xc8f3fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "RFOs from local IO devices which hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by IO Devices that hit the LLC",
+        "UMask": "0xc803fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR ItoM inserts from local IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices",
+        "UMask": "0xcc43ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices",
+        "UMask": "0xcd43ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR inserts from local IO devices which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from IO Devices that missed the LLC",
+        "UMask": "0xc001fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR ItoM inserts from local IO devices which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that missed the LLC",
+        "UMask": "0xcc43fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "UMask": "0xcd43fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "PCIRDCURs issued by IO devices which miss the LLC",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices that missed the LLC",
+        "UMask": "0xc8f3fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR RFO inserts from local IO devices which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by IO Devices that missed the LLC",
+        "UMask": "0xc803fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "PCIRDCURs issued by IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices",
+        "UMask": "0xc8f3ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "RFOs from local IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by IO Devices",
+        "UMask": "0xc803ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WBMtoI requests from IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_WBMTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WbMtoIs issued by IO Devices",
+        "UMask": "0xcc23ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Inserts for SF or LLC Evictions",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.LLC_OR_SF_EVICTIONS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR allocation occurred as a result of SF/LLC evictions (came from the ISMQ)",
+        "UMask": "0xc001ff02",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All locally initiated requests",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.LOC_ALL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All from Local iA and IO",
+        "UMask": "0xc000ff05",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All from Local iA",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.LOC_IA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All from Local iA",
+        "UMask": "0xc000ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All from Local IO",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.LOC_IO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All from Local IO",
+        "UMask": "0xc000ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Occupancy for all TOR entries",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.ALL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All",
+        "UMask": "0xc001ffff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from iA Cores",
+        "UMask": "0xc001ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CLFlush events that are initiated from the Core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CLFLUSH",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CLFlushes issued by iA Cores",
+        "UMask": "0xc8c7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CLFlushOpt events that are initiated from the Core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CLFLUSHOPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CLFlushOpts issued by iA Cores",
+        "UMask": "0xc8d7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRDs issued by iA Cores",
+        "UMask": "0xc80fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy; Code read prefetch from local IA that misses in the snoop filter",
+        "UMask": "0xc88fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores",
+        "UMask": "0xc8a7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores which hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from iA Cores that Hit the LLC",
+        "UMask": "0xc001fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Hit the LLC",
+        "UMask": "0xc80ffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that hit the LLC",
+        "UMask": "0xc88ffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores that hit the LLC",
+        "UMask": "0xc8a7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoM requests from local IA cores that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores that Hit LLC",
+        "UMask": "0xcc47fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores that hit the LLC",
+        "UMask": "0xcccffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores that hit the LLC",
+        "UMask": "0xccd7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that hit the LLC",
+        "UMask": "0xccc7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Hit the LLC",
+        "UMask": "0xc807fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Hit the LLC",
+        "UMask": "0xc887fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoM events that are initiated from the Core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores",
+        "UMask": "0xcc47ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMCacheNear requests from local IA cores",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMCacheNears issued by iA Cores",
+        "UMask": "0xcd47ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores",
+        "UMask": "0xcccfff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores",
+        "UMask": "0xccd7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores",
+        "UMask": "0xccc7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from iA Cores that Missed the LLC",
+        "UMask": "0xc001fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Missed the LLC",
+        "UMask": "0xc80ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CRDs from local IA cores to locally homed memory",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRd issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc80efe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc88ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CRD Prefetches from local IA cores to locally homed memory",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_PREF_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc88efe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores that missed the LLC",
+        "UMask": "0xc8a7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoM requests from local IA cores that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores that Missed LLC",
+        "UMask": "0xcc47fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores that missed the LLC",
+        "UMask": "0xcccffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores that missed the LLC",
+        "UMask": "0xccd7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that missed the LLC",
+        "UMask": "0xccc7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to locally homed DDR addresses that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCILF_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally",
+        "UMask": "0xc8668601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to locally homed DDR addresses that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCIL_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed locally",
+        "UMask": "0xc86e8601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc807fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc806fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc887fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc886fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for UCRDF requests from local IA cores that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_UCRDF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : UCRdFs issued by iA Cores that Missed LLC",
+        "UMask": "0xc877de01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from a local IA core that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc86ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA core that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCILF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLF issued by iA Cores that Missed the LLC",
+        "UMask": "0xc867fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to DDR homed addresses which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCILF_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC",
+        "UMask": "0xc8678601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to DDR homed addresses which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCIL_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting DDR that missed the LLC",
+        "UMask": "0xc86f8601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WIL requests from local IA cores that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WiLs issued by iA Cores that Missed LLC",
+        "UMask": "0xc87fde01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores",
+        "UMask": "0xc807ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores",
+        "UMask": "0xc887ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for SpecItoM events that are initiated from the Core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_SPECITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : SpecItoMs issued by iA Cores",
+        "UMask": "0xcc57ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WbMtoI requests from local IA cores",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WBMTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WbMtoIs issued by iA Cores",
+        "UMask": "0xcc27ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from a local IA core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WCIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores",
+        "UMask": "0xc86fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WCILF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLF issued by iA Cores",
+        "UMask": "0xc867ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from IO Devices",
+        "UMask": "0xc001ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CLFlush requests from IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_CLFLUSH",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CLFlushes issued by IO Devices",
+        "UMask": "0xc8c3ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices which hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from IO Devices that hit the LLC",
+        "UMask": "0xc001fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMs from local IO devices which hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc43fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC",
+        "UMask": "0xcd43fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices which hit the LLC",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that hit the LLC",
+        "UMask": "0xc8f3fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for RFOs from local IO devices which hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices that hit the LLC",
+        "UMask": "0xc803fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR ItoM inserts from local IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices",
+        "UMask": "0xcc43ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices",
+        "UMask": "0xcd43ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from IO Devices that missed the LLC",
+        "UMask": "0xc001fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR ItoM inserts from local IO devices which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that missed the LLC",
+        "UMask": "0xcc43fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "UMask": "0xcd43fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices which miss the LLC",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that missed the LLC",
+        "UMask": "0xc8f3fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR RFO inserts from local IO devices which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices that missed the LLC",
+        "UMask": "0xc803fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices",
+        "UMask": "0xc8f3ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for RFOs from local IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices",
+        "UMask": "0xc803ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WBMtoI requests from IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_WBMTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WbMtoIs issued by IO Devices",
+        "UMask": "0xcc23ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All locally initiated requests",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_ALL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All from Local iA and IO",
+        "UMask": "0xc000ff05",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All from Local iA",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_IA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All from Local iA",
+        "UMask": "0xc000ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All from Local IO",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_IO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All from Local IO",
+        "UMask": "0xc000ff04",
+        "Unit": "CHA"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-interconnect.json
new file mode 100644
index 000000000000..9091f8fde51f
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-interconnect.json
@@ -0,0 +1,175 @@
+[
+    {
+        "BriefDescription": "Clockticks of the mesh to memory (B2CMI)",
+        "EventCode": "0x01",
+        "EventName": "UNC_B2CMI_CLOCKTICKS",
+        "PerPkg": "1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of times B2CMI egress did D2C (direct to core)",
+        "EventCode": "0x16",
+        "EventName": "UNC_B2CMI_DIRECT2CORE_TAKEN",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of times D2C wasn't honoured even though the incoming request had d2c set for non cisgress txn",
+        "EventCode": "0x18",
+        "EventName": "UNC_B2CMI_DIRECT2CORE_TXN_OVERRIDE",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts any read",
+        "EventCode": "0x24",
+        "EventName": "UNC_B2CMI_IMC_READS.ALL",
+        "PerPkg": "1",
+        "UMask": "0x104",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts normal reads issue to CMI",
+        "EventCode": "0x24",
+        "EventName": "UNC_B2CMI_IMC_READS.NORMAL",
+        "PerPkg": "1",
+        "UMask": "0x101",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts reads to 1lm non persistent memory regions",
+        "EventCode": "0x24",
+        "EventName": "UNC_B2CMI_IMC_READS.TO_DDR_AS_MEM",
+        "PerPkg": "1",
+        "UMask": "0x108",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "All Writes - All Channels",
+        "EventCode": "0x25",
+        "EventName": "UNC_B2CMI_IMC_WRITES.ALL",
+        "PerPkg": "1",
+        "UMask": "0x110",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Full Non-ISOCH - All Channels",
+        "EventCode": "0x25",
+        "EventName": "UNC_B2CMI_IMC_WRITES.FULL",
+        "PerPkg": "1",
+        "UMask": "0x101",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Partial Non-ISOCH - All Channels",
+        "EventCode": "0x25",
+        "EventName": "UNC_B2CMI_IMC_WRITES.PARTIAL",
+        "PerPkg": "1",
+        "UMask": "0x102",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "DDR - All Channels",
+        "EventCode": "0x25",
+        "EventName": "UNC_B2CMI_IMC_WRITES.TO_DDR_AS_MEM",
+        "PerPkg": "1",
+        "UMask": "0x120",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Prefetch CAM Inserts : XPT - Ch 0",
+        "EventCode": "0x56",
+        "EventName": "UNC_B2CMI_PREFCAM_INSERTS.CH0_XPT",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Prefetch CAM Inserts : XPT -All Channels",
+        "EventCode": "0x56",
+        "EventName": "UNC_B2CMI_PREFCAM_INSERTS.XPT_ALLCH",
+        "PerPkg": "1",
+        "PublicDescription": "Prefetch CAM Inserts : XPT - All Channels",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Prefetch CAM Occupancy : Channel 0",
+        "EventCode": "0x54",
+        "EventName": "UNC_B2CMI_PREFCAM_OCCUPANCY.CH0",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Tracker Inserts : Channel 0",
+        "EventCode": "0x32",
+        "EventName": "UNC_B2CMI_TRACKER_INSERTS.CH0",
+        "PerPkg": "1",
+        "UMask": "0x104",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Tracker Occupancy : Channel 0",
+        "EventCode": "0x33",
+        "EventName": "UNC_B2CMI_TRACKER_OCCUPANCY.CH0",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Write Tracker Inserts : Channel 0",
+        "EventCode": "0x40",
+        "EventName": "UNC_B2CMI_WR_TRACKER_INSERTS.CH0",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Total Write Cache Occupancy : Mem",
+        "EventCode": "0x0F",
+        "EventName": "UNC_I_CACHE_TOTAL_OCCUPANCY.MEM",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "IRP"
+    },
+    {
+        "BriefDescription": "IRP Clockticks",
+        "EventCode": "0x01",
+        "EventName": "UNC_I_CLOCKTICKS",
+        "PerPkg": "1",
+        "Unit": "IRP"
+    },
+    {
+        "BriefDescription": "Inbound read requests received by the IRP and inserted into the FAF queue",
+        "EventCode": "0x18",
+        "EventName": "UNC_I_FAF_INSERTS",
+        "PerPkg": "1",
+        "Unit": "IRP"
+    },
+    {
+        "BriefDescription": "FAF occupancy",
+        "EventCode": "0x19",
+        "EventName": "UNC_I_FAF_OCCUPANCY",
+        "PerPkg": "1",
+        "Unit": "IRP"
+    },
+    {
+        "BriefDescription": "Misc Events - Set 1 : Lost Forward : Snoop pulled away ownership before a write was committed",
+        "EventCode": "0x1F",
+        "EventName": "UNC_I_MISC1.LOST_FWD",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "IRP"
+    },
+    {
+        "BriefDescription": "Inbound write (fast path) requests to coherent memory, received by the IRP resulting in write ownership requests issued by IRP to the mesh.",
+        "EventCode": "0x11",
+        "EventName": "UNC_I_TRANSACTIONS.WR_PREF",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "IRP"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-io.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-io.json
new file mode 100644
index 000000000000..c301ef95ae8d
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-io.json
@@ -0,0 +1,1187 @@
+[
+    {
+        "BriefDescription": "IIO Clockticks",
+        "EventCode": "0x01",
+        "EventName": "UNC_IIO_CLOCKTICKS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff0ff",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010010",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020020",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040040",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080080",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "UMask": "0x7002004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "UMask": "0x7008004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x10",
+        "UMask": "0x7010004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x20",
+        "UMask": "0x7020004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x40",
+        "UMask": "0x7040004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x80",
+        "UMask": "0x7080004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "UMask": "0x7002001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "UMask": "0x7004001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "UMask": "0x7008001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x10",
+        "UMask": "0x7010001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x20",
+        "UMask": "0x7020001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x40",
+        "UMask": "0x7040001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x80",
+        "UMask": "0x7080001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB Hits to a 1G Page",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.1G_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x10",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB Hits to a 2M Page",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.2M_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x8",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB Hits to a 4K Page",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.4K_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x4",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Context cache hits",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.CTXT_CACHE_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x80",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Context cache lookups",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.CTXT_CACHE_LOOKUPS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x40",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB lookups first",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.FIRST_LOOKUPS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x1",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB Fills (same as IOTLB miss)",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.MISSES",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOMMU memory access (both low and high priority)",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.NUM_MEM_ACCESSES",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0xc0",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Second Level Page Walk Cache Hit to a 1G page",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.SLPWC_1G_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x4",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Second Level Page Walk Cache Hit to a 256T page",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.SLPWC_256T_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x10",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Second Level Page Walk Cache Hit to a 512G page",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.SLPWC_512G_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x8",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.ABORT",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff080",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.CONFINED_P2P",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff040",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.LOC_P2P",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff020",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MCAST",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MEM",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MSGB",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.UBOX",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "All 9 bits of Page Walk Tracker Occupancy",
+        "EventCode": "0x42",
+        "EventName": "UNC_IIO_PWT_OCCUPANCY",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080002",
+        "Unit": "IIO"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-memory.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-memory.json
new file mode 100644
index 000000000000..a2405ed640c9
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-memory.json
@@ -0,0 +1,385 @@
+[
+    {
+        "BriefDescription": "DRAM Activate Count : Counts the number of DRAM Activate commands sent on this channel.  Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS.  One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.",
+        "EventCode": "0x02",
+        "EventName": "UNC_M_ACT_COUNT.ALL",
+        "PerPkg": "1",
+        "UMask": "0xf7",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Activate Count : Read transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel.  Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS.  One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.",
+        "EventCode": "0x02",
+        "EventName": "UNC_M_ACT_COUNT.RD",
+        "PerPkg": "1",
+        "UMask": "0xf1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Activate Count : Underfill Read transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel.  Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS.  One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.",
+        "EventCode": "0x02",
+        "EventName": "UNC_M_ACT_COUNT.UFILL",
+        "PerPkg": "1",
+        "UMask": "0xf4",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Activate Count : Write transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel.  Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS.  One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.",
+        "EventCode": "0x02",
+        "EventName": "UNC_M_ACT_COUNT.WR",
+        "PerPkg": "1",
+        "UMask": "0xf2",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0, all CAS operations",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.ALL",
+        "PerPkg": "1",
+        "UMask": "0xff",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0, all reads",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.RD",
+        "PerPkg": "1",
+        "UMask": "0xcf",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0 regular reads",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.RD_REG",
+        "PerPkg": "1",
+        "UMask": "0xc1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0 underfill reads",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.RD_UNDERFILL",
+        "PerPkg": "1",
+        "UMask": "0xc4",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0, all writes",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.WR",
+        "PerPkg": "1",
+        "UMask": "0xf0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0 regular writes",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.WR_NONPRE",
+        "PerPkg": "1",
+        "UMask": "0xd0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0 auto-precharge writes",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.WR_PRE",
+        "PerPkg": "1",
+        "UMask": "0xe0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1, all CAS operations",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.ALL",
+        "PerPkg": "1",
+        "UMask": "0xff",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1, all reads",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.RD",
+        "PerPkg": "1",
+        "UMask": "0xcf",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1 regular reads",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.RD_REG",
+        "PerPkg": "1",
+        "UMask": "0xc1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1 underfill reads",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.RD_UNDERFILL",
+        "PerPkg": "1",
+        "UMask": "0xc4",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1, all writes",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.WR",
+        "PerPkg": "1",
+        "UMask": "0xf0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1 regular writes",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.WR_NONPRE",
+        "PerPkg": "1",
+        "UMask": "0xd0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1 auto-precharge writes",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.WR_PRE",
+        "PerPkg": "1",
+        "UMask": "0xe0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Number of DRAM DCLK clock cycles while the event is enabled",
+        "EventCode": "0x01",
+        "EventName": "UNC_M_CLOCKTICKS",
+        "PerPkg": "1",
+        "PublicDescription": "DRAM Clockticks",
+        "UMask": "0x1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Number of DRAM HCLK clock cycles while the event is enabled",
+        "EventCode": "0x01",
+        "EventName": "UNC_M_HCLOCKTICKS",
+        "PerPkg": "1",
+        "PublicDescription": "DRAM Clockticks",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_PRE_COUNT.ALL",
+        "PerPkg": "1",
+        "UMask": "0xff",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Precharge commands. : Precharge due to (?) : Counts the number of DRAM Precharge commands sent on this channel.",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_PRE_COUNT.PGT",
+        "PerPkg": "1",
+        "UMask": "0xf8",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_PRE_COUNT.RD",
+        "PerPkg": "1",
+        "UMask": "0xf1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_PRE_COUNT.UFILL",
+        "PerPkg": "1",
+        "UMask": "0xf4",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_PRE_COUNT.WR",
+        "PerPkg": "1",
+        "UMask": "0xf2",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read buffer inserts on subchannel 0",
+        "EventCode": "0x17",
+        "EventName": "UNC_M_RDB_INSERTS.SCH0",
+        "PerPkg": "1",
+        "UMask": "0x40",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read buffer inserts on subchannel 1",
+        "EventCode": "0x17",
+        "EventName": "UNC_M_RDB_INSERTS.SCH1",
+        "PerPkg": "1",
+        "UMask": "0x80",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read buffer occupancy on subchannel 0",
+        "EventCode": "0x1a",
+        "EventName": "UNC_M_RDB_OCCUPANCY_SCH0",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read buffer occupancy on subchannel 1",
+        "EventCode": "0x1b",
+        "EventName": "UNC_M_RDB_OCCUPANCY_SCH1",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue Allocations : Counts the number of allocations into the Read Pending Queue.  This queue is used to schedule reads out to the memory controller and to track the requests.  Requests allocate into the RPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC.  They deallocate after the CAS command has been issued to memory.  This includes both ISOCH and non-ISOCH requests.",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.PCH0",
+        "PerPkg": "1",
+        "UMask": "0x50",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue Allocations : Counts the number of allocations into the Read Pending Queue.  This queue is used to schedule reads out to the memory controller and to track the requests.  Requests allocate into the RPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC.  They deallocate after the CAS command has been issued to memory.  This includes both ISOCH and non-ISOCH requests.",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.PCH1",
+        "PerPkg": "1",
+        "UMask": "0xa0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue inserts for subchannel 0, pseudochannel 0",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.SCH0_PCH0",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue inserts for subchannel 0, pseudochannel 1",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.SCH0_PCH1",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue inserts for subchannel 1, pseudochannel 0",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.SCH1_PCH0",
+        "PerPkg": "1",
+        "UMask": "0x40",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue inserts for subchannel 1, pseudochannel 1",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.SCH1_PCH1",
+        "PerPkg": "1",
+        "UMask": "0x80",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read pending queue occupancy for subchannel 0, pseudochannel 0",
+        "EventCode": "0x80",
+        "EventName": "UNC_M_RPQ_OCCUPANCY_SCH0_PCH0",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read pending queue occupancy for subchannel 0, pseudochannel 1",
+        "EventCode": "0x81",
+        "EventName": "UNC_M_RPQ_OCCUPANCY_SCH0_PCH1",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read pending queue occupancy for subchannel 1, pseudochannel 0",
+        "EventCode": "0x82",
+        "EventName": "UNC_M_RPQ_OCCUPANCY_SCH1_PCH0",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read pending queue occupancy for subchannel 1, pseudochannel 1",
+        "EventCode": "0x83",
+        "EventName": "UNC_M_RPQ_OCCUPANCY_SCH1_PCH1",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue Allocations",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.PCH0",
+        "PerPkg": "1",
+        "UMask": "0x50",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue Allocations",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.PCH1",
+        "PerPkg": "1",
+        "UMask": "0xa0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue inserts for subchannel 0, pseudochannel 0",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.SCH0_PCH0",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue inserts for subchannel 0, pseudochannel 1",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.SCH0_PCH1",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue inserts for subchannel 1, pseudochannel 0",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.SCH1_PCH0",
+        "PerPkg": "1",
+        "UMask": "0x40",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue inserts for subchannel 1, pseudochannel 1",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.SCH1_PCH1",
+        "PerPkg": "1",
+        "UMask": "0x80",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write pending queue occupancy for subchannel 0, pseudochannel 0",
+        "EventCode": "0x84",
+        "EventName": "UNC_M_WPQ_OCCUPANCY_SCH0_PCH0",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write pending queue occupancy for subchannel 0, pseudochannel 1",
+        "EventCode": "0x85",
+        "EventName": "UNC_M_WPQ_OCCUPANCY_SCH0_PCH1",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write pending queue occupancy for subchannel 1, pseudochannel 0",
+        "EventCode": "0x86",
+        "EventName": "UNC_M_WPQ_OCCUPANCY_SCH1_PCH0",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write pending queue occupancy for subchannel 1, pseudochannel 1",
+        "EventCode": "0x87",
+        "EventName": "UNC_M_WPQ_OCCUPANCY_SCH1_PCH1",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/uncore-power.json b/tools/perf/pmu-events/arch/x86/grandridge/uncore-power.json
new file mode 100644
index 000000000000..e3a66166e28c
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/grandridge/uncore-power.json
@@ -0,0 +1,10 @@
+[
+    {
+        "BriefDescription": "PCU Clockticks",
+        "EventCode": "0x01",
+        "EventName": "UNC_P_CLOCKTICKS",
+        "PerPkg": "1",
+        "PublicDescription": "PCU Clockticks:  The PCU runs off a fixed 1 GHz clock.  This event counts the number of pclk cycles measured while the counter was enabled.  The pclk, like the Memory Controller's dclk, counts at a constant rate making it a good measure of actual wall time.",
+        "Unit": "PCU"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/virtual-memory.json b/tools/perf/pmu-events/arch/x86/grandridge/virtual-memory.json
index bd5f2b634c98..371974c6d6c3 100644
--- a/tools/perf/pmu-events/arch/x86/grandridge/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/grandridge/virtual-memory.json
@@ -1,24 +1,131 @@
 [
     {
-        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 1G page.",
+        "BriefDescription": "Counts the number of first level TLB misses but second level hits due to a demand load that did not start a page walk. Accounts for all page sizes. Will result in a DTLB write from STLB.",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.STLB_HIT",
+        "SampleAfterValue": "200003",
+        "UMask": "0x20"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses.",
         "EventCode": "0x08",
         "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED",
-        "SampleAfterValue": "1000003",
+        "SampleAfterValue": "200003",
         "UMask": "0xe"
     },
     {
+        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 2M or 4M page.",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M",
+        "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages. Includes page walks that page fault.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 4K page.",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks outstanding for Loads (demand or SW prefetch) in PMH every cycle.",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.WALK_PENDING",
+        "PublicDescription": "Counts the number of page walks outstanding for Loads (demand or SW prefetch) in PMH every cycle.  A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of first level TLB misses but second level hits due to stores that did not start a page walk. Accounts for all pages sizes. Will result in a DTLB write from STLB.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.STLB_HIT",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x20"
+    },
+    {
         "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 1G page.",
         "EventCode": "0x49",
         "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED",
-        "SampleAfterValue": "1000003",
+        "SampleAfterValue": "2000003",
         "UMask": "0xe"
     },
     {
+        "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 2M or 4M page.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M",
+        "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 4K page.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks outstanding in the page miss handler (PMH) for stores every cycle.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.WALK_PENDING",
+        "PublicDescription": "Counts the number of page walks outstanding in the page miss handler (PMH) for stores every cycle. A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks initiated by a instruction fetch that missed the first and second level TLBs.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.MISS_CAUSED_WALK",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of first level TLB misses but second level hits due to an instruction fetch that did not start a page walk. Account for all pages sizes. Will result in an ITLB write from STLB.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.STLB_HIT",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x20"
+    },
+    {
         "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to any page size.",
         "EventCode": "0x85",
         "EventName": "ITLB_MISSES.WALK_COMPLETED",
         "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size.  Includes page walks that page fault.",
         "SampleAfterValue": "200003",
         "UMask": "0xe"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to a 2M or 4M page.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M",
+        "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to a 4K page.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks outstanding for iside in PMH every cycle.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.WALK_PENDING",
+        "PublicDescription": "Counts the number of page walks outstanding for iside in PMH every cycle.  A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals.  Walks could be counted by edge detecting on this event, but would count restarted suspended walks.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DTLB miss.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.DTLB_MISS_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x90"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
index 79d89c263677..5631018ed388 100644
--- a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
@@ -84,12 +84,12 @@
         "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
+        "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -202,7 +202,7 @@
         "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -257,7 +257,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -289,20 +289,20 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
         "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
-        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)",
         "MetricGroup": "Bad;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmisp_indirect",
         "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
@@ -327,7 +327,7 @@
         "MetricName": "tma_info_core_coreipc"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
         "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@))",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
@@ -397,96 +397,90 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "Mem",
         "MetricName": "tma_info_memory_l3mpki"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
-    },
-    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
-    },
-    {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
-    },
-    {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
-    },
-    {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "0",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
     },
     {
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
@@ -502,21 +496,27 @@
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
         "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
     },
@@ -541,19 +541,6 @@
         "MetricThreshold": "tma_info_system_kernel_utilization > 0.05"
     },
     {
-        "BriefDescription": "Average number of parallel requests to external memory",
-        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_parallel_requests",
-        "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests"
-    },
-    {
-        "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
-        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_request_latency"
-    },
-    {
         "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
         "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)",
         "MetricGroup": "SMT",
@@ -612,7 +599,7 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
         "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
@@ -621,7 +608,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -630,7 +617,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS",
@@ -640,20 +627,20 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_SMT",
         "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -672,7 +659,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -707,21 +694,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -749,7 +736,7 @@
         "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
         "ScaleUnit": "100%"
     },
@@ -768,7 +755,7 @@
         "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_0",
         "MetricThreshold": "tma_port_0 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -777,7 +764,7 @@
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_1",
         "MetricThreshold": "tma_port_1 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -813,16 +800,16 @@
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_5",
         "MetricThreshold": "tma_port_5 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -868,7 +855,7 @@
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_2",
         "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
+        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
         "ScaleUnit": "100%"
     },
     {
@@ -876,7 +863,7 @@
         "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "ScaleUnit": "100%"
     },
     {
@@ -952,14 +939,5 @@
         "MetricName": "tma_store_op_utilization",
         "MetricThreshold": "tma_store_op_utilization > 0.6",
         "ScaleUnit": "100%"
-    },
-    {
-        "BriefDescription": "This metric serves as an approximation of legacy x87 usage",
-        "MetricExpr": "INST_RETIRED.X87 * tma_info_thread_uoppi / UOPS_RETIRED.RETIRE_SLOTS",
-        "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group",
-        "MetricName": "tma_x87_use",
-        "MetricThreshold": "tma_x87_use > 0.1",
-        "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
-        "ScaleUnit": "100%"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/haswell/memory.json b/tools/perf/pmu-events/arch/x86/haswell/memory.json
index 2fc25e22a42a..6ba0ea6e3fa6 100644
--- a/tools/perf/pmu-events/arch/x86/haswell/memory.json
+++ b/tools/perf/pmu-events/arch/x86/haswell/memory.json
@@ -371,7 +371,7 @@
         "BriefDescription": "Number of times an RTM execution aborted due to any reasons (multiple categories may count as one).",
         "EventCode": "0xc9",
         "EventName": "RTM_RETIRED.ABORTED",
-        "PEBS": "1",
+        "PEBS": "2",
         "SampleAfterValue": "2000003",
         "UMask": "0x4"
     },
diff --git a/tools/perf/pmu-events/arch/x86/haswell/metricgroups.json b/tools/perf/pmu-events/arch/x86/haswell/metricgroups.json
index f6a0258e3241..8c808347f6da 100644
--- a/tools/perf/pmu-events/arch/x86/haswell/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/haswell/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -24,7 +24,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -94,6 +96,7 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
index 5f451948c893..21e2cb5e3178 100644
--- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
@@ -286,12 +286,12 @@
         "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
+        "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -404,7 +404,7 @@
         "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -459,7 +459,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -491,20 +491,20 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
         "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
-        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)",
         "MetricGroup": "Bad;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmisp_indirect",
         "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
@@ -529,7 +529,7 @@
         "MetricName": "tma_info_core_coreipc"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
         "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@))",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
@@ -599,96 +599,139 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
+    },
+    {
+        "BriefDescription": "Average Parallel L2 cache miss data reads",
+        "MetricExpr": "tma_info_memory_latency_data_l2_mlp",
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_data_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
-        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
-        "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
-        "MetricName": "tma_info_memory_l3mpki"
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_l3mpki"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
-        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
+        "MetricExpr": "tma_info_memory_load_l2_miss_latency",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
-        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
+        "MetricExpr": "tma_info_memory_load_l2_mlp",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
+        "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_miss_latency",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "0",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+    },
+    {
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricExpr": "tma_info_memory_tlb_page_walks_utilization",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_page_walks_utilization",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
@@ -704,21 +747,27 @@
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
         "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
     },
@@ -775,6 +824,12 @@
         "MetricName": "tma_info_system_turbo_utilization"
     },
     {
+        "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]",
+        "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time",
+        "MetricGroup": "SoC",
+        "MetricName": "tma_info_system_uncore_frequency"
+    },
+    {
         "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
         "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
         "MetricGroup": "Pipeline",
@@ -815,7 +870,7 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
         "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
@@ -824,7 +879,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -833,7 +888,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS",
@@ -843,20 +898,20 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_SMT",
         "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -875,7 +930,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -890,11 +945,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory",
-        "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
         "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_local_dram",
-        "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_local_mem",
+        "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -920,21 +974,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -962,7 +1016,7 @@
         "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
         "ScaleUnit": "100%"
     },
@@ -981,7 +1035,7 @@
         "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_0",
         "MetricThreshold": "tma_port_0 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -990,7 +1044,7 @@
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_1",
         "MetricThreshold": "tma_port_1 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -1026,16 +1080,16 @@
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_5",
         "MetricThreshold": "tma_port_5 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -1081,7 +1135,7 @@
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_2",
         "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
+        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
         "ScaleUnit": "100%"
     },
     {
@@ -1089,7 +1143,7 @@
         "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "ScaleUnit": "100%"
     },
     {
@@ -1104,11 +1158,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory",
-        "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
         "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_remote_dram",
-        "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_remote_mem",
+        "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -1187,15 +1240,6 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric serves as an approximation of legacy x87 usage",
-        "MetricExpr": "INST_RETIRED.X87 * tma_info_thread_uoppi / UOPS_RETIRED.RETIRE_SLOTS",
-        "MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group",
-        "MetricName": "tma_x87_use",
-        "MetricThreshold": "tma_x87_use > 0.1",
-        "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
-        "ScaleUnit": "100%"
-    },
-    {
         "BriefDescription": "Uncore operating frequency in GHz",
         "MetricExpr": "UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages) / 1e9 / duration_time",
         "MetricName": "uncore_frequency",
diff --git a/tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json b/tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json
index f6a0258e3241..8c808347f6da 100644
--- a/tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -24,7 +24,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -94,6 +96,7 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
diff --git a/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json b/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json
index daebf1050acb..c391325ee36b 100644
--- a/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json
@@ -426,6 +426,7 @@
         "BriefDescription": "Number of cores in C-State; C0 and C1",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
+        "Filter": "occ_sel=1",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
@@ -434,6 +435,7 @@
         "BriefDescription": "Number of cores in C-State; C3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
+        "Filter": "occ_sel=2",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
@@ -442,6 +444,7 @@
         "BriefDescription": "Number of cores in C-State; C6 and C7",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
+        "Filter": "occ_sel=3",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
diff --git a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json
index b43a6c6d8b7f..f67cc73779f8 100644
--- a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json
@@ -98,12 +98,12 @@
         "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots",
+        "MetricExpr": "34 * ASSISTS.ANY / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -113,7 +113,7 @@
     {
         "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
         "DefaultMetricgroupName": "TopdownL1",
-        "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots",
+        "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / tma_info_thread_slots",
         "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group",
         "MetricName": "tma_backend_bound",
         "MetricThreshold": "tma_backend_bound > 0.2",
@@ -135,7 +135,7 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.",
         "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_branch_instructions",
         "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6",
         "ScaleUnit": "100%"
@@ -180,7 +180,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(29 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(29 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 23.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_contested_accesses",
         "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -200,7 +200,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "23.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_data_sharing",
         "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -212,7 +212,7 @@
         "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
         "MetricName": "tma_decoder0_alone",
-        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))",
+        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
         "ScaleUnit": "100%"
     },
@@ -240,7 +240,7 @@
         "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -259,7 +259,7 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
         "MetricName": "tma_dtlb_load",
         "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
@@ -268,12 +268,12 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
         "MetricName": "tma_dtlb_store",
         "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
-        "MetricExpr": "32.5 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
+        "MetricExpr": "32.5 * tma_info_system_core_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
         "MetricName": "tma_false_sharing",
         "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -286,7 +286,7 @@
         "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
         "MetricName": "tma_fb_full",
         "MetricThreshold": "tma_fb_full > 0.3",
-        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
         "ScaleUnit": "100%"
     },
     {
@@ -294,7 +294,7 @@
         "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -328,6 +328,15 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists",
+        "MetricExpr": "34 * ASSISTS.FP / tma_info_thread_slots",
+        "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group",
+        "MetricName": "tma_fp_assists",
+        "MetricThreshold": "tma_fp_assists > 0.1",
+        "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
         "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)",
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
@@ -390,13 +399,13 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
-        "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
@@ -405,7 +414,7 @@
     {
         "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100",
         "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
         "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers"
@@ -446,6 +455,12 @@
         "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
     },
     {
+        "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)",
+        "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)",
+        "MetricGroup": "BrMispredicts",
+        "MetricName": "tma_info_bad_spec_spec_clears_ratio"
+    },
+    {
         "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
@@ -472,67 +487,102 @@
         "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: "
     },
     {
+        "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.",
+        "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Ret",
+        "MetricName": "tma_info_bottleneck_base_non_br",
+        "MetricThreshold": "tma_info_bottleneck_base_non_br > 20"
+    },
+    {
         "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
-        "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC",
+        "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB",
         "MetricName": "tma_info_bottleneck_big_code",
-        "MetricThreshold": "tma_info_bottleneck_big_code > 20",
-        "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead"
+        "MetricThreshold": "tma_info_bottleneck_big_code > 20"
     },
     {
         "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
-        "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)",
-        "MetricGroup": "Ret;tma_issueBC",
+        "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)",
+        "MetricGroup": "Ret",
         "MetricName": "tma_info_bottleneck_branching_overhead",
-        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10",
-        "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code"
+        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
+        "MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
+        "MetricName": "tma_info_bottleneck_cache_memory_latency",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency"
+    },
+    {
+        "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
+        "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
+        "MetricGroup": "Cor;tma_issueComp",
+        "MetricName": "tma_info_bottleneck_compute_bound_est",
+        "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
+        "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: "
     },
     {
         "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
+        "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
         "MetricGroup": "Fed;FetchBW;Frontend",
         "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
         "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20"
     },
     {
-        "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
-        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
-        "MetricName": "tma_info_bottleneck_memory_bandwidth",
-        "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20",
-        "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+        "BriefDescription": "Total pipeline cost of irregular execution (e.g",
+        "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Bad;Cor;Ret;tma_issueMS",
+        "MetricName": "tma_info_bottleneck_irregular_overhead",
+        "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
+        "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches"
     },
     {
         "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
         "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB",
         "MetricName": "tma_info_bottleneck_memory_data_tlbs",
         "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
-        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store"
+        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization"
     },
     {
-        "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))",
-        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
-        "MetricName": "tma_info_bottleneck_memory_latency",
-        "MetricThreshold": "tma_info_bottleneck_memory_latency > 20",
-        "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency"
+        "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
+        "MetricGroup": "Mem;Offcore;tma_issueTLB",
+        "MetricName": "tma_info_bottleneck_memory_synchronization",
+        "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
+        "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs"
     },
     {
         "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+        "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
         "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bottleneck_mispredictions",
         "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
         "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers"
     },
     {
+        "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)",
+        "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)",
+        "MetricGroup": "Cor;Offcore",
+        "MetricName": "tma_info_bottleneck_other_bottlenecks",
+        "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
+        "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls."
+    },
+    {
         "BriefDescription": "Fraction of branches that are CALL or RET",
         "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
@@ -564,7 +614,7 @@
     },
     {
         "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
-        "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED",
+        "MetricExpr": "(CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else tma_info_thread_clks)",
         "MetricGroup": "SMT",
         "MetricName": "tma_info_core_core_clks"
     },
@@ -575,8 +625,14 @@
         "MetricName": "tma_info_core_coreipc"
     },
     {
+        "BriefDescription": "uops Executed per Cycle",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_core_epc"
+    },
+    {
         "BriefDescription": "Floating Point Operations Per Cycle",
-        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc"
     },
@@ -588,8 +644,8 @@
         "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
@@ -669,7 +725,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
@@ -677,7 +733,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
@@ -685,7 +741,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)",
@@ -693,7 +749,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx512",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
@@ -701,7 +757,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
@@ -709,7 +765,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -727,7 +783,7 @@
     },
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
-        "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
@@ -740,6 +796,12 @@
         "MetricThreshold": "tma_info_inst_mix_ipload < 3"
     },
     {
+        "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)",
+        "MetricExpr": "tma_info_inst_mix_instructions / MISC_RETIRED.PAUSE_INST",
+        "MetricGroup": "Flops;FpVector;InsType",
+        "MetricName": "tma_info_inst_mix_ippause"
+    },
+    {
         "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
         "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES",
         "MetricGroup": "InsType",
@@ -763,136 +825,148 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_access_bw",
         "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_core_l3_cache_access_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_fb_hpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS) / tma_info_inst_mix_instructions",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load"
     },
     {
-        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
-        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
-        "MetricName": "tma_info_memory_l3mpki"
+        "BriefDescription": "",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW;Offcore",
+        "MetricName": "tma_info_memory_l3_cache_access_bw"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
     },
     {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_l3mpki"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L3 cache miss demand Loads",
         "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l3_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l3_miss_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "BriefDescription": "\"Bus lock\" per kilo instruction",
+        "MetricExpr": "1e3 * SQ_MISC.BUS_LOCK / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_bus_lock_pki"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_access_bw",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_uc_load_pki"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
     },
     {
         "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
@@ -920,42 +994,56 @@
         "MetricName": "tma_info_memory_tlb_store_stlb_mpki"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
     {
+        "BriefDescription": "Instructions per a microcode Assist invocation",
+        "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY",
+        "MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
+        "MetricName": "tma_info_pipeline_ipassist",
+        "MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
+        "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)"
+    },
+    {
         "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
         "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
         "MetricGroup": "Pipeline;Ret",
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
-        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
+        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -1071,8 +1159,8 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
-        "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
@@ -1081,7 +1169,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -1091,7 +1179,7 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
@@ -1101,24 +1189,24 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_NMI",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
-        "MetricExpr": "9 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "MetricExpr": "9 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
-        "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks",
+        "MetricExpr": "DECODE.LCP / tma_info_thread_clks",
         "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
         "MetricName": "tma_lcp",
         "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -1132,7 +1220,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -1175,7 +1263,7 @@
         "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_lsd",
-        "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
+        "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.",
         "ScaleUnit": "100%"
     },
@@ -1190,21 +1278,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -1228,11 +1316,11 @@
     },
     {
         "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
-        "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
+        "MetricExpr": "UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
         "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
         "MetricName": "tma_microcode_sequencer",
         "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
-        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches",
+        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1249,7 +1337,7 @@
         "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
         "ScaleUnit": "100%"
     },
@@ -1258,16 +1346,16 @@
         "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group",
         "MetricName": "tma_mite_4wide",
-        "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))",
+        "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
+        "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)",
         "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
         "MetricName": "tma_mixing_vectors",
         "MetricThreshold": "tma_mixing_vectors > 0.05",
-        "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
+        "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1276,22 +1364,22 @@
         "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
         "MetricName": "tma_ms_switches",
         "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
-        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
+        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
         "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
         "MetricName": "tma_nop_instructions",
-        "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6",
+        "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))",
+        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions))",
         "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_other_light_ops",
         "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
@@ -1299,6 +1387,22 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
+        "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)",
+        "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
+        "MetricName": "tma_other_mispredicts",
+        "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
+        "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)",
+        "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_other_nukes",
+        "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)",
         "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks",
         "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
@@ -1326,17 +1430,17 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
-        "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
+        "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
         "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
         "MetricName": "tma_ports_utilization",
         "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -1345,7 +1449,7 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
-        "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
+        "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_0",
         "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -1375,7 +1479,7 @@
         "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
         "ScaleUnit": "100%"
     },
@@ -1393,18 +1497,18 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
         "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks",
-        "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group",
+        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
         "MetricName": "tma_serializing_operation",
-        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))",
+        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
         "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks",
-        "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
         "MetricName": "tma_slow_pause",
-        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))",
+        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST",
         "ScaleUnit": "100%"
     },
@@ -1433,7 +1537,7 @@
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
         "MetricName": "tma_sq_full",
         "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
+        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
         "ScaleUnit": "100%"
     },
     {
@@ -1501,10 +1605,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/icelake/memory.json b/tools/perf/pmu-events/arch/x86/icelake/memory.json
index e8d2ec1c029b..f84763220549 100644
--- a/tools/perf/pmu-events/arch/x86/icelake/memory.json
+++ b/tools/perf/pmu-events/arch/x86/icelake/memory.json
@@ -259,6 +259,7 @@
         "BriefDescription": "Number of times an RTM execution aborted.",
         "EventCode": "0xc9",
         "EventName": "RTM_RETIRED.ABORTED",
+        "PEBS": "1",
         "PublicDescription": "Counts the number of times RTM abort was triggered.",
         "SampleAfterValue": "100003",
         "UMask": "0x4"
diff --git a/tools/perf/pmu-events/arch/x86/icelake/metricgroups.json b/tools/perf/pmu-events/arch/x86/icelake/metricgroups.json
index a151ba9cccb0..5452a1448ded 100644
--- a/tools/perf/pmu-events/arch/x86/icelake/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/icelake/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -25,7 +25,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -63,8 +65,10 @@
     "tma_L5_group": "Metrics for top-down breakdown at level 5",
     "tma_L6_group": "Metrics for top-down breakdown at level 6",
     "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category",
+    "tma_assists_group": "Metrics contributing to tma_assists category",
     "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category",
     "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category",
+    "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category",
     "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category",
     "tma_core_bound_group": "Metrics contributing to tma_core_bound category",
     "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category",
@@ -77,9 +81,9 @@
     "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category",
     "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category",
     "tma_issue2P": "Metrics related by the issue $issue2P",
-    "tma_issueBC": "Metrics related by the issue $issueBC",
     "tma_issueBM": "Metrics related by the issue $issueBM",
     "tma_issueBW": "Metrics related by the issue $issueBW",
+    "tma_issueComp": "Metrics related by the issue $issueComp",
     "tma_issueD0": "Metrics related by the issue $issueD0",
     "tma_issueFB": "Metrics related by the issue $issueFB",
     "tma_issueFL": "Metrics related by the issue $issueFL",
@@ -99,10 +103,12 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
     "tma_mite_group": "Metrics contributing to tma_mite category",
+    "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category",
     "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category",
     "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category",
     "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category",
diff --git a/tools/perf/pmu-events/arch/x86/icelake/other.json b/tools/perf/pmu-events/arch/x86/icelake/other.json
index cfb590632918..4fdc87339555 100644
--- a/tools/perf/pmu-events/arch/x86/icelake/other.json
+++ b/tools/perf/pmu-events/arch/x86/icelake/other.json
@@ -19,7 +19,7 @@
         "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.",
         "EventCode": "0x28",
         "EventName": "CORE_POWER.LVL2_TURBO_LICENSE",
-        "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchtecture).  This includes high current AVX 512-bit instructions.",
+        "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchitecture).  This includes high current AVX 512-bit instructions.",
         "SampleAfterValue": "200003",
         "UMask": "0x20"
     },
diff --git a/tools/perf/pmu-events/arch/x86/icelake/pipeline.json b/tools/perf/pmu-events/arch/x86/icelake/pipeline.json
index 375b78044f14..c7313fd4fdf4 100644
--- a/tools/perf/pmu-events/arch/x86/icelake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/icelake/pipeline.json
@@ -529,7 +529,7 @@
         "BriefDescription": "Cycles when Reservation Station (RS) is empty for the thread",
         "EventCode": "0x5e",
         "EventName": "RS_EVENTS.EMPTY_CYCLES",
-        "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into stravation periods (e.g. branch mispredictions or i-cache misses)",
+        "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)",
         "SampleAfterValue": "1000003",
         "UMask": "0x1"
     },
@@ -553,14 +553,6 @@
         "UMask": "0x2"
     },
     {
-        "BriefDescription": "TMA slots wasted due to incorrect speculation by branch mispredictions",
-        "EventCode": "0xa4",
-        "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS",
-        "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by branch mispredictions. This event estimates number of operations that were issued but not retired from the speculative path as well as the out-of-order engine recovery past a branch misprediction.",
-        "SampleAfterValue": "10000003",
-        "UMask": "0x8"
-    },
-    {
         "BriefDescription": "TMA slots available for an unhalted logical processor. Fixed counter - architectural event",
         "EventName": "TOPDOWN.SLOTS",
         "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3).",
diff --git a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json
index 71d78a7841ea..c015b8277dc7 100644
--- a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json
@@ -302,12 +302,12 @@
         "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots",
+        "MetricExpr": "34 * ASSISTS.ANY / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -317,7 +317,7 @@
     {
         "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
         "DefaultMetricgroupName": "TopdownL1",
-        "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots",
+        "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / tma_info_thread_slots",
         "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group",
         "MetricName": "tma_backend_bound",
         "MetricThreshold": "tma_backend_bound > 0.2",
@@ -339,7 +339,7 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.",
         "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_branch_instructions",
         "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6",
         "ScaleUnit": "100%"
@@ -384,7 +384,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 43.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 43.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_contested_accesses",
         "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -404,7 +404,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "43.5 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "43.5 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_data_sharing",
         "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -416,7 +416,7 @@
         "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
         "MetricName": "tma_decoder0_alone",
-        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))",
+        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
         "ScaleUnit": "100%"
     },
@@ -444,7 +444,7 @@
         "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -463,7 +463,7 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
         "MetricName": "tma_dtlb_load",
         "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
@@ -472,12 +472,12 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
         "MetricName": "tma_dtlb_store",
         "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
-        "MetricExpr": "48 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
+        "MetricExpr": "48 * tma_info_system_core_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
         "MetricName": "tma_false_sharing",
         "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -490,7 +490,7 @@
         "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
         "MetricName": "tma_fb_full",
         "MetricThreshold": "tma_fb_full > 0.3",
-        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
         "ScaleUnit": "100%"
     },
     {
@@ -498,7 +498,7 @@
         "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -532,6 +532,15 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists",
+        "MetricExpr": "34 * ASSISTS.FP / tma_info_thread_slots",
+        "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group",
+        "MetricName": "tma_fp_assists",
+        "MetricThreshold": "tma_fp_assists > 0.1",
+        "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
         "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)",
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
@@ -594,13 +603,13 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
-        "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
@@ -609,7 +618,7 @@
     {
         "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100",
         "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
         "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers"
@@ -644,12 +653,39 @@
     },
     {
         "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
-        "MetricExpr": "tma_info_core_ipmispredict",
+        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;BadSpec;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmispredict",
         "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
     },
     {
+        "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)",
+        "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)",
+        "MetricGroup": "BrMispredicts",
+        "MetricName": "tma_info_bad_spec_spec_clears_ratio"
+    },
+    {
+        "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
+        "MetricExpr": "(100 * (1 - max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) / (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) < (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots - (CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES) * (topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / slots)) * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
+        "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_core_bound_likely",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.",
+        "MetricExpr": "100 * (100 * ((5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 10 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(3 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + max(0, topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots - (5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots) * ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2) / ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2 + (IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2)))",
+        "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_dsb_misses",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.",
+        "MetricExpr": "100 * (100 * ((5 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE - INT_MISC.UOP_DROPPING) / slots * (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 10 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(3 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))",
+        "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_ic_misses",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
@@ -676,67 +712,102 @@
         "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: "
     },
     {
+        "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.",
+        "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Ret",
+        "MetricName": "tma_info_bottleneck_base_non_br",
+        "MetricThreshold": "tma_info_bottleneck_base_non_br > 20"
+    },
+    {
         "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
-        "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC",
+        "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB",
         "MetricName": "tma_info_bottleneck_big_code",
-        "MetricThreshold": "tma_info_bottleneck_big_code > 20",
-        "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead"
+        "MetricThreshold": "tma_info_bottleneck_big_code > 20"
     },
     {
         "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
-        "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)",
-        "MetricGroup": "Ret;tma_issueBC",
+        "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)",
+        "MetricGroup": "Ret",
         "MetricName": "tma_info_bottleneck_branching_overhead",
-        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10",
-        "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code"
+        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
+        "MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
+        "MetricName": "tma_info_bottleneck_cache_memory_latency",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency"
+    },
+    {
+        "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
+        "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
+        "MetricGroup": "Cor;tma_issueComp",
+        "MetricName": "tma_info_bottleneck_compute_bound_est",
+        "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
+        "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: "
     },
     {
         "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
+        "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
         "MetricGroup": "Fed;FetchBW;Frontend",
         "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
         "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20"
     },
     {
-        "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
-        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
-        "MetricName": "tma_info_bottleneck_memory_bandwidth",
-        "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20",
-        "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+        "BriefDescription": "Total pipeline cost of irregular execution (e.g",
+        "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Bad;Cor;Ret;tma_issueMS",
+        "MetricName": "tma_info_bottleneck_irregular_overhead",
+        "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
+        "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches"
     },
     {
         "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
         "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB",
         "MetricName": "tma_info_bottleneck_memory_data_tlbs",
         "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
-        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store"
+        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization"
     },
     {
-        "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))",
-        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
-        "MetricName": "tma_info_bottleneck_memory_latency",
-        "MetricThreshold": "tma_info_bottleneck_memory_latency > 20",
-        "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency"
+        "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
+        "MetricGroup": "Mem;Offcore;tma_issueTLB",
+        "MetricName": "tma_info_bottleneck_memory_synchronization",
+        "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
+        "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs"
     },
     {
         "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+        "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
         "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bottleneck_mispredictions",
         "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
         "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers"
     },
     {
+        "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)",
+        "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)",
+        "MetricGroup": "Cor;Offcore",
+        "MetricName": "tma_info_bottleneck_other_bottlenecks",
+        "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
+        "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls."
+    },
+    {
         "BriefDescription": "Fraction of branches that are CALL or RET",
         "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
@@ -768,7 +839,7 @@
     },
     {
         "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
-        "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED",
+        "MetricExpr": "(CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else tma_info_thread_clks)",
         "MetricGroup": "SMT",
         "MetricName": "tma_info_core_core_clks"
     },
@@ -779,8 +850,14 @@
         "MetricName": "tma_info_core_coreipc"
     },
     {
+        "BriefDescription": "uops Executed per Cycle",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_core_epc"
+    },
+    {
         "BriefDescription": "Floating Point Operations Per Cycle",
-        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc"
     },
@@ -792,19 +869,12 @@
         "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
     {
-        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
-        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
-        "MetricGroup": "Bad;BadSpec;BrMispredicts;TopdownL1;tma_L1_group",
-        "MetricName": "tma_info_core_ipmispredict",
-        "MetricgroupNoGroup": "TopdownL1"
-    },
-    {
         "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
         "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY",
         "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB",
@@ -874,7 +944,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
@@ -882,7 +952,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
@@ -890,7 +960,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)",
@@ -898,7 +968,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx512",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
@@ -906,7 +976,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
@@ -914,7 +984,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -932,7 +1002,7 @@
     },
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
-        "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
@@ -945,6 +1015,12 @@
         "MetricThreshold": "tma_info_inst_mix_ipload < 3"
     },
     {
+        "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)",
+        "MetricExpr": "tma_info_inst_mix_instructions / MISC_RETIRED.PAUSE_INST",
+        "MetricGroup": "Flops;FpVector;InsType",
+        "MetricName": "tma_info_inst_mix_ippause"
+    },
+    {
         "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
         "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES",
         "MetricGroup": "InsType",
@@ -967,16 +1043,30 @@
         "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp"
     },
     {
+        "BriefDescription": "\"Bus lock\" per kilo instruction",
+        "MetricExpr": "tma_info_memory_mix_bus_lock_pki",
+        "MetricGroup": "Mem;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_bus_lock_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki",
+        "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_code_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction",
@@ -992,124 +1082,227 @@
     },
     {
         "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_access_bw",
         "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_core_l3_cache_access_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
+    },
+    {
+        "BriefDescription": "Average Parallel L2 cache miss data reads",
+        "MetricExpr": "tma_info_memory_latency_data_l2_mlp",
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_data_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_fb_hpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction",
+        "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY",
+        "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)",
+        "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY",
+        "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_evictions_silent_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (OFFCORE_REQUESTS.ALL_DATA_RD - OFFCORE_REQUESTS.DEMAND_DATA_RD + L2_RQSTS.ALL_DEMAND_MISS + L2_RQSTS.SWPF_MISS) / tma_info_inst_mix_instructions",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load"
     },
     {
-        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
-        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
-        "MetricName": "tma_info_memory_l3mpki"
+        "BriefDescription": "",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW;Offcore",
+        "MetricName": "tma_info_memory_l3_cache_access_bw"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
+        "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l3_cache_access_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_l3mpki"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
-        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
+        "MetricExpr": "tma_info_memory_load_l2_miss_latency",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L3 cache miss demand Loads",
-        "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
+        "MetricExpr": "tma_info_memory_load_l3_miss_latency",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l3_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l3_miss_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
+        "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_miss_latency",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=0x1@",
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_access_bw",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Average Latency for L3 cache miss demand Loads",
+        "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
+        "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l3_miss_latency",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
+    },
+    {
+        "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "\"Bus lock\" per kilo instruction",
+        "MetricExpr": "1e3 * SQ_MISC.BUS_LOCK / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_bus_lock_pki"
+    },
+    {
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "tma_info_memory_uc_load_pki",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_uc_load_pki"
+    },
+    {
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+    },
+    {
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (2 * (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD))",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_page_walks_utilization",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_store_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
@@ -1137,54 +1330,77 @@
         "MetricName": "tma_info_memory_tlb_store_stlb_mpki"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY",
+        "MetricGroup": "Mem;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_uc_load_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
     {
+        "BriefDescription": "Instructions per a microcode Assist invocation",
+        "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY",
+        "MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
+        "MetricName": "tma_info_pipeline_ipassist",
+        "MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
+        "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)"
+    },
+    {
         "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
         "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
         "MetricGroup": "Pipeline;Ret",
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
-        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
+        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]",
-        "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_HIT_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR) * 64 / 1e9 / duration_time",
-        "MetricGroup": "IoBW;Mem;Server;SoC",
-        "MetricName": "tma_info_system_io_read_bw"
+        "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time",
+        "MetricGroup": "IoBW;MemOffcore;Server;SoC",
+        "MetricName": "tma_info_system_io_read_bw",
+        "PublicDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU"
     },
     {
         "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]",
-        "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time",
-        "MetricGroup": "IoBW;Mem;Server;SoC",
-        "MetricName": "tma_info_system_io_write_bw"
+        "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_HIT_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR) * 64 / 1e9 / duration_time",
+        "MetricGroup": "IoBW;MemOffcore;Server;SoC",
+        "MetricName": "tma_info_system_io_write_bw",
+        "PublicDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]. Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -1209,7 +1425,7 @@
     {
         "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]",
         "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / cha_0@event\\=0x0@",
-        "MetricGroup": "Mem;MemoryLat;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryLat;Server;SoC",
         "MetricName": "tma_info_system_mem_dram_read_latency",
         "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches"
     },
@@ -1223,7 +1439,7 @@
     {
         "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]",
         "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / cha_0@event\\=0x0@ if #has_pmem > 0 else 0)",
-        "MetricGroup": "Mem;MemoryLat;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryLat;Server;SoC",
         "MetricName": "tma_info_system_mem_pmm_read_latency",
         "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches"
     },
@@ -1237,13 +1453,13 @@
     {
         "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]",
         "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)",
-        "MetricGroup": "Mem;MemoryBW;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryBW;Server;SoC",
         "MetricName": "tma_info_system_pmm_read_bw"
     },
     {
         "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]",
         "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)",
-        "MetricGroup": "Mem;MemoryBW;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryBW;Server;SoC",
         "MetricName": "tma_info_system_pmm_write_bw"
     },
     {
@@ -1288,6 +1504,12 @@
         "MetricName": "tma_info_system_turbo_utilization"
     },
     {
+        "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]",
+        "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time",
+        "MetricGroup": "SoC",
+        "MetricName": "tma_info_system_uncore_frequency"
+    },
+    {
         "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
         "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
         "MetricGroup": "Pipeline",
@@ -1340,8 +1562,8 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
-        "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
@@ -1350,7 +1572,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -1360,7 +1582,7 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
@@ -1370,24 +1592,24 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_NMI",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
-        "MetricExpr": "19 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "MetricExpr": "19 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
-        "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks",
+        "MetricExpr": "DECODE.LCP / tma_info_thread_clks",
         "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
         "MetricName": "tma_lcp",
         "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -1401,7 +1623,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -1431,10 +1653,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory",
-        "MetricExpr": "43.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "43.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_local_dram",
-        "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_local_mem",
+        "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -1459,21 +1681,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -1497,11 +1719,11 @@
     },
     {
         "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
-        "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
+        "MetricExpr": "UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
         "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
         "MetricName": "tma_microcode_sequencer",
         "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
-        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches",
+        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1518,7 +1740,7 @@
         "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
         "ScaleUnit": "100%"
     },
@@ -1527,16 +1749,16 @@
         "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group",
         "MetricName": "tma_mite_4wide",
-        "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))",
+        "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
+        "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)",
         "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
         "MetricName": "tma_mixing_vectors",
         "MetricThreshold": "tma_mixing_vectors > 0.05",
-        "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
+        "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1545,22 +1767,22 @@
         "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
         "MetricName": "tma_ms_switches",
         "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
-        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
+        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
         "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
         "MetricName": "tma_nop_instructions",
-        "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6",
+        "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))",
+        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions))",
         "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_other_light_ops",
         "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
@@ -1568,8 +1790,24 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
+        "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)",
+        "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
+        "MetricName": "tma_other_mispredicts",
+        "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
+        "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)",
+        "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_other_nukes",
+        "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a",
-        "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)",
+        "MetricExpr": "(((1 - (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)",
         "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_pmm_bound",
         "MetricThreshold": "tma_pmm_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -1604,17 +1842,17 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
-        "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
+        "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
         "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
         "MetricName": "tma_ports_utilization",
         "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -1623,7 +1861,7 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
-        "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
+        "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_0",
         "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -1653,13 +1891,13 @@
         "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues",
-        "MetricExpr": "(97 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 97 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(97 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 97 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group",
         "MetricName": "tma_remote_cache",
         "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
@@ -1668,10 +1906,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory",
-        "MetricExpr": "108 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "108 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_remote_dram",
-        "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_remote_mem",
+        "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -1689,18 +1927,18 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
         "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks",
-        "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group",
+        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
         "MetricName": "tma_serializing_operation",
-        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))",
+        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
         "MetricExpr": "37 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks",
-        "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
         "MetricName": "tma_slow_pause",
-        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))",
+        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST",
         "ScaleUnit": "100%"
     },
@@ -1729,7 +1967,7 @@
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
         "MetricName": "tma_sq_full",
         "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
+        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
         "ScaleUnit": "100%"
     },
     {
@@ -1797,10 +2035,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json b/tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json
index bc6a9a4d27a9..904d299c95a3 100644
--- a/tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/icelakex/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -26,7 +26,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -64,8 +66,10 @@
     "tma_L5_group": "Metrics for top-down breakdown at level 5",
     "tma_L6_group": "Metrics for top-down breakdown at level 6",
     "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category",
+    "tma_assists_group": "Metrics contributing to tma_assists category",
     "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category",
     "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category",
+    "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category",
     "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category",
     "tma_core_bound_group": "Metrics contributing to tma_core_bound category",
     "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category",
@@ -78,9 +82,9 @@
     "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category",
     "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category",
     "tma_issue2P": "Metrics related by the issue $issue2P",
-    "tma_issueBC": "Metrics related by the issue $issueBC",
     "tma_issueBM": "Metrics related by the issue $issueBM",
     "tma_issueBW": "Metrics related by the issue $issueBW",
+    "tma_issueComp": "Metrics related by the issue $issueComp",
     "tma_issueD0": "Metrics related by the issue $issueD0",
     "tma_issueFB": "Metrics related by the issue $issueFB",
     "tma_issueFL": "Metrics related by the issue $issueFL",
@@ -100,10 +104,12 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
     "tma_mite_group": "Metrics contributing to tma_mite category",
+    "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category",
     "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category",
     "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category",
     "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category",
diff --git a/tools/perf/pmu-events/arch/x86/icelakex/uncore-power.json b/tools/perf/pmu-events/arch/x86/icelakex/uncore-power.json
index ee4dac6fc797..920cab6ffe37 100644
--- a/tools/perf/pmu-events/arch/x86/icelakex/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/icelakex/uncore-power.json
@@ -151,6 +151,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
         "PerPkg": "1",
         "PublicDescription": "Number of cores in C-State : C0 and C1 : This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0x40",
         "Unit": "PCU"
     },
     {
@@ -159,6 +160,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
         "PerPkg": "1",
         "PublicDescription": "Number of cores in C-State : C3 : This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0x80",
         "Unit": "PCU"
     },
     {
@@ -167,6 +169,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
         "PerPkg": "1",
         "PublicDescription": "Number of cores in C-State : C6 and C7 : This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0xc0",
         "Unit": "PCU"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json
index 33fe555252b2..5f3f0b5aebad 100644
--- a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json
@@ -84,12 +84,12 @@
         "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * tma_info_core_core_clks)",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
+        "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -202,7 +202,7 @@
         "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -257,7 +257,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -287,7 +287,7 @@
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
         "MetricName": "tma_fp_scalar",
         "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -296,7 +296,25 @@
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
         "MetricName": "tma_fp_vector",
         "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
+        "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE) / UOPS_EXECUTED.THREAD",
+        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
+        "MetricName": "tma_fp_vector_128b",
+        "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
+        "MetricExpr": "(SIMD_FP_256.PACKED_DOUBLE + SIMD_FP_256.PACKED_SINGLE) / UOPS_EXECUTED.THREAD",
+        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
+        "MetricName": "tma_fp_vector_256b",
+        "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -316,20 +334,20 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
         "MetricExpr": "ICACHE.IFETCH_STALL / tma_info_thread_clks - tma_itlb_misses",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
-        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)",
         "MetricGroup": "Bad;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmisp_indirect",
         "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
@@ -360,8 +378,8 @@
         "MetricName": "tma_info_core_flopc"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
@@ -398,7 +416,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -438,96 +456,90 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "Mem",
         "MetricName": "tma_info_memory_l3mpki"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
-    },
-    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
-    },
-    {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
-    },
-    {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
-    },
-    {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "0",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
     },
     {
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
@@ -537,8 +549,8 @@
         "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
@@ -549,21 +561,27 @@
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
         "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
     },
@@ -572,7 +590,7 @@
         "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -595,19 +613,6 @@
         "MetricThreshold": "tma_info_system_kernel_utilization > 0.05"
     },
     {
-        "BriefDescription": "Average number of parallel requests to external memory",
-        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_parallel_requests",
-        "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests"
-    },
-    {
-        "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
-        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_request_latency"
-    },
-    {
         "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
         "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)",
         "MetricGroup": "SMT",
@@ -673,7 +678,7 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
         "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
@@ -682,7 +687,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -691,7 +696,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS",
@@ -701,20 +706,20 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_SMT",
         "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.LLC_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.LLC_MISS))) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -733,7 +738,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -768,21 +773,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -810,7 +815,7 @@
         "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
         "ScaleUnit": "100%"
     },
@@ -829,7 +834,7 @@
         "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_0",
         "MetricThreshold": "tma_port_0 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -838,7 +843,7 @@
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_1",
         "MetricThreshold": "tma_port_1 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -874,7 +879,7 @@
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_5",
         "MetricThreshold": "tma_port_5 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -911,7 +916,7 @@
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_2",
         "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
+        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
         "ScaleUnit": "100%"
     },
     {
@@ -919,7 +924,7 @@
         "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json b/tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json
index f6a0258e3241..8c808347f6da 100644
--- a/tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/ivybridge/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -24,7 +24,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -94,6 +96,7 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
diff --git a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json
index f5e46a768fdd..e6f5b05a71b5 100644
--- a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json
@@ -84,12 +84,12 @@
         "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * tma_info_core_core_clks)",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
+        "MetricExpr": "66 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -202,7 +202,7 @@
         "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -257,7 +257,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -287,7 +287,7 @@
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
         "MetricName": "tma_fp_scalar",
         "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -296,7 +296,25 @@
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
         "MetricName": "tma_fp_vector",
         "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
+        "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE) / UOPS_EXECUTED.THREAD",
+        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
+        "MetricName": "tma_fp_vector_128b",
+        "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
+        "MetricExpr": "(SIMD_FP_256.PACKED_DOUBLE + SIMD_FP_256.PACKED_SINGLE) / UOPS_EXECUTED.THREAD",
+        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
+        "MetricName": "tma_fp_vector_256b",
+        "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -316,20 +334,20 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
         "MetricExpr": "ICACHE.IFETCH_STALL / tma_info_thread_clks - tma_itlb_misses",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
-        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)",
         "MetricGroup": "Bad;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmisp_indirect",
         "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
@@ -360,8 +378,8 @@
         "MetricName": "tma_info_core_flopc"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
@@ -398,7 +416,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -438,96 +456,90 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.LLC_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "Mem",
         "MetricName": "tma_info_memory_l3mpki"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
-    },
-    {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
-    },
-    {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
-    },
-    {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
-    },
-    {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "0",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricConstraint": "NO_GROUP_EVENTS",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
     },
     {
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
@@ -537,8 +549,8 @@
         "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
@@ -549,21 +561,27 @@
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
         "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
     },
@@ -572,7 +590,7 @@
         "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -627,6 +645,12 @@
         "MetricName": "tma_info_system_turbo_utilization"
     },
     {
+        "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]",
+        "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time",
+        "MetricGroup": "SoC",
+        "MetricName": "tma_info_system_uncore_frequency"
+    },
+    {
         "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
         "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
         "MetricGroup": "Pipeline",
@@ -674,7 +698,7 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
         "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
@@ -683,7 +707,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -692,7 +716,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS",
@@ -702,20 +726,20 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_SMT",
         "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.LLC_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS. Related metrics: tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -734,7 +758,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -749,11 +773,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory",
-        "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "200 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
         "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_local_dram",
-        "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_local_mem",
+        "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -779,21 +802,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -821,7 +844,7 @@
         "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
         "ScaleUnit": "100%"
     },
@@ -840,7 +863,7 @@
         "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_0",
         "MetricThreshold": "tma_port_0 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch). Sample with: UOPS_DISPATCHED_PORT.PORT_0. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -849,7 +872,7 @@
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_1",
         "MetricThreshold": "tma_port_1 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_1. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -885,7 +908,7 @@
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_5",
         "MetricThreshold": "tma_port_5 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU). Sample with: UOPS_DISPATCHED.PORT_5. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -922,7 +945,7 @@
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_2",
         "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
+        "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).  Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_port_6",
         "ScaleUnit": "100%"
     },
     {
@@ -930,7 +953,7 @@
         "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "ScaleUnit": "100%"
     },
     {
@@ -945,11 +968,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory",
-        "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "310 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
         "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_remote_dram",
-        "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_remote_mem",
+        "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS",
         "ScaleUnit": "100%"
     },
diff --git a/tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json b/tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json
index f6a0258e3241..8c808347f6da 100644
--- a/tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/ivytown/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -24,7 +24,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -94,6 +96,7 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json
index 5df1ebfb89ea..ad6c531a9e38 100644
--- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json
@@ -514,6 +514,7 @@
         "BriefDescription": "Number of cores in C-State; C0 and C1",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
+        "Filter": "occ_sel=1",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
@@ -522,6 +523,7 @@
         "BriefDescription": "Number of cores in C-State; C3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
+        "Filter": "occ_sel=2",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
@@ -530,6 +532,7 @@
         "BriefDescription": "Number of cores in C-State; C6 and C7",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
+        "Filter": "occ_sel=3",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
diff --git a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json
index 35b1a3aa728d..fc8c3f785be1 100644
--- a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json
@@ -163,7 +163,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_lcp",
         "ScaleUnit": "100%"
@@ -193,7 +193,7 @@
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
         "MetricName": "tma_fp_scalar",
         "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -202,7 +202,25 @@
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
         "MetricName": "tma_fp_vector",
         "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
+        "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE) / UOPS_DISPATCHED.THREAD",
+        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
+        "MetricName": "tma_fp_vector_128b",
+        "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
+        "MetricExpr": "(SIMD_FP_256.PACKED_DOUBLE + SIMD_FP_256.PACKED_SINGLE) / UOPS_DISPATCHED.THREAD",
+        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
+        "MetricName": "tma_fp_vector_256b",
+        "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -222,7 +240,7 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
@@ -244,7 +262,7 @@
         "MetricName": "tma_info_core_flopc"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
         "MetricExpr": "UOPS_DISPATCHED.THREAD / (cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
@@ -271,21 +289,27 @@
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
         "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_mem_bandwidth"
     },
@@ -294,7 +318,7 @@
         "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -349,6 +373,12 @@
         "MetricName": "tma_info_system_turbo_utilization"
     },
     {
+        "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]",
+        "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time",
+        "MetricGroup": "SoC",
+        "MetricName": "tma_info_system_uncore_frequency"
+    },
+    {
         "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
         "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
         "MetricGroup": "Pipeline",
@@ -389,7 +419,7 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
         "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
@@ -399,7 +429,7 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_SMT",
         "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
@@ -421,7 +451,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -436,21 +466,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: ",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: ",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json b/tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json
index bebb85945d62..a2c27794c0d8 100644
--- a/tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -23,7 +23,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -88,6 +90,7 @@
     "tma_issueTLB": "Metrics related by the issue $issueTLB",
     "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
diff --git a/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json b/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json
index b3ee5d741015..6f98fc1728e6 100644
--- a/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json
@@ -233,6 +233,7 @@
         "BriefDescription": "Number of cores in C0",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
+        "Filter": "occ_sel=1",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in C0.  It can be used by itself to get the average number of cores in C0, with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
@@ -241,6 +242,7 @@
         "BriefDescription": "Number of cores in C0",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
+        "Filter": "occ_sel=2",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in C0.  It can be used by itself to get the average number of cores in C0, with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
@@ -249,6 +251,7 @@
         "BriefDescription": "Number of cores in C0",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
+        "Filter": "occ_sel=3",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in C0.  It can be used by itself to get the average number of cores in C0, with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
         "Unit": "PCU"
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 4d1deed4437a..5297d25f4e03 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -1,38 +1,38 @@
 Family-model,Version,Filename,EventType
-GenuineIntel-6-(97|9A|B7|BA|BF),v1.23,alderlake,core
-GenuineIntel-6-BE,v1.23,alderlaken,core
+GenuineIntel-6-(97|9A|B7|BA|BF),v1.24,alderlake,core
+GenuineIntel-6-BE,v1.24,alderlaken,core
 GenuineIntel-6-(1C|26|27|35|36),v5,bonnell,core
-GenuineIntel-6-(3D|47),v28,broadwell,core
+GenuineIntel-6-(3D|47),v29,broadwell,core
 GenuineIntel-6-56,v11,broadwellde,core
 GenuineIntel-6-4F,v22,broadwellx,core
 GenuineIntel-6-55-[56789ABCDEF],v1.20,cascadelakex,core
 GenuineIntel-6-9[6C],v1.04,elkhartlake,core
-GenuineIntel-6-CF,v1.02,emeraldrapids,core
+GenuineIntel-6-CF,v1.03,emeraldrapids,core
 GenuineIntel-6-5[CF],v13,goldmont,core
 GenuineIntel-6-7A,v1.01,goldmontplus,core
-GenuineIntel-6-B6,v1.00,grandridge,core
+GenuineIntel-6-B6,v1.01,grandridge,core
 GenuineIntel-6-A[DE],v1.01,graniterapids,core
-GenuineIntel-6-(3C|45|46),v33,haswell,core
+GenuineIntel-6-(3C|45|46),v35,haswell,core
 GenuineIntel-6-3F,v28,haswellx,core
-GenuineIntel-6-7[DE],v1.19,icelake,core
+GenuineIntel-6-7[DE],v1.21,icelake,core
 GenuineIntel-6-6[AC],v1.23,icelakex,core
 GenuineIntel-6-3A,v24,ivybridge,core
 GenuineIntel-6-3E,v24,ivytown,core
 GenuineIntel-6-2D,v24,jaketown,core
 GenuineIntel-6-(57|85),v16,knightslanding,core
 GenuineIntel-6-BD,v1.00,lunarlake,core
-GenuineIntel-6-A[AC],v1.06,meteorlake,core
+GenuineIntel-6-A[AC],v1.07,meteorlake,core
 GenuineIntel-6-1[AEF],v4,nehalemep,core
 GenuineIntel-6-2E,v4,nehalemex,core
-GenuineIntel-6-A7,v1.01,rocketlake,core
+GenuineIntel-6-A7,v1.02,rocketlake,core
 GenuineIntel-6-2A,v19,sandybridge,core
 GenuineIntel-6-8F,v1.17,sapphirerapids,core
-GenuineIntel-6-AF,v1.00,sierraforest,core
+GenuineIntel-6-AF,v1.01,sierraforest,core
 GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core
-GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v57,skylake,core
+GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v58,skylake,core
 GenuineIntel-6-55-[01234],v1.32,skylakex,core
 GenuineIntel-6-86,v1.21,snowridgex,core
-GenuineIntel-6-8[CD],v1.13,tigerlake,core
+GenuineIntel-6-8[CD],v1.15,tigerlake,core
 GenuineIntel-6-2C,v5,westmereep-dp,core
 GenuineIntel-6-25,v4,westmereep-sp,core
 GenuineIntel-6-2F,v4,westmereex,core
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json
index 5fef87502d4b..47861a6dd8e9 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json
@@ -319,7 +319,7 @@
         "EventCode": "0x35",
         "EventName": "MEM_BOUND_STALLS_IFETCH.ALL",
         "SampleAfterValue": "1000003",
-        "UMask": "0x6f",
+        "UMask": "0x7f",
         "Unit": "cpu_atom"
     },
     {
@@ -344,7 +344,7 @@
         "EventCode": "0x35",
         "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_MISS",
         "SampleAfterValue": "1000003",
-        "UMask": "0x68",
+        "UMask": "0x78",
         "Unit": "cpu_atom"
     },
     {
@@ -352,7 +352,7 @@
         "EventCode": "0x34",
         "EventName": "MEM_BOUND_STALLS_LOAD.ALL",
         "SampleAfterValue": "1000003",
-        "UMask": "0x6f",
+        "UMask": "0x7f",
         "Unit": "cpu_atom"
     },
     {
@@ -377,7 +377,7 @@
         "EventCode": "0x34",
         "EventName": "MEM_BOUND_STALLS_LOAD.LLC_MISS",
         "SampleAfterValue": "1000003",
-        "UMask": "0x68",
+        "UMask": "0x78",
         "Unit": "cpu_atom"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json b/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json
index f66506ee37ef..30e604d2120f 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json
@@ -1,5 +1,14 @@
 [
     {
+        "BriefDescription": "Counts the number of cycles when any of the floating point dividers are active.",
+        "CounterMask": "1",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.FPDIV_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "This event counts the cycles the floating point divider is busy.",
         "CounterMask": "1",
         "EventCode": "0xb0",
@@ -26,7 +35,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0",
+        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0]",
         "EventCode": "0xb3",
         "EventName": "FP_ARITH_DISPATCHED.PORT_0",
         "SampleAfterValue": "2000003",
@@ -34,7 +43,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1",
+        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1]",
         "EventCode": "0xb3",
         "EventName": "FP_ARITH_DISPATCHED.PORT_1",
         "SampleAfterValue": "2000003",
@@ -42,7 +51,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5",
+        "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2]",
         "EventCode": "0xb3",
         "EventName": "FP_ARITH_DISPATCHED.PORT_5",
         "SampleAfterValue": "2000003",
@@ -50,6 +59,30 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0]",
+        "EventCode": "0xb3",
+        "EventName": "FP_ARITH_DISPATCHED.V0",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x1",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1]",
+        "EventCode": "0xb3",
+        "EventName": "FP_ARITH_DISPATCHED.V1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5]",
+        "EventCode": "0xb3",
+        "EventName": "FP_ARITH_DISPATCHED.V2",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x4",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 2 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.",
         "EventCode": "0xc7",
         "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE",
@@ -131,6 +164,53 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of all types of floating point operations per uop with all default weighting",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.ALL",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP64]",
+        "Deprecated": "1",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.DP",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point operations that produce 32 bit single precision results [This event is alias to FP_FLOPS_RETIRED.SP]",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.FP32",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point operations that produce 64 bit double precision results [This event is alias to FP_FLOPS_RETIRED.DP]",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.FP64",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP32]",
+        "Deprecated": "1",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.SP",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Counts the number of floating point operations retired that required microcode assist.",
         "EventCode": "0xc3",
         "EventName": "MACHINE_CLEARS.FP_ASSIST",
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/other.json b/tools/perf/pmu-events/arch/x86/meteorlake/other.json
index d55e792c0c43..7effc1f271e7 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/other.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/other.json
@@ -8,6 +8,16 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "This event is deprecated. [This event is alias to MISC_RETIRED.LBR_INSERTS]",
+        "Deprecated": "1",
+        "EventCode": "0xe4",
+        "EventName": "LBR_INSERTS.ANY",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Counts demand data reads that have any type of response.",
         "EventCode": "0x2A,0x2B",
         "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE",
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json
index deaa7aba93f7..24bbfcebd2be 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json
@@ -1,5 +1,14 @@
 [
     {
+        "BriefDescription": "Counts the number of cycles when any of the dividers are active.",
+        "CounterMask": "1",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.DIV_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Cycles when divide unit is busy executing divide or square root operations.",
         "CounterMask": "1",
         "EventCode": "0xb0",
@@ -46,6 +55,15 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of retired JCC (Jump on Conditional Code) branch instructions retired, includes both taken and not taken branches.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x7e",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Conditional branch instructions retired.",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.COND",
@@ -66,6 +84,15 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of taken JCC (Jump on Conditional Code) branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND_TAKEN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfe",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Taken conditional branch instructions retired.",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.COND_TAKEN",
@@ -95,6 +122,15 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.INDIRECT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xeb",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Indirect near branch instructions retired (excluding returns)",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.INDIRECT",
@@ -105,6 +141,25 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of near indirect CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.INDIRECT_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT_CALL",
+        "Deprecated": "1",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.IND_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Counts the number of near CALL branch instructions retired.",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.NEAR_CALL",
@@ -124,6 +179,15 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of near RET branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_RETURN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf7",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Return instructions retired.",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.NEAR_RETURN",
@@ -671,6 +735,7 @@
         "BriefDescription": "INST_RETIRED.MACRO_FUSED",
         "EventCode": "0xc0",
         "EventName": "INST_RETIRED.MACRO_FUSED",
+        "PEBS": "1",
         "SampleAfterValue": "2000003",
         "UMask": "0x10",
         "Unit": "cpu_core"
@@ -679,6 +744,7 @@
         "BriefDescription": "Retired NOP instructions.",
         "EventCode": "0xc0",
         "EventName": "INST_RETIRED.NOP",
+        "PEBS": "1",
         "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions",
         "SampleAfterValue": "2000003",
         "UMask": "0x2",
@@ -697,6 +763,7 @@
         "BriefDescription": "Iterations of Repeat string retired instructions.",
         "EventCode": "0xc0",
         "EventName": "INST_RETIRED.REP_ITERATION",
+        "PEBS": "1",
         "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.",
         "SampleAfterValue": "2000003",
         "UMask": "0x8",
@@ -980,6 +1047,15 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of Last Branch Record (LBR) entries. Requires LBRs to be enabled and configured in IA32_LBR_CTL. [This event is alias to LBR_INSERTS.ANY]",
+        "EventCode": "0xe4",
+        "EventName": "MISC_RETIRED.LBR_INSERTS",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Counts cycles where the pipeline is stalled due to serializing operations.",
         "EventCode": "0xa2",
         "EventName": "RESOURCE_STALLS.SCOREBOARD",
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json b/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json
index 056c2a885a32..55798e64c58a 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json
@@ -71,6 +71,15 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 4K page.",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Page walks completed due to a demand data load to a 4K page.",
         "EventCode": "0x12",
         "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K",
@@ -151,6 +160,15 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 2M or 4M page.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M",
+        "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Page walks completed due to a demand data store to a 2M/4M page.",
         "EventCode": "0x13",
         "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M",
@@ -160,6 +178,15 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 4K page.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Page walks completed due to a demand data store to a 4K page.",
         "EventCode": "0x13",
         "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K",
@@ -258,6 +285,15 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to a 4K page.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (4K)",
         "EventCode": "0x11",
         "EventName": "ITLB_MISSES.WALK_COMPLETED_4K",
diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/memory.json b/tools/perf/pmu-events/arch/x86/rocketlake/memory.json
index e8d2ec1c029b..f84763220549 100644
--- a/tools/perf/pmu-events/arch/x86/rocketlake/memory.json
+++ b/tools/perf/pmu-events/arch/x86/rocketlake/memory.json
@@ -259,6 +259,7 @@
         "BriefDescription": "Number of times an RTM execution aborted.",
         "EventCode": "0xc9",
         "EventName": "RTM_RETIRED.ABORTED",
+        "PEBS": "1",
         "PublicDescription": "Counts the number of times RTM abort was triggered.",
         "SampleAfterValue": "100003",
         "UMask": "0x4"
diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/metricgroups.json b/tools/perf/pmu-events/arch/x86/rocketlake/metricgroups.json
index a151ba9cccb0..5452a1448ded 100644
--- a/tools/perf/pmu-events/arch/x86/rocketlake/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/rocketlake/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -25,7 +25,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -63,8 +65,10 @@
     "tma_L5_group": "Metrics for top-down breakdown at level 5",
     "tma_L6_group": "Metrics for top-down breakdown at level 6",
     "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category",
+    "tma_assists_group": "Metrics contributing to tma_assists category",
     "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category",
     "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category",
+    "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category",
     "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category",
     "tma_core_bound_group": "Metrics contributing to tma_core_bound category",
     "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category",
@@ -77,9 +81,9 @@
     "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category",
     "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category",
     "tma_issue2P": "Metrics related by the issue $issue2P",
-    "tma_issueBC": "Metrics related by the issue $issueBC",
     "tma_issueBM": "Metrics related by the issue $issueBM",
     "tma_issueBW": "Metrics related by the issue $issueBW",
+    "tma_issueComp": "Metrics related by the issue $issueComp",
     "tma_issueD0": "Metrics related by the issue $issueD0",
     "tma_issueFB": "Metrics related by the issue $issueFB",
     "tma_issueFL": "Metrics related by the issue $issueFL",
@@ -99,10 +103,12 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
     "tma_mite_group": "Metrics contributing to tma_mite category",
+    "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category",
     "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category",
     "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category",
     "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category",
diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/other.json b/tools/perf/pmu-events/arch/x86/rocketlake/other.json
index cfb590632918..4fdc87339555 100644
--- a/tools/perf/pmu-events/arch/x86/rocketlake/other.json
+++ b/tools/perf/pmu-events/arch/x86/rocketlake/other.json
@@ -19,7 +19,7 @@
         "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.",
         "EventCode": "0x28",
         "EventName": "CORE_POWER.LVL2_TURBO_LICENSE",
-        "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchtecture).  This includes high current AVX 512-bit instructions.",
+        "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchitecture).  This includes high current AVX 512-bit instructions.",
         "SampleAfterValue": "200003",
         "UMask": "0x20"
     },
diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/pipeline.json b/tools/perf/pmu-events/arch/x86/rocketlake/pipeline.json
index 375b78044f14..c7313fd4fdf4 100644
--- a/tools/perf/pmu-events/arch/x86/rocketlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/rocketlake/pipeline.json
@@ -529,7 +529,7 @@
         "BriefDescription": "Cycles when Reservation Station (RS) is empty for the thread",
         "EventCode": "0x5e",
         "EventName": "RS_EVENTS.EMPTY_CYCLES",
-        "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into stravation periods (e.g. branch mispredictions or i-cache misses)",
+        "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)",
         "SampleAfterValue": "1000003",
         "UMask": "0x1"
     },
@@ -553,14 +553,6 @@
         "UMask": "0x2"
     },
     {
-        "BriefDescription": "TMA slots wasted due to incorrect speculation by branch mispredictions",
-        "EventCode": "0xa4",
-        "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS",
-        "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by branch mispredictions. This event estimates number of operations that were issued but not retired from the speculative path as well as the out-of-order engine recovery past a branch misprediction.",
-        "SampleAfterValue": "10000003",
-        "UMask": "0x8"
-    },
-    {
         "BriefDescription": "TMA slots available for an unhalted logical processor. Fixed counter - architectural event",
         "EventName": "TOPDOWN.SLOTS",
         "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3).",
diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json b/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json
index 27433fc15ede..1dad462e58b1 100644
--- a/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json
@@ -98,12 +98,12 @@
         "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots",
+        "MetricExpr": "34 * ASSISTS.ANY / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -113,7 +113,7 @@
     {
         "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
         "DefaultMetricgroupName": "TopdownL1",
-        "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots",
+        "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / tma_info_thread_slots",
         "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group",
         "MetricName": "tma_backend_bound",
         "MetricThreshold": "tma_backend_bound > 0.2",
@@ -135,7 +135,7 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.",
         "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_branch_instructions",
         "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6",
         "ScaleUnit": "100%"
@@ -180,7 +180,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(29 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(29 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 23.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_contested_accesses",
         "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -200,7 +200,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "23.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "23.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_data_sharing",
         "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -212,7 +212,7 @@
         "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
         "MetricName": "tma_decoder0_alone",
-        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))",
+        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
         "ScaleUnit": "100%"
     },
@@ -240,7 +240,7 @@
         "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -259,7 +259,7 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
         "MetricName": "tma_dtlb_load",
         "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
@@ -268,12 +268,12 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
         "MetricName": "tma_dtlb_store",
         "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
-        "MetricExpr": "32.5 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
+        "MetricExpr": "32.5 * tma_info_system_core_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
         "MetricName": "tma_false_sharing",
         "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -286,7 +286,7 @@
         "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
         "MetricName": "tma_fb_full",
         "MetricThreshold": "tma_fb_full > 0.3",
-        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
         "ScaleUnit": "100%"
     },
     {
@@ -294,7 +294,7 @@
         "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -328,6 +328,15 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists",
+        "MetricExpr": "34 * ASSISTS.FP / tma_info_thread_slots",
+        "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group",
+        "MetricName": "tma_fp_assists",
+        "MetricThreshold": "tma_fp_assists > 0.1",
+        "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
         "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)",
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
@@ -390,13 +399,13 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
-        "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
@@ -405,7 +414,7 @@
     {
         "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100",
         "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
         "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers"
@@ -446,6 +455,12 @@
         "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
     },
     {
+        "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)",
+        "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)",
+        "MetricGroup": "BrMispredicts",
+        "MetricName": "tma_info_bad_spec_spec_clears_ratio"
+    },
+    {
         "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
@@ -472,67 +487,102 @@
         "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: "
     },
     {
+        "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.",
+        "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Ret",
+        "MetricName": "tma_info_bottleneck_base_non_br",
+        "MetricThreshold": "tma_info_bottleneck_base_non_br > 20"
+    },
+    {
         "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
-        "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC",
+        "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB",
         "MetricName": "tma_info_bottleneck_big_code",
-        "MetricThreshold": "tma_info_bottleneck_big_code > 20",
-        "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead"
+        "MetricThreshold": "tma_info_bottleneck_big_code > 20"
     },
     {
         "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
-        "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)",
-        "MetricGroup": "Ret;tma_issueBC",
+        "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)",
+        "MetricGroup": "Ret",
         "MetricName": "tma_info_bottleneck_branching_overhead",
-        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10",
-        "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code"
+        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
+        "MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
+        "MetricName": "tma_info_bottleneck_cache_memory_latency",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency"
+    },
+    {
+        "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
+        "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
+        "MetricGroup": "Cor;tma_issueComp",
+        "MetricName": "tma_info_bottleneck_compute_bound_est",
+        "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
+        "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: "
     },
     {
         "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
+        "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
         "MetricGroup": "Fed;FetchBW;Frontend",
         "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
         "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20"
     },
     {
-        "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
-        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
-        "MetricName": "tma_info_bottleneck_memory_bandwidth",
-        "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20",
-        "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+        "BriefDescription": "Total pipeline cost of irregular execution (e.g",
+        "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Bad;Cor;Ret;tma_issueMS",
+        "MetricName": "tma_info_bottleneck_irregular_overhead",
+        "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
+        "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches"
     },
     {
         "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
         "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB",
         "MetricName": "tma_info_bottleneck_memory_data_tlbs",
         "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
-        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store"
+        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization"
     },
     {
-        "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))",
-        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
-        "MetricName": "tma_info_bottleneck_memory_latency",
-        "MetricThreshold": "tma_info_bottleneck_memory_latency > 20",
-        "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency"
+        "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
+        "MetricGroup": "Mem;Offcore;tma_issueTLB",
+        "MetricName": "tma_info_bottleneck_memory_synchronization",
+        "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
+        "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs"
     },
     {
         "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+        "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
         "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bottleneck_mispredictions",
         "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
         "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers"
     },
     {
+        "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)",
+        "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)",
+        "MetricGroup": "Cor;Offcore",
+        "MetricName": "tma_info_bottleneck_other_bottlenecks",
+        "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
+        "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls."
+    },
+    {
         "BriefDescription": "Fraction of branches that are CALL or RET",
         "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
@@ -564,7 +614,7 @@
     },
     {
         "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
-        "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED",
+        "MetricExpr": "(CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else tma_info_thread_clks)",
         "MetricGroup": "SMT",
         "MetricName": "tma_info_core_core_clks"
     },
@@ -575,8 +625,14 @@
         "MetricName": "tma_info_core_coreipc"
     },
     {
+        "BriefDescription": "uops Executed per Cycle",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_core_epc"
+    },
+    {
         "BriefDescription": "Floating Point Operations Per Cycle",
-        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc"
     },
@@ -588,8 +644,8 @@
         "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
@@ -669,7 +725,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
@@ -677,7 +733,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
@@ -685,7 +741,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)",
@@ -693,7 +749,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx512",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
@@ -701,7 +757,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
@@ -709,7 +765,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -727,7 +783,7 @@
     },
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
-        "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
@@ -740,6 +796,12 @@
         "MetricThreshold": "tma_info_inst_mix_ipload < 3"
     },
     {
+        "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)",
+        "MetricExpr": "tma_info_inst_mix_instructions / MISC_RETIRED.PAUSE_INST",
+        "MetricGroup": "Flops;FpVector;InsType",
+        "MetricName": "tma_info_inst_mix_ippause"
+    },
+    {
         "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
         "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES",
         "MetricGroup": "InsType",
@@ -763,142 +825,154 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_access_bw",
         "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_core_l3_cache_access_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_fb_hpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_all"
     },
     {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load"
     },
     {
-        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
-        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
-        "MetricName": "tma_info_memory_l3mpki"
+        "BriefDescription": "",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW;Offcore",
+        "MetricName": "tma_info_memory_l3_cache_access_bw"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
     },
     {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_l3mpki"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L3 cache miss demand Loads",
         "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l3_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l3_miss_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "BriefDescription": "\"Bus lock\" per kilo instruction",
+        "MetricExpr": "1e3 * SQ_MISC.BUS_LOCK / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_bus_lock_pki"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_access_bw",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_uc_load_pki"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
     },
     {
         "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
@@ -926,42 +1000,56 @@
         "MetricName": "tma_info_memory_tlb_store_stlb_mpki"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
     {
+        "BriefDescription": "Instructions per a microcode Assist invocation",
+        "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY",
+        "MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
+        "MetricName": "tma_info_pipeline_ipassist",
+        "MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
+        "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)"
+    },
+    {
         "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
         "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
         "MetricGroup": "Pipeline;Ret",
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
-        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
+        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -998,12 +1086,6 @@
         "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)"
     },
     {
-        "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
-        "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.ALL",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_request_latency"
-    },
-    {
         "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0",
         "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks",
         "MetricGroup": "Power",
@@ -1097,8 +1179,8 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
-        "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
@@ -1107,7 +1189,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -1117,7 +1199,7 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
@@ -1127,24 +1209,24 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_NMI",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
-        "MetricExpr": "9 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "MetricExpr": "9 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
-        "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks",
+        "MetricExpr": "DECODE.LCP / tma_info_thread_clks",
         "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
         "MetricName": "tma_lcp",
         "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -1158,7 +1240,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -1201,7 +1283,7 @@
         "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_lsd",
-        "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
+        "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.",
         "ScaleUnit": "100%"
     },
@@ -1216,21 +1298,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -1254,11 +1336,11 @@
     },
     {
         "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
-        "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
+        "MetricExpr": "UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
         "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
         "MetricName": "tma_microcode_sequencer",
         "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
-        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches",
+        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1275,7 +1357,7 @@
         "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
         "ScaleUnit": "100%"
     },
@@ -1284,16 +1366,16 @@
         "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group",
         "MetricName": "tma_mite_4wide",
-        "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))",
+        "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
+        "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)",
         "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
         "MetricName": "tma_mixing_vectors",
         "MetricThreshold": "tma_mixing_vectors > 0.05",
-        "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
+        "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1302,22 +1384,22 @@
         "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
         "MetricName": "tma_ms_switches",
         "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
-        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
+        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
         "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
         "MetricName": "tma_nop_instructions",
-        "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6",
+        "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))",
+        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions))",
         "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_other_light_ops",
         "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
@@ -1325,6 +1407,22 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
+        "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)",
+        "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
+        "MetricName": "tma_other_mispredicts",
+        "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
+        "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)",
+        "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_other_nukes",
+        "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)",
         "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks",
         "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
@@ -1352,17 +1450,17 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
-        "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
+        "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
         "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
         "MetricName": "tma_ports_utilization",
         "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -1371,7 +1469,7 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
-        "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
+        "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_0",
         "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -1401,7 +1499,7 @@
         "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
         "ScaleUnit": "100%"
     },
@@ -1419,18 +1517,18 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
         "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks",
-        "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group",
+        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
         "MetricName": "tma_serializing_operation",
-        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))",
+        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
         "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks",
-        "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
         "MetricName": "tma_slow_pause",
-        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))",
+        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST",
         "ScaleUnit": "100%"
     },
@@ -1459,7 +1557,7 @@
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
         "MetricName": "tma_sq_full",
         "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
+        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
         "ScaleUnit": "100%"
     },
     {
@@ -1527,10 +1625,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json b/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json
index bebb85945d62..a2c27794c0d8 100644
--- a/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "DSB": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -23,7 +23,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -88,6 +90,7 @@
     "tma_issueTLB": "Metrics related by the issue $issueTLB",
     "tma_l1_bound_group": "Metrics contributing to tma_l1_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json
index 8898b6fd0dea..ce836ebda542 100644
--- a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json
@@ -163,7 +163,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_lcp",
         "ScaleUnit": "100%"
@@ -193,7 +193,7 @@
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
         "MetricName": "tma_fp_scalar",
         "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting. Related metrics: tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -202,7 +202,25 @@
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
         "MetricName": "tma_fp_vector",
         "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
+        "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE) / UOPS_DISPATCHED.THREAD",
+        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
+        "MetricName": "tma_fp_vector_128b",
+        "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
+        "MetricExpr": "(SIMD_FP_256.PACKED_DOUBLE + SIMD_FP_256.PACKED_SINGLE) / UOPS_DISPATCHED.THREAD",
+        "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
+        "MetricName": "tma_fp_vector_256b",
+        "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
+        "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_512b, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -222,7 +240,7 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
@@ -244,7 +262,7 @@
         "MetricName": "tma_info_core_flopc"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
         "MetricExpr": "UOPS_DISPATCHED.THREAD / (cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
@@ -271,21 +289,27 @@
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
         "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_mem_bandwidth"
     },
@@ -294,7 +318,7 @@
         "MetricExpr": "(FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * (FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE) + 8 * SIMD_FP_256.PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -317,19 +341,6 @@
         "MetricThreshold": "tma_info_system_kernel_utilization > 0.05"
     },
     {
-        "BriefDescription": "Average number of parallel requests to external memory",
-        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_parallel_requests",
-        "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests"
-    },
-    {
-        "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
-        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_request_latency"
-    },
-    {
         "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
         "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)",
         "MetricGroup": "SMT",
@@ -388,7 +399,7 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
         "MetricExpr": "(12 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
@@ -398,7 +409,7 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_SMT",
         "MetricExpr": "MEM_LOAD_UOPS_RETIRED.LLC_HIT / (MEM_LOAD_UOPS_RETIRED.LLC_HIT + 7 * MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
@@ -420,7 +431,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -435,21 +446,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_info_system_dram_bw_use",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: ",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: ",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json
index e6f7934320bf..81e5ca1c3078 100644
--- a/tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/metricgroups.json
@@ -2,10 +2,11 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "C0Wait": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -27,7 +28,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -68,6 +71,7 @@
     "tma_assists_group": "Metrics contributing to tma_assists category",
     "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category",
     "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category",
+    "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category",
     "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category",
     "tma_core_bound_group": "Metrics contributing to tma_core_bound category",
     "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category",
@@ -81,9 +85,9 @@
     "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category",
     "tma_int_operations_group": "Metrics contributing to tma_int_operations category",
     "tma_issue2P": "Metrics related by the issue $issue2P",
-    "tma_issueBC": "Metrics related by the issue $issueBC",
     "tma_issueBM": "Metrics related by the issue $issueBM",
     "tma_issueBW": "Metrics related by the issue $issueBW",
+    "tma_issueComp": "Metrics related by the issue $issueComp",
     "tma_issueD0": "Metrics related by the issue $issueD0",
     "tma_issueFB": "Metrics related by the issue $issueFB",
     "tma_issueFL": "Metrics related by the issue $issueFL",
@@ -103,11 +107,13 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_bandwidth_group": "Metrics contributing to tma_mem_bandwidth category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
     "tma_mite_group": "Metrics contributing to tma_mite category",
+    "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category",
     "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category",
     "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category",
     "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category",
diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json
index 56e54babcc26..6f0e6360e989 100644
--- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json
@@ -85,6 +85,24 @@
         "ScaleUnit": "1MB/s"
     },
     {
+        "BriefDescription": "Percentage of inbound full cacheline writes initiated by end device controllers that miss the L3 cache.",
+        "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM / UNC_CHA_TOR_INSERTS.IO_ITOM",
+        "MetricName": "io_percent_of_inbound_full_writes_that_miss_l3",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Percentage of inbound partial cacheline writes initiated by end device controllers that miss the L3 cache.",
+        "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_RFO) / (UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_RFO)",
+        "MetricName": "io_percent_of_inbound_partial_writes_that_miss_l3",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Percentage of inbound reads initiated by end device controllers that miss the L3 cache.",
+        "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR / UNC_CHA_TOR_INSERTS.IO_PCIRDCUR",
+        "MetricName": "io_percent_of_inbound_reads_that_miss_l3",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions",
         "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY",
         "MetricName": "itlb_2nd_level_large_page_mpi",
@@ -310,20 +328,20 @@
         "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * tma_info_core_core_clks)",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the Advanced Matrix Extensions (AMX) execution engine was busy with tile (arithmetic) operations",
+        "BriefDescription": "This metric estimates fraction of cycles where the Advanced Matrix eXtensions (AMX) execution engine was busy with tile (arithmetic) operations",
         "MetricExpr": "EXE.AMX_BUSY / tma_info_core_core_clks",
-        "MetricGroup": "Compute;HPC;Server;TopdownL5;tma_L5_group;tma_ports_utilized_0_group",
+        "MetricGroup": "Compute;HPC;Server;TopdownL3;tma_L3_group;tma_core_bound_group",
         "MetricName": "tma_amx_busy",
-        "MetricThreshold": "tma_amx_busy > 0.5 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))",
+        "MetricThreshold": "tma_amx_busy > 0.5 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * cpu@ASSISTS.ANY\\,umask\\=0x1B@ / tma_info_thread_slots",
+        "MetricExpr": "78 * ASSISTS.ANY / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -381,6 +399,22 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.1 power-performance optimized state (Faster wakeup time; Smaller power savings).",
+        "MetricExpr": "CPU_CLK_UNHALTED.C01 / tma_info_thread_clks",
+        "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group",
+        "MetricName": "tma_c01_wait",
+        "MetricThreshold": "tma_c01_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due staying in C0.2 power-performance optimized state (Slower wakeup time; Larger power savings).",
+        "MetricExpr": "CPU_CLK_UNHALTED.C02 / tma_info_thread_clks",
+        "MetricGroup": "C0Wait;TopdownL4;tma_L4_group;tma_serializing_operation_group",
+        "MetricName": "tma_c02_wait",
+        "MetricThreshold": "tma_c02_wait > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction",
         "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
@@ -400,7 +434,7 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
-        "MetricExpr": "(76 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 75.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(76 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 75.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_contested_accesses",
         "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -420,7 +454,7 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
-        "MetricExpr": "75.5 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "75.5 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_data_sharing",
         "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -432,7 +466,7 @@
         "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
         "MetricName": "tma_decoder0_alone",
-        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35))",
+        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
         "ScaleUnit": "100%"
     },
@@ -459,7 +493,7 @@
         "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -478,7 +512,7 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
         "MetricName": "tma_dtlb_load",
         "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
@@ -487,12 +521,12 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
         "MetricName": "tma_dtlb_store",
         "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
-        "MetricExpr": "80 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
+        "MetricExpr": "80 * tma_info_system_core_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
         "MetricName": "tma_false_sharing",
         "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -505,7 +539,7 @@
         "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
         "MetricName": "tma_fb_full",
         "MetricThreshold": "tma_fb_full > 0.3",
-        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
         "ScaleUnit": "100%"
     },
     {
@@ -514,7 +548,7 @@
         "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)",
         "MetricGroup": "Default;FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2;Default",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -540,17 +574,8 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric approximates arithmetic floating-point (FP) matrix uops fraction the CPU has retired (aggregated across all supported FP datatypes in AMX engine)",
-        "MetricExpr": "cpu@AMX_OPS_RETIRED.BF16\\,cmask\\=1@ / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Compute;Flops;HPC;Pipeline;Server;TopdownL4;tma_L4_group;tma_fp_arith_group",
-        "MetricName": "tma_fp_amx",
-        "MetricThreshold": "tma_fp_amx > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric approximates arithmetic floating-point (FP) matrix uops fraction the CPU has retired (aggregated across all supported FP datatypes in AMX engine). Refer to AMX_Busy and GFLOPs metrics for actual AMX utilization and FP performance, resp.",
-        "ScaleUnit": "100%"
-    },
-    {
         "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
-        "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector + tma_fp_amx",
+        "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
         "MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_fp_arith",
         "MetricThreshold": "tma_fp_arith > 0.2 & tma_light_operations > 0.6",
@@ -568,7 +593,7 @@
     },
     {
         "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
-        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR) / (tma_retiring * tma_info_thread_slots)",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + FP_ARITH_INST_RETIRED2.SCALAR) / (tma_retiring * tma_info_thread_slots)",
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
         "MetricName": "tma_fp_scalar",
         "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
@@ -577,7 +602,7 @@
     },
     {
         "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
-        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR) / (tma_retiring * tma_info_thread_slots)",
+        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ + FP_ARITH_INST_RETIRED2.VECTOR) / (tma_retiring * tma_info_thread_slots)",
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
         "MetricName": "tma_fp_vector",
         "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
@@ -625,10 +650,10 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions",
         "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_fused_instructions",
         "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}",
         "ScaleUnit": "100%"
     },
     {
@@ -639,13 +664,13 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2;Default",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .). Sample with: UOPS_RETIRED.HEAVY",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
         "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
@@ -653,7 +678,7 @@
     },
     {
         "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100",
         "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
         "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers"
@@ -694,6 +719,33 @@
         "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
     },
     {
+        "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)",
+        "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)",
+        "MetricGroup": "BrMispredicts",
+        "MetricName": "tma_info_bad_spec_spec_clears_ratio"
+    },
+    {
+        "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
+        "MetricExpr": "(100 * (1 - max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)) / (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu@RS.EMPTY\\,umask\\=0x1@) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CPU_CLK_UNHALTED.THREAD) if max(0, topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)) < (((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu@RS.EMPTY\\,umask\\=0x1@) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0) + 0 * slots",
+        "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_core_bound_likely",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.",
+        "MetricExpr": "100 * (100 * ((topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots) * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + INT_MISC.UNKNOWN_BRANCH_CYCLES / CPU_CLK_UNHALTED.THREAD) + min(3 * cpu@UOPS_RETIRED.MS\\,cmask\\=0x1\\,edge\\=0x1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY) / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + max(0, topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots - (topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots)) * ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2) / ((IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2 + (IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD) / 2)))",
+        "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_dsb_misses",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.",
+        "MetricExpr": "100 * (100 * ((topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / slots) * (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD) / (ICACHE_DATA.STALLS / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + INT_MISC.UNKNOWN_BRANCH_CYCLES / CPU_CLK_UNHALTED.THREAD) + min(3 * cpu@UOPS_RETIRED.MS\\,cmask\\=0x1\\,edge\\=0x1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY) / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))",
+        "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_ic_misses",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
         "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
         "MetricGroup": "Cor;SMT",
@@ -717,61 +769,98 @@
         "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: "
     },
     {
+        "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.",
+        "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Ret",
+        "MetricName": "tma_info_bottleneck_base_non_br",
+        "MetricThreshold": "tma_info_bottleneck_base_non_br > 20"
+    },
+    {
         "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
         "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
-        "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC",
+        "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB",
         "MetricName": "tma_info_bottleneck_big_code",
-        "MetricThreshold": "tma_info_bottleneck_big_code > 20",
-        "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead"
+        "MetricThreshold": "tma_info_bottleneck_big_code > 20"
     },
     {
         "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
-        "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)",
-        "MetricGroup": "Ret;tma_issueBC",
+        "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)",
+        "MetricGroup": "Ret",
         "MetricName": "tma_info_bottleneck_branching_overhead",
-        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10",
-        "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code"
+        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
+        "MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
+        "MetricName": "tma_info_bottleneck_cache_memory_latency",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency"
+    },
+    {
+        "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
+        "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * tma_amx_busy / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
+        "MetricGroup": "Cor;tma_issueComp",
+        "MetricName": "tma_info_bottleneck_compute_bound_est",
+        "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
+        "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: "
     },
     {
         "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
-        "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
+        "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))) - tma_info_bottleneck_big_code",
         "MetricGroup": "Fed;FetchBW;Frontend",
         "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
         "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20"
     },
     {
-        "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
-        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
-        "MetricName": "tma_info_bottleneck_memory_bandwidth",
-        "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20",
-        "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+        "BriefDescription": "Total pipeline cost of irregular execution (e.g",
+        "MetricExpr": "100 * ((1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu@RS.EMPTY\\,umask\\=1@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Bad;Cor;Ret;tma_issueMS",
+        "MetricName": "tma_info_bottleneck_irregular_overhead",
+        "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
+        "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches"
     },
     {
         "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
-        "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
         "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB",
         "MetricName": "tma_info_bottleneck_memory_data_tlbs",
         "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
-        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store"
+        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization"
     },
     {
-        "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound))",
-        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
-        "MetricName": "tma_info_bottleneck_memory_latency",
-        "MetricThreshold": "tma_info_bottleneck_memory_latency > 20",
-        "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency"
+        "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
+        "MetricGroup": "Mem;Offcore;tma_issueTLB",
+        "MetricName": "tma_info_bottleneck_memory_synchronization",
+        "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
+        "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs"
     },
     {
         "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
-        "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+        "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
         "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bottleneck_mispredictions",
         "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
         "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers"
     },
     {
+        "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)",
+        "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)",
+        "MetricGroup": "Cor;Offcore",
+        "MetricName": "tma_info_bottleneck_other_bottlenecks",
+        "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
+        "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls."
+    },
+    {
         "BriefDescription": "Fraction of branches that are CALL or RET",
         "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
@@ -803,7 +892,7 @@
     },
     {
         "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
-        "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED",
+        "MetricExpr": "(CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else tma_info_thread_clks)",
         "MetricGroup": "SMT",
         "MetricName": "tma_info_core_core_clks"
     },
@@ -814,8 +903,14 @@
         "MetricName": "tma_info_core_coreipc"
     },
     {
+        "BriefDescription": "uops Executed per Cycle",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_core_epc"
+    },
+    {
         "BriefDescription": "Floating Point Operations Per Cycle",
-        "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR_HALF + 2 * (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF) + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * (FP_ARITH_INST_RETIRED2.128B_PACKED_HALF + cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@) + 16 * (FP_ARITH_INST_RETIRED2.256B_PACKED_HALF + FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) + 32 * FP_ARITH_INST_RETIRED2.512B_PACKED_HALF + 4 * AMX_OPS_RETIRED.BF16",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc"
     },
@@ -827,8 +922,8 @@
         "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
@@ -884,6 +979,13 @@
         "MetricName": "tma_info_frontend_l2mpki_code_all"
     },
     {
+        "BriefDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection",
+        "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / cpu@INT_MISC.UNKNOWN_BRANCH_CYCLES\\,cmask\\=1\\,edge@",
+        "MetricGroup": "Fed",
+        "MetricName": "tma_info_frontend_unknown_branch_cost",
+        "PublicDescription": "Average number of cycles the front-end was delayed due to an Unknown Branch detection. See Unknown_Branches node."
+    },
+    {
         "BriefDescription": "Branch instructions per taken branch.",
         "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
         "MetricGroup": "Branches;Fed;PGO",
@@ -898,27 +1000,11 @@
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
-        "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + FP_ARITH_INST_RETIRED2.SCALAR + (cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ + FP_ARITH_INST_RETIRED2.VECTOR))",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + FP_ARITH_INST_RETIRED2.SCALAR + (cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ + FP_ARITH_INST_RETIRED2.VECTOR))",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
-    },
-    {
-        "BriefDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate)",
-        "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.BF16",
-        "MetricGroup": "Flops;FpVector;InsType;Server",
-        "MetricName": "tma_info_inst_mix_iparith_amx_f16",
-        "MetricThreshold": "tma_info_inst_mix_iparith_amx_f16 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions."
-    },
-    {
-        "BriefDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate)",
-        "MetricExpr": "INST_RETIRED.ANY / AMX_OPS_RETIRED.INT8",
-        "MetricGroup": "InsType;IntVector;Server",
-        "MetricName": "tma_info_inst_mix_iparith_amx_int8",
-        "MetricThreshold": "tma_info_inst_mix_iparith_amx_int8 < 10",
-        "PublicDescription": "Instructions per Integer Arithmetic AMX operation (lower number means higher occurrence rate). Operations factored per matrices' sizes of the AMX instructions."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
@@ -926,7 +1012,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
@@ -934,7 +1020,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)",
@@ -942,7 +1028,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx512",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
@@ -950,7 +1036,15 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
+    },
+    {
+        "BriefDescription": "Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate)",
+        "MetricExpr": "INST_RETIRED.ANY / FP_ARITH_INST_RETIRED2.SCALAR",
+        "MetricGroup": "Flops;FpScalar;InsType;Server",
+        "MetricName": "tma_info_inst_mix_iparith_scalar_hp",
+        "MetricThreshold": "tma_info_inst_mix_iparith_scalar_hp < 10",
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Half-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
@@ -958,7 +1052,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -976,7 +1070,7 @@
     },
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
-        "MetricExpr": "INST_RETIRED.ANY / tma_info_core_flopc",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
@@ -989,6 +1083,12 @@
         "MetricThreshold": "tma_info_inst_mix_ipload < 3"
     },
     {
+        "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)",
+        "MetricExpr": "tma_info_inst_mix_instructions / CPU_CLK_UNHALTED.PAUSE_INST",
+        "MetricGroup": "Flops;FpVector;InsType",
+        "MetricName": "tma_info_inst_mix_ippause"
+    },
+    {
         "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
         "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES",
         "MetricGroup": "InsType",
@@ -1011,16 +1111,30 @@
         "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp"
     },
     {
+        "BriefDescription": "\"Bus lock\" per kilo instruction",
+        "MetricExpr": "tma_info_memory_mix_bus_lock_pki",
+        "MetricGroup": "Mem;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_bus_lock_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki",
+        "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_code_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction",
@@ -1036,77 +1150,232 @@
     },
     {
         "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_access_bw",
         "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_core_l3_cache_access_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
+    },
+    {
+        "BriefDescription": "Average Parallel L2 cache miss data reads",
+        "MetricExpr": "tma_info_memory_latency_data_l2_mlp",
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_data_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_fb_hpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction",
+        "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY",
+        "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)",
+        "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY",
+        "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_evictions_silent_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_all"
     },
     {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW;Offcore",
+        "MetricName": "tma_info_memory_l3_cache_access_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l3_cache_access_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "Mem",
         "MetricName": "tma_info_memory_l3mpki"
     },
     {
+        "BriefDescription": "Average Parallel L2 cache miss data reads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
+        "MetricGroup": "Memory_BW;Offcore",
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
+    },
+    {
+        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
+        "MetricExpr": "tma_info_memory_load_l2_miss_latency",
+        "MetricGroup": "Memory_Lat;Offcore",
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
+    },
+    {
+        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@",
+        "MetricGroup": "Memory_BW;Offcore",
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
+    },
+    {
+        "BriefDescription": "Average Latency for L3 cache miss demand Loads",
+        "MetricExpr": "tma_info_memory_load_l3_miss_latency",
+        "MetricGroup": "Memory_Lat;Offcore",
+        "MetricName": "tma_info_memory_latency_load_l3_miss_latency"
+    },
+    {
+        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
+        "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_miss_latency",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=0x1@",
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Average Latency for L3 cache miss demand Loads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
+        "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l3_miss_latency",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
         "MetricExpr": "L1D_PEND_MISS.PENDING / MEM_LOAD_COMPLETED.L1_MISS_ANY",
         "MetricGroup": "Mem;MemoryBound;MemoryLat",
         "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
+        "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "\"Bus lock\" per kilo instruction",
+        "MetricExpr": "1e3 * SQ_MISC.BUS_LOCK / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_bus_lock_pki"
+    },
+    {
+        "BriefDescription": "Off-core accesses per kilo instruction for modified write requests",
+        "MetricExpr": "1e3 * OCR.MODIFIED_WRITE.ANY_RESPONSE / tma_info_inst_mix_instructions",
+        "MetricGroup": "Offcore",
+        "MetricName": "tma_info_memory_mix_offcore_mwrite_any_pki"
+    },
+    {
+        "BriefDescription": "Off-core accesses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)",
+        "MetricExpr": "1e3 * OCR.READS_TO_CORE.ANY_RESPONSE / tma_info_inst_mix_instructions",
+        "MetricGroup": "CacheHits;Offcore",
+        "MetricName": "tma_info_memory_mix_offcore_read_any_pki"
+    },
+    {
+        "BriefDescription": "L3 cache misses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)",
+        "MetricExpr": "1e3 * OCR.READS_TO_CORE.L3_MISS / tma_info_inst_mix_instructions",
+        "MetricGroup": "Offcore",
+        "MetricName": "tma_info_memory_mix_offcore_read_l3m_pki"
+    },
+    {
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "tma_info_memory_uc_load_pki",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_uc_load_pki"
+    },
+    {
         "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
         "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
         "MetricGroup": "Mem;MemoryBW;MemoryBound",
@@ -1114,52 +1383,84 @@
         "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
     },
     {
-        "BriefDescription": "Average Parallel L2 cache miss data reads",
-        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
-        "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "BriefDescription": "Off-core accesses per kilo instruction for modified write requests",
+        "MetricExpr": "1e3 * OCR.MODIFIED_WRITE.ANY_RESPONSE / INST_RETIRED.ANY",
+        "MetricGroup": "Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_offcore_mwrite_any_pki",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
-        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
-        "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "BriefDescription": "Off-core accesses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)",
+        "MetricExpr": "1e3 * OCR.READS_TO_CORE.ANY_RESPONSE / INST_RETIRED.ANY",
+        "MetricGroup": "CacheHits;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_offcore_read_any_pki",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
-        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@",
-        "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
+        "BriefDescription": "L3 cache misses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)",
+        "MetricExpr": "1e3 * OCR.READS_TO_CORE.L3_MISS / INST_RETIRED.ANY",
+        "MetricGroup": "Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_offcore_read_l3m_pki",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average Latency for L3 cache miss demand Loads",
-        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
-        "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l3_miss_latency"
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * (CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else CPU_CLK_UNHALTED.THREAD))",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_page_walks_utilization",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "BriefDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket",
+        "MetricExpr": "64 * OCR.READS_TO_CORE.DRAM / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_r2c_dram_bw",
+        "MetricgroupNoGroup": "TopdownL1",
+        "PublicDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket. See R2C_Offcore_BW."
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "BriefDescription": "Average L3-cache miss BW for Reads-to-Core (R2C)",
+        "MetricExpr": "64 * OCR.READS_TO_CORE.L3_MISS / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_r2c_l3m_bw",
+        "MetricgroupNoGroup": "TopdownL1",
+        "PublicDescription": "Average L3-cache miss BW for Reads-to-Core (R2C). This covering going to DRAM or other memory off-chip memory tears. See R2C_Offcore_BW."
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_access_bw",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Average Off-core access BW for Reads-to-Core (R2C)",
+        "MetricExpr": "64 * OCR.READS_TO_CORE.ANY_RESPONSE / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "HPC;Mem;MemoryBW;SoC;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_r2c_offcore_bw",
+        "MetricgroupNoGroup": "TopdownL1",
+        "PublicDescription": "Average Off-core access BW for Reads-to-Core (R2C). R2C account for demand or prefetch load/RFO/code access that fill data into the Core caches."
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket",
+        "MetricExpr": "64 * OCR.READS_TO_CORE.DRAM / 1e9 / duration_time",
+        "MetricGroup": "HPC;Mem;MemoryBW;SoC",
+        "MetricName": "tma_info_memory_soc_r2c_dram_bw",
+        "PublicDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket. See R2C_Offcore_BW."
+    },
+    {
+        "BriefDescription": "Average L3-cache miss BW for Reads-to-Core (R2C)",
+        "MetricExpr": "64 * OCR.READS_TO_CORE.L3_MISS / 1e9 / duration_time",
+        "MetricGroup": "HPC;Mem;MemoryBW;SoC",
+        "MetricName": "tma_info_memory_soc_r2c_l3m_bw",
+        "PublicDescription": "Average L3-cache miss BW for Reads-to-Core (R2C). This covering going to DRAM or other memory off-chip memory tears. See R2C_Offcore_BW."
+    },
+    {
+        "BriefDescription": "Average Off-core access BW for Reads-to-Core (R2C)",
+        "MetricExpr": "64 * OCR.READS_TO_CORE.ANY_RESPONSE / 1e9 / duration_time",
+        "MetricGroup": "HPC;Mem;MemoryBW;SoC",
+        "MetricName": "tma_info_memory_soc_r2c_offcore_bw",
+        "PublicDescription": "Average Off-core access BW for Reads-to-Core (R2C). R2C account for demand or prefetch load/RFO/code access that fill data into the Core caches."
+    },
+    {
+        "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_store_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
@@ -1187,15 +1488,22 @@
         "MetricName": "tma_info_memory_tlb_store_stlb_mpki"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY",
+        "MetricGroup": "Mem;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_uc_load_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
     {
         "BriefDescription": "Instructions per a microcode Assist invocation",
-        "MetricExpr": "INST_RETIRED.ANY / cpu@ASSISTS.ANY\\,umask\\=0x1B@",
-        "MetricGroup": "Pipeline;Ret;Retire",
+        "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY",
+        "MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
         "MetricName": "tma_info_pipeline_ipassist",
         "MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
         "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)"
@@ -1209,41 +1517,62 @@
     {
         "BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions",
         "MetricExpr": "INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
-        "MetricGroup": "Pipeline;Ret",
+        "MetricGroup": "MicroSeq;Pipeline;Ret",
         "MetricName": "tma_info_pipeline_strings_cycles",
         "MetricThreshold": "tma_info_pipeline_strings_cycles > 0.1"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Fraction of cycles the processor is waiting yet unhalted; covering legacy PAUSE instruction, as well as C0.1 / C0.2 power-performance optimized states",
+        "MetricExpr": "CPU_CLK_UNHALTED.C0_WAIT / tma_info_thread_clks",
+        "MetricGroup": "C0Wait",
+        "MetricName": "tma_info_system_c0_wait",
+        "MetricThreshold": "tma_info_system_c0_wait > 0.05"
+    },
+    {
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
-        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
+        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "tma_info_core_flopc / duration_time",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
-        "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]",
+        "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]",
         "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR * 64 / 1e9 / duration_time",
-        "MetricGroup": "IoBW;Mem;Server;SoC",
-        "MetricName": "tma_info_system_io_write_bw"
+        "MetricGroup": "IoBW;MemOffcore;Server;SoC",
+        "MetricName": "tma_info_system_io_read_bw",
+        "PublicDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU"
+    },
+    {
+        "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]",
+        "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_ITOM + UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR) * 64 / 1e9 / duration_time",
+        "MetricGroup": "IoBW;MemOffcore;Server;SoC",
+        "MetricName": "tma_info_system_io_write_bw",
+        "PublicDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]. Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -1268,7 +1597,7 @@
     {
         "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]",
         "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_DDR / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_DDR) / uncore_cha_0@event\\=0x1@",
-        "MetricGroup": "Mem;MemoryLat;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryLat;Server;SoC",
         "MetricName": "tma_info_system_mem_dram_read_latency",
         "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches"
     },
@@ -1282,7 +1611,7 @@
     {
         "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]",
         "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / uncore_cha_0@event\\=0x1@ if #has_pmem > 0 else 0)",
-        "MetricGroup": "Mem;MemoryLat;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryLat;Server;SoC",
         "MetricName": "tma_info_system_mem_pmm_read_latency",
         "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches"
     },
@@ -1297,13 +1626,13 @@
     {
         "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]",
         "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)",
-        "MetricGroup": "Mem;MemoryBW;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryBW;Server;SoC",
         "MetricName": "tma_info_system_pmm_read_bw"
     },
     {
         "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]",
         "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / duration_time if #has_pmem > 0 else 0)",
-        "MetricGroup": "Mem;MemoryBW;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryBW;Server;SoC",
         "MetricName": "tma_info_system_pmm_write_bw"
     },
     {
@@ -1319,18 +1648,18 @@
         "MetricName": "tma_info_system_socket_clks"
     },
     {
-        "BriefDescription": "Tera Integer (matrix) Operations Per Second",
-        "MetricExpr": "8 * AMX_OPS_RETIRED.INT8 / 1e12 / duration_time",
-        "MetricGroup": "Cor;HPC;IntVector;Server",
-        "MetricName": "tma_info_system_tiops"
-    },
-    {
         "BriefDescription": "Average Frequency Utilization relative nominal frequency",
         "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC",
         "MetricGroup": "Power",
         "MetricName": "tma_info_system_turbo_utilization"
     },
     {
+        "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]",
+        "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time",
+        "MetricGroup": "SoC",
+        "MetricName": "tma_info_system_uncore_frequency"
+    },
+    {
         "BriefDescription": "Cross-socket Ultra Path Interconnect (UPI) data transmit bandwidth for data only [MB / sec]",
         "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 64 / 9 / 1e6",
         "MetricGroup": "Server;SoC",
@@ -1388,17 +1717,8 @@
         "MetricThreshold": "tma_info_thread_uptb < 9"
     },
     {
-        "BriefDescription": "This metric approximates arithmetic Integer (Int) matrix uops fraction the CPU has retired (aggregated across all supported Int datatypes in AMX engine)",
-        "MetricExpr": "cpu@AMX_OPS_RETIRED.INT8\\,cmask\\=1@ / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Compute;HPC;IntVector;Pipeline;Server;TopdownL4;tma_L4_group;tma_int_operations_group",
-        "MetricName": "tma_int_amx",
-        "MetricThreshold": "tma_int_amx > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric approximates arithmetic Integer (Int) matrix uops fraction the CPU has retired (aggregated across all supported Int datatypes in AMX engine). Refer to AMX_Busy and TIOPs metrics for actual AMX utilization and Int performance, resp.",
-        "ScaleUnit": "100%"
-    },
-    {
         "BriefDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired)",
-        "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b + tma_shuffles + tma_int_amx",
+        "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b",
         "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_int_operations",
         "MetricThreshold": "tma_int_operations > 0.1 & tma_light_operations > 0.6",
@@ -1415,18 +1735,18 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired",
+        "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired",
         "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * tma_info_thread_slots)",
         "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group;tma_issue2P",
         "MetricName": "tma_int_vector_256b",
         "MetricThreshold": "tma_int_vector_256b > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)",
-        "PublicDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_port_0, tma_port_1, tma_port_5, tma_port_6, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
         "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
@@ -1435,7 +1755,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -1444,7 +1764,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
@@ -1453,19 +1773,19 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
-        "MetricExpr": "33 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "MetricExpr": "33 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -1485,7 +1805,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2;Default",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -1515,10 +1835,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory",
-        "MetricExpr": "71 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "71 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_local_dram",
-        "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_local_mem",
+        "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -1551,21 +1871,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -1583,9 +1903,9 @@
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.",
         "MetricConstraint": "NO_GROUP_EVENTS_NMI",
         "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_thread_clks",
-        "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
         "MetricName": "tma_memory_fence",
-        "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))",
+        "MetricThreshold": "tma_memory_fence > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "ScaleUnit": "100%"
     },
     {
@@ -1602,7 +1922,7 @@
         "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
         "MetricName": "tma_microcode_sequencer",
         "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
-        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches",
+        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1619,32 +1939,32 @@
         "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 6 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
+        "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)",
         "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / tma_info_thread_clks",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
         "MetricName": "tma_mixing_vectors",
         "MetricThreshold": "tma_mixing_vectors > 0.05",
-        "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
+        "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
-        "MetricExpr": "3 * cpu@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY) / tma_info_thread_clks",
+        "MetricExpr": "3 * cpu@UOPS_RETIRED.MS\\,cmask\\=1\\,edge@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY) / tma_info_thread_clks",
         "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
         "MetricName": "tma_ms_switches",
         "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
-        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
+        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused",
         "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_non_fused_branches",
         "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.",
@@ -1653,15 +1973,15 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
         "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
         "MetricName": "tma_nop_instructions",
-        "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6",
+        "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
-        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))",
+        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))",
         "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_other_light_ops",
         "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
@@ -1669,6 +1989,22 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
+        "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)",
+        "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
+        "MetricName": "tma_other_mispredicts",
+        "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
+        "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)",
+        "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_other_nukes",
+        "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults",
         "MetricExpr": "99 * ASSISTS.PAGE_FAULT / tma_info_thread_slots",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_assists_group",
@@ -1679,7 +2015,7 @@
     },
     {
         "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a",
-        "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 0)) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)",
+        "MetricExpr": "(((1 - (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)",
         "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_pmm_bound",
         "MetricThreshold": "tma_pmm_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -1705,17 +2041,17 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_int_vector_128b, tma_int_vector_256b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
-        "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)",
+        "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / tma_info_thread_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / tma_info_thread_clks)",
         "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
         "MetricName": "tma_ports_utilization",
         "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -1724,7 +2060,7 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
-        "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_thread_clks",
+        "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu@RS.EMPTY\\,umask\\=1@) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_0",
         "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -1756,13 +2092,13 @@
         "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues",
-        "MetricExpr": "(135.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 135.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(135.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 135.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group",
         "MetricName": "tma_remote_cache",
         "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
@@ -1771,10 +2107,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory",
-        "MetricExpr": "149 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "149 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_remote_dram",
-        "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_remote_mem",
+        "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -1791,28 +2127,29 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
-        "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks",
-        "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group",
+        "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks + tma_c02_wait",
+        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
         "MetricName": "tma_serializing_operation",
-        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))",
+        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Shuffle (cross \"vector lane\" data transfers) uops fraction the CPU has retired.",
-        "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_int_operations_group",
-        "MetricName": "tma_shuffles",
-        "MetricThreshold": "tma_shuffles > 0.1 & (tma_int_operations > 0.1 & tma_light_operations > 0.6)",
+        "BriefDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer)",
+        "MetricExpr": "tma_light_operations * INT_VEC_RETIRED.SHUFFLES / (tma_retiring * tma_info_thread_slots)",
+        "MetricGroup": "HPC;Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
+        "MetricName": "tma_shuffles_256b",
+        "MetricThreshold": "tma_shuffles_256b > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring Shuffle operations of 256-bit vector size (FP or Integer). Shuffles may incur slow cross \"vector lane\" data transfers.",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
         "MetricConstraint": "NO_GROUP_EVENTS_NMI",
         "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_thread_clks",
-        "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
         "MetricName": "tma_slow_pause",
-        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))",
+        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: CPU_CLK_UNHALTED.PAUSE_INST",
         "ScaleUnit": "100%"
     },
@@ -1840,7 +2177,7 @@
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
         "MetricName": "tma_sq_full",
         "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
+        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
         "ScaleUnit": "100%"
     },
     {
@@ -1907,10 +2244,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/cache.json b/tools/perf/pmu-events/arch/x86/sierraforest/cache.json
index 7f0dc65a55d2..f937ba0e50e1 100644
--- a/tools/perf/pmu-events/arch/x86/sierraforest/cache.json
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/cache.json
@@ -16,6 +16,148 @@
         "UMask": "0x4f"
     },
     {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an instruction cache or TLB miss.",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.ALL",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x7f"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2 cache.",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.L2_HIT",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or Translation Lookaside Buffer (TLB) miss which hit in the L2 cache.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an icache or itlb miss which hit in the LLC.",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_HIT",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x6"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an icache or itlb miss which missed all the caches.",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x78"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an L1 demand load miss.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.ALL",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x7f"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the L2 cache.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.L2_HIT",
+        "PublicDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 cache.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x6"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the local caches.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x78"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in DRAM",
+        "EventCode": "0xd3",
+        "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that hit the L1 data cache.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L1_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that miss in the L1 data cache.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L1_MISS",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x40"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that hit in the L2 cache.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that miss in the L2 cache.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_MISS",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x80"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that hit in the L3 cache.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x1c"
+    },
+    {
+        "BriefDescription": "Counts the number of loads that hit in a write combining buffer (WCB), excluding the first load that caused the WCB to allocate.",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.WCB_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x20"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked for any of the following reasons:  load buffer, store buffer or RSV full.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.ALL",
+        "SampleAfterValue": "20003",
+        "UMask": "0x7"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to a load buffer full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.LD_BUF",
+        "SampleAfterValue": "20003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to an RSV full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.RSV",
+        "SampleAfterValue": "20003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to a store buffer full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.ST_BUF",
+        "SampleAfterValue": "20003",
+        "UMask": "0x1"
+    },
+    {
         "BriefDescription": "Counts the number of load ops retired.",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -144,6 +286,42 @@
         "UMask": "0x5"
     },
     {
+        "BriefDescription": "Counts the number of load uops retired that performed one or more locks",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOCK_LOADS",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x21"
+    },
+    {
+        "BriefDescription": "Counts the number of memory uops retired that were splits.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.SPLIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x43"
+    },
+    {
+        "BriefDescription": "Counts the number of retired split load uops.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x41"
+    },
+    {
+        "BriefDescription": "Counts the number of retired split store uops.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x42"
+    },
+    {
         "BriefDescription": "Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -151,5 +329,12 @@
         "PEBS": "2",
         "SampleAfterValue": "1000003",
         "UMask": "0x6"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to an icache miss",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ICACHE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x20"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/floating-point.json b/tools/perf/pmu-events/arch/x86/sierraforest/floating-point.json
new file mode 100644
index 000000000000..00c9a8ae0f53
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/floating-point.json
@@ -0,0 +1,68 @@
+[
+    {
+        "BriefDescription": "Counts the number of cycles when any of the floating point dividers are active.",
+        "CounterMask": "1",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.FPDIV_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of all types of floating point operations per uop with all default weighting",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.ALL",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3"
+    },
+    {
+        "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP64]",
+        "Deprecated": "1",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.DP",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point operations that produce 32 bit single precision results [This event is alias to FP_FLOPS_RETIRED.SP]",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.FP32",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point operations that produce 64 bit double precision results [This event is alias to FP_FLOPS_RETIRED.DP]",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.FP64",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "This event is deprecated. [This event is alias to FP_FLOPS_RETIRED.FP32]",
+        "Deprecated": "1",
+        "EventCode": "0xc8",
+        "EventName": "FP_FLOPS_RETIRED.SP",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point operations retired that required microcode assist.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.FP_ASSIST",
+        "PublicDescription": "Counts the number of floating point operations retired that required microcode assist, which is not a reflection of the number of FP operations, instructions or uops.",
+        "SampleAfterValue": "20003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point divide uops retired (x87 and sse, including x87 sqrt).",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.FPDIV",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x8"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/frontend.json b/tools/perf/pmu-events/arch/x86/sierraforest/frontend.json
index be8f1c7e195c..356d36aecc81 100644
--- a/tools/perf/pmu-events/arch/x86/sierraforest/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/frontend.json
@@ -1,5 +1,21 @@
 [
     {
+        "BriefDescription": "Counts the total number of BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+        "EventCode": "0xe6",
+        "EventName": "BACLEARS.ANY",
+        "PublicDescription": "Counts the total number of BACLEARS, which occur when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend.  Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss",
+        "EventCode": "0xc6",
+        "EventName": "FRONTEND_RETIRED.ITLB_MISS",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
         "BriefDescription": "Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump.",
         "EventCode": "0x80",
         "EventName": "ICACHE.ACCESSES",
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/memory.json b/tools/perf/pmu-events/arch/x86/sierraforest/memory.json
index 79d8af45100c..e0ce2decc805 100644
--- a/tools/perf/pmu-events/arch/x86/sierraforest/memory.json
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/memory.json
@@ -1,5 +1,71 @@
 [
     {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to any number of reasons, including an L1 miss, WCB full, pagewalk, store address block or store data block, on a load that retires.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.ANY_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xff"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to a core bound stall including a store address match, a DTLB miss or a page walk that detains the load from retiring.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.L1_BOUND_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xf4"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DL1 miss.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.L1_MISS_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x81"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.OTHER_AT_RET",
+        "PublicDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases such as pipeline conflicts, fences, etc.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xc0"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a pagewalk.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.PGWALK_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xa0"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a store address match.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.ST_ADDR_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x84"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to memory ordering caused by a snoop from an external agent. Does not count internally generated machine clears such as those due to memory disambiguation.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.MEMORY_ORDERING",
+        "SampleAfterValue": "20003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts misaligned loads that are 4K page splits.",
+        "EventCode": "0x13",
+        "EventName": "MISALIGN_MEM_REF.LOAD_PAGE_SPLIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts misaligned stores that are 4K page splits.",
+        "EventCode": "0x13",
+        "EventName": "MISALIGN_MEM_REF.STORE_PAGE_SPLIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4"
+    },
+    {
         "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.",
         "EventCode": "0xB7",
         "EventName": "OCR.DEMAND_DATA_RD.L3_MISS",
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/other.json b/tools/perf/pmu-events/arch/x86/sierraforest/other.json
index 2414f6ff53b0..70a9da7e97df 100644
--- a/tools/perf/pmu-events/arch/x86/sierraforest/other.json
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/other.json
@@ -1,5 +1,14 @@
 [
     {
+        "BriefDescription": "This event is deprecated. [This event is alias to MISC_RETIRED.LBR_INSERTS]",
+        "Deprecated": "1",
+        "EventCode": "0xe4",
+        "EventName": "LBR_INSERTS.ANY",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
         "BriefDescription": "Counts demand data reads that have any type of response.",
         "EventCode": "0xB7",
         "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE",
@@ -16,5 +25,12 @@
         "MSRValue": "0x10002",
         "SampleAfterValue": "100003",
         "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state.",
+        "EventCode": "0x75",
+        "EventName": "SERIALIZATION.C01_MS_SCB",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json b/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json
index 41212957ef21..ba9843110f07 100644
--- a/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json
@@ -1,5 +1,13 @@
 [
     {
+        "BriefDescription": "Counts the number of cycles when any of the dividers are active.",
+        "CounterMask": "1",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.DIV_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3"
+    },
+    {
         "BriefDescription": "Counts the total number of branch instructions retired for all branch types.",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.ALL_BRANCHES",
@@ -8,6 +16,71 @@
         "SampleAfterValue": "200003"
     },
     {
+        "BriefDescription": "Counts the number of retired JCC (Jump on Conditional Code) branch instructions retired, includes both taken and not taken branches.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x7e"
+    },
+    {
+        "BriefDescription": "Counts the number of taken JCC (Jump on Conditional Code) branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND_TAKEN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfe"
+    },
+    {
+        "BriefDescription": "Counts the number of far branch instructions retired, includes far jump, far call and return, and interrupt call and return.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.FAR_BRANCH",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xbf"
+    },
+    {
+        "BriefDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.INDIRECT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xeb"
+    },
+    {
+        "BriefDescription": "Counts the number of near indirect CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.INDIRECT_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT_CALL",
+        "Deprecated": "1",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.IND_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb"
+    },
+    {
+        "BriefDescription": "Counts the number of near CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf9"
+    },
+    {
+        "BriefDescription": "Counts the number of near RET branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_RETURN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf7"
+    },
+    {
         "BriefDescription": "Counts the total number of mispredicted branch instructions retired for all branch types.",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.ALL_BRANCHES",
@@ -16,6 +89,54 @@
         "SampleAfterValue": "200003"
     },
     {
+        "BriefDescription": "Counts the number of mispredicted JCC (Jump on Conditional Code) branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.COND",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x7e"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted taken JCC (Jump on Conditional Code) branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.COND_TAKEN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfe"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.INDIRECT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xeb"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted near indirect CALL branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.INDIRECT_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted near taken branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.NEAR_TAKEN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x80"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted near RET branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.RETURN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf7"
+    },
+    {
         "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles",
         "EventName": "CPU_CLK_UNHALTED.CORE",
         "SampleAfterValue": "2000003",
@@ -68,6 +189,66 @@
         "SampleAfterValue": "2000003"
     },
     {
+        "BriefDescription": "Counts the number of retired loads that are blocked because it initially appears to be store forward blocked, but subsequently is shown not to be blocked based on 4K alias check.",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.ADDRESS_ALIAS",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of retired loads that are blocked because its address exactly matches an older store whose data is not ready.",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.DATA_UNKNOWN",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of retired loads that are blocked because its address partially overlapped with an older store.",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.STORE_FORWARD",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to memory ordering in which an internal load passes an older store within the same CPU.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.DISAMBIGUATION",
+        "SampleAfterValue": "20003",
+        "UMask": "0x8"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to a page fault.  Counts both I-Side and D-Side (Loads/Stores) page faults.  A page fault occurs when either the page is not present, or an access violation occurs.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.PAGE_FAULT",
+        "SampleAfterValue": "20003",
+        "UMask": "0x20"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears that flush the pipeline and restart the machine with the use of microcode due to SMC, MEMORY_ORDERING, FP_ASSISTS, PAGE_FAULT, DISAMBIGUATION, and FPC_VIRTUAL_TRAP.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.SLOW",
+        "SampleAfterValue": "20003",
+        "UMask": "0x6f"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to program modifying data (self modifying code) within 1K of a recently fetched code page.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.SMC",
+        "SampleAfterValue": "20003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of Last Branch Record (LBR) entries. Requires LBRs to be enabled and configured in IA32_LBR_CTL. [This event is alias to LBR_INSERTS.ANY]",
+        "EventCode": "0xe4",
+        "EventName": "MISC_RETIRED.LBR_INSERTS",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
         "BriefDescription": "Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.",
         "EventCode": "0x73",
         "EventName": "TOPDOWN_BAD_SPECULATION.ALL",
@@ -75,22 +256,201 @@
         "SampleAfterValue": "1000003"
     },
     {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to Fast Nukes such as  Memory Ordering Machine clears and MRN nukes",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.FASTNUKE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to Branch Mispredict",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.MISPREDICT",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to a machine clear (nuke).",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.NUKE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
         "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls",
         "EventCode": "0x74",
         "EventName": "TOPDOWN_BE_BOUND.ALL",
         "SampleAfterValue": "1000003"
     },
     {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to due to certain allocation restrictions",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stall (scheduler not being able to accept another uop).  This could be caused by RSV full or load/store buffer block.",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.MEM_SCHEDULER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to IEC and FPC RAT stalls - which can be due to the FIQ and IEC reservation station stall (integer, FP and SIMD scheduler not being able to accept another uop. )",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to mrbl stall.  A 'marble' refers to a physical register file entry, also known as the physical destination (PDST).",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.REGISTER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x20"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to ROB full",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.REORDER_BUFFER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x40"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to iq/jeu scoreboards or ms scb",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.SERIALIZATION",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
         "BriefDescription": "Counts the number of retirement slots not consumed due to front end stalls",
         "EventCode": "0x71",
         "EventName": "TOPDOWN_FE_BOUND.ALL",
         "SampleAfterValue": "1000003"
     },
     {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BAClear",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.BRANCH_DETECT",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BTClear",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.BRANCH_RESTEER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x40"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to ms",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.CISC",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to decode stall",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.DECODE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8d"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to latency related stalls including BACLEARs, BTCLEARs, ITLB misses, and ICache misses.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x72"
+    },
+    {
+        "BriefDescription": "This event is deprecated. [This event is alias to TOPDOWN_FE_BOUND.ITLB_MISS]",
+        "Deprecated": "1",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ITLB",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to itlb miss [This event is alias to TOPDOWN_FE_BOUND.ITLB]",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ITLB_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend that do not categorize into any other common frontend stall",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.OTHER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x80"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to predecode wrong",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.PREDECODE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4"
+    },
+    {
         "BriefDescription": "Counts the number of consumed retirement slots.  Similar to UOPS_RETIRED.ALL",
         "EventCode": "0x72",
         "EventName": "TOPDOWN_RETIRING.ALL",
         "PEBS": "1",
         "SampleAfterValue": "1000003"
+    },
+    {
+        "BriefDescription": "Counts the number of uops issued by the front end every cycle.",
+        "EventCode": "0x0e",
+        "EventName": "UOPS_ISSUED.ANY",
+        "PublicDescription": "Counts the number of uops issued by the front end every cycle. When 4-uops are requested and only 2-uops are delivered, the event counts 2.  Uops_issued correlates to the number of ROB entries.  If uop takes 2 ROB slots it counts as 2 uops_issued.",
+        "SampleAfterValue": "1000003"
+    },
+    {
+        "BriefDescription": "Counts the total number of uops retired.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.ALL",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003"
+    },
+    {
+        "BriefDescription": "Counts the number of integer divide uops retired.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.IDIV",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS).  This includes uops from flows due to complex instructions, faults, assists, and inserted flows.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.MS",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of x87 uops retired, includes those in ms flows",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.X87",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cache.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cache.json
new file mode 100644
index 000000000000..a3aafbbc3484
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cache.json
@@ -0,0 +1,2853 @@
+[
+    {
+        "BriefDescription": "Clockticks for CMS units attached to CHA",
+        "EventCode": "0x01",
+        "EventName": "UNC_CHACMS_CLOCKTICKS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "PublicDescription": "UNC_CHACMS_CLOCKTICKS",
+        "Unit": "CHACMS"
+    },
+    {
+        "BriefDescription": "Number of CHA clock cycles while the event is enabled",
+        "EventCode": "0x01",
+        "EventName": "UNC_CHA_CLOCKTICKS",
+        "PerPkg": "1",
+        "PublicDescription": "Clockticks of the uncore caching and home agent (CHA)",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts transactions that looked into the multi-socket cacheline Directory state, and therefore did not send a snoop because the Directory indicated it was not needed.",
+        "EventCode": "0x53",
+        "EventName": "UNC_CHA_DIR_LOOKUP.NO_SNP",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts  transactions that looked into the multi-socket cacheline Directory state, and sent one or more snoops, because the Directory indicated it was needed.",
+        "EventCode": "0x53",
+        "EventName": "UNC_CHA_DIR_LOOKUP.SNP",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts only multi-socket cacheline Directory state updates memory writes issued from the HA pipe. This does not include memory write requests which are for I (Invalid) or E (Exclusive) cachelines.",
+        "EventCode": "0x54",
+        "EventName": "UNC_CHA_DIR_UPDATE.HA",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts only multi-socket cacheline Directory state updates due to memory writes issued from the TOR pipe which are the result of remote transaction hitting the SF/LLC and returning data Core2Core. This does not include memory write requests which are for I (Invalid) or E (Exclusive) cachelines.",
+        "EventCode": "0x54",
+        "EventName": "UNC_CHA_DIR_UPDATE.TOR",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT).  Threshold for distress signal assertion reached in TOR or IRQ (immediate cause for triggering).",
+        "EventCode": "0x59",
+        "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_ANY",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x3",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT).  Threshold for distress signal assertion reached in IRQ (immediate cause for triggering).",
+        "EventCode": "0x59",
+        "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_IRQ",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Distress signal assertion for dynamic prefetch throttle (DPT).  Threshold for distress signal assertion reached in TOR (immediate cause for triggering).",
+        "EventCode": "0x59",
+        "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_TOR",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts when a normal (Non-Isochronous) full line write is issued from the CHA to the any of the memory controller channels.",
+        "EventCode": "0x5b",
+        "EventName": "UNC_CHA_IMC_WRITES_COUNT.FULL",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CHA to iMC Full Line Writes Issued : ISOCH Full Line : Counts the total number of full line writes issued from the HA into the memory controller.",
+        "EventCode": "0x5b",
+        "EventName": "UNC_CHA_IMC_WRITES_COUNT.FULL_PRIORITY",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CHA to iMC Full Line Writes Issued : Partial Non-ISOCH : Counts the total number of full line writes issued from the HA into the memory controller.",
+        "EventCode": "0x5b",
+        "EventName": "UNC_CHA_IMC_WRITES_COUNT.PARTIAL",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CHA to iMC Full Line Writes Issued : ISOCH Partial : Counts the total number of full line writes issued from the HA into the memory controller.",
+        "EventCode": "0x5b",
+        "EventName": "UNC_CHA_IMC_WRITES_COUNT.PARTIAL_PRIORITY",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: All Requests to Remotely Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.ALL_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : All transactions from Remote Agents",
+        "UMask": "0x17e0ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: CRd Requests",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.CODE",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : CRd Requests",
+        "UMask": "0x1bd0ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests and Read Prefetches",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.DATA_RD",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x1bc1ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests, Read Prefetches, and Snoops",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_ALL",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Data Reads",
+        "UMask": "0x1fc1ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Demand Data Reads, Core and LLC prefetches",
+        "UMask": "0x841ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests, Read Prefetches, and Snoops which miss the Cache",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.DATA_READ_MISS",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Data Read Misses",
+        "UMask": "0x1fc101",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: All Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCALLY_HOMED_ADDRESS",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Transactions homed locally",
+        "UMask": "0xbdfff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Code Read Requests and Code Read Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_CODE",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : CRd Requests",
+        "UMask": "0x19d0ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests and Read Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DATA_RD",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x19c1ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Code Read Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_CODE",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : CRd Requests",
+        "UMask": "0x1850ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_DATA_RD",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x1841ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: RFO Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_DMND_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : RFO Requests",
+        "UMask": "0x1848ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: LLC Prefetch Requests to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_LLC_PF",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x189dff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: All Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x199dff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Code Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_CODE",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : CRd Requests",
+        "UMask": "0x1910ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Read Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_DATA_RD",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x1981ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: RFO Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_PF_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : RFO Requests",
+        "UMask": "0x1908ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: RFO Requests and RFO Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.LOCAL_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : RFO Requests",
+        "UMask": "0x19c8ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: All Requests to Remotely Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.REMOTELY_HOMED_ADDRESS",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Transactions homed remotely : Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing. : Transaction whose address resides in a remote MC",
+        "UMask": "0x15dfff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Code Read/Prefetch Requests from a Remote Socket",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.REMOTE_CODE",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : CRd Requests",
+        "UMask": "0x1a10ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Data Read/Prefetch Requests from a Remote Socket",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.REMOTE_DATA_RD",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed - this includes code, data, prefetches and hints coming from L2.  This has numerous filters available.  Note the non-standard filtering equation.  This event will count requests that lookup the cache multiple times with multiple increments.  One must ALWAYS set umask bit 0 and select a state or states to match.  Otherwise, the event will count nothing.   CHAFilter0[24:21,17] bits correspond to [FMESI] state. Read transactions",
+        "UMask": "0x1a01ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: RFO Requests/Prefetches from a Remote Socket",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.REMOTE_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : RFO Requests",
+        "UMask": "0x1a08ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Snoop Requests from a Remote Socket",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.REMOTE_SNP",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of times the LLC was accessed",
+        "UMask": "0x1c19ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: All RFO and RFO Prefetches",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.RFO",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : All RFOs - Demand and Prefetches",
+        "UMask": "0x1bc8ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: RFO Requests and RFO Prefetches to Locally Homed Memory",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.RFO_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Locally HOMed RFOs - Demand and Prefetches",
+        "UMask": "0x9c8ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Writes to Locally Homed Memory (includes writebacks from L1/L2)",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.WRITE_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Writes",
+        "UMask": "0x842ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Cache Lookups: Writes to Remotely Homed Memory (includes writebacks from L1/L2)",
+        "EventCode": "0x34",
+        "EventName": "UNC_CHA_LLC_LOOKUP.WRITE_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "Cache Lookups : Remote Writes",
+        "UMask": "0x17c2ff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.ALL",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : All Lines Victimized",
+        "UMask": "0xf",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : IA traffic : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.IA",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : IO traffic : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.IO",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_ALL",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Local - All Lines",
+        "UMask": "0x200f",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_E",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Local - Lines in E State",
+        "UMask": "0x2002",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_F",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Local - Lines in F State",
+        "UMask": "0x2008",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_M",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Local - Lines in M State",
+        "UMask": "0x2001",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.LOCAL_S",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Local - Lines in S State",
+        "UMask": "0x2004",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.REMOTE_ALL",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Remote - All Lines",
+        "UMask": "0x800f",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.REMOTE_E",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Remote - Lines in E State",
+        "UMask": "0x8002",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.REMOTE_M",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Remote - Lines in M State",
+        "UMask": "0x8001",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Lines Victimized : Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.REMOTE_S",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Remote - Lines in S State",
+        "UMask": "0x8004",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_E",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Lines in E state",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_M",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Lines in M state",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the number of lines that were victimized on a fill.  This can be filtered by the state that the line was in.",
+        "EventCode": "0x37",
+        "EventName": "UNC_CHA_LLC_VICTIMS.TOTAL_S",
+        "PerPkg": "1",
+        "PublicDescription": "Lines Victimized : Lines in S State",
+        "UMask": "0x4",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts when a RFO (the Read for Ownership issued before a  write) request hit a cacheline in the S (Shared) state.",
+        "EventCode": "0x39",
+        "EventName": "UNC_CHA_MISC.RFO_HIT_S",
+        "PerPkg": "1",
+        "PublicDescription": "Cbo Misc : RFO HitS",
+        "UMask": "0x8",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "OSB Snoop Broadcast : Local InvItoE : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.",
+        "EventCode": "0x55",
+        "EventName": "UNC_CHA_OSB.LOCAL_INVITOE",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "OSB Snoop Broadcast : Local Rd : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.",
+        "EventCode": "0x55",
+        "EventName": "UNC_CHA_OSB.LOCAL_READ",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "OSB Snoop Broadcast : Off : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.",
+        "EventCode": "0x55",
+        "EventName": "UNC_CHA_OSB.OFF_PWRHEURISTIC",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "OSB Snoop Broadcast : Remote Rd : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.",
+        "EventCode": "0x55",
+        "EventName": "UNC_CHA_OSB.REMOTE_READ",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "OSB Snoop Broadcast : RFO HitS Snoop Broadcast : Count of OSB snoop broadcasts. Counts by 1 per request causing OSB snoops to be broadcast. Does not count all the snoops generated by OSB.",
+        "EventCode": "0x55",
+        "EventName": "UNC_CHA_OSB.RFO_HITS_SNP_BCAST",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.ALLOC_EXCLUSIVE",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.ALLOC_EXCLUSIVE",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.ALLOC_SHARED",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.ALLOC_SHARED",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.DEALLOC_EVCTCLN",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.DEALLOC_EVCTCLN",
+        "PerPkg": "1",
+        "UMask": "0x40",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.DIRBACKED_ONLY",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.DIRBACKED_ONLY",
+        "PerPkg": "1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.HIT_EXCLUSIVE",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.HIT_EXCLUSIVE",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.HIT_SHARED",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.HIT_SHARED",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.INCLUSIVE_ONLY",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.INCLUSIVE_ONLY",
+        "PerPkg": "1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.MISS",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.MISS",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.UPDATE_EXCLUSIVE",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.UPDATE_EXCLUSIVE",
+        "PerPkg": "1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.UPDATE_SHARED",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.UPDATE_SHARED",
+        "PerPkg": "1",
+        "UMask": "0x80",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.VICTIM_EXCLUSIVE",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.VICTIM_EXCLUSIVE",
+        "PerPkg": "1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UNC_CHA_REMOTE_SF.VICTIM_SHARED",
+        "EventCode": "0x69",
+        "EventName": "UNC_CHA_REMOTE_SF.VICTIM_SHARED",
+        "PerPkg": "1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the total number of requests coming from a unit on this socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.INVITOE",
+        "PerPkg": "1",
+        "PublicDescription": "HA Read and Write Requests : InvalItoE",
+        "UMask": "0x30",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the total number of requests coming from a unit on this socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.INVITOE_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the total number of requests coming from a remote socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.INVITOE_REMOTE",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts read requests made into this CHA. Reads include all read opcodes (including RFO: the Read for Ownership issued before a  write) .",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.READS",
+        "PerPkg": "1",
+        "PublicDescription": "HA Read and Write Requests : Reads",
+        "UMask": "0x3",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts read requests coming from a unit on this socket made into this CHA. Reads include all read opcodes (including RFO: the Read for Ownership issued before a  write).",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.READS_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts read requests coming from a remote socket made into the CHA. Reads include all read opcodes (including RFO: the Read for Ownership issued before a  write).",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.READS_REMOTE",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts write requests made into the CHA, including streaming, evictions, HitM (Reads from another core to a Modified cacheline), etc.",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.WRITES",
+        "PerPkg": "1",
+        "PublicDescription": "HA Read and Write Requests : Writes",
+        "UMask": "0xc",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts  write requests coming from a unit on this socket made into this CHA, including streaming, evictions, HitM (Reads from another core to a Modified cacheline), etc.",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.WRITES_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Counts the total number of read requests made into the Home Agent. Reads include all read opcodes (including RFO).  Writes include all writes (streaming, evictions, HitM, etc).",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.WRITES_REMOTE",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR Inserts",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.ALL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All",
+        "UMask": "0xc001ffff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CLFlush transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_CLFLUSH",
+        "PerPkg": "1",
+        "UMask": "0x78c8c7fd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "FsRdCur transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_FSRDCUR",
+        "PerPkg": "1",
+        "UMask": "0x78c8effd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "FsRdCurPtl transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_FSRDCURPTL",
+        "PerPkg": "1",
+        "UMask": "0x78c9effd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoM transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_ITOM",
+        "PerPkg": "1",
+        "UMask": "0x78cc47fd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMWr transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_ITOMWR",
+        "PerPkg": "1",
+        "UMask": "0x78cc4ffd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "MemPushWr transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_MEMPUSHWR",
+        "PerPkg": "1",
+        "UMask": "0x78cc6ffd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCiL transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_WCIL",
+        "PerPkg": "1",
+        "UMask": "0x78c86ffd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WcilF transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_WCILF",
+        "PerPkg": "1",
+        "UMask": "0x78c867fd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WiL transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_HIT_WIL",
+        "PerPkg": "1",
+        "UMask": "0x78c87ffd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CLFlush transactions from a CXL device which miss the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_CLFLUSH",
+        "PerPkg": "1",
+        "UMask": "0x78c8c7fe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "FsRdCur transactions from a CXL device which miss the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_FSRDCUR",
+        "PerPkg": "1",
+        "UMask": "0x78c8effe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "FsRdCurPtl transactions from a CXL device which miss the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_FSRDCURPTL",
+        "PerPkg": "1",
+        "UMask": "0x78c9effe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoM transactions from a CXL device which miss the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_ITOM",
+        "PerPkg": "1",
+        "UMask": "0x78cc47fe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMWr transactions from a CXL device which miss the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_ITOMWR",
+        "PerPkg": "1",
+        "UMask": "0x78cc4ffe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "MemPushWr transactions from a CXL device which miss the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_MEMPUSHWR",
+        "PerPkg": "1",
+        "UMask": "0x78cc6ffe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCiL transactions from a CXL device which miss the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_WCIL",
+        "PerPkg": "1",
+        "UMask": "0x78c86ffe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WcilF transactions from a CXL device which miss the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_WCILF",
+        "PerPkg": "1",
+        "UMask": "0x78c867fe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WiL transactions from a CXL device which miss the L3.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.CXL_MISS_WIL",
+        "PerPkg": "1",
+        "UMask": "0x78c87ffe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All locally initiated requests from IA Cores",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from iA Cores",
+        "UMask": "0xc001ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CLFlush events that are initiated from the Core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_CLFLUSH",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CLFlushes issued by iA Cores",
+        "UMask": "0xc8c7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CLFlushOpt events that are initiated from the Core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_CLFLUSHOPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CLFlushOpts issued by iA Cores",
+        "UMask": "0xc8d7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRDs issued by iA Cores",
+        "UMask": "0xc80fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts; Code read prefetch from local IA that misses in the snoop filter",
+        "UMask": "0xc88fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_DRD_OPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opts issued by iA Cores",
+        "UMask": "0xc827ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores",
+        "UMask": "0xc8a7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All locally initiated requests from IA Cores which hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from iA Cores that Hit the LLC",
+        "UMask": "0xc001fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Hit the LLC",
+        "UMask": "0xc80ffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read prefetch from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that hit the LLC",
+        "UMask": "0xc88ffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All requests issued from IA cores to CXL accelerator memory regions that hit the LLC.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10c0018101",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD_OPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opts issued by iA Cores that hit the LLC",
+        "UMask": "0xc827fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt prefetch from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that hit the LLC",
+        "UMask": "0xc8a7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoM requests from local IA cores that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores that Hit LLC",
+        "UMask": "0xcc47fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch code read from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores that hit the LLC",
+        "UMask": "0xcccffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch data read from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores that hit the LLC",
+        "UMask": "0xccd7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch read for ownership from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that hit the LLC",
+        "UMask": "0xccc7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Hit the LLC",
+        "UMask": "0xc807fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership prefetch from local IA that hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_HIT_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Hit the LLC",
+        "UMask": "0xc887fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoM events that are initiated from the Core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores",
+        "UMask": "0xcc47ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMCacheNear requests from local IA cores",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMCacheNears issued by iA Cores",
+        "UMask": "0xcd47ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch code read from local IA.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores",
+        "UMask": "0xcccfff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch data read from local IA.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores",
+        "UMask": "0xccd7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch read for ownership from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores",
+        "UMask": "0xccc7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All locally initiated requests from IA Cores which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from iA Cores that Missed the LLC",
+        "UMask": "0xc001fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRds issued by iA Cores that Missed the LLC",
+        "UMask": "0xc80ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CRDs from local IA cores to locally homed memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRd issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc80efe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Code read prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc88ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CRD Prefetches from local IA cores to locally homed memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc88efe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CRD Prefetches from local IA cores to remotely homed memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_PREF_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed remotely",
+        "UMask": "0xc88f7e01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CRDs from local IA cores to remotely homed memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CRD_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CRd issued by iA Cores that Missed the LLC - HOMed remotely",
+        "UMask": "0xc80f7e01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All requests issued from IA cores to CXL accelerator memory regions that miss the LLC.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10c0018201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "DRds and equivalent opcodes issued from an IA core which miss the L3 and target memory in a CXL type 2 memory expander card.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_CXL_ACC",
+        "PerPkg": "1",
+        "PublicDescription": "DRds issued from an IA core which miss the L3 and target memory in a CXL type 2 memory expander card.",
+        "UMask": "0x10c8178201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt issued by iA Cores that missed the LLC",
+        "UMask": "0xc827fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRd_Opt, and which target local memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc826fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Data read opt prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that missed the LLC",
+        "UMask": "0xc8a7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRD_PREF_OPT, and target local memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_PREF_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that missed the LLC",
+        "UMask": "0xc8a6fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRD_PREF_OPT, and target remote memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_PREF_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt_Prefs issued by iA Cores that missed the LLC",
+        "UMask": "0xc8a77e01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Inserts into the TOR from local IA cores which miss the LLC and snoop filter with the opcode DRd_Opt, and target remote memory",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_OPT_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : DRd_Opt issued by iA Cores that missed the LLC",
+        "UMask": "0xc8277e01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "L2 data prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10c8978201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoM requests from local IA cores that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by iA Cores that Missed LLC",
+        "UMask": "0xcc47fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch code read from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefCode issued by iA Cores that missed the LLC",
+        "UMask": "0xcccffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch data read from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefData issued by iA Cores that missed the LLC",
+        "UMask": "0xccd7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "LLC data prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFDATA_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10ccd78201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Last level cache prefetch read for ownership from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : LLCPrefRFO issued by iA Cores that missed the LLC",
+        "UMask": "0xccc7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "L2 RFO prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LLCPREFRFO_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10c8878201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA cores to locally homed DDR addresses that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCILF_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally",
+        "UMask": "0xc8668601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA cores to locally homed PMM addresses which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCILF_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting PMM that missed the LLC - HOMed locally",
+        "UMask": "0xc8668a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from local IA cores to locally homed DDR addresses that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCIL_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed locally",
+        "UMask": "0xc86e8601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from local IA cores to locally homed PMM addresses which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_LOCAL_WCIL_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting PMM that missed the LLC - HOMed locally",
+        "UMask": "0xc86e8a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA cores to remotely homed DDR addresses that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_REMOTE_WCILF_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed remotely",
+        "UMask": "0xc8670601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA cores to remotely homed PMM addresses which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_REMOTE_WCILF_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting PMM that missed the LLC - HOMed remotely",
+        "UMask": "0xc8670a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from local IA cores to remotely homed DDR addresses that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_REMOTE_WCIL_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed remotely",
+        "UMask": "0xc86f0601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from local IA cores to remotely homed PMM addresses which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_REMOTE_WCIL_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting PMM that missed the LLC - HOMed remotely",
+        "UMask": "0xc86f0a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc807fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "RFOs issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10c8078201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc806fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc887fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "LLC RFO prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10ccc78201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc886fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_PREF_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed remotely",
+        "UMask": "0xc8877e01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_RFO_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by iA Cores that Missed the LLC - HOMed remotely",
+        "UMask": "0xc8077e01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UCRDF requests from local IA cores that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_UCRDF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : UCRdFs issued by iA Cores that Missed LLC",
+        "UMask": "0xc877de01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from a local IA core that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc86ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA core that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCILF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLF issued by iA Cores that Missed the LLC",
+        "UMask": "0xc867fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA cores to DDR homed addresses which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCILF_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting DDR that missed the LLC",
+        "UMask": "0xc8678601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA cores to PMM homed addresses which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCILF_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLFs issued by iA Cores targeting PMM that missed the LLC",
+        "UMask": "0xc8678a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from local IA cores to DDR homed addresses which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCIL_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting DDR that missed the LLC",
+        "UMask": "0xc86f8601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from a local IA core to PMM homed addresses that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WCIL_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores targeting PMM that missed the LLC",
+        "UMask": "0xc86f8a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WIL requests from local IA cores that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_MISS_WIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WiLs issued by iA Cores that Missed LLC",
+        "UMask": "0xc87fde01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by iA Cores",
+        "UMask": "0xc807ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFO_Prefs issued by iA Cores",
+        "UMask": "0xc887ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "SpecItoM events that are initiated from the Core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_SPECITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : SpecItoMs issued by iA Cores",
+        "UMask": "0xcc57ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WbEFtoEs issued by iA Cores.  (Non Modified Write Backs)",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WBEFTOE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc3fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WbEFtoIs issued by iA Cores .  (Non Modified Write Backs)",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WBEFTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc37ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WbMtoEs issued by iA Cores .  (Modified Write Backs)",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WBMTOE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc2fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WbMtoI requests from local IA cores",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WBMTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WbMtoIs issued by iA Cores",
+        "UMask": "0xcc27ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WbStoIs issued by iA Cores .  (Non Modified Write Backs)",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WBSTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc67ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCIL requests from a local IA core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WCIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLs issued by iA Cores",
+        "UMask": "0xc86fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WCILF requests from local IA core",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IA_WCILF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WCiLF issued by iA Cores",
+        "UMask": "0xc867ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR inserts from local IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from IO Devices",
+        "UMask": "0xc001ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CLFlush requests from IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_CLFLUSH",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : CLFlushes issued by IO Devices",
+        "UMask": "0xc8c3ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR inserts from local IO devices which hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from IO Devices that hit the LLC",
+        "UMask": "0xc001fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMs from local IO devices which hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc43fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC",
+        "UMask": "0xcd43fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "PCIRDCURs issued by IO devices which hit the LLC",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices that hit the LLC",
+        "UMask": "0xc8f3fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "RFOs from local IO devices which hit the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_HIT_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by IO Devices that hit the LLC",
+        "UMask": "0xc803fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR ItoM inserts from local IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices",
+        "UMask": "0xcc43ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices",
+        "UMask": "0xcd43ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR inserts from local IO devices which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All requests from IO Devices that missed the LLC",
+        "UMask": "0xc001fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR ItoM inserts from local IO devices which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that missed the LLC",
+        "UMask": "0xcc43fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "UMask": "0xcd43fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMCacheNear transactions from an IO device on the local socket that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "UMask": "0xcd42fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoMCacheNear transactions from an IO device on a remote socket that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "UMask": "0xcd437e04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoM transactions from an IO device on the local socket that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that missed the LLC",
+        "UMask": "0xcc42fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "ItoM transactions from an IO device on a remote socket that miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : ItoMs issued by IO Devices that missed the LLC",
+        "UMask": "0xcc437e04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "PCIRDCURs issued by IO devices which miss the LLC",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices that missed the LLC",
+        "UMask": "0xc8f3fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All TOR RFO inserts from local IO devices which miss the cache",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_MISS_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by IO Devices that missed the LLC",
+        "UMask": "0xc803fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "PCIRDCURs issued by IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : PCIRdCurs issued by IO Devices",
+        "UMask": "0xc8f3ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "RFOs from local IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : RFOs issued by IO Devices",
+        "UMask": "0xc803ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "WBMtoI requests from IO devices",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.IO_WBMTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : WbMtoIs issued by IO Devices",
+        "UMask": "0xcc23ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Inserts for SF or LLC Evictions",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.LLC_OR_SF_EVICTIONS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR allocation occurred as a result of SF/LLC evictions (came from the ISMQ)",
+        "UMask": "0xc001ff02",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All locally initiated requests",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.LOC_ALL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All from Local iA and IO",
+        "UMask": "0xc000ff05",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All from Local iA",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.LOC_IA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All from Local iA",
+        "UMask": "0xc000ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All from Local IO",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.LOC_IO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All from Local IO",
+        "UMask": "0xc000ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All remote requests (e.g. snoops, writebacks) that came from remote sockets",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.REM_ALL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All Remote Requests",
+        "UMask": "0xc001ffc8",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "All snoops to this LLC that came from remote sockets",
+        "EventCode": "0x35",
+        "EventName": "UNC_CHA_TOR_INSERTS.REM_SNPS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Inserts : All Snoops from Remote",
+        "UMask": "0xc001ff08",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Occupancy for all TOR entries",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.ALL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All",
+        "UMask": "0xc001ffff",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CLFlush transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_CLFLUSH",
+        "PerPkg": "1",
+        "UMask": "0x78c8c7fd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for FsRdCur transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_FSRDCUR",
+        "PerPkg": "1",
+        "UMask": "0x78c8effd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for FsRdCurPtl transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_FSRDCURPTL",
+        "PerPkg": "1",
+        "UMask": "0x78c9effd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoM transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_ITOM",
+        "PerPkg": "1",
+        "UMask": "0x78cc47fd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMWr transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_ITOMWR",
+        "PerPkg": "1",
+        "UMask": "0x78cc4ffd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for MemPushWr transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_MEMPUSHWR",
+        "PerPkg": "1",
+        "UMask": "0x78cc6ffd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCiL transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_WCIL",
+        "PerPkg": "1",
+        "UMask": "0x78c86ffd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WcilF transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_WCILF",
+        "PerPkg": "1",
+        "UMask": "0x78c867fd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WiL transactions from a CXL device which hit in the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_HIT_WIL",
+        "PerPkg": "1",
+        "UMask": "0x78c87ffd20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CLFlush transactions from a CXL device which miss the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_CLFLUSH",
+        "PerPkg": "1",
+        "UMask": "0x78c8c7fe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for FsRdCur transactions from a CXL device which miss the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_FSRDCUR",
+        "PerPkg": "1",
+        "UMask": "0x78c8effe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for FsRdCurPtl transactions from a CXL device which miss the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_FSRDCURPTL",
+        "PerPkg": "1",
+        "UMask": "0x78c9effe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoM transactions from a CXL device which miss the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_ITOM",
+        "PerPkg": "1",
+        "UMask": "0x78cc47fe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMWr transactions from a CXL device which miss the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_ITOMWR",
+        "PerPkg": "1",
+        "UMask": "0x78cc4ffe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for MemPushWr transactions from a CXL device which miss the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_MEMPUSHWR",
+        "PerPkg": "1",
+        "UMask": "0x78cc6ffe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCiL transactions from a CXL device which miss the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_WCIL",
+        "PerPkg": "1",
+        "UMask": "0x78c86ffe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WcilF transactions from a CXL device which miss the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_WCILF",
+        "PerPkg": "1",
+        "UMask": "0x78c867fe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WiL transactions from a CXL device which miss the L3.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.CXL_MISS_WIL",
+        "PerPkg": "1",
+        "UMask": "0x78c87ffe20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from iA Cores",
+        "UMask": "0xc001ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CLFlush events that are initiated from the Core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CLFLUSH",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CLFlushes issued by iA Cores",
+        "UMask": "0xc8c7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CLFlushOpt events that are initiated from the Core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CLFLUSHOPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CLFlushOpts issued by iA Cores",
+        "UMask": "0xc8d7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRDs issued by iA Cores",
+        "UMask": "0xc80fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy; Code read prefetch from local IA that misses in the snoop filter",
+        "UMask": "0xc88fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Data read opt from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_DRD_OPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : DRd_Opts issued by iA Cores",
+        "UMask": "0xc827ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores",
+        "UMask": "0xc8a7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores which hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from iA Cores that Hit the LLC",
+        "UMask": "0xc001fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Hit the LLC",
+        "UMask": "0xc80ffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that hit the LLC",
+        "UMask": "0xc88ffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All requests issued from IA cores to CXL accelerator memory regions that hit the LLC.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10c0018101",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Data read opt from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD_OPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : DRd_Opts issued by iA Cores that hit the LLC",
+        "UMask": "0xc827fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores that hit the LLC",
+        "UMask": "0xc8a7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoM requests from local IA cores that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores that Hit LLC",
+        "UMask": "0xcc47fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores that hit the LLC",
+        "UMask": "0xcccffd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores that hit the LLC",
+        "UMask": "0xccd7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that hit the LLC",
+        "UMask": "0xccc7fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Hit the LLC",
+        "UMask": "0xc807fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_HIT_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Hit the LLC",
+        "UMask": "0xc887fd01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoM events that are initiated from the Core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores",
+        "UMask": "0xcc47ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMCacheNear requests from local IA cores",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMCacheNears issued by iA Cores",
+        "UMask": "0xcd47ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores",
+        "UMask": "0xcccfff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores",
+        "UMask": "0xccd7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores",
+        "UMask": "0xccc7ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All locally initiated requests from IA Cores which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from iA Cores that Missed the LLC",
+        "UMask": "0xc001fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRds issued by iA Cores that Missed the LLC",
+        "UMask": "0xc80ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CRDs from local IA cores to locally homed memory",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRd issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc80efe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Code read prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc88ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CRD Prefetches from local IA cores to locally homed memory",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_PREF_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc88efe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CRD Prefetches from local IA cores to remotely homed memory",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_PREF_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRd_Prefs issued by iA Cores that Missed the LLC - HOMed remotely",
+        "UMask": "0xc88f7e01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CRDs from local IA cores to remotely homed memory",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CRD_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CRd issued by iA Cores that Missed the LLC - HOMed remotely",
+        "UMask": "0xc80f7e01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All requests issued from IA cores to CXL accelerator memory regions that miss the LLC.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10c0018201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for DRds and equivalent opcodes issued from an IA core which miss the L3 and target memory in a CXL type 2 memory expander card.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10c8178201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Data read opt from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_OPT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : DRd_Opt issued by iA Cores that missed the LLC",
+        "UMask": "0xc827fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Data read opt prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_OPT_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : DRd_Opt_Prefs issued by iA Cores that missed the LLC",
+        "UMask": "0xc8a7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for L2 data prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PREF_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10c8978201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoM requests from local IA cores that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by iA Cores that Missed LLC",
+        "UMask": "0xcc47fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch code read from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFCODE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefCode issued by iA Cores that missed the LLC",
+        "UMask": "0xcccffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch data read from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFDATA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefData issued by iA Cores that missed the LLC",
+        "UMask": "0xccd7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for LLC data prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFDATA_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10ccd78201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Last level cache prefetch read for ownership from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : LLCPrefRFO issued by iA Cores that missed the LLC",
+        "UMask": "0xccc7fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for L2 RFO prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LLCPREFRFO_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10c8878201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to locally homed DDR addresses that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCILF_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed locally",
+        "UMask": "0xc8668601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to locally homed PMM addresses which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCILF_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting PMM that missed the LLC - HOMed locally",
+        "UMask": "0xc8668a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to locally homed DDR addresses that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCIL_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed locally",
+        "UMask": "0xc86e8601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to locally homed PMM addresses which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_LOCAL_WCIL_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting PMM that missed the LLC - HOMed locally",
+        "UMask": "0xc86e8a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to remotely homed DDR addresses that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_REMOTE_WCILF_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC - HOMed remotely",
+        "UMask": "0xc8670601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to remotely homed PMM addresses which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_REMOTE_WCILF_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting PMM that missed the LLC - HOMed remotely",
+        "UMask": "0xc8670a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to remotely homed DDR addresses that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_REMOTE_WCIL_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting DDR that missed the LLC - HOMed remotely",
+        "UMask": "0xc86f0601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to remotely homed PMM addresses which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_REMOTE_WCIL_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting PMM that missed the LLC - HOMed remotely",
+        "UMask": "0xc86f0a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc807fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for RFOs issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10c8078201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc806fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc887fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for LLC RFO prefetches issued from an IA core which miss the L3 and target memory in a CXL type 2 accelerator.",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_CXL_ACC",
+        "PerPkg": "1",
+        "UMask": "0x10ccc78201",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed locally",
+        "UMask": "0xc886fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_PREF_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores that Missed the LLC - HOMed remotely",
+        "UMask": "0xc8877e01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_RFO_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores that Missed the LLC - HOMed remotely",
+        "UMask": "0xc8077e01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for UCRDF requests from local IA cores that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_UCRDF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : UCRdFs issued by iA Cores that Missed LLC",
+        "UMask": "0xc877de01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from a local IA core that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores that Missed the LLC",
+        "UMask": "0xc86ffe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA core that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCILF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLF issued by iA Cores that Missed the LLC",
+        "UMask": "0xc867fe01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to DDR homed addresses which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCILF_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting DDR that missed the LLC",
+        "UMask": "0xc8678601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA cores to PMM homed addresses which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCILF_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLFs issued by iA Cores targeting PMM that missed the LLC",
+        "UMask": "0xc8678a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from local IA cores to DDR homed addresses which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCIL_DDR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting DDR that missed the LLC",
+        "UMask": "0xc86f8601",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from a local IA core to PMM homed addresses that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WCIL_PMM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores targeting PMM that missed the LLC",
+        "UMask": "0xc86f8a01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WIL requests from local IA cores that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_WIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WiLs issued by iA Cores that Missed LLC",
+        "UMask": "0xc87fde01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by iA Cores",
+        "UMask": "0xc807ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for Read for ownership prefetch from local IA that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_RFO_PREF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFO_Prefs issued by iA Cores",
+        "UMask": "0xc887ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for SpecItoM events that are initiated from the Core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_SPECITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : SpecItoMs issued by iA Cores",
+        "UMask": "0xcc57ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WbMtoI requests from local IA cores",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WBMTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WbMtoIs issued by iA Cores",
+        "UMask": "0xcc27ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCIL requests from a local IA core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WCIL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLs issued by iA Cores",
+        "UMask": "0xc86fff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WCILF requests from local IA core",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IA_WCILF",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WCiLF issued by iA Cores",
+        "UMask": "0xc867ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from IO Devices",
+        "UMask": "0xc001ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for CLFlush requests from IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_CLFLUSH",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : CLFlushes issued by IO Devices",
+        "UMask": "0xc8c3ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices which hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from IO Devices that hit the LLC",
+        "UMask": "0xc001fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMs from local IO devices which hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that Hit the LLC",
+        "UMask": "0xcc43fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that hit the LLC",
+        "UMask": "0xcd43fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices which hit the LLC",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that hit the LLC",
+        "UMask": "0xc8f3fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for RFOs from local IO devices which hit the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_HIT_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices that hit the LLC",
+        "UMask": "0xc803fd04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR ItoM inserts from local IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices",
+        "UMask": "0xcc43ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices",
+        "UMask": "0xcd43ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR inserts from local IO devices which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All requests from IO Devices that missed the LLC",
+        "UMask": "0xc001fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR ItoM inserts from local IO devices which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that missed the LLC",
+        "UMask": "0xcc43fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "UMask": "0xcd43fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMCacheNear transactions from an IO device on the local socket that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "UMask": "0xcd42fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoMCacheNear transactions from an IO device on a remote socket that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOMCACHENEAR_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMCacheNears, indicating a partial write request, from IO Devices that missed the LLC",
+        "UMask": "0xcd437e04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoM transactions from an IO device on the local socket that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that missed the LLC",
+        "UMask": "0xcc42fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for ItoM transactions from an IO device on a remote socket that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_ITOM_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : ItoMs issued by IO Devices that missed the LLC",
+        "UMask": "0xcc437e04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices which miss the LLC",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that missed the LLC",
+        "UMask": "0xc8f3fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for PCIRDCUR transactions from an IO device on the local socket that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that missed the LLC",
+        "UMask": "0xc8f2fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for PCIRDCUR transactions from an IO device on a remote socket that miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_PCIRDCUR_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices that missed the LLC",
+        "UMask": "0xc8f37e04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All TOR RFO inserts from local IO devices which miss the cache",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_MISS_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices that missed the LLC",
+        "UMask": "0xc803fe04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for PCIRDCURs issued by IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_PCIRDCUR",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : PCIRdCurs issued by IO Devices",
+        "UMask": "0xc8f3ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for RFOs from local IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_RFO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : RFOs issued by IO Devices",
+        "UMask": "0xc803ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for WBMtoI requests from IO devices",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.IO_WBMTOI",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : WbMtoIs issued by IO Devices",
+        "UMask": "0xcc23ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All locally initiated requests",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_ALL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All from Local iA and IO",
+        "UMask": "0xc000ff05",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All from Local iA",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_IA",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All from Local iA",
+        "UMask": "0xc000ff01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All from Local IO",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.LOC_IO",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All from Local IO",
+        "UMask": "0xc000ff04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All remote requests (e.g. snoops, writebacks) that came from remote sockets",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.REM_ALL",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All Remote Requests",
+        "UMask": "0xc001ffc8",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "TOR Occupancy for All snoops to this LLC that came from remote sockets",
+        "EventCode": "0x36",
+        "EventName": "UNC_CHA_TOR_OCCUPANCY.REM_SNPS",
+        "PerPkg": "1",
+        "PublicDescription": "TOR Occupancy : All Snoops from Remote",
+        "UMask": "0xc001ff08",
+        "Unit": "CHA"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cxl.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cxl.json
new file mode 100644
index 000000000000..dc676c7aa37f
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-cxl.json
@@ -0,0 +1,10 @@
+[
+    {
+        "BriefDescription": "B2CXL Clockticks",
+        "EventCode": "0x01",
+        "EventName": "UNC_B2CXL_CLOCKTICKS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "Unit": "B2CXL"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-interconnect.json
new file mode 100644
index 000000000000..6932b2fea3a5
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-interconnect.json
@@ -0,0 +1,1228 @@
+[
+    {
+        "BriefDescription": "Clockticks of the mesh to memory (B2CMI)",
+        "EventCode": "0x01",
+        "EventName": "UNC_B2CMI_CLOCKTICKS",
+        "PerPkg": "1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of time D2C was not honoured by egress due to directory state constraints",
+        "EventCode": "0x17",
+        "EventName": "UNC_B2CMI_DIRECT2CORE_NOT_TAKEN_DIRSTATE",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of times B2CMI egress did D2C (direct to core)",
+        "EventCode": "0x16",
+        "EventName": "UNC_B2CMI_DIRECT2CORE_TAKEN",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of times D2C wasn't honoured even though the incoming request had d2c set for non cisgress txn",
+        "EventCode": "0x18",
+        "EventName": "UNC_B2CMI_DIRECT2CORE_TXN_OVERRIDE",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of d2k wasn't done due to credit constraints",
+        "EventCode": "0x1B",
+        "EventName": "UNC_B2CMI_DIRECT2UPI_NOT_TAKEN_CREDITS",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Direct to UPI Transactions - Ignored due to lack of credits : All : Counts the number of d2k wasn't done due to credit constraints",
+        "EventCode": "0x1B",
+        "EventName": "UNC_B2CMI_DIRECT2UPI_NOT_TAKEN_CREDITS.EGRESS",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of time D2K was not honoured by egress due to directory state constraints",
+        "EventCode": "0x1A",
+        "EventName": "UNC_B2CMI_DIRECT2UPI_NOT_TAKEN_DIRSTATE",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Cycles when Direct2UPI was Disabled : Egress Ignored D2U : Counts the number of time D2K was not honoured by egress due to directory state constraints",
+        "EventCode": "0x1A",
+        "EventName": "UNC_B2CMI_DIRECT2UPI_NOT_TAKEN_DIRSTATE.EGRESS",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of times egress did D2K (Direct to KTI)",
+        "EventCode": "0x19",
+        "EventName": "UNC_B2CMI_DIRECT2UPI_TAKEN",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of times D2K wasn't honoured even though the incoming request had d2k set for non cisgress txn",
+        "EventCode": "0x1C",
+        "EventName": "UNC_B2CMI_DIRECT2UPI_TXN_OVERRIDE",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Hit Clean",
+        "EventCode": "0x1D",
+        "EventName": "UNC_B2CMI_DIRECTORY_HIT.CLEAN",
+        "PerPkg": "1",
+        "UMask": "0x38",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Hit : On NonDirty Line in A State",
+        "EventCode": "0x1D",
+        "EventName": "UNC_B2CMI_DIRECTORY_HIT.CLEAN_A",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Hit : On NonDirty Line in I State",
+        "EventCode": "0x1D",
+        "EventName": "UNC_B2CMI_DIRECTORY_HIT.CLEAN_I",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Hit : On NonDirty Line in S State",
+        "EventCode": "0x1D",
+        "EventName": "UNC_B2CMI_DIRECTORY_HIT.CLEAN_S",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Hit Dirty (modified)",
+        "EventCode": "0x1D",
+        "EventName": "UNC_B2CMI_DIRECTORY_HIT.DIRTY",
+        "PerPkg": "1",
+        "UMask": "0x7",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Hit : On Dirty Line in A State",
+        "EventCode": "0x1D",
+        "EventName": "UNC_B2CMI_DIRECTORY_HIT.DIRTY_A",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Hit : On Dirty Line in I State",
+        "EventCode": "0x1D",
+        "EventName": "UNC_B2CMI_DIRECTORY_HIT.DIRTY_I",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Hit : On Dirty Line in S State",
+        "EventCode": "0x1D",
+        "EventName": "UNC_B2CMI_DIRECTORY_HIT.DIRTY_S",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of 1lm or 2lm hit read data returns to egress with any directory to non persistent memory",
+        "EventCode": "0x20",
+        "EventName": "UNC_B2CMI_DIRECTORY_LOOKUP.ANY",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of 1lm or 2lm hit read data returns to egress with directory A to non persistent memory",
+        "EventCode": "0x20",
+        "EventName": "UNC_B2CMI_DIRECTORY_LOOKUP.STATE_A",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of 1lm or 2lm hit read data returns to egress with directory I to non persistent memory",
+        "EventCode": "0x20",
+        "EventName": "UNC_B2CMI_DIRECTORY_LOOKUP.STATE_I",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the number of 1lm or 2lm hit read data returns to egress with directory S to non persistent memory",
+        "EventCode": "0x20",
+        "EventName": "UNC_B2CMI_DIRECTORY_LOOKUP.STATE_S",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of 1lm or 2lm hit read  data returns to egress with directory S to non persistent memory",
+        "UMask": "0x4",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Miss Clean",
+        "EventCode": "0x1E",
+        "EventName": "UNC_B2CMI_DIRECTORY_MISS.CLEAN",
+        "PerPkg": "1",
+        "UMask": "0x38",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Miss : On NonDirty Line in A State",
+        "EventCode": "0x1E",
+        "EventName": "UNC_B2CMI_DIRECTORY_MISS.CLEAN_A",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Miss : On NonDirty Line in I State",
+        "EventCode": "0x1E",
+        "EventName": "UNC_B2CMI_DIRECTORY_MISS.CLEAN_I",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Miss : On NonDirty Line in S State",
+        "EventCode": "0x1E",
+        "EventName": "UNC_B2CMI_DIRECTORY_MISS.CLEAN_S",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Miss Dirty (modified)",
+        "EventCode": "0x1E",
+        "EventName": "UNC_B2CMI_DIRECTORY_MISS.DIRTY",
+        "PerPkg": "1",
+        "UMask": "0x7",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Miss : On Dirty Line in A State",
+        "EventCode": "0x1E",
+        "EventName": "UNC_B2CMI_DIRECTORY_MISS.DIRTY_A",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Miss : On Dirty Line in I State",
+        "EventCode": "0x1E",
+        "EventName": "UNC_B2CMI_DIRECTORY_MISS.DIRTY_I",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory Miss : On Dirty Line in S State",
+        "EventCode": "0x1E",
+        "EventName": "UNC_B2CMI_DIRECTORY_MISS.DIRTY_S",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Any A2I Transition",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.A2I",
+        "PerPkg": "1",
+        "UMask": "0x320",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Any A2S Transition",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.A2S",
+        "PerPkg": "1",
+        "UMask": "0x340",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts cisgress directory updates",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.ANY",
+        "PerPkg": "1",
+        "UMask": "0x301",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts any 1lm or 2lm hit data return that would result in directory update to non persistent memory (DRAM)",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.HIT_ANY",
+        "PerPkg": "1",
+        "UMask": "0x101",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory update in near memory to the A state",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.HIT_X2A",
+        "PerPkg": "1",
+        "UMask": "0x114",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory update in near memory to the I state",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.HIT_X2I",
+        "PerPkg": "1",
+        "UMask": "0x128",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory update in near memory to the S state",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.HIT_X2S",
+        "PerPkg": "1",
+        "UMask": "0x142",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Any I2A Transition",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.I2A",
+        "PerPkg": "1",
+        "UMask": "0x304",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Any I2S Transition",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.I2S",
+        "PerPkg": "1",
+        "UMask": "0x302",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory update in far memory to the A state",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.MISS_X2A",
+        "PerPkg": "1",
+        "UMask": "0x214",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory update in far memory to the I state",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.MISS_X2I",
+        "PerPkg": "1",
+        "UMask": "0x228",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory update in far memory to the S state",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.MISS_X2S",
+        "PerPkg": "1",
+        "UMask": "0x242",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Any S2A Transition",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.S2A",
+        "PerPkg": "1",
+        "UMask": "0x310",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Any S2I Transition",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.S2I",
+        "PerPkg": "1",
+        "UMask": "0x308",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory update to the A state",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.X2A",
+        "PerPkg": "1",
+        "UMask": "0x314",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory update to the I state",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.X2I",
+        "PerPkg": "1",
+        "UMask": "0x328",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Directory update to the S state",
+        "EventCode": "0x21",
+        "EventName": "UNC_B2CMI_DIRECTORY_UPDATE.X2S",
+        "PerPkg": "1",
+        "UMask": "0x342",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts any read",
+        "EventCode": "0x24",
+        "EventName": "UNC_B2CMI_IMC_READS.ALL",
+        "PerPkg": "1",
+        "UMask": "0x104",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts normal reads issue to CMI",
+        "EventCode": "0x24",
+        "EventName": "UNC_B2CMI_IMC_READS.NORMAL",
+        "PerPkg": "1",
+        "UMask": "0x101",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Count reads to NM region",
+        "EventCode": "0x24",
+        "EventName": "UNC_B2CMI_IMC_READS.TO_DDR_AS_CACHE",
+        "PerPkg": "1",
+        "UMask": "0x110",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts reads to 1lm non persistent memory regions",
+        "EventCode": "0x24",
+        "EventName": "UNC_B2CMI_IMC_READS.TO_DDR_AS_MEM",
+        "PerPkg": "1",
+        "UMask": "0x108",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "All Writes - All Channels",
+        "EventCode": "0x25",
+        "EventName": "UNC_B2CMI_IMC_WRITES.ALL",
+        "PerPkg": "1",
+        "UMask": "0x110",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Full Non-ISOCH - All Channels",
+        "EventCode": "0x25",
+        "EventName": "UNC_B2CMI_IMC_WRITES.FULL",
+        "PerPkg": "1",
+        "UMask": "0x101",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Non-Inclusive - All Channels",
+        "EventCode": "0x25",
+        "EventName": "UNC_B2CMI_IMC_WRITES.NI",
+        "PerPkg": "1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Non-Inclusive Miss - All Channels",
+        "EventCode": "0x25",
+        "EventName": "UNC_B2CMI_IMC_WRITES.NI_MISS",
+        "PerPkg": "1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Partial Non-ISOCH - All Channels",
+        "EventCode": "0x25",
+        "EventName": "UNC_B2CMI_IMC_WRITES.PARTIAL",
+        "PerPkg": "1",
+        "UMask": "0x102",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "DDR, acting as Cache - All Channels",
+        "EventCode": "0x25",
+        "EventName": "UNC_B2CMI_IMC_WRITES.TO_DDR_AS_CACHE",
+        "PerPkg": "1",
+        "UMask": "0x140",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "DDR - All Channels",
+        "EventCode": "0x25",
+        "EventName": "UNC_B2CMI_IMC_WRITES.TO_DDR_AS_MEM",
+        "PerPkg": "1",
+        "UMask": "0x120",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Prefetch CAM Inserts : UPI - Ch 0",
+        "EventCode": "0x56",
+        "EventName": "UNC_B2CMI_PREFCAM_INSERTS.CH0_UPI",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Prefetch CAM Inserts : XPT - Ch 0",
+        "EventCode": "0x56",
+        "EventName": "UNC_B2CMI_PREFCAM_INSERTS.CH0_XPT",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Prefetch CAM Inserts : UPI - All Channels",
+        "EventCode": "0x56",
+        "EventName": "UNC_B2CMI_PREFCAM_INSERTS.UPI_ALLCH",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Prefetch CAM Inserts : XPT -All Channels",
+        "EventCode": "0x56",
+        "EventName": "UNC_B2CMI_PREFCAM_INSERTS.XPT_ALLCH",
+        "PerPkg": "1",
+        "PublicDescription": "Prefetch CAM Inserts : XPT - All Channels",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Prefetch CAM Occupancy : Channel 0",
+        "EventCode": "0x54",
+        "EventName": "UNC_B2CMI_PREFCAM_OCCUPANCY.CH0",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm reads and WRNI which were a hit",
+        "EventCode": "0x1F",
+        "EventName": "UNC_B2CMI_TAG_HIT.ALL",
+        "PerPkg": "1",
+        "UMask": "0xf",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm reads which were a hit clean",
+        "EventCode": "0x1F",
+        "EventName": "UNC_B2CMI_TAG_HIT.RD_CLEAN",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm reads which were a hit dirty",
+        "EventCode": "0x1F",
+        "EventName": "UNC_B2CMI_TAG_HIT.RD_DIRTY",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm WRNI which were a hit clean",
+        "EventCode": "0x1F",
+        "EventName": "UNC_B2CMI_TAG_HIT.WR_CLEAN",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm WRNI which were a hit dirty",
+        "EventCode": "0x1F",
+        "EventName": "UNC_B2CMI_TAG_HIT.WR_DIRTY",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm second way read miss for a WrNI",
+        "EventCode": "0x4B",
+        "EventName": "UNC_B2CMI_TAG_MISS.CLEAN",
+        "PerPkg": "1",
+        "UMask": "0x5",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm second way read miss for a WrNI",
+        "EventCode": "0x4B",
+        "EventName": "UNC_B2CMI_TAG_MISS.DIRTY",
+        "PerPkg": "1",
+        "UMask": "0xa",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm second way read miss for a Rd",
+        "EventCode": "0x4B",
+        "EventName": "UNC_B2CMI_TAG_MISS.RD_2WAY",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm reads which were a miss and the cache line is unmodified",
+        "EventCode": "0x4B",
+        "EventName": "UNC_B2CMI_TAG_MISS.RD_CLEAN",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm reads which were a miss and the cache line is modified",
+        "EventCode": "0x4B",
+        "EventName": "UNC_B2CMI_TAG_MISS.RD_DIRTY",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm second way read miss for a WrNI",
+        "EventCode": "0x4B",
+        "EventName": "UNC_B2CMI_TAG_MISS.WR_2WAY",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm WRNI which were a miss and the cache line is unmodified",
+        "EventCode": "0x4B",
+        "EventName": "UNC_B2CMI_TAG_MISS.WR_CLEAN",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Counts the 2lm WRNI which were a miss and the cache line is modified",
+        "EventCode": "0x4B",
+        "EventName": "UNC_B2CMI_TAG_MISS.WR_DIRTY",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Tracker Inserts : Channel 0",
+        "EventCode": "0x32",
+        "EventName": "UNC_B2CMI_TRACKER_INSERTS.CH0",
+        "PerPkg": "1",
+        "UMask": "0x104",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Tracker Occupancy : Channel 0",
+        "EventCode": "0x33",
+        "EventName": "UNC_B2CMI_TRACKER_OCCUPANCY.CH0",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "Write Tracker Inserts : Channel 0",
+        "EventCode": "0x40",
+        "EventName": "UNC_B2CMI_WR_TRACKER_INSERTS.CH0",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2CMI"
+    },
+    {
+        "BriefDescription": "UNC_B2HOT_CLOCKTICKS",
+        "EventCode": "0x01",
+        "EventName": "UNC_B2HOT_CLOCKTICKS",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "B2HOT"
+    },
+    {
+        "BriefDescription": "Number of uclks in domain",
+        "EventCode": "0x01",
+        "EventName": "UNC_B2UPI_CLOCKTICKS",
+        "PerPkg": "1",
+        "Unit": "B2UPI"
+    },
+    {
+        "BriefDescription": "Total Write Cache Occupancy : Mem",
+        "EventCode": "0x0F",
+        "EventName": "UNC_I_CACHE_TOTAL_OCCUPANCY.MEM",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "IRP"
+    },
+    {
+        "BriefDescription": "IRP Clockticks",
+        "EventCode": "0x01",
+        "EventName": "UNC_I_CLOCKTICKS",
+        "PerPkg": "1",
+        "Unit": "IRP"
+    },
+    {
+        "BriefDescription": "Inbound read requests received by the IRP and inserted into the FAF queue",
+        "EventCode": "0x18",
+        "EventName": "UNC_I_FAF_INSERTS",
+        "PerPkg": "1",
+        "Unit": "IRP"
+    },
+    {
+        "BriefDescription": "FAF occupancy",
+        "EventCode": "0x19",
+        "EventName": "UNC_I_FAF_OCCUPANCY",
+        "PerPkg": "1",
+        "Unit": "IRP"
+    },
+    {
+        "BriefDescription": "Misc Events - Set 1 : Lost Forward : Snoop pulled away ownership before a write was committed",
+        "EventCode": "0x1F",
+        "EventName": "UNC_I_MISC1.LOST_FWD",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "IRP"
+    },
+    {
+        "BriefDescription": "Inbound write (fast path) requests to coherent memory, received by the IRP resulting in write ownership requests issued by IRP to the mesh.",
+        "EventCode": "0x11",
+        "EventName": "UNC_I_TRANSACTIONS.WR_PREF",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "IRP"
+    },
+    {
+        "BriefDescription": "MDF Clockticks",
+        "EventCode": "0x01",
+        "EventName": "UNC_MDF_CLOCKTICKS",
+        "PerPkg": "1",
+        "Unit": "MDF"
+    },
+    {
+        "BriefDescription": "Number of UPI LL clock cycles while the event is enabled",
+        "EventCode": "0x01",
+        "EventName": "UNC_UPI_CLOCKTICKS",
+        "PerPkg": "1",
+        "PublicDescription": "Number of kfclks",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Cycles in L1 : Number of UPI qfclk cycles spent in L1 power mode.  L1 is a mode that totally shuts down a UPI link.  Use edge detect to count the number of instances when the UPI link entered L1.  Link power states are per link and per direction, so for example the Tx direction could be in one state while Rx was in another. Because L1 totally shuts down the link, it takes a good amount of time to exit this mode.",
+        "EventCode": "0x21",
+        "EventName": "UNC_UPI_L1_POWER_CYCLES",
+        "PerPkg": "1",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB",
+        "PerPkg": "1",
+        "UMask": "0xe",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Non-Coherent Bypass, Match Opcode",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCB_OPC",
+        "PerPkg": "1",
+        "UMask": "0x10e",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS",
+        "PerPkg": "1",
+        "UMask": "0xf",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Non-Coherent Standard, Match Opcode",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.NCS_OPC",
+        "PerPkg": "1",
+        "UMask": "0x10f",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Request",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.REQ",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Request, Match Opcode",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.REQ_OPC",
+        "PerPkg": "1",
+        "UMask": "0x108",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Response - Conflict",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSPCNFLT",
+        "PerPkg": "1",
+        "UMask": "0x1aa",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Response - Invalid",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSPI",
+        "PerPkg": "1",
+        "UMask": "0x12a",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Response - Data",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_DATA",
+        "PerPkg": "1",
+        "UMask": "0xc",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Response - Data, Match Opcode",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_DATA_OPC",
+        "PerPkg": "1",
+        "UMask": "0x10c",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Response - No Data",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_NODATA",
+        "PerPkg": "1",
+        "UMask": "0xa",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Response - No Data, Match Opcode",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.RSP_NODATA_OPC",
+        "PerPkg": "1",
+        "UMask": "0x10a",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Snoop",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.SNP",
+        "PerPkg": "1",
+        "UMask": "0x9",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Snoop, Match Opcode",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.SNP_OPC",
+        "PerPkg": "1",
+        "UMask": "0x109",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Writeback",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.WB",
+        "PerPkg": "1",
+        "UMask": "0xd",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Receive path of a UPI Port : Writeback, Match Opcode",
+        "EventCode": "0x05",
+        "EventName": "UNC_UPI_RxL_BASIC_HDR_MATCH.WB_OPC",
+        "PerPkg": "1",
+        "UMask": "0x10d",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Received : All Data : Shows legal flit time (hides impact of L0p and L0c).",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.ALL_DATA",
+        "PerPkg": "1",
+        "UMask": "0xf",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Null FLITs received from any slot",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.ALL_NULL",
+        "PerPkg": "1",
+        "PublicDescription": "Valid Flits Received : Null FLITs received from any slot",
+        "UMask": "0x27",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Received : Data : Shows legal flit time (hides impact of L0p and L0c). : Count Data Flits (which consume all slots), but how much to count is based on Slot0-2 mask, so count can be 0-3 depending on which slots are enabled for counting..",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.DATA",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Received : Idle : Shows legal flit time (hides impact of L0p and L0c).",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.IDLE",
+        "PerPkg": "1",
+        "UMask": "0x47",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Received : LLCRD Not Empty : Shows legal flit time (hides impact of L0p and L0c). : Enables counting of LLCRD (with non-zero payload). This only applies to slot 2 since LLCRD is only allowed in slot 2",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.LLCRD",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Received : LLCTRL : Shows legal flit time (hides impact of L0p and L0c). : Equivalent to an idle packet.  Enables counting of slot 0 LLCTRL messages.",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.LLCTRL",
+        "PerPkg": "1",
+        "UMask": "0x40",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Received : All Non Data : Shows legal flit time (hides impact of L0p and L0c).",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.NON_DATA",
+        "PerPkg": "1",
+        "UMask": "0x97",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Received : Slot NULL or LLCRD Empty : Shows legal flit time (hides impact of L0p and L0c). : LLCRD with all zeros is treated as NULL. Slot 1 is not treated as NULL if slot 0 is a dual slot. This can apply to slot 0,1, or 2.",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.NULL",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Received : Protocol Header : Shows legal flit time (hides impact of L0p and L0c). : Enables count of protocol headers in slot 0,1,2 (depending on slot uMask bits)",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.PROTHDR",
+        "PerPkg": "1",
+        "UMask": "0x80",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Received : Slot 0 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 0 - Other mask bits determine types of headers to count.",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.SLOT0",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Received : Slot 1 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 1 - Other mask bits determine types of headers to count.",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.SLOT1",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Received : Slot 2 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 2 - Other mask bits determine types of headers to count.",
+        "EventCode": "0x03",
+        "EventName": "UNC_UPI_RxL_FLITS.SLOT2",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "RxQ Flit Buffer Allocations : Slot 0 : Number of allocations into the UPI Rx Flit Buffer.  Generally, when data is transmitted across UPI, it will bypass the RxQ and pass directly to the ring interface.  If things back up getting transmitted onto the ring, however, it may need to allocate into this buffer, thus increasing the latency.  This event can be used in conjunction with the Flit Buffer Occupancy event in order to calculate the average flit buffer lifetime.",
+        "EventCode": "0x30",
+        "EventName": "UNC_UPI_RxL_INSERTS.SLOT0",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "RxQ Flit Buffer Allocations : Slot 1 : Number of allocations into the UPI Rx Flit Buffer.  Generally, when data is transmitted across UPI, it will bypass the RxQ and pass directly to the ring interface.  If things back up getting transmitted onto the ring, however, it may need to allocate into this buffer, thus increasing the latency.  This event can be used in conjunction with the Flit Buffer Occupancy event in order to calculate the average flit buffer lifetime.",
+        "EventCode": "0x30",
+        "EventName": "UNC_UPI_RxL_INSERTS.SLOT1",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "RxQ Flit Buffer Allocations : Slot 2 : Number of allocations into the UPI Rx Flit Buffer.  Generally, when data is transmitted across UPI, it will bypass the RxQ and pass directly to the ring interface.  If things back up getting transmitted onto the ring, however, it may need to allocate into this buffer, thus increasing the latency.  This event can be used in conjunction with the Flit Buffer Occupancy event in order to calculate the average flit buffer lifetime.",
+        "EventCode": "0x30",
+        "EventName": "UNC_UPI_RxL_INSERTS.SLOT2",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "RxQ Occupancy - All Packets : Slot 0",
+        "EventCode": "0x32",
+        "EventName": "UNC_UPI_RxL_OCCUPANCY.SLOT0",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "RxQ Occupancy - All Packets : Slot 1",
+        "EventCode": "0x32",
+        "EventName": "UNC_UPI_RxL_OCCUPANCY.SLOT1",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "RxQ Occupancy - All Packets : Slot 2",
+        "EventCode": "0x32",
+        "EventName": "UNC_UPI_RxL_OCCUPANCY.SLOT2",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB",
+        "PerPkg": "1",
+        "UMask": "0xe",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Bypass, Match Opcode",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCB_OPC",
+        "PerPkg": "1",
+        "UMask": "0x10e",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS",
+        "PerPkg": "1",
+        "UMask": "0xf",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Non-Coherent Standard, Match Opcode",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.NCS_OPC",
+        "PerPkg": "1",
+        "UMask": "0x10f",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Request",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.REQ",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Request, Match Opcode",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.REQ_OPC",
+        "PerPkg": "1",
+        "UMask": "0x108",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Response - Conflict",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSPCNFLT",
+        "PerPkg": "1",
+        "UMask": "0x1aa",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Response - Invalid",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSPI",
+        "PerPkg": "1",
+        "UMask": "0x12a",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Response - Data",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_DATA",
+        "PerPkg": "1",
+        "UMask": "0xc",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Response - Data, Match Opcode",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_DATA_OPC",
+        "PerPkg": "1",
+        "UMask": "0x10c",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Response - No Data",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_NODATA",
+        "PerPkg": "1",
+        "UMask": "0xa",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Response - No Data, Match Opcode",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.RSP_NODATA_OPC",
+        "PerPkg": "1",
+        "UMask": "0x10a",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Snoop",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.SNP",
+        "PerPkg": "1",
+        "UMask": "0x9",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Snoop, Match Opcode",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.SNP_OPC",
+        "PerPkg": "1",
+        "UMask": "0x109",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Writeback",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.WB",
+        "PerPkg": "1",
+        "UMask": "0xd",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Matches on Transmit path of a UPI Port : Writeback, Match Opcode",
+        "EventCode": "0x04",
+        "EventName": "UNC_UPI_TxL_BASIC_HDR_MATCH.WB_OPC",
+        "PerPkg": "1",
+        "UMask": "0x10d",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Sent : All Data : Counts number of data flits across this UPI link.",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.ALL_DATA",
+        "PerPkg": "1",
+        "UMask": "0xf",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "All Null Flits",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.ALL_NULL",
+        "PerPkg": "1",
+        "PublicDescription": "Valid Flits Sent : Idle",
+        "UMask": "0x27",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Sent : Data : Shows legal flit time (hides impact of L0p and L0c). : Count Data Flits (which consume all slots), but how much to count is based on Slot0-2 mask, so count can be 0-3 depending on which slots are enabled for counting..",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.DATA",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Sent : Idle : Shows legal flit time (hides impact of L0p and L0c).",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.IDLE",
+        "PerPkg": "1",
+        "UMask": "0x47",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Sent : LLCRD Not Empty : Shows legal flit time (hides impact of L0p and L0c). : Enables counting of LLCRD (with non-zero payload). This only applies to slot 2 since LLCRD is only allowed in slot 2",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.LLCRD",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Sent : LLCTRL : Shows legal flit time (hides impact of L0p and L0c). : Equivalent to an idle packet.  Enables counting of slot 0 LLCTRL messages.",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.LLCTRL",
+        "PerPkg": "1",
+        "UMask": "0x40",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Sent : All Non Data : Shows legal flit time (hides impact of L0p and L0c).",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.NON_DATA",
+        "PerPkg": "1",
+        "PublicDescription": "Valid Flits Sent : Null FLITs transmitted to any slot",
+        "UMask": "0x97",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Sent : Slot NULL or LLCRD Empty : Shows legal flit time (hides impact of L0p and L0c). : LLCRD with all zeros is treated as NULL. Slot 1 is not treated as NULL if slot 0 is a dual slot. This can apply to slot 0,1, or 2.",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.NULL",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Sent : Protocol Header : Shows legal flit time (hides impact of L0p and L0c). : Enables count of protocol headers in slot 0,1,2 (depending on slot uMask bits)",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.PROTHDR",
+        "PerPkg": "1",
+        "UMask": "0x80",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Sent : Slot 0 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 0 - Other mask bits determine types of headers to count.",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.SLOT0",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Sent : Slot 1 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 1 - Other mask bits determine types of headers to count.",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.SLOT1",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Valid Flits Sent : Slot 2 : Shows legal flit time (hides impact of L0p and L0c). : Count Slot 2 - Other mask bits determine types of headers to count.",
+        "EventCode": "0x02",
+        "EventName": "UNC_UPI_TxL_FLITS.SLOT2",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Tx Flit Buffer Allocations : Number of allocations into the UPI Tx Flit Buffer.  Generally, when data is transmitted across UPI, it will bypass the TxQ and pass directly to the link.  However, the TxQ will be used with L0p and when LLR occurs, increasing latency to transfer out to the link.  This event can be used in conjunction with the Flit Buffer Occupancy event in order to calculate the average flit buffer lifetime.",
+        "EventCode": "0x40",
+        "EventName": "UNC_UPI_TxL_INSERTS",
+        "PerPkg": "1",
+        "Unit": "UPI"
+    },
+    {
+        "BriefDescription": "Tx Flit Buffer Occupancy : Accumulates the number of flits in the TxQ.  Generally, when data is transmitted across UPI, it will bypass the TxQ and pass directly to the link.  However, the TxQ will be used with L0p and when LLR occurs, increasing latency to transfer out to the link. This can be used with the cycles not empty event to track average occupancy, or the allocations event to track average lifetime in the TxQ.",
+        "EventCode": "0x42",
+        "EventName": "UNC_UPI_TxL_OCCUPANCY",
+        "PerPkg": "1",
+        "Unit": "UPI"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-io.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-io.json
new file mode 100644
index 000000000000..9495cb0f68ea
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-io.json
@@ -0,0 +1,1634 @@
+[
+    {
+        "BriefDescription": "IIO Clockticks",
+        "EventCode": "0x01",
+        "EventName": "UNC_IIO_CLOCKTICKS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCIE Completion Buffer Inserts.  Counts once per 64 byte read issued from this PCIE device.",
+        "EventCode": "0xC2",
+        "EventName": "UNC_IIO_COMP_BUF_INSERTS.CMPD.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff0ff",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010010",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020020",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040040",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Count of allocations in the completion buffer",
+        "EventCode": "0xD5",
+        "EventName": "UNC_IIO_COMP_BUF_OCCUPANCY.CMPD.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080080",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core reporting completion of Card read from Core DRAM",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Another card (different IIO stack) reading from this card.",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.PEER_READ.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested by the CPU : Another card (different IIO stack) writing to this card.",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.PEER_WRITE.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Counts once for every 4 bytes read from this card to memory.  This event does include reads to IO.",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "UMask": "0x7002004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "UMask": "0x7008004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x10",
+        "UMask": "0x7010004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x20",
+        "UMask": "0x7020004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x40",
+        "UMask": "0x7040004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card reading from DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x80",
+        "UMask": "0x7080004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Counts once for every 4 bytes written from this card to memory.  This event does include writes to IO.",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "UMask": "0x7002001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "UMask": "0x7004001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "UMask": "0x7008001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x10",
+        "UMask": "0x7010001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x20",
+        "UMask": "0x7020001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x40",
+        "UMask": "0x7040001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Four byte data request of the CPU : Card writing to DRAM",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x80",
+        "UMask": "0x7080001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_READ.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Counts once for every 4 bytes written from this card to a peer device's IO space.",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Data requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.PEER_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB Hits to a 1G Page",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.1G_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x10",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB Hits to a 2M Page",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.2M_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x8",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB Hits to a 4K Page",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.4K_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x4",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB lookups all",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.ALL_LOOKUPS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x2",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Context cache hits",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.CTXT_CACHE_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x80",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Context cache lookups",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.CTXT_CACHE_LOOKUPS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x40",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB lookups first",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.FIRST_LOOKUPS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x1",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB Fills (same as IOTLB miss)",
+        "EventCode": "0x40",
+        "EventName": "UNC_IIO_IOMMU0.MISSES",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOMMU memory access (both low and high priority)",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.NUM_MEM_ACCESSES",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0xc0",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOMMU high priority memory access",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.NUM_MEM_ACCESSES_HIGH",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x80",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOMMU low priority memory access",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.NUM_MEM_ACCESSES_LOW",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x40",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Second Level Page Walk Cache Hit to a 1G page",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.SLPWC_1G_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x4",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Second Level Page Walk Cache Hit to a 256T page",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.SLPWC_256T_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x10",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Second Level Page Walk Cache Hit to a 2M page",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.SLPWC_2M_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x2",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Second Level Page Walk Cache Hit to a 512G page",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.SLPWC_512G_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x8",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Second Level Page Walk Cache fill",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.SLPWC_CACHE_FILLS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Second Level Page Walk Cache lookup",
+        "EventCode": "0x41",
+        "EventName": "UNC_IIO_IOMMU1.SLPWC_CACHE_LOOKUPS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x1",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Cycles PWT full",
+        "EventCode": "0x43",
+        "EventName": "UNC_IIO_IOMMU3.CYC_PWT_FULL",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x2",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Interrupt Entry cache hit",
+        "EventCode": "0x43",
+        "EventName": "UNC_IIO_IOMMU3.INT_CACHE_HITS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x80",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Interrupt Entry cache lookup",
+        "EventCode": "0x43",
+        "EventName": "UNC_IIO_IOMMU3.INT_CACHE_LOOKUPS",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x40",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Context Cache invalidation events",
+        "EventCode": "0x43",
+        "EventName": "UNC_IIO_IOMMU3.NUM_INVAL_CTXT_CACHE",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x8",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Interrupt Entry Cache invalidation events",
+        "EventCode": "0x43",
+        "EventName": "UNC_IIO_IOMMU3.NUM_INVAL_INT_CACHE",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x20",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "IOTLB invalidation events",
+        "EventCode": "0x43",
+        "EventName": "UNC_IIO_IOMMU3.NUM_INVAL_IOTLB",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x4",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PASID Cache invalidation events",
+        "EventCode": "0x43",
+        "EventName": "UNC_IIO_IOMMU3.NUM_INVAL_PASID_CACHE",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "UMask": "0x10",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Occupancy of outbound request queue : To device : Counts number of outbound requests/completions IIO is currently processing",
+        "EventCode": "0xc5",
+        "EventName": "UNC_IIO_NUM_OUSTANDING_REQ_FROM_CPU.TO_IO",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Passing data to be written",
+        "EventCode": "0x88",
+        "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.DATA",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x700f020",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Issuing final read or write of line",
+        "EventCode": "0x88",
+        "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.FINAL_RD_WR",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x700f008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Processing response from IOMMU",
+        "EventCode": "0x88",
+        "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.IOMMU_HIT",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x700f002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Issuing to IOMMU",
+        "EventCode": "0x88",
+        "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.IOMMU_REQ",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x700f001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Request Ownership",
+        "EventCode": "0x88",
+        "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.REQ_OWN",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x700f004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Writing line",
+        "EventCode": "0x88",
+        "EventName": "UNC_IIO_NUM_OUTSTANDING_REQ_OF_CPU.WR",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x700f010",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.ABORT",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff080",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.CONFINED_P2P",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff040",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.LOC_P2P",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff020",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MCAST",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MEM",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.MSGB",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.REM_P2P",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff010",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "-",
+        "EventCode": "0x8e",
+        "EventName": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.UBOX",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "All 9 bits of Page Walk Tracker Occupancy",
+        "EventCode": "0x42",
+        "EventName": "UNC_IIO_PWT_OCCUPANCY",
+        "PerPkg": "1",
+        "PortMask": "0x000",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core reading from Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Core writing to Cards MMIO space",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Another card (different IIO stack) reading from this card.",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.PEER_READ.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested by the CPU : Another card (different IIO stack) writing to this card.",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.PEER_WRITE.ALL_PARTS",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x0FF",
+        "UMask": "0x70ff002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080004",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to DRAM",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080001",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card reading from another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_READ.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080008",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x001",
+        "UMask": "0x7001002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x002",
+        "UMask": "0x7002002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x004",
+        "UMask": "0x7004002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x008",
+        "UMask": "0x7008002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART4",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x010",
+        "UMask": "0x7010002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART5",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x020",
+        "UMask": "0x7020002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART6",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x040",
+        "UMask": "0x7040002",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Number Transactions requested of the CPU : Card writing to another Card (same or different stack)",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.PEER_WRITE.PART7",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x080",
+        "UMask": "0x7080002",
+        "Unit": "IIO"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-memory.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-memory.json
new file mode 100644
index 000000000000..a2405ed640c9
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-memory.json
@@ -0,0 +1,385 @@
+[
+    {
+        "BriefDescription": "DRAM Activate Count : Counts the number of DRAM Activate commands sent on this channel.  Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS.  One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.",
+        "EventCode": "0x02",
+        "EventName": "UNC_M_ACT_COUNT.ALL",
+        "PerPkg": "1",
+        "UMask": "0xf7",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Activate Count : Read transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel.  Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS.  One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.",
+        "EventCode": "0x02",
+        "EventName": "UNC_M_ACT_COUNT.RD",
+        "PerPkg": "1",
+        "UMask": "0xf1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Activate Count : Underfill Read transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel.  Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS.  One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.",
+        "EventCode": "0x02",
+        "EventName": "UNC_M_ACT_COUNT.UFILL",
+        "PerPkg": "1",
+        "UMask": "0xf4",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Activate Count : Write transaction on Page Empty or Page Miss : Counts the number of DRAM Activate commands sent on this channel.  Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS.  One can calculate the number of Page Misses by subtracting the number of Page Miss precharges from the number of Activates.",
+        "EventCode": "0x02",
+        "EventName": "UNC_M_ACT_COUNT.WR",
+        "PerPkg": "1",
+        "UMask": "0xf2",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0, all CAS operations",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.ALL",
+        "PerPkg": "1",
+        "UMask": "0xff",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0, all reads",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.RD",
+        "PerPkg": "1",
+        "UMask": "0xcf",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0 regular reads",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.RD_REG",
+        "PerPkg": "1",
+        "UMask": "0xc1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0 underfill reads",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.RD_UNDERFILL",
+        "PerPkg": "1",
+        "UMask": "0xc4",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0, all writes",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.WR",
+        "PerPkg": "1",
+        "UMask": "0xf0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0 regular writes",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.WR_NONPRE",
+        "PerPkg": "1",
+        "UMask": "0xd0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 0 auto-precharge writes",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_CAS_COUNT_SCH0.WR_PRE",
+        "PerPkg": "1",
+        "UMask": "0xe0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1, all CAS operations",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.ALL",
+        "PerPkg": "1",
+        "UMask": "0xff",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1, all reads",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.RD",
+        "PerPkg": "1",
+        "UMask": "0xcf",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1 regular reads",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.RD_REG",
+        "PerPkg": "1",
+        "UMask": "0xc1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1 underfill reads",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.RD_UNDERFILL",
+        "PerPkg": "1",
+        "UMask": "0xc4",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1, all writes",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.WR",
+        "PerPkg": "1",
+        "UMask": "0xf0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1 regular writes",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.WR_NONPRE",
+        "PerPkg": "1",
+        "UMask": "0xd0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "CAS count for SubChannel 1 auto-precharge writes",
+        "EventCode": "0x06",
+        "EventName": "UNC_M_CAS_COUNT_SCH1.WR_PRE",
+        "PerPkg": "1",
+        "UMask": "0xe0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Number of DRAM DCLK clock cycles while the event is enabled",
+        "EventCode": "0x01",
+        "EventName": "UNC_M_CLOCKTICKS",
+        "PerPkg": "1",
+        "PublicDescription": "DRAM Clockticks",
+        "UMask": "0x1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Number of DRAM HCLK clock cycles while the event is enabled",
+        "EventCode": "0x01",
+        "EventName": "UNC_M_HCLOCKTICKS",
+        "PerPkg": "1",
+        "PublicDescription": "DRAM Clockticks",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_PRE_COUNT.ALL",
+        "PerPkg": "1",
+        "UMask": "0xff",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Precharge commands. : Precharge due to (?) : Counts the number of DRAM Precharge commands sent on this channel.",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_PRE_COUNT.PGT",
+        "PerPkg": "1",
+        "UMask": "0xf8",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_PRE_COUNT.RD",
+        "PerPkg": "1",
+        "UMask": "0xf1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_PRE_COUNT.UFILL",
+        "PerPkg": "1",
+        "UMask": "0xf4",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "DRAM Precharge commands. : Counts the number of DRAM Precharge commands sent on this channel.",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_PRE_COUNT.WR",
+        "PerPkg": "1",
+        "UMask": "0xf2",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read buffer inserts on subchannel 0",
+        "EventCode": "0x17",
+        "EventName": "UNC_M_RDB_INSERTS.SCH0",
+        "PerPkg": "1",
+        "UMask": "0x40",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read buffer inserts on subchannel 1",
+        "EventCode": "0x17",
+        "EventName": "UNC_M_RDB_INSERTS.SCH1",
+        "PerPkg": "1",
+        "UMask": "0x80",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read buffer occupancy on subchannel 0",
+        "EventCode": "0x1a",
+        "EventName": "UNC_M_RDB_OCCUPANCY_SCH0",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read buffer occupancy on subchannel 1",
+        "EventCode": "0x1b",
+        "EventName": "UNC_M_RDB_OCCUPANCY_SCH1",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue Allocations : Counts the number of allocations into the Read Pending Queue.  This queue is used to schedule reads out to the memory controller and to track the requests.  Requests allocate into the RPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC.  They deallocate after the CAS command has been issued to memory.  This includes both ISOCH and non-ISOCH requests.",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.PCH0",
+        "PerPkg": "1",
+        "UMask": "0x50",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue Allocations : Counts the number of allocations into the Read Pending Queue.  This queue is used to schedule reads out to the memory controller and to track the requests.  Requests allocate into the RPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the HA to the iMC.  They deallocate after the CAS command has been issued to memory.  This includes both ISOCH and non-ISOCH requests.",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.PCH1",
+        "PerPkg": "1",
+        "UMask": "0xa0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue inserts for subchannel 0, pseudochannel 0",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.SCH0_PCH0",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue inserts for subchannel 0, pseudochannel 1",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.SCH0_PCH1",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue inserts for subchannel 1, pseudochannel 0",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.SCH1_PCH0",
+        "PerPkg": "1",
+        "UMask": "0x40",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue inserts for subchannel 1, pseudochannel 1",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS.SCH1_PCH1",
+        "PerPkg": "1",
+        "UMask": "0x80",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read pending queue occupancy for subchannel 0, pseudochannel 0",
+        "EventCode": "0x80",
+        "EventName": "UNC_M_RPQ_OCCUPANCY_SCH0_PCH0",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read pending queue occupancy for subchannel 0, pseudochannel 1",
+        "EventCode": "0x81",
+        "EventName": "UNC_M_RPQ_OCCUPANCY_SCH0_PCH1",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read pending queue occupancy for subchannel 1, pseudochannel 0",
+        "EventCode": "0x82",
+        "EventName": "UNC_M_RPQ_OCCUPANCY_SCH1_PCH0",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Read pending queue occupancy for subchannel 1, pseudochannel 1",
+        "EventCode": "0x83",
+        "EventName": "UNC_M_RPQ_OCCUPANCY_SCH1_PCH1",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue Allocations",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.PCH0",
+        "PerPkg": "1",
+        "UMask": "0x50",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue Allocations",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.PCH1",
+        "PerPkg": "1",
+        "UMask": "0xa0",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue inserts for subchannel 0, pseudochannel 0",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.SCH0_PCH0",
+        "PerPkg": "1",
+        "UMask": "0x10",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue inserts for subchannel 0, pseudochannel 1",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.SCH0_PCH1",
+        "PerPkg": "1",
+        "UMask": "0x20",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue inserts for subchannel 1, pseudochannel 0",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.SCH1_PCH0",
+        "PerPkg": "1",
+        "UMask": "0x40",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue inserts for subchannel 1, pseudochannel 1",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_WPQ_INSERTS.SCH1_PCH1",
+        "PerPkg": "1",
+        "UMask": "0x80",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write pending queue occupancy for subchannel 0, pseudochannel 0",
+        "EventCode": "0x84",
+        "EventName": "UNC_M_WPQ_OCCUPANCY_SCH0_PCH0",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write pending queue occupancy for subchannel 0, pseudochannel 1",
+        "EventCode": "0x85",
+        "EventName": "UNC_M_WPQ_OCCUPANCY_SCH0_PCH1",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write pending queue occupancy for subchannel 1, pseudochannel 0",
+        "EventCode": "0x86",
+        "EventName": "UNC_M_WPQ_OCCUPANCY_SCH1_PCH0",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    },
+    {
+        "BriefDescription": "Write pending queue occupancy for subchannel 1, pseudochannel 1",
+        "EventCode": "0x87",
+        "EventName": "UNC_M_WPQ_OCCUPANCY_SCH1_PCH1",
+        "PerPkg": "1",
+        "Unit": "IMC"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-power.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-power.json
new file mode 100644
index 000000000000..e3a66166e28c
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-power.json
@@ -0,0 +1,10 @@
+[
+    {
+        "BriefDescription": "PCU Clockticks",
+        "EventCode": "0x01",
+        "EventName": "UNC_P_CLOCKTICKS",
+        "PerPkg": "1",
+        "PublicDescription": "PCU Clockticks:  The PCU runs off a fixed 1 GHz clock.  This event counts the number of pclk cycles measured while the counter was enabled.  The pclk, like the Memory Controller's dclk, counts at a constant rate making it a good measure of actual wall time.",
+        "Unit": "PCU"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/virtual-memory.json b/tools/perf/pmu-events/arch/x86/sierraforest/virtual-memory.json
index bd5f2b634c98..371974c6d6c3 100644
--- a/tools/perf/pmu-events/arch/x86/sierraforest/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/virtual-memory.json
@@ -1,24 +1,131 @@
 [
     {
-        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 1G page.",
+        "BriefDescription": "Counts the number of first level TLB misses but second level hits due to a demand load that did not start a page walk. Accounts for all page sizes. Will result in a DTLB write from STLB.",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.STLB_HIT",
+        "SampleAfterValue": "200003",
+        "UMask": "0x20"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses.",
         "EventCode": "0x08",
         "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED",
-        "SampleAfterValue": "1000003",
+        "SampleAfterValue": "200003",
         "UMask": "0xe"
     },
     {
+        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 2M or 4M page.",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M",
+        "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages. Includes page walks that page fault.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 4K page.",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks outstanding for Loads (demand or SW prefetch) in PMH every cycle.",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.WALK_PENDING",
+        "PublicDescription": "Counts the number of page walks outstanding for Loads (demand or SW prefetch) in PMH every cycle.  A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of first level TLB misses but second level hits due to stores that did not start a page walk. Accounts for all pages sizes. Will result in a DTLB write from STLB.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.STLB_HIT",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x20"
+    },
+    {
         "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 1G page.",
         "EventCode": "0x49",
         "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED",
-        "SampleAfterValue": "1000003",
+        "SampleAfterValue": "2000003",
         "UMask": "0xe"
     },
     {
+        "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 2M or 4M page.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M",
+        "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 4K page.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks outstanding in the page miss handler (PMH) for stores every cycle.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.WALK_PENDING",
+        "PublicDescription": "Counts the number of page walks outstanding in the page miss handler (PMH) for stores every cycle. A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks initiated by a instruction fetch that missed the first and second level TLBs.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.MISS_CAUSED_WALK",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of first level TLB misses but second level hits due to an instruction fetch that did not start a page walk. Account for all pages sizes. Will result in an ITLB write from STLB.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.STLB_HIT",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x20"
+    },
+    {
         "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to any page size.",
         "EventCode": "0x85",
         "EventName": "ITLB_MISSES.WALK_COMPLETED",
         "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size.  Includes page walks that page fault.",
         "SampleAfterValue": "200003",
         "UMask": "0xe"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to a 2M or 4M page.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M",
+        "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 2M or 4M pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to a 4K page.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks outstanding for iside in PMH every cycle.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.WALK_PENDING",
+        "PublicDescription": "Counts the number of page walks outstanding for iside in PMH every cycle.  A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals.  Walks could be counted by edge detecting on this event, but would count restarted suspended walks.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DTLB miss.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.DTLB_MISS_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x90"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/skylake/memory.json b/tools/perf/pmu-events/arch/x86/skylake/memory.json
index 588ad6059a13..f047862f9735 100644
--- a/tools/perf/pmu-events/arch/x86/skylake/memory.json
+++ b/tools/perf/pmu-events/arch/x86/skylake/memory.json
@@ -1008,7 +1008,7 @@
         "BriefDescription": "Number of times an RTM execution aborted due to any reasons (multiple categories may count as one).",
         "EventCode": "0xC9",
         "EventName": "RTM_RETIRED.ABORTED",
-        "PEBS": "1",
+        "PEBS": "2",
         "PublicDescription": "Number of times RTM abort was triggered.",
         "SampleAfterValue": "2000003",
         "UMask": "0x4"
diff --git a/tools/perf/pmu-events/arch/x86/skylake/metricgroups.json b/tools/perf/pmu-events/arch/x86/skylake/metricgroups.json
index a151ba9cccb0..5452a1448ded 100644
--- a/tools/perf/pmu-events/arch/x86/skylake/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/skylake/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -25,7 +25,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -63,8 +65,10 @@
     "tma_L5_group": "Metrics for top-down breakdown at level 5",
     "tma_L6_group": "Metrics for top-down breakdown at level 6",
     "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category",
+    "tma_assists_group": "Metrics contributing to tma_assists category",
     "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category",
     "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category",
+    "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category",
     "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category",
     "tma_core_bound_group": "Metrics contributing to tma_core_bound category",
     "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category",
@@ -77,9 +81,9 @@
     "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category",
     "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category",
     "tma_issue2P": "Metrics related by the issue $issue2P",
-    "tma_issueBC": "Metrics related by the issue $issueBC",
     "tma_issueBM": "Metrics related by the issue $issueBM",
     "tma_issueBW": "Metrics related by the issue $issueBW",
+    "tma_issueComp": "Metrics related by the issue $issueComp",
     "tma_issueD0": "Metrics related by the issue $issueD0",
     "tma_issueFB": "Metrics related by the issue $issueFB",
     "tma_issueFL": "Metrics related by the issue $issueFL",
@@ -99,10 +103,12 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
     "tma_mite_group": "Metrics contributing to tma_mite category",
+    "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category",
     "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category",
     "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category",
     "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category",
diff --git a/tools/perf/pmu-events/arch/x86/skylake/pipeline.json b/tools/perf/pmu-events/arch/x86/skylake/pipeline.json
index cd3e737bf4a1..fe202d1e368a 100644
--- a/tools/perf/pmu-events/arch/x86/skylake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/skylake/pipeline.json
@@ -387,7 +387,7 @@
         "Errata": "SKL091, SKL044",
         "EventCode": "0xC0",
         "EventName": "INST_RETIRED.NOP",
-        "PEBS": "1",
+        "PEBS": "2",
         "SampleAfterValue": "2000003",
         "UMask": "0x2"
     },
diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json
index faa615c57893..3af71b84bb9d 100644
--- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json
@@ -83,12 +83,12 @@
         "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots",
+        "MetricExpr": "34 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -156,7 +156,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(18.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 16.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(18.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM + 16.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_contested_accesses",
         "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -177,7 +177,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "16.5 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "16.5 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_data_sharing",
         "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -189,7 +189,7 @@
         "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
         "MetricName": "tma_decoder0_alone",
-        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35))",
+        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
         "ScaleUnit": "100%"
     },
@@ -214,10 +214,10 @@
     },
     {
         "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
-        "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
+        "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -237,7 +237,7 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
         "MetricName": "tma_dtlb_load",
         "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
@@ -246,13 +246,13 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
         "MetricName": "tma_dtlb_store",
         "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "22 * tma_info_system_average_frequency * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
+        "MetricExpr": "22 * tma_info_system_core_frequency * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
         "MetricName": "tma_false_sharing",
         "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -266,7 +266,7 @@
         "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
         "MetricName": "tma_fb_full",
         "MetricThreshold": "tma_fb_full > 0.3",
-        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
         "ScaleUnit": "100%"
     },
     {
@@ -274,7 +274,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -309,6 +309,15 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists",
+        "MetricExpr": "34 * FP_ASSIST.ANY / tma_info_thread_slots",
+        "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group",
+        "MetricName": "tma_fp_assists",
+        "MetricThreshold": "tma_fp_assists > 0.1",
+        "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
         "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / UOPS_RETIRED.RETIRE_SLOTS",
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
@@ -358,10 +367,10 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions",
         "MetricExpr": "tma_light_operations * UOPS_RETIRED.MACRO_FUSED / UOPS_RETIRED.RETIRE_SLOTS",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_fused_instructions",
         "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}",
         "ScaleUnit": "100%"
     },
     {
@@ -371,13 +380,13 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
         "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
@@ -385,14 +394,14 @@
     },
     {
         "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100",
         "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
         "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers"
     },
     {
         "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
-        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)",
         "MetricGroup": "Bad;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmisp_indirect",
         "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
@@ -405,6 +414,12 @@
         "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
     },
     {
+        "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)",
+        "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)",
+        "MetricGroup": "BrMispredicts",
+        "MetricName": "tma_info_bad_spec_spec_clears_ratio"
+    },
+    {
         "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
@@ -430,67 +445,102 @@
         "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: "
     },
     {
+        "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.",
+        "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Ret",
+        "MetricName": "tma_info_bottleneck_base_non_br",
+        "MetricThreshold": "tma_info_bottleneck_base_non_br > 20"
+    },
+    {
         "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
-        "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC",
+        "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB",
         "MetricName": "tma_info_bottleneck_big_code",
-        "MetricThreshold": "tma_info_bottleneck_big_code > 20",
-        "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead"
+        "MetricThreshold": "tma_info_bottleneck_big_code > 20"
     },
     {
         "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
-        "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)",
-        "MetricGroup": "Ret;tma_issueBC",
+        "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)",
+        "MetricGroup": "Ret",
         "MetricName": "tma_info_bottleneck_branching_overhead",
-        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10",
-        "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code"
+        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
+        "MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))",
+        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
+        "MetricName": "tma_info_bottleneck_cache_memory_latency",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency"
+    },
+    {
+        "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
+        "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
+        "MetricGroup": "Cor;tma_issueComp",
+        "MetricName": "tma_info_bottleneck_compute_bound_est",
+        "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
+        "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: "
     },
     {
         "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
+        "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
         "MetricGroup": "Fed;FetchBW;Frontend",
         "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
         "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20"
     },
     {
-        "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
-        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
-        "MetricName": "tma_info_bottleneck_memory_bandwidth",
-        "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20",
-        "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+        "BriefDescription": "Total pipeline cost of irregular execution (e.g",
+        "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Bad;Cor;Ret;tma_issueMS",
+        "MetricName": "tma_info_bottleneck_irregular_overhead",
+        "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
+        "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches"
     },
     {
         "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))",
         "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB",
         "MetricName": "tma_info_bottleneck_memory_data_tlbs",
         "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
-        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store"
+        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization"
     },
     {
-        "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))",
-        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
-        "MetricName": "tma_info_bottleneck_memory_latency",
-        "MetricThreshold": "tma_info_bottleneck_memory_latency > 20",
-        "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency"
+        "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
+        "MetricGroup": "Mem;Offcore;tma_issueTLB",
+        "MetricName": "tma_info_bottleneck_memory_synchronization",
+        "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
+        "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs"
     },
     {
         "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+        "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
         "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bottleneck_mispredictions",
         "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
         "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers"
     },
     {
+        "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)",
+        "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)",
+        "MetricGroup": "Cor;Offcore",
+        "MetricName": "tma_info_bottleneck_other_bottlenecks",
+        "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
+        "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls."
+    },
+    {
         "BriefDescription": "Fraction of branches that are CALL or RET",
         "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
@@ -511,7 +561,7 @@
     {
         "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.COND - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
         "MetricName": "tma_info_branches_jump"
     },
@@ -528,9 +578,15 @@
         "MetricName": "tma_info_core_coreipc"
     },
     {
+        "BriefDescription": "uops Executed per Cycle",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_core_epc"
+    },
+    {
         "BriefDescription": "Floating Point Operations Per Cycle",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc"
     },
@@ -542,8 +598,8 @@
         "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
@@ -618,7 +674,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
@@ -626,7 +682,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
@@ -634,7 +690,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
@@ -642,7 +698,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
@@ -650,7 +706,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -669,7 +725,7 @@
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
@@ -705,136 +761,142 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_access_bw",
         "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_core_l3_cache_access_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_fb_hpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_all"
     },
     {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load"
     },
     {
-        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
-        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
-        "MetricName": "tma_info_memory_l3mpki"
+        "BriefDescription": "",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW;Offcore",
+        "MetricName": "tma_info_memory_l3_cache_access_bw"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
     },
     {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_l3mpki"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
-    },
-    {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_access_bw",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_uc_load_pki"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
     },
     {
         "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
@@ -863,43 +925,57 @@
         "MetricName": "tma_info_memory_tlb_store_stlb_mpki"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
     {
+        "BriefDescription": "Instructions per a microcode Assist invocation",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ASSIST.ANY + OTHER_ASSISTS.ANY)",
+        "MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
+        "MetricName": "tma_info_pipeline_ipassist",
+        "MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
+        "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)"
+    },
+    {
         "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
         "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@",
         "MetricGroup": "Pipeline;Ret",
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
-        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
+        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -929,13 +1005,6 @@
         "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches"
     },
     {
-        "BriefDescription": "Average number of parallel requests to external memory",
-        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_parallel_requests",
-        "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests"
-    },
-    {
         "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)",
         "MetricExpr": "1e9 * (UNC_ARB_TRK_OCCUPANCY.DATA_READ / UNC_ARB_TRK_REQUESTS.DATA_READ) / (tma_info_system_socket_clks / duration_time)",
         "MetricGroup": "Mem;MemoryLat;SoC",
@@ -943,12 +1012,6 @@
         "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)"
     },
     {
-        "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
-        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_request_latency"
-    },
-    {
         "BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
         "MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)",
         "MetricGroup": "SMT",
@@ -1013,8 +1076,8 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
-        "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
@@ -1023,7 +1086,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -1033,7 +1096,7 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
@@ -1042,24 +1105,24 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
-        "MetricExpr": "6.5 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "MetricExpr": "6.5 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
-        "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks",
+        "MetricExpr": "DECODE.LCP / tma_info_thread_clks",
         "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
         "MetricName": "tma_lcp",
         "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -1073,7 +1136,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -1123,21 +1186,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -1165,7 +1228,7 @@
         "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
         "MetricName": "tma_microcode_sequencer",
         "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
-        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches",
+        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1182,17 +1245,17 @@
         "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
+        "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)",
         "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
         "MetricName": "tma_mixing_vectors",
         "MetricThreshold": "tma_mixing_vectors > 0.05",
-        "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
+        "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1201,13 +1264,13 @@
         "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
         "MetricName": "tma_ms_switches",
         "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
-        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
+        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused",
         "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED) / UOPS_RETIRED.RETIRE_SLOTS",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_non_fused_branches",
         "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.",
@@ -1216,15 +1279,15 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
         "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / UOPS_RETIRED.RETIRE_SLOTS",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
         "MetricName": "tma_nop_instructions",
-        "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6",
+        "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
-        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))",
+        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))",
         "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_other_light_ops",
         "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
@@ -1232,6 +1295,22 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
+        "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)",
+        "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
+        "MetricName": "tma_other_mispredicts",
+        "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
+        "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)",
+        "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_other_nukes",
+        "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)",
         "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks",
         "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
@@ -1286,12 +1365,12 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -1305,7 +1384,7 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
-        "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
+        "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
         "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
         "MetricName": "tma_ports_utilization",
         "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -1314,7 +1393,7 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
-        "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_core_clks",
+        "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_0",
         "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -1344,7 +1423,7 @@
         "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "ScaleUnit": "100%"
     },
     {
@@ -1360,9 +1439,9 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
         "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks",
-        "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group",
+        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
         "MetricName": "tma_serializing_operation",
-        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))",
+        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
@@ -1391,7 +1470,7 @@
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
         "MetricName": "tma_sq_full",
         "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
+        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
         "ScaleUnit": "100%"
     },
     {
@@ -1449,10 +1528,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/skylake/virtual-memory.json b/tools/perf/pmu-events/arch/x86/skylake/virtual-memory.json
index f59405877ae8..73feadaf7674 100644
--- a/tools/perf/pmu-events/arch/x86/skylake/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/skylake/virtual-memory.json
@@ -205,7 +205,7 @@
         "BriefDescription": "Counts 1 per cycle for each PMH that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake.",
         "EventCode": "0x85",
         "EventName": "ITLB_MISSES.WALK_PENDING",
-        "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake michroarchitecture.",
+        "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake microarchitecture.",
         "SampleAfterValue": "100003",
         "UMask": "0x10"
     },
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json b/tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json
index bc6a9a4d27a9..904d299c95a3 100644
--- a/tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/skylakex/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -26,7 +26,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -64,8 +66,10 @@
     "tma_L5_group": "Metrics for top-down breakdown at level 5",
     "tma_L6_group": "Metrics for top-down breakdown at level 6",
     "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category",
+    "tma_assists_group": "Metrics contributing to tma_assists category",
     "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category",
     "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category",
+    "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category",
     "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category",
     "tma_core_bound_group": "Metrics contributing to tma_core_bound category",
     "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category",
@@ -78,9 +82,9 @@
     "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category",
     "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category",
     "tma_issue2P": "Metrics related by the issue $issue2P",
-    "tma_issueBC": "Metrics related by the issue $issueBC",
     "tma_issueBM": "Metrics related by the issue $issueBM",
     "tma_issueBW": "Metrics related by the issue $issueBW",
+    "tma_issueComp": "Metrics related by the issue $issueComp",
     "tma_issueD0": "Metrics related by the issue $issueD0",
     "tma_issueFB": "Metrics related by the issue $issueFB",
     "tma_issueFL": "Metrics related by the issue $issueFL",
@@ -100,10 +104,12 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
     "tma_mite_group": "Metrics contributing to tma_mite category",
+    "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category",
     "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category",
     "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category",
     "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category",
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
index ec3aa5ef00a3..025e836a1c80 100644
--- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
@@ -210,6 +210,12 @@
         "ScaleUnit": "1MB/s"
     },
     {
+        "BriefDescription": "Bandwidth (MB/sec) of write requests that miss the last level cache (LLC) and go to remote memory.",
+        "MetricExpr": "UNC_CHA_REQUESTS.WRITES_REMOTE * 64 / 1e6 / duration_time",
+        "MetricName": "llc_miss_remote_memory_bandwidth_write",
+        "ScaleUnit": "1MB/s"
+    },
+    {
         "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions",
         "MetricExpr": "MEM_INST_RETIRED.ALL_LOADS / INST_RETIRED.ANY",
         "MetricName": "loads_per_instr",
@@ -298,12 +304,12 @@
         "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots",
+        "MetricExpr": "34 * (FP_ASSIST.ANY + OTHER_ASSISTS.ANY) / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -371,7 +377,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 44 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 44 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_contested_accesses",
         "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -392,7 +398,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "44 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_data_sharing",
         "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -404,7 +410,7 @@
         "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
         "MetricName": "tma_decoder0_alone",
-        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35))",
+        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
         "ScaleUnit": "100%"
     },
@@ -429,10 +435,10 @@
     },
     {
         "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
-        "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
+        "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -452,7 +458,7 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
         "MetricName": "tma_dtlb_load",
         "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
@@ -461,13 +467,13 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
         "MetricName": "tma_dtlb_store",
         "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(110 * tma_info_system_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HITM + OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_average_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks",
+        "MetricExpr": "(110 * tma_info_system_core_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HITM + OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HITM) + 47.5 * tma_info_system_core_frequency * (OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
         "MetricName": "tma_false_sharing",
         "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -481,7 +487,7 @@
         "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
         "MetricName": "tma_fb_full",
         "MetricThreshold": "tma_fb_full > 0.3",
-        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
         "ScaleUnit": "100%"
     },
     {
@@ -489,7 +495,7 @@
         "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -524,6 +530,15 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists",
+        "MetricExpr": "34 * FP_ASSIST.ANY / tma_info_thread_slots",
+        "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group",
+        "MetricName": "tma_fp_assists",
+        "MetricThreshold": "tma_fp_assists > 0.1",
+        "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
         "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / UOPS_RETIRED.RETIRE_SLOTS",
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
@@ -582,10 +597,10 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions",
         "MetricExpr": "tma_light_operations * UOPS_RETIRED.MACRO_FUSED / UOPS_RETIRED.RETIRE_SLOTS",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_fused_instructions",
         "MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}",
         "ScaleUnit": "100%"
     },
     {
@@ -595,13 +610,13 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
         "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
@@ -609,26 +624,53 @@
     },
     {
         "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100",
         "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
         "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers"
     },
     {
         "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
-        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@BR_MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+        "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * BR_MISP_EXEC.INDIRECT)",
         "MetricGroup": "Bad;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmisp_indirect",
         "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
     },
     {
         "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
-        "MetricExpr": "tma_info_core_ipmispredict",
+        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;BadSpec;BrMispredicts",
         "MetricName": "tma_info_bad_spec_ipmispredict",
         "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
     },
     {
+        "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)",
+        "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)",
+        "MetricGroup": "BrMispredicts",
+        "MetricName": "tma_info_bad_spec_spec_clears_ratio"
+    },
+    {
+        "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
+        "MetricExpr": "(100 * (1 - tma_core_bound / (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) if tma_core_bound < (((EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / CPU_CLK_UNHALTED.THREAD * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CPU_CLK_UNHALTED.THREAD * CPU_CLK_UNHALTED.THREAD + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CPU_CLK_UNHALTED.THREAD if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CPU_CLK_UNHALTED.THREAD) else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
+        "MetricGroup": "Cor;SMT;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_core_bound_likely",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.",
+        "MetricExpr": "100 * (100 * (tma_fetch_latency * (DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD) + tma_fetch_bandwidth * tma_mite / (tma_mite + tma_dsb)))",
+        "MetricGroup": "DSBmiss;Fed;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_dsb_misses",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck.",
+        "MetricExpr": "100 * (100 * (tma_fetch_latency * ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD) / ((ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=0x1\\,edge\\=0x1@) / CPU_CLK_UNHALTED.THREAD + ICACHE_TAG.STALLS / CPU_CLK_UNHALTED.THREAD + (INT_MISC.CLEAR_RESTEER_CYCLES / CPU_CLK_UNHALTED.THREAD + 9 * BACLEARS.ANY / CPU_CLK_UNHALTED.THREAD) + min(2 * IDQ.MS_SWITCHES / CPU_CLK_UNHALTED.THREAD, 1) + DECODE.LCP / CPU_CLK_UNHALTED.THREAD + DSB2MITE_SWITCHES.PENALTY_CYCLES / CPU_CLK_UNHALTED.THREAD)))",
+        "MetricGroup": "Fed;FetchLat;IcMiss;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_botlnk_ic_misses",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
@@ -654,67 +696,102 @@
         "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: "
     },
     {
+        "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.",
+        "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Ret",
+        "MetricName": "tma_info_bottleneck_base_non_br",
+        "MetricThreshold": "tma_info_bottleneck_base_non_br > 20"
+    },
+    {
         "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
-        "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC",
+        "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB",
         "MetricName": "tma_info_bottleneck_big_code",
-        "MetricThreshold": "tma_info_bottleneck_big_code > 20",
-        "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead"
+        "MetricThreshold": "tma_info_bottleneck_big_code > 20"
     },
     {
         "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
-        "MetricExpr": "100 * ((BR_INST_RETIRED.CONDITIONAL + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)",
-        "MetricGroup": "Ret;tma_issueBC",
+        "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)",
+        "MetricGroup": "Ret",
         "MetricName": "tma_info_bottleneck_branching_overhead",
-        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10",
-        "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code"
+        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
+        "MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))",
+        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
+        "MetricName": "tma_info_bottleneck_cache_memory_latency",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency"
+    },
+    {
+        "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
+        "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
+        "MetricGroup": "Cor;tma_issueComp",
+        "MetricName": "tma_info_bottleneck_compute_bound_est",
+        "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
+        "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: "
     },
     {
         "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
+        "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
         "MetricGroup": "Fed;FetchBW;Frontend",
         "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
         "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20"
     },
     {
-        "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
-        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
-        "MetricName": "tma_info_bottleneck_memory_bandwidth",
-        "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20",
-        "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+        "BriefDescription": "Total pipeline cost of irregular execution (e.g",
+        "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Bad;Cor;Ret;tma_issueMS",
+        "MetricName": "tma_info_bottleneck_irregular_overhead",
+        "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
+        "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches"
     },
     {
         "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))",
         "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB",
         "MetricName": "tma_info_bottleneck_memory_data_tlbs",
         "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
-        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store"
+        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization"
     },
     {
-        "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))",
-        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
-        "MetricName": "tma_info_bottleneck_memory_latency",
-        "MetricThreshold": "tma_info_bottleneck_memory_latency > 20",
-        "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency"
+        "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
+        "MetricGroup": "Mem;Offcore;tma_issueTLB",
+        "MetricName": "tma_info_bottleneck_memory_synchronization",
+        "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
+        "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs"
     },
     {
         "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+        "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
         "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bottleneck_mispredictions",
         "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
         "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers"
     },
     {
+        "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)",
+        "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)",
+        "MetricGroup": "Cor;Offcore",
+        "MetricName": "tma_info_bottleneck_other_bottlenecks",
+        "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
+        "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls."
+    },
+    {
         "BriefDescription": "Fraction of branches that are CALL or RET",
         "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
@@ -735,7 +812,7 @@
     {
         "BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.CONDITIONAL - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "(BR_INST_RETIRED.NEAR_TAKEN - (BR_INST_RETIRED.COND - BR_INST_RETIRED.NOT_TAKEN) - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
         "MetricName": "tma_info_branches_jump"
     },
@@ -752,9 +829,15 @@
         "MetricName": "tma_info_core_coreipc"
     },
     {
+        "BriefDescription": "uops Executed per Cycle",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_core_epc"
+    },
+    {
         "BriefDescription": "Floating Point Operations Per Cycle",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc"
     },
@@ -766,19 +849,12 @@
         "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
     {
-        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
-        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
-        "MetricGroup": "Bad;BadSpec;BrMispredicts;TopdownL1;tma_L1_group",
-        "MetricName": "tma_info_core_ipmispredict",
-        "MetricgroupNoGroup": "TopdownL1"
-    },
-    {
         "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
         "MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)",
         "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB",
@@ -849,7 +925,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
@@ -857,7 +933,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
@@ -865,7 +941,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)",
@@ -873,7 +949,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx512",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
@@ -881,7 +957,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
@@ -889,7 +965,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -908,7 +984,7 @@
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
@@ -943,16 +1019,23 @@
         "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp"
     },
     {
+        "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_code_stlb_mpki",
+        "MetricGroup": "Fed;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_code_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction",
@@ -968,124 +1051,214 @@
     },
     {
         "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_access_bw",
         "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_core_l3_cache_access_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
+    },
+    {
+        "BriefDescription": "Average Parallel L2 cache miss data reads",
+        "MetricExpr": "tma_info_memory_latency_data_l2_mlp",
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_data_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_fb_hpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Rate of non silent evictions from the L2 cache per Kilo instruction",
+        "MetricExpr": "1e3 * L2_LINES_OUT.NON_SILENT / INST_RETIRED.ANY",
+        "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_evictions_nonsilent_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "Rate of silent evictions from the L2 cache per Kilo instruction where the evicted lines are dropped (no writeback to L3 or memory)",
+        "MetricExpr": "1e3 * L2_LINES_OUT.SILENT / INST_RETIRED.ANY",
+        "MetricGroup": "L2Evicts;Mem;Server;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l2_evictions_silent_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_all"
     },
     {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW;Offcore",
+        "MetricName": "tma_info_memory_l3_cache_access_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l3_cache_access_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
+    },
+    {
+        "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / (duration_time * 1e3 / 1e3)",
+        "MetricGroup": "Mem;MemoryBW;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw_2t",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
         "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "Mem",
         "MetricName": "tma_info_memory_l3mpki"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
+        "BriefDescription": "Average Parallel L2 cache miss data reads",
+        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
+        "MetricGroup": "Memory_BW;Offcore",
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+        "BriefDescription": "Average Latency for L2 cache miss demand Loads",
+        "MetricExpr": "tma_info_memory_load_l2_miss_latency",
+        "MetricGroup": "Memory_Lat;Offcore",
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
-        "BriefDescription": "Average Parallel L2 cache miss data reads",
-        "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
+        "BriefDescription": "Average Parallel L2 cache miss demand Loads",
+        "MetricExpr": "tma_info_memory_load_l2_mlp",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
-        "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricGroup": "Memory_Lat;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_miss_latency",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
-        "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
+        "MetricGroup": "Memory_BW;Offcore;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_l2_mlp",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_load_stlb_mpki",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_load_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_access_bw",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "tma_info_memory_uc_load_pki",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_uc_load_pki"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+    },
+    {
+        "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
+        "MetricExpr": "tma_info_memory_tlb_page_walks_utilization",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_page_walks_utilization",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
+        "MetricExpr": "tma_info_memory_tlb_store_stlb_mpki",
+        "MetricGroup": "Mem;MemoryTLB;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_store_stlb_mpki",
+        "MetricgroupNoGroup": "TopdownL1"
     },
     {
         "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
@@ -1114,55 +1287,78 @@
         "MetricName": "tma_info_memory_tlb_store_stlb_mpki"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY",
+        "MetricGroup": "Mem;TopdownL1;tma_L1_group",
+        "MetricName": "tma_info_memory_uc_load_pki",
+        "MetricgroupNoGroup": "TopdownL1"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
     {
+        "BriefDescription": "Instructions per a microcode Assist invocation",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ASSIST.ANY + OTHER_ASSISTS.ANY)",
+        "MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
+        "MetricName": "tma_info_pipeline_ipassist",
+        "MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
+        "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)"
+    },
+    {
         "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
         "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@",
         "MetricGroup": "Pipeline;Ret",
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
-        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
+        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * (FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE) + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]",
         "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3) * 4 / 1e9 / duration_time",
-        "MetricGroup": "IoBW;Mem;Server;SoC",
-        "MetricName": "tma_info_system_io_read_bw"
+        "MetricGroup": "IoBW;MemOffcore;Server;SoC",
+        "MetricName": "tma_info_system_io_read_bw",
+        "PublicDescription": "Average IO (network or disk) Bandwidth Use for Reads [GB / sec]. Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU"
     },
     {
         "BriefDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]",
         "MetricExpr": "(UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3) * 4 / 1e9 / duration_time",
-        "MetricGroup": "IoBW;Mem;Server;SoC",
-        "MetricName": "tma_info_system_io_write_bw"
+        "MetricGroup": "IoBW;MemOffcore;Server;SoC",
+        "MetricName": "tma_info_system_io_write_bw",
+        "PublicDescription": "Average IO (network or disk) Bandwidth Use for Writes [GB / sec]. Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -1187,7 +1383,7 @@
     {
         "BriefDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]",
         "MetricExpr": "1e9 * (UNC_M_RPQ_OCCUPANCY / UNC_M_RPQ_INSERTS) / imc_0@event\\=0x0@",
-        "MetricGroup": "Mem;MemoryLat;Server;SoC",
+        "MetricGroup": "MemOffcore;MemoryLat;Server;SoC",
         "MetricName": "tma_info_system_mem_dram_read_latency",
         "PublicDescription": "Average latency of data read request to external DRAM memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches"
     },
@@ -1247,6 +1443,12 @@
         "MetricName": "tma_info_system_turbo_utilization"
     },
     {
+        "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]",
+        "MetricExpr": "tma_info_system_socket_clks / 1e9 / duration_time",
+        "MetricGroup": "SoC",
+        "MetricName": "tma_info_system_uncore_frequency"
+    },
+    {
         "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
         "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
         "MetricGroup": "Pipeline",
@@ -1293,8 +1495,8 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
-        "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
@@ -1303,7 +1505,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -1313,7 +1515,7 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
@@ -1322,24 +1524,24 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
-        "MetricExpr": "17 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "MetricExpr": "17 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
-        "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks",
+        "MetricExpr": "DECODE.LCP / tma_info_thread_clks",
         "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
         "MetricName": "tma_lcp",
         "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -1353,7 +1555,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -1384,10 +1586,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory",
-        "MetricExpr": "59.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "59.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_local_dram",
-        "MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_local_mem",
+        "MetricThreshold": "tma_local_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -1412,21 +1614,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -1454,7 +1656,7 @@
         "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
         "MetricName": "tma_microcode_sequencer",
         "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
-        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches",
+        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1471,17 +1673,17 @@
         "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
+        "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)",
         "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
         "MetricName": "tma_mixing_vectors",
         "MetricThreshold": "tma_mixing_vectors > 0.05",
-        "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
+        "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1490,13 +1692,13 @@
         "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
         "MetricName": "tma_ms_switches",
         "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
-        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
+        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused",
         "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - UOPS_RETIRED.MACRO_FUSED) / UOPS_RETIRED.RETIRE_SLOTS",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_non_fused_branches",
         "MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.",
@@ -1505,15 +1707,15 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
         "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / UOPS_RETIRED.RETIRE_SLOTS",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
         "MetricName": "tma_nop_instructions",
-        "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6",
+        "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
-        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))",
+        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))",
         "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_other_light_ops",
         "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
@@ -1521,6 +1723,22 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
+        "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)",
+        "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
+        "MetricName": "tma_other_mispredicts",
+        "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
+        "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)",
+        "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_other_nukes",
+        "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)",
         "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks",
         "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
@@ -1575,12 +1793,12 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED_PORT.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
@@ -1594,7 +1812,7 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
-        "MetricExpr": "((EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
+        "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
         "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
         "MetricName": "tma_ports_utilization",
         "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -1603,7 +1821,7 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
-        "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_core_core_clks",
+        "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_0",
         "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -1633,13 +1851,13 @@
         "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / tma_info_core_core_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues",
         "MetricConstraint": "NO_GROUP_EVENTS_NMI",
-        "MetricExpr": "(89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(89.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + 89.5 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group",
         "MetricName": "tma_remote_cache",
         "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
@@ -1648,10 +1866,10 @@
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory",
-        "MetricExpr": "127 * tma_info_system_average_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "127 * tma_info_system_core_frequency * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group",
-        "MetricName": "tma_remote_dram",
-        "MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
+        "MetricName": "tma_remote_mem",
+        "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
         "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article. Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS",
         "ScaleUnit": "100%"
     },
@@ -1668,9 +1886,9 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
         "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / tma_info_thread_clks",
-        "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group",
+        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
         "MetricName": "tma_serializing_operation",
-        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))",
+        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
@@ -1699,7 +1917,7 @@
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
         "MetricName": "tma_sq_full",
         "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
+        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
         "ScaleUnit": "100%"
     },
     {
@@ -1757,10 +1975,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "9 * BACLEARS.ANY / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-power.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-power.json
index c6254af7a468..ceef46046488 100644
--- a/tools/perf/pmu-events/arch/x86/skylakex/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-power.json
@@ -144,6 +144,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0x40",
         "Unit": "PCU"
     },
     {
@@ -152,6 +153,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0x80",
         "Unit": "PCU"
     },
     {
@@ -160,6 +162,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
         "PerPkg": "1",
         "PublicDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0xc0",
         "Unit": "PCU"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json
index a61ffca2dfea..dcf268467db9 100644
--- a/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/snowridgex/uncore-power.json
@@ -150,6 +150,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
         "PerPkg": "1",
         "PublicDescription": "Number of cores in C-State : C0 and C1 : This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0x40",
         "Unit": "PCU"
     },
     {
@@ -158,6 +159,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
         "PerPkg": "1",
         "PublicDescription": "Number of cores in C-State : C3 : This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0x80",
         "Unit": "PCU"
     },
     {
@@ -166,6 +168,7 @@
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
         "PerPkg": "1",
         "PublicDescription": "Number of cores in C-State : C6 and C7 : This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.",
+        "UMask": "0xc0",
         "Unit": "PCU"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json b/tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json
index a151ba9cccb0..5452a1448ded 100644
--- a/tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json
+++ b/tools/perf/pmu-events/arch/x86/tigerlake/metricgroups.json
@@ -2,10 +2,10 @@
     "Backend": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Bad": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BadSpec": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "BigFoot": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "BigFootprint": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "BrMispredicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Branches": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
-    "CacheMisses": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "CacheHits": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "CodeGen": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Compute": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Cor": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -25,7 +25,9 @@
     "L2Evicts": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "LSD": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MachineClears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "Machine_Clears": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "Mem": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
+    "MemOffcore": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBW": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryBound": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
     "MemoryLat": "Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet",
@@ -63,8 +65,10 @@
     "tma_L5_group": "Metrics for top-down breakdown at level 5",
     "tma_L6_group": "Metrics for top-down breakdown at level 6",
     "tma_alu_op_utilization_group": "Metrics contributing to tma_alu_op_utilization category",
+    "tma_assists_group": "Metrics contributing to tma_assists category",
     "tma_backend_bound_group": "Metrics contributing to tma_backend_bound category",
     "tma_bad_speculation_group": "Metrics contributing to tma_bad_speculation category",
+    "tma_branch_mispredicts_group": "Metrics contributing to tma_branch_mispredicts category",
     "tma_branch_resteers_group": "Metrics contributing to tma_branch_resteers category",
     "tma_core_bound_group": "Metrics contributing to tma_core_bound category",
     "tma_dram_bound_group": "Metrics contributing to tma_dram_bound category",
@@ -77,9 +81,9 @@
     "tma_frontend_bound_group": "Metrics contributing to tma_frontend_bound category",
     "tma_heavy_operations_group": "Metrics contributing to tma_heavy_operations category",
     "tma_issue2P": "Metrics related by the issue $issue2P",
-    "tma_issueBC": "Metrics related by the issue $issueBC",
     "tma_issueBM": "Metrics related by the issue $issueBM",
     "tma_issueBW": "Metrics related by the issue $issueBW",
+    "tma_issueComp": "Metrics related by the issue $issueComp",
     "tma_issueD0": "Metrics related by the issue $issueD0",
     "tma_issueFB": "Metrics related by the issue $issueFB",
     "tma_issueFL": "Metrics related by the issue $issueFL",
@@ -99,10 +103,12 @@
     "tma_l3_bound_group": "Metrics contributing to tma_l3_bound category",
     "tma_light_operations_group": "Metrics contributing to tma_light_operations category",
     "tma_load_op_utilization_group": "Metrics contributing to tma_load_op_utilization category",
+    "tma_machine_clears_group": "Metrics contributing to tma_machine_clears category",
     "tma_mem_latency_group": "Metrics contributing to tma_mem_latency category",
     "tma_memory_bound_group": "Metrics contributing to tma_memory_bound category",
     "tma_microcode_sequencer_group": "Metrics contributing to tma_microcode_sequencer category",
     "tma_mite_group": "Metrics contributing to tma_mite category",
+    "tma_other_light_ops_group": "Metrics contributing to tma_other_light_ops category",
     "tma_ports_utilization_group": "Metrics contributing to tma_ports_utilization category",
     "tma_ports_utilized_0_group": "Metrics contributing to tma_ports_utilized_0 category",
     "tma_ports_utilized_3m_group": "Metrics contributing to tma_ports_utilized_3m category",
diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/other.json b/tools/perf/pmu-events/arch/x86/tigerlake/other.json
index 55f3048bcfa6..117b18abcaaf 100644
--- a/tools/perf/pmu-events/arch/x86/tigerlake/other.json
+++ b/tools/perf/pmu-events/arch/x86/tigerlake/other.json
@@ -19,7 +19,7 @@
         "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.",
         "EventCode": "0x28",
         "EventName": "CORE_POWER.LVL2_TURBO_LICENSE",
-        "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchtecture).  This includes high current AVX 512-bit instructions.",
+        "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchitecture).  This includes high current AVX 512-bit instructions.",
         "SampleAfterValue": "200003",
         "UMask": "0x20"
     },
diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json b/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json
index 541bf1dd1679..4f85d53edec2 100644
--- a/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/tigerlake/pipeline.json
@@ -537,7 +537,7 @@
         "BriefDescription": "Cycles when Reservation Station (RS) is empty for the thread",
         "EventCode": "0x5e",
         "EventName": "RS_EVENTS.EMPTY_CYCLES",
-        "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into stravation periods (e.g. branch mispredictions or i-cache misses)",
+        "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)",
         "SampleAfterValue": "1000003",
         "UMask": "0x1"
     },
@@ -561,14 +561,6 @@
         "UMask": "0x2"
     },
     {
-        "BriefDescription": "TMA slots wasted due to incorrect speculation by branch mispredictions",
-        "EventCode": "0xa4",
-        "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS",
-        "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by branch mispredictions. This event estimates number of operations that were issued but not retired from the speculative path as well as the out-of-order engine recovery past a branch misprediction.",
-        "SampleAfterValue": "10000003",
-        "UMask": "0x8"
-    },
-    {
         "BriefDescription": "TMA slots available for an unhalted logical processor. Fixed counter - architectural event",
         "EventName": "TOPDOWN.SLOTS",
         "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3).",
diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json
index f11860f39c18..8ae4f2474b25 100644
--- a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json
@@ -98,12 +98,12 @@
         "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5 + UOPS_DISPATCHED.PORT_6) / (4 * tma_info_core_core_clks)",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
         "MetricName": "tma_alu_op_utilization",
-        "MetricThreshold": "tma_alu_op_utilization > 0.6",
+        "MetricThreshold": "tma_alu_op_utilization > 0.4",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
-        "MetricExpr": "100 * ASSISTS.ANY / tma_info_thread_slots",
+        "MetricExpr": "34 * ASSISTS.ANY / tma_info_thread_slots",
         "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
         "MetricName": "tma_assists",
         "MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -113,7 +113,7 @@
     {
         "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
         "DefaultMetricgroupName": "TopdownL1",
-        "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * cpu@INT_MISC.RECOVERY_CYCLES\\,cmask\\=1\\,edge@ / tma_info_thread_slots",
+        "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 5 * INT_MISC.CLEARS_COUNT / tma_info_thread_slots",
         "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group",
         "MetricName": "tma_backend_bound",
         "MetricThreshold": "tma_backend_bound > 0.2",
@@ -135,7 +135,7 @@
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions.",
         "MetricExpr": "tma_light_operations * BR_INST_RETIRED.ALL_BRANCHES / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_branch_instructions",
         "MetricThreshold": "tma_branch_instructions > 0.1 & tma_light_operations > 0.6",
         "ScaleUnit": "100%"
@@ -180,7 +180,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(49 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 48 * tma_info_system_average_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "(49 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 48 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_contested_accesses",
         "MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -200,7 +200,7 @@
     {
         "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "48 * tma_info_system_average_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "MetricExpr": "48 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
         "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
         "MetricName": "tma_data_sharing",
         "MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -212,7 +212,7 @@
         "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_issueD0;tma_mite_group",
         "MetricName": "tma_decoder0_alone",
-        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))",
+        "MetricThreshold": "tma_decoder0_alone > 0.1 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder. Related metrics: tma_few_uops_instructions",
         "ScaleUnit": "100%"
     },
@@ -240,7 +240,7 @@
         "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_dsb",
-        "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
+        "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline.  For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
         "ScaleUnit": "100%"
     },
@@ -259,7 +259,7 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
         "MetricName": "tma_dtlb_load",
         "MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
@@ -268,12 +268,12 @@
         "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
         "MetricName": "tma_dtlb_store",
         "MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs",
+        "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses.  As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead.  Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page.  Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
-        "MetricExpr": "54 * tma_info_system_average_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
+        "MetricExpr": "54 * tma_info_system_core_frequency * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / tma_info_thread_clks",
         "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
         "MetricName": "tma_false_sharing",
         "MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -286,7 +286,7 @@
         "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
         "MetricName": "tma_fb_full",
         "MetricThreshold": "tma_fb_full > 0.3",
-        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+        "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
         "ScaleUnit": "100%"
     },
     {
@@ -294,7 +294,7 @@
         "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)",
         "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
         "MetricName": "tma_fetch_bandwidth",
-        "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35",
+        "MetricThreshold": "tma_fetch_bandwidth > 0.2",
         "MetricgroupNoGroup": "TopdownL2",
         "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
         "ScaleUnit": "100%"
@@ -328,6 +328,15 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists",
+        "MetricExpr": "34 * ASSISTS.FP / tma_info_thread_slots",
+        "MetricGroup": "HPC;TopdownL5;tma_L5_group;tma_assists_group",
+        "MetricName": "tma_fp_assists",
+        "MetricThreshold": "tma_fp_assists > 0.1",
+        "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called Denormals).",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
         "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)",
         "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
@@ -390,13 +399,13 @@
         "MetricName": "tma_heavy_operations",
         "MetricThreshold": "tma_heavy_operations > 0.1",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. ([ICL+] Note this may overcount due to approximation using indirect events; [ADL+] .)",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
-        "MetricExpr": "ICACHE_16B.IFDATA_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricExpr": "ICACHE_DATA.STALLS / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_icache_misses",
         "MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
@@ -405,7 +414,7 @@
     {
         "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "(tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES",
+        "MetricExpr": "tma_info_bottleneck_mispredictions * tma_info_thread_slots / BR_MISP_RETIRED.ALL_BRANCHES / 100",
         "MetricGroup": "Bad;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bad_spec_branch_misprediction_cost",
         "PublicDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear). Related metrics: tma_branch_mispredicts, tma_info_bottleneck_mispredictions, tma_mispredicts_resteers"
@@ -446,6 +455,12 @@
         "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
     },
     {
+        "BriefDescription": "Speculative to Retired ratio of all clears (covering mispredicts and nukes)",
+        "MetricExpr": "INT_MISC.CLEARS_COUNT / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)",
+        "MetricGroup": "BrMispredicts",
+        "MetricName": "tma_info_bad_spec_spec_clears_ratio"
+    },
+    {
         "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "(100 * (1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if tma_info_system_smt_2t_utilization > 0.5 else 0)",
@@ -472,67 +487,102 @@
         "PublicDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck. Related metrics: "
     },
     {
+        "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.",
+        "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Ret",
+        "MetricName": "tma_info_bottleneck_base_non_br",
+        "MetricThreshold": "tma_info_bottleneck_base_non_br > 20"
+    },
+    {
         "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
-        "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB;tma_issueBC",
+        "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB",
         "MetricName": "tma_info_bottleneck_big_code",
-        "MetricThreshold": "tma_info_bottleneck_big_code > 20",
-        "PublicDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses). Related metrics: tma_info_bottleneck_branching_overhead"
+        "MetricThreshold": "tma_info_bottleneck_big_code > 20"
     },
     {
         "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
-        "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / tma_info_thread_slots)",
-        "MetricGroup": "Ret;tma_issueBC",
+        "MetricExpr": "100 * ((BR_INST_RETIRED.ALL_BRANCHES + BR_INST_RETIRED.NEAR_CALL) / tma_info_thread_slots)",
+        "MetricGroup": "Ret",
         "MetricName": "tma_info_bottleneck_branching_overhead",
-        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 10",
-        "PublicDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls). Related metrics: tma_info_bottleneck_big_code"
+        "MetricThreshold": "tma_info_bottleneck_branching_overhead > 5"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
+        "MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+    },
+    {
+        "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
+        "MetricName": "tma_info_bottleneck_cache_memory_latency",
+        "MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
+        "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency"
+    },
+    {
+        "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
+        "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
+        "MetricGroup": "Cor;tma_issueComp",
+        "MetricName": "tma_info_bottleneck_compute_bound_est",
+        "MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
+        "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: "
     },
     {
         "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
+        "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_info_bottleneck_big_code",
         "MetricGroup": "Fed;FetchBW;Frontend",
         "MetricName": "tma_info_bottleneck_instruction_fetch_bw",
         "MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20"
     },
     {
-        "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
-        "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
-        "MetricName": "tma_info_bottleneck_memory_bandwidth",
-        "MetricThreshold": "tma_info_bottleneck_memory_bandwidth > 20",
-        "PublicDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full"
+        "BriefDescription": "Total pipeline cost of irregular execution (e.g",
+        "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+        "MetricGroup": "Bad;Cor;Ret;tma_issueMS",
+        "MetricName": "tma_info_bottleneck_irregular_overhead",
+        "MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
+        "PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches"
     },
     {
         "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
         "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB",
         "MetricName": "tma_info_bottleneck_memory_data_tlbs",
         "MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
-        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store"
+        "PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization"
     },
     {
-        "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
-        "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound))",
-        "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
-        "MetricName": "tma_info_bottleneck_memory_latency",
-        "MetricThreshold": "tma_info_bottleneck_memory_latency > 20",
-        "PublicDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches). Related metrics: tma_l3_hit_latency, tma_mem_latency"
+        "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
+        "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
+        "MetricGroup": "Mem;Offcore;tma_issueTLB",
+        "MetricName": "tma_info_bottleneck_memory_synchronization",
+        "MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
+        "PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs"
     },
     {
         "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+        "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
         "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM",
         "MetricName": "tma_info_bottleneck_mispredictions",
         "MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
         "PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers"
     },
     {
+        "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)",
+        "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)",
+        "MetricGroup": "Cor;Offcore",
+        "MetricName": "tma_info_bottleneck_other_bottlenecks",
+        "MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
+        "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls."
+    },
+    {
         "BriefDescription": "Fraction of branches that are CALL or RET",
         "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
         "MetricGroup": "Bad;Branches",
@@ -564,7 +614,7 @@
     },
     {
         "BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
-        "MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED",
+        "MetricExpr": "(CPU_CLK_UNHALTED.DISTRIBUTED if #SMT_on else tma_info_thread_clks)",
         "MetricGroup": "SMT",
         "MetricName": "tma_info_core_core_clks"
     },
@@ -575,8 +625,14 @@
         "MetricName": "tma_info_core_coreipc"
     },
     {
+        "BriefDescription": "uops Executed per Cycle",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / tma_info_thread_clks",
+        "MetricGroup": "Power",
+        "MetricName": "tma_info_core_epc"
+    },
+    {
         "BriefDescription": "Floating Point Operations Per Cycle",
-        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
         "MetricGroup": "Flops;Ret",
         "MetricName": "tma_info_core_flopc"
     },
@@ -588,8 +644,8 @@
         "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per thread (logical-processor)",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
         "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
         "MetricName": "tma_info_core_ilp"
     },
@@ -669,7 +725,7 @@
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_iparith",
         "MetricThreshold": "tma_info_inst_mix_iparith < 10",
-        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
+        "PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting. Approximated prior to BDW."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
@@ -677,7 +733,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx128",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
@@ -685,7 +741,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx256",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate)",
@@ -693,7 +749,7 @@
         "MetricGroup": "Flops;FpVector;InsType",
         "MetricName": "tma_info_inst_mix_iparith_avx512",
         "MetricThreshold": "tma_info_inst_mix_iparith_avx512 < 10",
-        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic AVX 512-bit instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
@@ -701,7 +757,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_dp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
@@ -709,7 +765,7 @@
         "MetricGroup": "Flops;FpScalar;InsType",
         "MetricName": "tma_info_inst_mix_iparith_scalar_sp",
         "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
-        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
+        "PublicDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate). Values < 1 are possible due to intentional FMA double counting."
     },
     {
         "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
@@ -727,7 +783,7 @@
     },
     {
         "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
-        "MetricExpr": "INST_RETIRED.ANY / (cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
+        "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE)",
         "MetricGroup": "Flops;InsType",
         "MetricName": "tma_info_inst_mix_ipflop",
         "MetricThreshold": "tma_info_inst_mix_ipflop < 10"
@@ -740,6 +796,12 @@
         "MetricThreshold": "tma_info_inst_mix_ipload < 3"
     },
     {
+        "BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)",
+        "MetricExpr": "tma_info_inst_mix_instructions / MISC_RETIRED.PAUSE_INST",
+        "MetricGroup": "Flops;FpVector;InsType",
+        "MetricName": "tma_info_inst_mix_ippause"
+    },
+    {
         "BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
         "MetricExpr": "INST_RETIRED.ANY / MEM_INST_RETIRED.ALL_STORES",
         "MetricGroup": "InsType",
@@ -763,142 +825,154 @@
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l1d_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l1d_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l2_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l2_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_access_bw",
         "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_core_l3_cache_access_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_access_bw_2t"
     },
     {
         "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricExpr": "tma_info_memory_l3_cache_fill_bw",
         "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+        "MetricName": "tma_info_memory_core_l3_cache_fill_bw_2t"
     },
     {
         "BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_fb_hpki"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l1d_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki"
     },
     {
         "BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l1mpki_load"
     },
     {
+        "BriefDescription": "",
+        "MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l2_cache_fill_bw"
+    },
+    {
         "BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_all"
     },
     {
         "BriefDescription": "L2 cache hits per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2hpki_load"
     },
     {
         "BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
         "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "Backend;CacheMisses;Mem",
+        "MetricGroup": "Backend;CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem;Offcore",
+        "MetricGroup": "CacheHits;Mem;Offcore",
         "MetricName": "tma_info_memory_l2mpki_all"
     },
     {
         "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads  (including speculative)",
         "MetricExpr": "1e3 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
+        "MetricGroup": "CacheHits;Mem",
         "MetricName": "tma_info_memory_l2mpki_load"
     },
     {
-        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
-        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
-        "MetricGroup": "CacheMisses;Mem",
-        "MetricName": "tma_info_memory_l3mpki"
+        "BriefDescription": "",
+        "MetricExpr": "64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW;Offcore",
+        "MetricName": "tma_info_memory_l3_cache_access_bw"
     },
     {
-        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
-        "MetricGroup": "Mem;MemoryBound;MemoryLat",
-        "MetricName": "tma_info_memory_load_miss_real_latency"
+        "BriefDescription": "",
+        "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
+        "MetricGroup": "Mem;MemoryBW",
+        "MetricName": "tma_info_memory_l3_cache_fill_bw"
     },
     {
-        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
-        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
-        "MetricGroup": "Mem;MemoryBW;MemoryBound",
-        "MetricName": "tma_info_memory_mlp",
-        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+        "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+        "MetricExpr": "1e3 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_l3mpki"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss data reads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_data_l2_mlp"
+        "MetricName": "tma_info_memory_latency_data_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l2_miss_latency"
     },
     {
         "BriefDescription": "Average Parallel L2 cache miss demand Loads",
         "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,cmask\\=1@",
         "MetricGroup": "Memory_BW;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l2_mlp"
+        "MetricName": "tma_info_memory_latency_load_l2_mlp"
     },
     {
         "BriefDescription": "Average Latency for L3 cache miss demand Loads",
         "MetricExpr": "cpu@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD\\,umask\\=0x10@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
         "MetricGroup": "Memory_Lat;Offcore",
-        "MetricName": "tma_info_memory_oro_load_l3_miss_latency"
+        "MetricName": "tma_info_memory_latency_load_l3_miss_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
+        "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT)",
+        "MetricGroup": "Mem;MemoryBound;MemoryLat",
+        "MetricName": "tma_info_memory_load_miss_real_latency"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
+        "BriefDescription": "\"Bus lock\" per kilo instruction",
+        "MetricExpr": "1e3 * SQ_MISC.BUS_LOCK / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_bus_lock_pki"
     },
     {
-        "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_access_bw",
-        "MetricGroup": "Mem;MemoryBW;Offcore",
-        "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
+        "BriefDescription": "Un-cacheable retired load per kilo instruction",
+        "MetricExpr": "1e3 * MEM_LOAD_MISC_RETIRED.UC / INST_RETIRED.ANY",
+        "MetricGroup": "Mem",
+        "MetricName": "tma_info_memory_mix_uc_load_pki"
     },
     {
-        "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
-        "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
-        "MetricGroup": "Mem;MemoryBW",
-        "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
+        "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+        "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+        "MetricGroup": "Mem;MemoryBW;MemoryBound",
+        "MetricName": "tma_info_memory_mlp",
+        "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
     },
     {
         "BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
@@ -926,42 +1000,56 @@
         "MetricName": "tma_info_memory_tlb_store_stlb_mpki"
     },
     {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-thread",
-        "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+        "BriefDescription": "",
+        "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
         "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
         "MetricName": "tma_info_pipeline_execute"
     },
     {
+        "BriefDescription": "Instructions per a microcode Assist invocation",
+        "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY",
+        "MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
+        "MetricName": "tma_info_pipeline_ipassist",
+        "MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
+        "PublicDescription": "Instructions per a microcode Assist invocation. See Assists tree node for details (lower number means higher occurrence rate)"
+    },
+    {
         "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
         "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
         "MetricGroup": "Pipeline;Ret",
         "MetricName": "tma_info_pipeline_retire"
     },
     {
-        "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+        "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
         "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
         "MetricGroup": "Power;Summary",
-        "MetricName": "tma_info_system_average_frequency"
+        "MetricName": "tma_info_system_core_frequency"
     },
     {
-        "BriefDescription": "Average CPU Utilization",
+        "BriefDescription": "Average CPU Utilization (percentage)",
         "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "tma_info_system_cpu_utilization"
     },
     {
+        "BriefDescription": "Average number of utilized CPUs",
+        "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+        "MetricGroup": "Summary",
+        "MetricName": "tma_info_system_cpus_utilized"
+    },
+    {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
         "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1e6 / duration_time / 1e3",
-        "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+        "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
         "MetricName": "tma_info_system_dram_bw_use",
-        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
+        "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_mem_bandwidth, tma_sq_full"
     },
     {
         "BriefDescription": "Giga Floating Point Operations Per Second",
-        "MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
+        "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * FP_ARITH_INST_RETIRED.4_FLOPS + 8 * FP_ARITH_INST_RETIRED.8_FLOPS + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / 1e9 / duration_time",
         "MetricGroup": "Cor;Flops;HPC",
         "MetricName": "tma_info_system_gflops",
-        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width"
     },
     {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
@@ -998,12 +1086,6 @@
         "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)"
     },
     {
-        "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
-        "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.ALL + UNC_ARB_DAT_OCCUPANCY.RD) / arb@event\\=0x81\\,umask\\=0x1@",
-        "MetricGroup": "Mem;SoC",
-        "MetricName": "tma_info_system_mem_request_latency"
-    },
-    {
         "BriefDescription": "Fraction of Core cycles where the core was running with power-delivery for baseline license level 0",
         "MetricExpr": "CORE_POWER.LVL0_TURBO_LICENSE / tma_info_core_core_clks",
         "MetricGroup": "Power",
@@ -1097,8 +1179,8 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
-        "MetricExpr": "ICACHE_64B.IFTAG_STALL / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+        "MetricExpr": "ICACHE_TAG.STALLS / tma_info_thread_clks",
+        "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
         "MetricName": "tma_itlb_misses",
         "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
@@ -1107,7 +1189,7 @@
     {
         "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
         "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / tma_info_thread_clks, 0)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
         "MetricName": "tma_l1_bound",
         "MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache.  The L1 data cache typically has the shortest latency.  However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS. Related metrics: tma_clears_resteers, tma_machine_clears, tma_microcode_sequencer, tma_ms_switches, tma_ports_utilized_1",
@@ -1117,7 +1199,7 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
         "MetricConstraint": "NO_GROUP_EVENTS",
         "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / (MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + L1D_PEND_MISS.FB_FULL_PERIODS) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks)",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l2_bound",
         "MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads.  Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
@@ -1127,24 +1209,24 @@
         "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
         "MetricConstraint": "NO_GROUP_EVENTS_NMI",
         "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
-        "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+        "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
         "MetricName": "tma_l3_bound",
         "MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
-        "MetricExpr": "17.5 * tma_info_system_average_frequency * MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks",
+        "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+        "MetricExpr": "17.5 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks",
         "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
         "MetricName": "tma_l3_hit_latency",
         "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_memory_latency, tma_mem_latency",
+        "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited).  Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance.  Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
-        "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks",
+        "MetricExpr": "DECODE.LCP / tma_info_thread_clks",
         "MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
         "MetricName": "tma_lcp",
         "MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -1158,7 +1240,7 @@
         "MetricName": "tma_light_operations",
         "MetricThreshold": "tma_light_operations > 0.6",
         "MetricgroupNoGroup": "TopdownL2",
-        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+        "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized code running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. ([ICL+] Note this may undercount due to approximation using indirect events; [ADL+] .). Sample with: INST_RETIRED.PREC_DIST",
         "ScaleUnit": "100%"
     },
     {
@@ -1201,7 +1283,7 @@
         "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_lsd",
-        "MetricThreshold": "tma_lsd > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
+        "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.",
         "ScaleUnit": "100%"
     },
@@ -1216,21 +1298,21 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
         "MetricName": "tma_mem_bandwidth",
         "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
+        "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM).  The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+        "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
         "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
         "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
         "MetricName": "tma_mem_latency",
         "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_memory_latency, tma_l3_hit_latency",
+        "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM).  This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
         "ScaleUnit": "100%"
     },
     {
@@ -1254,11 +1336,11 @@
     },
     {
         "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
-        "MetricExpr": "tma_retiring * tma_info_thread_slots / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
+        "MetricExpr": "UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
         "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
         "MetricName": "tma_microcode_sequencer",
         "MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
-        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_ms_switches",
+        "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit.  The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1275,7 +1357,7 @@
         "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / tma_info_core_core_clks / 2",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
         "MetricName": "tma_mite",
-        "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35)",
+        "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
         "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
         "ScaleUnit": "100%"
     },
@@ -1284,16 +1366,16 @@
         "MetricExpr": "(cpu@IDQ.MITE_UOPS\\,cmask\\=4@ - cpu@IDQ.MITE_UOPS\\,cmask\\=5@) / tma_info_thread_clks",
         "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_L4_group;tma_mite_group",
         "MetricName": "tma_mite_4wide",
-        "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 5 > 0.35))",
+        "MetricThreshold": "tma_mite_4wide > 0.05 & (tma_mite > 0.1 & tma_fetch_bandwidth > 0.2)",
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
+        "BriefDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles)",
         "MetricExpr": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY",
         "MetricGroup": "TopdownL5;tma_L5_group;tma_issueMV;tma_ports_utilized_0_group",
         "MetricName": "tma_mixing_vectors",
         "MetricThreshold": "tma_mixing_vectors > 0.05",
-        "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
+        "PublicDescription": "This metric estimates penalty in terms of percentage of([SKL+] injected blend uops out of all Uops Issued -- the Count Domain; [ADL+] cycles). Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
@@ -1302,22 +1384,22 @@
         "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
         "MetricName": "tma_ms_switches",
         "MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
-        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
+        "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES. Related metrics: tma_clears_resteers, tma_info_bottleneck_irregular_overhead, tma_l1_bound, tma_machine_clears, tma_microcode_sequencer, tma_mixing_vectors, tma_serializing_operation",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
         "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * tma_info_thread_slots)",
-        "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+        "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
         "MetricName": "tma_nop_instructions",
-        "MetricThreshold": "tma_nop_instructions > 0.1 & tma_light_operations > 0.6",
+        "MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
         "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
         "MetricConstraint": "NO_GROUP_EVENTS",
-        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions + tma_nop_instructions))",
+        "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_branch_instructions))",
         "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
         "MetricName": "tma_other_light_ops",
         "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
@@ -1325,6 +1407,22 @@
         "ScaleUnit": "100%"
     },
     {
+        "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
+        "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)",
+        "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
+        "MetricName": "tma_other_mispredicts",
+        "MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
+        "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)",
+        "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
+        "MetricName": "tma_other_nukes",
+        "MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
+        "ScaleUnit": "100%"
+    },
+    {
         "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)",
         "MetricExpr": "UOPS_DISPATCHED.PORT_0 / tma_info_core_core_clks",
         "MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
@@ -1352,17 +1450,17 @@
         "ScaleUnit": "100%"
     },
     {
-        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
+        "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU)",
         "MetricExpr": "UOPS_DISPATCHED.PORT_6 / tma_info_core_core_clks",
         "MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
         "MetricName": "tma_port_6",
         "MetricThreshold": "tma_port_6 > 0.6",
-        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
+        "PublicDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+] Primary Branch and simple ALU). Sample with: UOPS_DISPATCHED.PORT_6. Related metrics: tma_fp_scalar, tma_fp_vector, tma_fp_vector_128b, tma_fp_vector_256b, tma_fp_vector_512b, tma_port_0, tma_port_1, tma_port_5, tma_ports_utilized_2",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
-        "MetricExpr": "((cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
+        "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)",
         "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
         "MetricName": "tma_ports_utilization",
         "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -1371,7 +1469,7 @@
     },
     {
         "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
-        "MetricExpr": "cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / tma_info_thread_clks + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
+        "MetricExpr": "(cpu@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + tma_core_bound * RS_EVENTS.EMPTY_CYCLES) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_0",
         "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -1401,7 +1499,7 @@
         "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks",
         "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
         "MetricName": "tma_ports_utilized_3m",
-        "MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
+        "MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
         "ScaleUnit": "100%"
     },
@@ -1419,18 +1517,18 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
         "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / tma_info_thread_clks",
-        "MetricGroup": "PortsUtil;TopdownL5;tma_L5_group;tma_issueSO;tma_ports_utilized_0_group",
+        "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
         "MetricName": "tma_serializing_operation",
-        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)))",
+        "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
         "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
         "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
         "MetricExpr": "140 * MISC_RETIRED.PAUSE_INST / tma_info_thread_clks",
-        "MetricGroup": "TopdownL6;tma_L6_group;tma_serializing_operation_group",
+        "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
         "MetricName": "tma_slow_pause",
-        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))))",
+        "MetricThreshold": "tma_slow_pause > 0.05 & (tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
         "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST",
         "ScaleUnit": "100%"
     },
@@ -1459,7 +1557,7 @@
         "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
         "MetricName": "tma_sq_full",
         "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
-        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
+        "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
         "ScaleUnit": "100%"
     },
     {
@@ -1527,10 +1625,10 @@
     {
         "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
         "MetricExpr": "10 * BACLEARS.ANY / tma_info_thread_clks",
-        "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+        "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
         "MetricName": "tma_unknown_branches",
         "MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
-        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit). Sample with: BACLEARS.ANY",
+        "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: BACLEARS.ANY",
         "ScaleUnit": "100%"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/tigerlake/uncore-interconnect.json
index eed1b90a2779..48f23acc76c0 100644
--- a/tools/perf/pmu-events/arch/x86/tigerlake/uncore-interconnect.json
+++ b/tools/perf/pmu-events/arch/x86/tigerlake/uncore-interconnect.json
@@ -25,6 +25,7 @@
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event UNC_ARB_REQ_TRK_REQUEST.DRD",
+        "Deprecated": "1",
         "EventCode": "0x81",
         "EventName": "UNC_ARB_DAT_REQUESTS.RD",
         "PerPkg": "1",
@@ -33,6 +34,7 @@
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event UNC_ARB_DAT_OCCUPANCY.ALL",
+        "Deprecated": "1",
         "EventCode": "0x85",
         "EventName": "UNC_ARB_IFA_OCCUPANCY.ALL",
         "PerPkg": "1",
diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py
index 53ab050c8fa4..e42efc16723e 100755
--- a/tools/perf/pmu-events/jevents.py
+++ b/tools/perf/pmu-events/jevents.py
@@ -203,7 +203,7 @@ class JsonEvent:
 
     def llx(x: int) -> str:
       """Convert an int to a string similar to a printf modifier of %#llx."""
-      return '0' if x == 0 else hex(x)
+      return str(x) if x >= 0 and x < 10 else hex(x)
 
     def fixdesc(s: str) -> str:
       """Fix formatting issue for the desc string."""
@@ -294,6 +294,23 @@ class JsonEvent:
       }
       return table[unit] if unit in table else f'uncore_{unit.lower()}'
 
+    def is_zero(val: str) -> bool:
+        try:
+            if val.startswith('0x'):
+                return int(val, 16) == 0
+            else:
+                return int(val) == 0
+        except e:
+            return False
+
+    def canonicalize_value(val: str) -> str:
+        try:
+            if val.startswith('0x'):
+                return llx(int(val, 16))
+            return str(int(val))
+        except e:
+            return val
+
     eventcode = 0
     if 'EventCode' in jd:
       eventcode = int(jd['EventCode'].split(',', 1)[0], 0)
@@ -356,10 +373,14 @@ class JsonEvent:
         ('UMask', 'umask='),
         ('NodeType', 'type='),
         ('RdWrMask', 'rdwrmask='),
+        ('EnAllCores', 'enallcores='),
+        ('EnAllSlices', 'enallslices='),
+        ('SliceId', 'sliceid='),
+        ('ThreadMask', 'threadmask='),
     ]
     for key, value in event_fields:
-      if key in jd and jd[key] != '0':
-        event += ',' + value + jd[key]
+      if key in jd and not is_zero(jd[key]):
+        event += f',{value}{canonicalize_value(jd[key])}'
     if filter:
       event += f',{filter}'
     if msr:
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 53ba9c3e20e0..c7f9d9676095 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 perf-y += builtin-test.o
-perf-y += builtin-test-list.o
+perf-y += tests-scripts.o
 perf-y += parse-events.o
 perf-y += dso-data.o
 perf-y += attr.o
diff --git a/tools/perf/tests/builtin-test-list.c b/tools/perf/tests/builtin-test-list.c
deleted file mode 100644
index a65b9e547d82..000000000000
--- a/tools/perf/tests/builtin-test-list.c
+++ /dev/null
@@ -1,207 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <dirent.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <linux/ctype.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/zalloc.h>
-#include <string.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <subcmd/exec-cmd.h>
-#include <subcmd/parse-options.h>
-#include <sys/wait.h>
-#include <sys/stat.h>
-#include "builtin.h"
-#include "builtin-test-list.h"
-#include "color.h"
-#include "debug.h"
-#include "hist.h"
-#include "intlist.h"
-#include "string2.h"
-#include "symbol.h"
-#include "tests.h"
-#include "util/rlimit.h"
-
-
-/*
- * As this is a singleton built once for the run of the process, there is
- * no value in trying to free it and just let it stay around until process
- * exits when it's cleaned up.
- */
-static size_t files_num = 0;
-static struct script_file *files = NULL;
-static int files_max_width = 0;
-
-static const char *shell_tests__dir(char *path, size_t size)
-{
-	const char *devel_dirs[] = { "./tools/perf/tests", "./tests", };
-	char *exec_path;
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(devel_dirs); ++i) {
-		struct stat st;
-
-		if (!lstat(devel_dirs[i], &st)) {
-			scnprintf(path, size, "%s/shell", devel_dirs[i]);
-			if (!lstat(devel_dirs[i], &st))
-				return path;
-		}
-	}
-
-	/* Then installed path. */
-	exec_path = get_argv_exec_path();
-	scnprintf(path, size, "%s/tests/shell", exec_path);
-	free(exec_path);
-	return path;
-}
-
-static const char *shell_test__description(char *description, size_t size,
-                                           const char *path, const char *name)
-{
-	FILE *fp;
-	char filename[PATH_MAX];
-	int ch;
-
-	path__join(filename, sizeof(filename), path, name);
-	fp = fopen(filename, "r");
-	if (!fp)
-		return NULL;
-
-	/* Skip first line - should be #!/bin/sh Shebang */
-	do {
-		ch = fgetc(fp);
-	} while (ch != EOF && ch != '\n');
-
-	description = fgets(description, size, fp);
-	fclose(fp);
-
-	/* Assume first char on line is omment everything after that desc */
-	return description ? strim(description + 1) : NULL;
-}
-
-/* Is this full file path a shell script */
-static bool is_shell_script(const char *path)
-{
-	const char *ext;
-
-	ext = strrchr(path, '.');
-	if (!ext)
-		return false;
-	if (!strcmp(ext, ".sh")) { /* Has .sh extension */
-		if (access(path, R_OK | X_OK) == 0) /* Is executable */
-			return true;
-	}
-	return false;
-}
-
-/* Is this file in this dir a shell script (for test purposes) */
-static bool is_test_script(const char *path, const char *name)
-{
-	char filename[PATH_MAX];
-
-	path__join(filename, sizeof(filename), path, name);
-	if (!is_shell_script(filename)) return false;
-	return true;
-}
-
-/* Duplicate a string and fall over and die if we run out of memory */
-static char *strdup_check(const char *str)
-{
-	char *newstr;
-
-	newstr = strdup(str);
-	if (!newstr) {
-		pr_err("Out of memory while duplicating test script string\n");
-		abort();
-	}
-	return newstr;
-}
-
-static void append_script(const char *dir, const char *file, const char *desc)
-{
-	struct script_file *files_tmp;
-	size_t files_num_tmp;
-	int width;
-
-	files_num_tmp = files_num + 1;
-	if (files_num_tmp >= SIZE_MAX) {
-		pr_err("Too many script files\n");
-		abort();
-	}
-	/* Realloc is good enough, though we could realloc by chunks, not that
-	 * anyone will ever measure performance here */
-	files_tmp = realloc(files,
-			    (files_num_tmp + 1) * sizeof(struct script_file));
-	if (files_tmp == NULL) {
-		pr_err("Out of memory while building test list\n");
-		abort();
-	}
-	/* Add file to end and NULL terminate the struct array */
-	files = files_tmp;
-	files_num = files_num_tmp;
-	files[files_num - 1].dir = strdup_check(dir);
-	files[files_num - 1].file = strdup_check(file);
-	files[files_num - 1].desc = strdup_check(desc);
-	files[files_num].dir = NULL;
-	files[files_num].file = NULL;
-	files[files_num].desc = NULL;
-
-	width = strlen(desc); /* Track max width of desc */
-	if (width > files_max_width)
-		files_max_width = width;
-}
-
-static void append_scripts_in_dir(const char *path)
-{
-	struct dirent **entlist;
-	struct dirent *ent;
-	int n_dirs, i;
-	char filename[PATH_MAX];
-
-	/* List files, sorted by alpha */
-	n_dirs = scandir(path, &entlist, NULL, alphasort);
-	if (n_dirs == -1)
-		return;
-	for (i = 0; i < n_dirs && (ent = entlist[i]); i++) {
-		if (ent->d_name[0] == '.')
-			continue; /* Skip hidden files */
-		if (is_test_script(path, ent->d_name)) { /* It's a test */
-			char bf[256];
-			const char *desc = shell_test__description
-				(bf, sizeof(bf), path, ent->d_name);
-
-			if (desc) /* It has a desc line - valid script */
-				append_script(path, ent->d_name, desc);
-		} else if (is_directory(path, ent)) { /* Scan the subdir */
-			path__join(filename, sizeof(filename),
-				   path, ent->d_name);
-			append_scripts_in_dir(filename);
-		}
-	}
-	for (i = 0; i < n_dirs; i++) /* Clean up */
-		zfree(&entlist[i]);
-	free(entlist);
-}
-
-const struct script_file *list_script_files(void)
-{
-	char path_dir[PATH_MAX];
-	const char *path;
-
-	if (files)
-		return files; /* Singleton - we already know our list */
-
-	path = shell_tests__dir(path_dir, sizeof(path_dir)); /* Walk  dir */
-	append_scripts_in_dir(path);
-
-	return files;
-}
-
-int list_script_max_width(void)
-{
-	list_script_files(); /* Ensure we have scanned all scripts */
-	return files_max_width;
-}
diff --git a/tools/perf/tests/builtin-test-list.h b/tools/perf/tests/builtin-test-list.h
deleted file mode 100644
index eb81f3aa6683..000000000000
--- a/tools/perf/tests/builtin-test-list.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-struct script_file {
-	char *dir;
-	char *file;
-	char *desc;
-};
-
-/* List available script tests to run - singleton - never freed */
-const struct script_file *list_script_files(void);
-/* Get maximum width of description string */
-int list_script_max_width(void);
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 4a5973f9bb9b..d13ee7683d9d 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -6,6 +6,7 @@
  */
 #include <fcntl.h>
 #include <errno.h>
+#include <poll.h>
 #include <unistd.h>
 #include <string.h>
 #include <stdlib.h>
@@ -21,17 +22,25 @@
 #include "debug.h"
 #include "color.h"
 #include <subcmd/parse-options.h>
+#include <subcmd/run-command.h>
 #include "string2.h"
 #include "symbol.h"
 #include "util/rlimit.h"
+#include "util/strbuf.h"
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <subcmd/exec-cmd.h>
 #include <linux/zalloc.h>
 
-#include "builtin-test-list.h"
+#include "tests-scripts.h"
 
+/*
+ * Command line option to not fork the test running in the same process and
+ * making them easier to debug.
+ */
 static bool dont_fork;
+/* Fork the tests in parallel and then wait for their completion. */
+static bool parallel;
 const char *dso_to_test;
 const char *test_objdump_path = "objdump";
 
@@ -130,6 +139,7 @@ static struct test_suite *generic_tests[] = {
 static struct test_suite **tests[] = {
 	generic_tests,
 	arch_tests,
+	NULL, /* shell tests created at runtime. */
 };
 
 static struct test_workload *workloads[] = {
@@ -208,76 +218,36 @@ static bool perf_test__matches(const char *desc, int curr, int argc, const char
 	return false;
 }
 
-static int run_test(struct test_suite *test, int subtest)
-{
-	int status, err = -1, child = dont_fork ? 0 : fork();
-	char sbuf[STRERR_BUFSIZE];
-
-	if (child < 0) {
-		pr_err("failed to fork test: %s\n",
-			str_error_r(errno, sbuf, sizeof(sbuf)));
-		return -1;
-	}
-
-	if (!child) {
-		if (!dont_fork) {
-			pr_debug("test child forked, pid %d\n", getpid());
-
-			if (verbose <= 0) {
-				int nullfd = open("/dev/null", O_WRONLY);
-
-				if (nullfd >= 0) {
-					close(STDERR_FILENO);
-					close(STDOUT_FILENO);
-
-					dup2(nullfd, STDOUT_FILENO);
-					dup2(STDOUT_FILENO, STDERR_FILENO);
-					close(nullfd);
-				}
-			} else {
-				signal(SIGSEGV, sighandler_dump_stack);
-				signal(SIGFPE, sighandler_dump_stack);
-			}
-		}
-
-		err = test_function(test, subtest)(test, subtest);
-		if (!dont_fork)
-			exit(err);
-	}
-
-	if (!dont_fork) {
-		wait(&status);
+struct child_test {
+	struct child_process process;
+	struct test_suite *test;
+	int test_num;
+	int subtest;
+};
 
-		if (WIFEXITED(status)) {
-			err = (signed char)WEXITSTATUS(status);
-			pr_debug("test child finished with %d\n", err);
-		} else if (WIFSIGNALED(status)) {
-			err = -1;
-			pr_debug("test child interrupted\n");
-		}
-	}
+static int run_test_child(struct child_process *process)
+{
+	struct child_test *child = container_of(process, struct child_test, process);
+	int err;
 
-	return err;
+	pr_debug("--- start ---\n");
+	pr_debug("test child forked, pid %d\n", getpid());
+	err = test_function(child->test, child->subtest)(child->test, child->subtest);
+	pr_debug("---- end(%d) ----\n", err);
+	fflush(NULL);
+	return -err;
 }
 
-#define for_each_test(j, k, t)			\
-	for (j = 0, k = 0; j < ARRAY_SIZE(tests); j++, k = 0)	\
-		while ((t = tests[j][k++]) != NULL)
-
-static int test_and_print(struct test_suite *t, int subtest)
+static int print_test_result(struct test_suite *t, int i, int subtest, int result, int width)
 {
-	int err;
-
-	pr_debug("\n--- start ---\n");
-	err = run_test(t, subtest);
-	pr_debug("---- end ----\n");
+	if (has_subtests(t)) {
+		int subw = width > 2 ? width - 2 : width;
 
-	if (!has_subtests(t))
-		pr_debug("%s:", t->desc);
-	else
-		pr_debug("%s subtest %d:", t->desc, subtest + 1);
+		pr_info("%3d.%1d: %-*s:", i + 1, subtest + 1, subw, test_description(t, subtest));
+	} else
+		pr_info("%3d: %-*s:", i + 1, width, test_description(t, subtest));
 
-	switch (err) {
+	switch (result) {
 	case TEST_OK:
 		pr_info(" Ok\n");
 		break;
@@ -296,99 +266,186 @@ static int test_and_print(struct test_suite *t, int subtest)
 		break;
 	}
 
-	return err;
+	return 0;
 }
 
-struct shell_test {
-	const char *dir;
-	const char *file;
-};
-
-static int shell_test__run(struct test_suite *test, int subdir __maybe_unused)
+static int finish_test(struct child_test *child_test, int width)
 {
-	int err;
-	char script[PATH_MAX];
-	struct shell_test *st = test->priv;
+	struct test_suite *t = child_test->test;
+	int i = child_test->test_num;
+	int subi = child_test->subtest;
+	int out = child_test->process.out;
+	int err = child_test->process.err;
+	bool out_done = out <= 0;
+	bool err_done = err <= 0;
+	struct strbuf out_output = STRBUF_INIT;
+	struct strbuf err_output = STRBUF_INIT;
+	int ret;
 
-	path__join(script, sizeof(script) - 3, st->dir, st->file);
+	/*
+	 * For test suites with subtests, display the suite name ahead of the
+	 * sub test names.
+	 */
+	if (has_subtests(t) && subi == 0)
+		pr_info("%3d: %-*s:\n", i + 1, width, test_description(t, -1));
+
+	/*
+	 * Busy loop reading from the child's stdout and stderr that are set to
+	 * be non-blocking until EOF.
+	 */
+	if (!out_done)
+		fcntl(out, F_SETFL, O_NONBLOCK);
+	if (!err_done)
+		fcntl(err, F_SETFL, O_NONBLOCK);
+	if (verbose > 1) {
+		if (has_subtests(t))
+			pr_info("%3d.%1d: %s:\n", i + 1, subi + 1, test_description(t, subi));
+		else
+			pr_info("%3d: %s:\n", i + 1, test_description(t, -1));
+	}
+	while (!out_done || !err_done) {
+		struct pollfd pfds[2] = {
+			{ .fd = out,
+			  .events = POLLIN | POLLERR | POLLHUP | POLLNVAL,
+			},
+			{ .fd = err,
+			  .events = POLLIN | POLLERR | POLLHUP | POLLNVAL,
+			},
+		};
+		char buf[512];
+		ssize_t len;
 
-	if (verbose > 0)
-		strncat(script, " -v", sizeof(script) - strlen(script) - 1);
+		/* Poll to avoid excessive spinning, timeout set for 1000ms. */
+		poll(pfds, ARRAY_SIZE(pfds), /*timeout=*/1000);
+		if (!out_done && pfds[0].revents) {
+			errno = 0;
+			len = read(out, buf, sizeof(buf) - 1);
 
-	err = system(script);
-	if (!err)
-		return TEST_OK;
+			if (len <= 0) {
+				out_done = errno != EAGAIN;
+			} else {
+				buf[len] = '\0';
+				if (verbose > 1)
+					fprintf(stdout, "%s", buf);
+				else
+					strbuf_addstr(&out_output, buf);
+			}
+		}
+		if (!err_done && pfds[1].revents) {
+			errno = 0;
+			len = read(err, buf, sizeof(buf) - 1);
 
-	return WEXITSTATUS(err) == 2 ? TEST_SKIP : TEST_FAIL;
+			if (len <= 0) {
+				err_done = errno != EAGAIN;
+			} else {
+				buf[len] = '\0';
+				if (verbose > 1)
+					fprintf(stdout, "%s", buf);
+				else
+					strbuf_addstr(&err_output, buf);
+			}
+		}
+	}
+	/* Clean up child process. */
+	ret = finish_command(&child_test->process);
+	if (verbose == 1 && ret == TEST_FAIL) {
+		/* Add header for test that was skipped above. */
+		if (has_subtests(t))
+			pr_info("%3d.%1d: %s:\n", i + 1, subi + 1, test_description(t, subi));
+		else
+			pr_info("%3d: %s:\n", i + 1, test_description(t, -1));
+		fprintf(stdout, "%s", out_output.buf);
+		fprintf(stderr, "%s", err_output.buf);
+	}
+	strbuf_release(&out_output);
+	strbuf_release(&err_output);
+	print_test_result(t, i, subi, ret, width);
+	if (out > 0)
+		close(out);
+	if (err > 0)
+		close(err);
+	return 0;
 }
 
-static int run_shell_tests(int argc, const char *argv[], int i, int width,
-				struct intlist *skiplist)
+static int start_test(struct test_suite *test, int i, int subi, struct child_test **child,
+		      int width)
 {
-	struct shell_test st;
-	const struct script_file *files, *file;
+	int err;
 
-	files = list_script_files();
-	if (!files)
+	*child = NULL;
+	if (dont_fork) {
+		pr_debug("--- start ---\n");
+		err = test_function(test, subi)(test, subi);
+		pr_debug("---- end ----\n");
+		print_test_result(test, i, subi, err, width);
 		return 0;
-	for (file = files; file->dir; file++) {
-		int curr = i++;
-		struct test_case test_cases[] = {
-			{
-				.desc = file->desc,
-				.run_case = shell_test__run,
-			},
-			{ .name = NULL, }
-		};
-		struct test_suite test_suite = {
-			.desc = test_cases[0].desc,
-			.test_cases = test_cases,
-			.priv = &st,
-		};
-		st.dir = file->dir;
-
-		if (test_suite.desc == NULL ||
-		    !perf_test__matches(test_suite.desc, curr, argc, argv))
-			continue;
-
-		st.file = file->file;
-		pr_info("%3d: %-*s:", i, width, test_suite.desc);
-
-		if (intlist__find(skiplist, i)) {
-			color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip (user override)\n");
-			continue;
-		}
+	}
 
-		test_and_print(&test_suite, 0);
+	*child = zalloc(sizeof(**child));
+	if (!*child)
+		return -ENOMEM;
+
+	(*child)->test = test;
+	(*child)->test_num = i;
+	(*child)->subtest = subi;
+	(*child)->process.pid = -1;
+	(*child)->process.no_stdin = 1;
+	if (verbose <= 0) {
+		(*child)->process.no_stdout = 1;
+		(*child)->process.no_stderr = 1;
+	} else {
+		(*child)->process.out = -1;
+		(*child)->process.err = -1;
 	}
-	return 0;
+	(*child)->process.no_exec_cmd = run_test_child;
+	err = start_command(&(*child)->process);
+	if (err || parallel)
+		return  err;
+	return finish_test(*child, width);
 }
 
+#define for_each_test(j, k, t)					\
+	for (j = 0, k = 0; j < ARRAY_SIZE(tests); j++, k = 0)	\
+		while ((t = tests[j][k++]) != NULL)
+
 static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist)
 {
 	struct test_suite *t;
 	unsigned int j, k;
 	int i = 0;
-	int width = list_script_max_width();
+	int width = 0;
+	size_t num_tests = 0;
+	struct child_test **child_tests;
+	int child_test_num = 0;
 
 	for_each_test(j, k, t) {
 		int len = strlen(test_description(t, -1));
 
 		if (width < len)
 			width = len;
+
+		if (has_subtests(t)) {
+			for (int subi = 0, subn = num_subtests(t); subi < subn; subi++) {
+				len = strlen(test_description(t, subi));
+				if (width < len)
+					width = len;
+				num_tests++;
+			}
+		} else {
+			num_tests++;
+		}
 	}
+	child_tests = calloc(num_tests, sizeof(*child_tests));
+	if (!child_tests)
+		return -ENOMEM;
 
 	for_each_test(j, k, t) {
 		int curr = i++;
-		int subi;
 
 		if (!perf_test__matches(test_description(t, -1), curr, argc, argv)) {
 			bool skip = true;
-			int subn;
-
-			subn = num_subtests(t);
 
-			for (subi = 0; subi < subn; subi++) {
+			for (int subi = 0, subn = num_subtests(t); subi < subn; subi++) {
 				if (perf_test__matches(test_description(t, subi),
 							curr, argc, argv))
 					skip = false;
@@ -398,74 +455,45 @@ static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist)
 				continue;
 		}
 
-		pr_info("%3d: %-*s:", i, width, test_description(t, -1));
-
 		if (intlist__find(skiplist, i)) {
+			pr_info("%3d: %-*s:", curr + 1, width, test_description(t, -1));
 			color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip (user override)\n");
 			continue;
 		}
 
 		if (!has_subtests(t)) {
-			test_and_print(t, -1);
-		} else {
-			int subn = num_subtests(t);
-			/*
-			 * minus 2 to align with normal testcases.
-			 * For subtest we print additional '.x' in number.
-			 * for example:
-			 *
-			 * 35: Test LLVM searching and compiling                        :
-			 * 35.1: Basic BPF llvm compiling test                          : Ok
-			 */
-			int subw = width > 2 ? width - 2 : width;
-
-			if (subn <= 0) {
-				color_fprintf(stderr, PERF_COLOR_YELLOW,
-					      " Skip (not compiled in)\n");
-				continue;
-			}
-			pr_info("\n");
-
-			for (subi = 0; subi < subn; subi++) {
-				int len = strlen(test_description(t, subi));
+			int err = start_test(t, curr, -1, &child_tests[child_test_num++], width);
 
-				if (subw < len)
-					subw = len;
+			if (err) {
+				/* TODO: if parallel waitpid the already forked children. */
+				free(child_tests);
+				return err;
 			}
+		} else {
+			for (int subi = 0, subn = num_subtests(t); subi < subn; subi++) {
+				int err;
 
-			for (subi = 0; subi < subn; subi++) {
 				if (!perf_test__matches(test_description(t, subi),
 							curr, argc, argv))
 					continue;
 
-				pr_info("%3d.%1d: %-*s:", i, subi + 1, subw,
-					test_description(t, subi));
-				test_and_print(t, subi);
+				err = start_test(t, curr, subi, &child_tests[child_test_num++],
+						 width);
+				if (err)
+					return err;
 			}
 		}
 	}
+	for (i = 0; i < child_test_num; i++) {
+		if (parallel) {
+			int ret  = finish_test(child_tests[i], width);
 
-	return run_shell_tests(argc, argv, i, width, skiplist);
-}
-
-static int perf_test__list_shell(int argc, const char **argv, int i)
-{
-	const struct script_file *files, *file;
-
-	files = list_script_files();
-	if (!files)
-		return 0;
-	for (file = files; file->dir; file++) {
-		int curr = i++;
-		struct test_suite t = {
-			.desc = file->desc
-		};
-
-		if (!perf_test__matches(t.desc, curr, argc, argv))
-			continue;
-
-		pr_info("%3d: %s\n", i, t.desc);
+			if (ret)
+				return ret;
+		}
+		free(child_tests[i]);
 	}
+	free(child_tests);
 	return 0;
 }
 
@@ -492,9 +520,6 @@ static int perf_test__list(int argc, const char **argv)
 					test_description(t, subi));
 		}
 	}
-
-	perf_test__list_shell(argc, argv, i);
-
 	return 0;
 }
 
@@ -536,6 +561,8 @@ int cmd_test(int argc, const char **argv)
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('F', "dont-fork", &dont_fork,
 		    "Do not fork for testcase"),
+	OPT_BOOLEAN('p', "parallel", &parallel,
+		    "Run the tests altogether in parallel"),
 	OPT_STRING('w', "workload", &workload, "work", "workload to run for testing"),
 	OPT_STRING(0, "dso", &dso_to_test, "dso", "dso to test"),
 	OPT_STRING(0, "objdump", &test_objdump_path, "path",
@@ -554,6 +581,7 @@ int cmd_test(int argc, const char **argv)
 	/* Unbuffered output */
 	setvbuf(stdout, NULL, _IONBF, 0);
 
+	tests[2] = create_script_test_suites();
 	argc = parse_options_subcommand(argc, argv, test_options, test_subcommands, test_usage, 0);
 	if (argc >= 1 && !strcmp(argv[0], "list"))
 		return perf_test__list(argc - 1, argv + 1);
diff --git a/tools/perf/tests/expand-cgroup.c b/tools/perf/tests/expand-cgroup.c
index 9c1a1f18db75..31966ff856f8 100644
--- a/tools/perf/tests/expand-cgroup.c
+++ b/tools/perf/tests/expand-cgroup.c
@@ -127,8 +127,7 @@ static int expand_group_events(void)
 	parse_events_error__init(&err);
 	ret = parse_events(evlist, event_str, &err);
 	if (ret < 0) {
-		pr_debug("failed to parse event '%s', err %d, str '%s'\n",
-			 event_str, ret, err.str);
+		pr_debug("failed to parse event '%s', err %d\n", event_str, ret);
 		parse_events_error__print(&err, event_str);
 		goto out;
 	}
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 8a4da7eb637a..a1f8adf85367 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -83,6 +83,7 @@ make_no_libelf      := NO_LIBELF=1
 make_no_libunwind   := NO_LIBUNWIND=1
 make_no_libdw_dwarf_unwind := NO_LIBDW_DWARF_UNWIND=1
 make_no_backtrace   := NO_BACKTRACE=1
+make_no_libcapstone := NO_CAPSTONE=1
 make_no_libnuma     := NO_LIBNUMA=1
 make_no_libaudit    := NO_LIBAUDIT=1
 make_no_libbionic   := NO_LIBBIONIC=1
@@ -122,7 +123,7 @@ make_minimal        += NO_DEMANGLE=1 NO_LIBELF=1 NO_LIBUNWIND=1 NO_BACKTRACE=1
 make_minimal        += NO_LIBNUMA=1 NO_LIBAUDIT=1 NO_LIBBIONIC=1
 make_minimal        += NO_LIBDW_DWARF_UNWIND=1 NO_AUXTRACE=1 NO_LIBBPF=1
 make_minimal        += NO_LIBCRYPTO=1 NO_SDT=1 NO_JVMTI=1 NO_LIBZSTD=1
-make_minimal        += NO_LIBCAP=1 NO_SYSCALL_TABLE=1
+make_minimal        += NO_LIBCAP=1 NO_SYSCALL_TABLE=1 NO_CAPSTONE=1
 
 # $(run) contains all available tests
 run := make_pure
@@ -152,6 +153,7 @@ run += make_no_libelf
 run += make_no_libunwind
 run += make_no_libdw_dwarf_unwind
 run += make_no_backtrace
+run += make_no_libcapstone
 run += make_no_libnuma
 run += make_no_libaudit
 run += make_no_libbionic
diff --git a/tools/perf/tests/maps.c b/tools/perf/tests/maps.c
index bb3fbfe5a73e..b15417a0d617 100644
--- a/tools/perf/tests/maps.c
+++ b/tools/perf/tests/maps.c
@@ -156,6 +156,9 @@ static int test__maps__merge_in(struct test_suite *t __maybe_unused, int subtest
 	TEST_ASSERT_VAL("merge check failed", !ret);
 
 	maps__zput(maps);
+	map__zput(map_kcore1);
+	map__zput(map_kcore2);
+	map__zput(map_kcore3);
 	return TEST_OK;
 }
 
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index fbdf710d5eea..feb5727584d1 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -2506,11 +2506,10 @@ static int test_event(const struct evlist_test *e)
 	parse_events_error__init(&err);
 	ret = parse_events(evlist, e->name, &err);
 	if (ret) {
-		pr_debug("failed to parse event '%s', err %d, str '%s'\n",
-			 e->name, ret, err.str);
+		pr_debug("failed to parse event '%s', err %d\n", e->name, ret);
 		parse_events_error__print(&err, e->name);
 		ret = TEST_FAIL;
-		if (err.str && strstr(err.str, "can't access trace events"))
+		if (parse_events_error__contains(&err, "can't access trace events"))
 			ret = TEST_SKIP;
 	} else {
 		ret = e->check(evlist);
@@ -2535,8 +2534,8 @@ static int test_event_fake_pmu(const char *str)
 	ret = __parse_events(evlist, str, /*pmu_filter=*/NULL, &err,
 			     &perf_pmu__fake, /*warn_if_reordered=*/true);
 	if (ret) {
-		pr_debug("failed to parse event '%s', err %d, str '%s'\n",
-			 str, ret, err.str);
+		pr_debug("failed to parse event '%s', err %d\n",
+			 str, ret);
 		parse_events_error__print(&err, str);
 	}
 
diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c
index a56d32905743..47a7c3277540 100644
--- a/tools/perf/tests/pmu-events.c
+++ b/tools/perf/tests/pmu-events.c
@@ -70,7 +70,7 @@ static const struct perf_pmu_test_event segment_reg_loads_any = {
 	.event = {
 		.pmu = "default_core",
 		.name = "segment_reg_loads.any",
-		.event = "event=0x6,period=200000,umask=0x80",
+		.event = "event=6,period=200000,umask=0x80",
 		.desc = "Number of segment register loads",
 		.topic = "other",
 	},
@@ -82,7 +82,7 @@ static const struct perf_pmu_test_event dispatch_blocked_any = {
 	.event = {
 		.pmu = "default_core",
 		.name = "dispatch_blocked.any",
-		.event = "event=0x9,period=200000,umask=0x20",
+		.event = "event=9,period=200000,umask=0x20",
 		.desc = "Memory cluster signals to block micro-op dispatch for any reason",
 		.topic = "other",
 	},
@@ -94,11 +94,11 @@ static const struct perf_pmu_test_event eist_trans = {
 	.event = {
 		.pmu = "default_core",
 		.name = "eist_trans",
-		.event = "event=0x3a,period=200000,umask=0x0",
+		.event = "event=0x3a,period=200000",
 		.desc = "Number of Enhanced Intel SpeedStep(R) Technology (EIST) transitions",
 		.topic = "other",
 	},
-	.alias_str = "event=0x3a,period=0x30d40,umask=0",
+	.alias_str = "event=0x3a,period=0x30d40",
 	.alias_long_desc = "Number of Enhanced Intel SpeedStep(R) Technology (EIST) transitions",
 };
 
@@ -128,7 +128,7 @@ static const struct perf_pmu_test_event *core_events[] = {
 static const struct perf_pmu_test_event uncore_hisi_ddrc_flux_wcmd = {
 	.event = {
 		.name = "uncore_hisi_ddrc.flux_wcmd",
-		.event = "event=0x2",
+		.event = "event=2",
 		.desc = "DDRC write commands",
 		.topic = "uncore",
 		.long_desc = "DDRC write commands",
@@ -156,13 +156,13 @@ static const struct perf_pmu_test_event unc_cbo_xsnp_response_miss_eviction = {
 static const struct perf_pmu_test_event uncore_hyphen = {
 	.event = {
 		.name = "event-hyphen",
-		.event = "event=0xe0,umask=0x00",
+		.event = "event=0xe0",
 		.desc = "UNC_CBO_HYPHEN",
 		.topic = "uncore",
 		.long_desc = "UNC_CBO_HYPHEN",
 		.pmu = "uncore_cbox",
 	},
-	.alias_str = "event=0xe0,umask=0",
+	.alias_str = "event=0xe0",
 	.alias_long_desc = "UNC_CBO_HYPHEN",
 	.matching_pmu = "uncore_cbox_0",
 };
@@ -170,13 +170,13 @@ static const struct perf_pmu_test_event uncore_hyphen = {
 static const struct perf_pmu_test_event uncore_two_hyph = {
 	.event = {
 		.name = "event-two-hyph",
-		.event = "event=0xc0,umask=0x00",
+		.event = "event=0xc0",
 		.desc = "UNC_CBO_TWO_HYPH",
 		.topic = "uncore",
 		.long_desc = "UNC_CBO_TWO_HYPH",
 		.pmu = "uncore_cbox",
 	},
-	.alias_str = "event=0xc0,umask=0",
+	.alias_str = "event=0xc0",
 	.alias_long_desc = "UNC_CBO_TWO_HYPH",
 	.matching_pmu = "uncore_cbox_0",
 };
@@ -184,7 +184,7 @@ static const struct perf_pmu_test_event uncore_two_hyph = {
 static const struct perf_pmu_test_event uncore_hisi_l3c_rd_hit_cpipe = {
 	.event = {
 		.name = "uncore_hisi_l3c.rd_hit_cpipe",
-		.event = "event=0x7",
+		.event = "event=7",
 		.desc = "Total read hits",
 		.topic = "uncore",
 		.long_desc = "Total read hits",
@@ -265,7 +265,7 @@ static const struct perf_pmu_test_event sys_ccn_pmu_read_cycles = {
 static const struct perf_pmu_test_event sys_cmn_pmu_hnf_cache_miss = {
 	.event = {
 		.name = "sys_cmn_pmu.hnf_cache_miss",
-		.event = "eventid=0x1,type=0x5",
+		.event = "eventid=1,type=5",
 		.desc = "Counts total cache misses in first lookup result (high priority)",
 		.topic = "uncore",
 		.pmu = "uncore_sys_cmn_pmu",
diff --git a/tools/perf/tests/shell/base_probe/settings.sh b/tools/perf/tests/shell/base_probe/settings.sh
new file mode 100644
index 000000000000..123621c7f95e
--- /dev/null
+++ b/tools/perf/tests/shell/base_probe/settings.sh
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+#	settings.sh of perf_probe test
+#	Author: Michael Petlan <mpetlan@redhat.com>
+#	Author: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+#
+
+export TEST_NAME="perf_probe"
+
+export MY_ARCH=`arch`
+
+if [ -n "$PERFSUITE_RUN_DIR" ]; then
+	# when $PERFSUITE_RUN_DIR is set to something, all the logs and temp files will be placed there
+	# --> the $PERFSUITE_RUN_DIR/perf_something/examples and $PERFSUITE_RUN_DIR/perf_something/logs
+	#     dirs will be used for that
+	export PERFSUITE_RUN_DIR=`readlink -f $PERFSUITE_RUN_DIR`
+	export CURRENT_TEST_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME"
+	export MAKE_TARGET_DIR="$CURRENT_TEST_DIR/examples"
+	test -d "$MAKE_TARGET_DIR" || mkdir -p "$MAKE_TARGET_DIR"
+	export LOGS_DIR="$PERFSUITE_RUN_DIR/$TEST_NAME/logs"
+	test -d "$LOGS_DIR" || mkdir -p "$LOGS_DIR"
+else
+	# when $PERFSUITE_RUN_DIR is not set, logs will be placed here
+	export CURRENT_TEST_DIR="."
+	export LOGS_DIR="."
+fi
+
+check_kprobes_available()
+{
+	test -e /sys/kernel/debug/tracing/kprobe_events
+}
+
+check_uprobes_available()
+{
+	test -e /sys/kernel/debug/tracing/uprobe_events
+}
+
+clear_all_probes()
+{
+	echo 0 > /sys/kernel/debug/tracing/events/enable
+	check_kprobes_available && echo > /sys/kernel/debug/tracing/kprobe_events
+	check_uprobes_available && echo > /sys/kernel/debug/tracing/uprobe_events
+}
+
+check_sdt_support()
+{
+	$CMD_PERF list sdt | grep sdt > /dev/null 2> /dev/null
+}
diff --git a/tools/perf/tests/shell/base_probe/test_adding_kernel.sh b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh
new file mode 100755
index 000000000000..a5d707efad85
--- /dev/null
+++ b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh
@@ -0,0 +1,278 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+#
+#	test_adding_kernel of perf_probe test
+#	Author: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+#	Author: Michael Petlan <mpetlan@redhat.com>
+#
+#	Description:
+#
+#		This test tests adding of probes, their correct listing
+#		and removing.
+#
+
+# include working environment
+. ../common/init.sh
+. ./settings.sh
+
+# shellcheck disable=SC2034 # the variable is later used after the working environment is included
+THIS_TEST_NAME=`basename $0 .sh`
+TEST_RESULT=0
+
+TEST_PROBE=${TEST_PROBE:-"inode_permission"}
+
+check_kprobes_available
+if [ $? -ne 0 ]; then
+	print_overall_skipped
+	exit 0
+fi
+
+
+### basic probe adding
+
+for opt in "" "-a" "--add"; do
+	clear_all_probes
+	$CMD_PERF probe $opt $TEST_PROBE 2> $LOGS_DIR/adding_kernel_add$opt.err
+	PERF_EXIT_CODE=$?
+
+	../common/check_all_patterns_found.pl "Added new events?:" "probe:$TEST_PROBE" "on $TEST_PROBE" < $LOGS_DIR/adding_kernel_add$opt.err
+	CHECK_EXIT_CODE=$?
+
+	print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "adding probe $TEST_PROBE :: $opt"
+	(( TEST_RESULT += $? ))
+done
+
+
+### listing added probe :: perf list
+
+# any added probes should appear in perf-list output
+$CMD_PERF list probe:\* > $LOGS_DIR/adding_kernel_list.log
+PERF_EXIT_CODE=$?
+
+../common/check_all_lines_matched.pl "$RE_LINE_EMPTY" "List of pre-defined events" "probe:${TEST_PROBE}(?:_\d+)?\s+\[Tracepoint event\]" "Metric Groups:" < $LOGS_DIR/adding_kernel_list.log
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "listing added probe :: perf list"
+(( TEST_RESULT += $? ))
+
+
+### listing added probe :: perf probe -l
+
+# '-l' should list all the added probes as well
+$CMD_PERF probe -l > $LOGS_DIR/adding_kernel_list-l.log
+PERF_EXIT_CODE=$?
+
+../common/check_all_patterns_found.pl "\s*probe:${TEST_PROBE}(?:_\d+)?\s+\(on ${TEST_PROBE}(?:[:\+]$RE_NUMBER_HEX)?@.+\)" < $LOGS_DIR/adding_kernel_list-l.log
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "listing added probe :: perf probe -l"
+(( TEST_RESULT += $? ))
+
+
+### using added probe
+
+$CMD_PERF stat -e probe:$TEST_PROBE\* -o $LOGS_DIR/adding_kernel_using_probe.log -- cat /proc/uptime > /dev/null
+PERF_EXIT_CODE=$?
+
+REGEX_STAT_HEADER="\s*Performance counter stats for \'cat /proc/uptime\':"
+REGEX_STAT_VALUES="\s*\d+\s+probe:$TEST_PROBE"
+# the value should be greater than 1
+REGEX_STAT_VALUE_NONZERO="\s*[1-9][0-9]*\s+probe:$TEST_PROBE"
+REGEX_STAT_TIME="\s*$RE_NUMBER\s+seconds (?:time elapsed|user|sys)"
+../common/check_all_lines_matched.pl "$REGEX_STAT_HEADER" "$REGEX_STAT_VALUES" "$REGEX_STAT_TIME" "$RE_LINE_COMMENT" "$RE_LINE_EMPTY" < $LOGS_DIR/adding_kernel_using_probe.log
+CHECK_EXIT_CODE=$?
+../common/check_all_patterns_found.pl "$REGEX_STAT_HEADER" "$REGEX_STAT_VALUE_NONZERO" "$REGEX_STAT_TIME" < $LOGS_DIR/adding_kernel_using_probe.log
+(( CHECK_EXIT_CODE += $? ))
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "using added probe"
+(( TEST_RESULT += $? ))
+
+
+### removing added probe
+
+# '-d' should remove the probe
+$CMD_PERF probe -d $TEST_PROBE\* 2> $LOGS_DIR/adding_kernel_removing.err
+PERF_EXIT_CODE=$?
+
+../common/check_all_lines_matched.pl "Removed event: probe:$TEST_PROBE" < $LOGS_DIR/adding_kernel_removing.err
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "deleting added probe"
+(( TEST_RESULT += $? ))
+
+
+### listing removed probe
+
+# removed probes should NOT appear in perf-list output
+$CMD_PERF list probe:\* > $LOGS_DIR/adding_kernel_list_removed.log
+PERF_EXIT_CODE=$?
+
+../common/check_all_lines_matched.pl "$RE_LINE_EMPTY" "List of pre-defined events" "Metric Groups:" < $LOGS_DIR/adding_kernel_list_removed.log
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "listing removed probe (should NOT be listed)"
+(( TEST_RESULT += $? ))
+
+
+### dry run
+
+# the '-n' switch should run it in dry mode
+$CMD_PERF probe -n --add $TEST_PROBE 2> $LOGS_DIR/adding_kernel_dryrun.err
+PERF_EXIT_CODE=$?
+
+# check for the output (should be the same as usual)
+../common/check_all_patterns_found.pl "Added new events?:" "probe:$TEST_PROBE" "on $TEST_PROBE" < $LOGS_DIR/adding_kernel_dryrun.err
+CHECK_EXIT_CODE=$?
+
+# check that no probe was added in real
+! ( $CMD_PERF probe -l | grep "probe:$TEST_PROBE" )
+(( CHECK_EXIT_CODE += $? ))
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "dry run :: adding probe"
+(( TEST_RESULT += $? ))
+
+
+### force-adding probes
+
+# when using '--force' a probe should be added even if it is already there
+$CMD_PERF probe --add $TEST_PROBE 2> $LOGS_DIR/adding_kernel_forceadd_01.err
+PERF_EXIT_CODE=$?
+
+../common/check_all_patterns_found.pl "Added new events?:" "probe:$TEST_PROBE" "on $TEST_PROBE" < $LOGS_DIR/adding_kernel_forceadd_01.err
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "force-adding probes :: first probe adding"
+(( TEST_RESULT += $? ))
+
+# adding existing probe without '--force' should fail
+! $CMD_PERF probe --add $TEST_PROBE 2> $LOGS_DIR/adding_kernel_forceadd_02.err
+PERF_EXIT_CODE=$?
+
+../common/check_all_patterns_found.pl "Error: event \"$TEST_PROBE\" already exists." "Error: Failed to add events." < $LOGS_DIR/adding_kernel_forceadd_02.err
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "force-adding probes :: second probe adding (without force)"
+(( TEST_RESULT += $? ))
+
+# adding existing probe with '--force' should pass
+NO_OF_PROBES=`$CMD_PERF probe -l | wc -l`
+$CMD_PERF probe --force --add $TEST_PROBE 2> $LOGS_DIR/adding_kernel_forceadd_03.err
+PERF_EXIT_CODE=$?
+
+../common/check_all_patterns_found.pl "Added new events?:" "probe:${TEST_PROBE}_${NO_OF_PROBES}" "on $TEST_PROBE" < $LOGS_DIR/adding_kernel_forceadd_03.err
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "force-adding probes :: second probe adding (with force)"
+(( TEST_RESULT += $? ))
+
+
+### using doubled probe
+
+# since they are the same, they should produce the same results
+$CMD_PERF stat -e probe:$TEST_PROBE -e probe:${TEST_PROBE}_${NO_OF_PROBES} -x';' -o $LOGS_DIR/adding_kernel_using_two.log -- bash -c 'cat /proc/cpuinfo > /dev/null'
+PERF_EXIT_CODE=$?
+
+REGEX_LINE="$RE_NUMBER;+probe:${TEST_PROBE}_?(?:$NO_OF_PROBES)?;$RE_NUMBER;$RE_NUMBER"
+../common/check_all_lines_matched.pl "$REGEX_LINE" "$RE_LINE_EMPTY" "$RE_LINE_COMMENT" < $LOGS_DIR/adding_kernel_using_two.log
+CHECK_EXIT_CODE=$?
+
+VALUE_1=`grep "$TEST_PROBE;" $LOGS_DIR/adding_kernel_using_two.log | awk -F';' '{print $1}'`
+VALUE_2=`grep "${TEST_PROBE}_${NO_OF_PROBES};" $LOGS_DIR/adding_kernel_using_two.log | awk -F';' '{print $1}'`
+
+test $VALUE_1 -eq $VALUE_2
+(( CHECK_EXIT_CODE += $? ))
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "using doubled probe"
+
+
+### removing multiple probes
+
+# using wildcards should remove all matching probes
+$CMD_PERF probe --del \* 2> $LOGS_DIR/adding_kernel_removing_wildcard.err
+PERF_EXIT_CODE=$?
+
+../common/check_all_lines_matched.pl "Removed event: probe:$TEST_PROBE" "Removed event: probe:${TEST_PROBE}_1" < $LOGS_DIR/adding_kernel_removing_wildcard.err
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "removing multiple probes"
+(( TEST_RESULT += $? ))
+
+
+### wildcard adding support
+
+$CMD_PERF probe -nf --max-probes=512 -a 'vfs_* $params' 2> $LOGS_DIR/adding_kernel_adding_wildcard.err
+PERF_EXIT_CODE=$?
+
+../common/check_all_patterns_found.pl "probe:vfs_mknod" "probe:vfs_create" "probe:vfs_rmdir" "probe:vfs_link" "probe:vfs_write" < $LOGS_DIR/adding_kernel_adding_wildcard.err
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "wildcard adding support"
+(( TEST_RESULT += $? ))
+
+
+### non-existing variable
+
+# perf probe should survive a non-existing variable probing attempt
+{ $CMD_PERF probe 'vfs_read somenonexistingrandomstuffwhichisalsoprettylongorevenlongertoexceed64' ; } 2> $LOGS_DIR/adding_kernel_nonexisting.err
+PERF_EXIT_CODE=$?
+
+# the exitcode should not be 0 or segfault
+test $PERF_EXIT_CODE -ne 139 -a $PERF_EXIT_CODE -ne 0
+PERF_EXIT_CODE=$?
+
+# check that the error message is reasonable
+../common/check_all_patterns_found.pl "Failed to find" "somenonexistingrandomstuffwhichisalsoprettylongorevenlongertoexceed64" < $LOGS_DIR/adding_kernel_nonexisting.err
+CHECK_EXIT_CODE=$?
+../common/check_all_patterns_found.pl "in this function|at this address" "Error" "Failed to add events" < $LOGS_DIR/adding_kernel_nonexisting.err
+(( CHECK_EXIT_CODE += $? ))
+../common/check_all_lines_matched.pl "Failed to find" "Error" "Probe point .+ not found" "optimized out" "Use.+\-\-range option to show.+location range" < $LOGS_DIR/adding_kernel_nonexisting.err
+(( CHECK_EXIT_CODE += $? ))
+../common/check_no_patterns_found.pl "$RE_SEGFAULT" < $LOGS_DIR/adding_kernel_nonexisting.err
+(( CHECK_EXIT_CODE += $? ))
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "non-existing variable"
+(( TEST_RESULT += $? ))
+
+
+### function with return value
+
+# adding probe with return value
+$CMD_PERF probe --add "$TEST_PROBE%return \$retval" 2> $LOGS_DIR/adding_kernel_func_retval_add.err
+PERF_EXIT_CODE=$?
+
+../common/check_all_patterns_found.pl "Added new events?:" "probe:$TEST_PROBE" "on $TEST_PROBE%return with \\\$retval" < $LOGS_DIR/adding_kernel_func_retval_add.err
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "function with retval :: add"
+(( TEST_RESULT += $? ))
+
+# recording some data
+$CMD_PERF record -e probe:$TEST_PROBE\* -o $CURRENT_TEST_DIR/perf.data -- cat /proc/cpuinfo > /dev/null 2> $LOGS_DIR/adding_kernel_func_retval_record.err
+PERF_EXIT_CODE=$?
+
+../common/check_all_patterns_found.pl "$RE_LINE_RECORD1" "$RE_LINE_RECORD2" < $LOGS_DIR/adding_kernel_func_retval_record.err
+CHECK_EXIT_CODE=$?
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "function with retval :: record"
+(( TEST_RESULT += $? ))
+
+# perf script should report the function calls with the correct arg values
+$CMD_PERF script -i $CURRENT_TEST_DIR/perf.data > $LOGS_DIR/adding_kernel_func_retval_script.log
+PERF_EXIT_CODE=$?
+
+REGEX_SCRIPT_LINE="\s*cat\s+$RE_NUMBER\s+\[$RE_NUMBER\]\s+$RE_NUMBER:\s+probe:$TEST_PROBE\w*:\s+\($RE_NUMBER_HEX\s+<\-\s+$RE_NUMBER_HEX\)\s+arg1=$RE_NUMBER_HEX"
+../common/check_all_lines_matched.pl "$REGEX_SCRIPT_LINE" < $LOGS_DIR/adding_kernel_func_retval_script.log
+CHECK_EXIT_CODE=$?
+../common/check_all_patterns_found.pl "$REGEX_SCRIPT_LINE" < $LOGS_DIR/adding_kernel_func_retval_script.log
+(( CHECK_EXIT_CODE += $? ))
+
+print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "function argument probing :: script"
+(( TEST_RESULT += $? ))
+
+
+clear_all_probes
+
+# print overall results
+print_overall_results "$TEST_RESULT"
+exit $?
diff --git a/tools/perf/tests/shell/common/check_all_lines_matched.pl b/tools/perf/tests/shell/common/check_all_lines_matched.pl
new file mode 100755
index 000000000000..fded48959a3f
--- /dev/null
+++ b/tools/perf/tests/shell/common/check_all_lines_matched.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/perl
+# SPDX-License-Identifier: GPL-2.0
+
+@regexps = @ARGV;
+
+$max_printed_lines = 20;
+$max_printed_lines = $ENV{TESTLOG_ERR_MSG_MAX_LINES} if (defined $ENV{TESTLOG_ERR_MSG_MAX_LINES});
+
+$quiet = 1;
+$quiet = 0 if (defined $ENV{TESTLOG_VERBOSITY} && $ENV{TESTLOG_VERBOSITY} ge 2);
+
+$passed = 1;
+$lines_printed = 0;
+
+while (<STDIN>)
+{
+	s/\n//;
+
+	$line_matched = 0;
+	for $r (@regexps)
+	{
+		if (/$r/)
+		{
+			$line_matched = 1;
+			last;
+		}
+	}
+
+	unless ($line_matched)
+	{
+		if ($lines_printed++ < $max_printed_lines)
+		{
+			print "Line did not match any pattern: \"$_\"\n" unless $quiet;
+		}
+		$passed = 0;
+	}
+}
+
+exit ($passed == 0);
diff --git a/tools/perf/tests/shell/common/check_all_patterns_found.pl b/tools/perf/tests/shell/common/check_all_patterns_found.pl
new file mode 100755
index 000000000000..11bdf1d3460a
--- /dev/null
+++ b/tools/perf/tests/shell/common/check_all_patterns_found.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/perl
+# SPDX-License-Identifier: GPL-2.0
+
+@regexps = @ARGV;
+
+$quiet = 1;
+$quiet = 0 if (defined $ENV{TESTLOG_VERBOSITY} && $ENV{TESTLOG_VERBOSITY} ge 2);
+
+%found = ();
+$passed = 1;
+
+while (<STDIN>)
+{
+	s/\n//;
+
+	for $r (@regexps)
+	{
+		if (/$r/)
+		{
+			$found{$r} = 1;	# FIXME: maybe add counters -- how many times was the regexp matched
+		}
+	}
+}
+
+for $r (@regexps)
+{
+	unless (exists $found{$r})
+	{
+		print "Regexp not found: \"$r\"\n" unless $quiet;
+		$passed = 0;
+	}
+}
+
+exit ($passed == 0);
diff --git a/tools/perf/tests/shell/common/check_no_patterns_found.pl b/tools/perf/tests/shell/common/check_no_patterns_found.pl
new file mode 100755
index 000000000000..770999e87a5f
--- /dev/null
+++ b/tools/perf/tests/shell/common/check_no_patterns_found.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/perl
+# SPDX-License-Identifier: GPL-2.0
+
+@regexps = @ARGV;
+
+$quiet = 1;
+$quiet = 0 if (defined $ENV{TESTLOG_VERBOSITY} && $ENV{TESTLOG_VERBOSITY} ge 2);
+
+%found = ();
+$passed = 1;
+
+while (<STDIN>)
+{
+	s/\n//;
+
+	for $r (@regexps)
+	{
+		if (/$r/)
+		{
+			$found{$r} = 1;
+		}
+	}
+}
+
+for $r (@regexps)
+{
+	if (exists $found{$r})
+	{
+		print "Regexp found: \"$r\"\n" unless $quiet;
+		$passed = 0;
+	}
+}
+
+exit ($passed == 0);
diff --git a/tools/perf/tests/shell/common/init.sh b/tools/perf/tests/shell/common/init.sh
new file mode 100644
index 000000000000..aadeaf782e03
--- /dev/null
+++ b/tools/perf/tests/shell/common/init.sh
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+#	init.sh
+#	Author: Michael Petlan <mpetlan@redhat.com>
+#
+#	Description:
+#
+#		This file should be used for initialization of basic functions
+#	for checking, reporting results etc.
+#
+#
+
+
+. ../common/settings.sh
+. ../common/patterns.sh
+
+THIS_TEST_NAME=`basename $0 .sh`
+
+_echo()
+{
+	test "$TESTLOG_VERBOSITY" -ne 0 && echo -e "$@"
+}
+
+print_results()
+{
+	PERF_RETVAL="$1"; shift
+	CHECK_RETVAL="$1"; shift
+	FAILURE_REASON=""
+	TASK_COMMENT="$@"
+	if [ $PERF_RETVAL -eq 0 -a $CHECK_RETVAL -eq 0 ]; then
+		_echo "$MPASS-- [ PASS ] --$MEND $TEST_NAME :: $THIS_TEST_NAME :: $TASK_COMMENT"
+		return 0
+	else
+		if [ $PERF_RETVAL -ne 0 ]; then
+			FAILURE_REASON="command exitcode"
+		fi
+		if [ $CHECK_RETVAL -ne 0 ]; then
+			test -n "$FAILURE_REASON" && FAILURE_REASON="$FAILURE_REASON + "
+			FAILURE_REASON="$FAILURE_REASON""output regexp parsing"
+		fi
+		_echo "$MFAIL-- [ FAIL ] --$MEND $TEST_NAME :: $THIS_TEST_NAME :: $TASK_COMMENT ($FAILURE_REASON)"
+		return 1
+	fi
+}
+
+print_overall_results()
+{
+	RETVAL="$1"; shift
+	if [ $RETVAL -eq 0 ]; then
+		_echo "$MALLPASS## [ PASS ] ##$MEND $TEST_NAME :: $THIS_TEST_NAME SUMMARY"
+	else
+		_echo "$MALLFAIL## [ FAIL ] ##$MEND $TEST_NAME :: $THIS_TEST_NAME SUMMARY :: $RETVAL failures found"
+	fi
+	return $RETVAL
+}
+
+print_testcase_skipped()
+{
+	TASK_COMMENT="$@"
+	_echo "$MSKIP-- [ SKIP ] --$MEND $TEST_NAME :: $THIS_TEST_NAME :: $TASK_COMMENT :: testcase skipped"
+	return 0
+}
+
+print_overall_skipped()
+{
+	_echo "$MSKIP## [ SKIP ] ##$MEND $TEST_NAME :: $THIS_TEST_NAME :: testcase skipped"
+	return 0
+}
+
+print_warning()
+{
+	WARN_COMMENT="$@"
+	_echo "$MWARN-- [ WARN ] --$MEND $TEST_NAME :: $THIS_TEST_NAME :: $WARN_COMMENT"
+	return 0
+}
+
+# this function should skip a testcase if the testsuite is not run in
+# a runmode that fits the testcase --> if the suite runs in BASIC mode
+# all STANDARD and EXPERIMENTAL testcases will be skipped; if the suite
+# runs in STANDARD mode, all EXPERIMENTAL testcases will be skipped and
+# if the suite runs in EXPERIMENTAL mode, nothing is skipped
+consider_skipping()
+{
+	TESTCASE_RUNMODE="$1"
+	# the runmode of a testcase needs to be at least the current suite's runmode
+	if [ $PERFTOOL_TESTSUITE_RUNMODE -lt $TESTCASE_RUNMODE ]; then
+		print_overall_skipped
+		exit 0
+	fi
+}
+
+detect_baremetal()
+{
+	# return values:
+	# 0 = bare metal
+	# 1 = virtualization detected
+	# 2 = unknown state
+	VIRT=`systemd-detect-virt 2>/dev/null`
+	test $? -eq 127 && return 2
+	test "$VIRT" = "none"
+}
+
+detect_intel()
+{
+	# return values:
+	# 0 = is Intel
+	# 1 = is not Intel or unknown
+	grep "vendor_id" < /proc/cpuinfo | grep -q "GenuineIntel"
+}
+
+detect_amd()
+{
+	# return values:
+	# 0 = is AMD
+	# 1 = is not AMD or unknown
+	grep "vendor_id" < /proc/cpuinfo | grep -q "AMD"
+}
diff --git a/tools/perf/tests/shell/common/patterns.sh b/tools/perf/tests/shell/common/patterns.sh
new file mode 100644
index 000000000000..21dab25c7b7f
--- /dev/null
+++ b/tools/perf/tests/shell/common/patterns.sh
@@ -0,0 +1,268 @@
+# SPDX-License-Identifier: GPL-2.0
+
+export RE_NUMBER="[0-9\.]+"
+# Number
+# Examples:
+#    123.456
+
+
+export RE_NUMBER_HEX="[0-9A-Fa-f]+"
+# Hexadecimal number
+# Examples:
+#    1234
+#    a58d
+#    aBcD
+#    deadbeef
+
+
+export RE_DATE_YYYYMMDD="[0-9]{4}-(?:(?:01|03|05|07|08|10|12)-(?:[0-2][0-9]|3[0-1])|02-[0-2][0-9]|(?:(?:04|06|09|11)-(?:[0-2][0-9]|30)))"
+# Date in YYYY-MM-DD form
+# Examples:
+#    1990-02-29
+#    0015-07-31
+#    2456-12-31
+#!   2012-13-01
+#!   1963-09-31
+
+
+export RE_TIME="(?:[0-1][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9]"
+# Time
+# Examples:
+#    15:12:27
+#    23:59:59
+#!   24:00:00
+#!   11:25:60
+#!   17:60:15
+
+
+export RE_DATE_TIME="\w+\s+\w+\s+$RE_NUMBER\s+$RE_TIME\s+$RE_NUMBER"
+# Time and date
+# Examples:
+#    Wed Feb 12 10:46:26 2020
+#    Mon Mar  2 13:27:06 2020
+#!   St úno 12 10:57:21 CET 2020
+#!   Po úno 14 15:17:32 2010
+
+
+export RE_ADDRESS="0x$RE_NUMBER_HEX"
+# Memory address
+# Examples:
+#    0x123abc
+#    0xffffffff9abe8ae8
+#    0x0
+
+
+export RE_ADDRESS_NOT_NULL="0x[0-9A-Fa-f]*[1-9A-Fa-f]+[0-9A-Fa-f]*"
+# Memory address (not NULL)
+# Examples:
+#    0xffffffff9abe8ae8
+#!   0x0
+#!   0x0000000000000000
+
+export RE_PROCESS_PID="[^\/]+\/\d+"
+# A process with PID
+# Example:
+#    sleep/4102
+#    test_overhead./866185
+#    in:imjournal/1096
+#    random#$& test/866607
+
+export RE_EVENT_ANY="[\w\-\:\/_=,]+"
+# Name of any event (universal)
+# Examples:
+#    cpu-cycles
+#    cpu/event=12,umask=34/
+#    r41e1
+#    nfs:nfs_getattr_enter
+
+
+export RE_EVENT="[\w\-:_]+"
+# Name of an usual event
+# Examples:
+#    cpu-cycles
+
+
+export RE_EVENT_RAW="r$RE_NUMBER_HEX"
+# Specification of a raw event
+# Examples:
+#    r41e1
+#    r1a
+
+
+export RE_EVENT_CPU="cpu/(\w+=$RE_NUMBER_HEX,?)+/p*"
+# Specification of a CPU event
+# Examples:
+#    cpu/event=12,umask=34/pp
+
+
+export RE_EVENT_UNCORE="uncore/[\w_]+/"
+# Specification of an uncore event
+# Examples:
+#    uncore/qhl_request_local_reads/
+
+
+export RE_EVENT_SUBSYSTEM="[\w\-]+:[\w\-]+"
+# Name of an event from subsystem
+# Examples:
+#    ext4:ext4_ordered_write_end
+#    sched:sched_switch
+
+
+export RE_FILE_NAME="[\w\+\.-]+"
+# A filename
+# Examples:
+#    libstdc++.so.6
+#!   some/path
+
+
+export RE_PATH_ABSOLUTE="(?:\/$RE_FILE_NAME)+"
+# A full filepath
+# Examples:
+#    /usr/lib64/somelib.so.5.4.0
+#    /lib/modules/4.3.0-rc5/kernel/fs/xfs/xfs.ko
+#    /usr/bin/mv
+#!   some/relative/path
+#!   ./some/relative/path
+
+
+export RE_PATH="(?:$RE_FILE_NAME)?$RE_PATH_ABSOLUTE"
+# A filepath
+# Examples:
+#    /usr/lib64/somelib.so.5.4.0
+#    /lib/modules/4.3.0-rc5/kernel/fs/xfs/xfs.ko
+#    ./.emacs
+#    src/fs/file.c
+
+
+export RE_DSO="(?:$RE_PATH_ABSOLUTE(?: \(deleted\))?|\[kernel\.kallsyms\]|\[unknown\]|\[vdso\]|\[kernel\.vmlinux\][\.\w]*)"
+# A DSO name in various result tables
+# Examples:
+#    /usr/lib64/somelib.so.5.4.0
+#    /usr/bin/somebinart (deleted)
+#    /lib/modules/4.3.0-rc5/kernel/fs/xfs/xfs.ko
+#    [kernel.kallsyms]
+#    [kernel.vmlinux]
+#    [vdso]
+#    [unknown]
+
+
+export RE_LINE_COMMENT="^#.*"
+# A comment line
+# Examples:
+#    # Started on Thu Sep 10 11:43:00 2015
+
+
+export RE_LINE_EMPTY="^\s*$"
+# An empty line with possible whitespaces
+# Examples:
+#
+
+
+export RE_LINE_RECORD1="^\[\s+perf\s+record:\s+Woken up $RE_NUMBER times? to write data\s+\].*$"
+# The first line of perf-record "OK" output
+# Examples:
+#    [ perf record: Woken up 1 times to write data ]
+
+
+export RE_LINE_RECORD2="^\[\s+perf\s+record:\s+Captured and wrote $RE_NUMBER\s*MB\s+(?:[\w\+\.-]*(?:$RE_PATH)?\/)?perf\.data(?:\.\d+)?\s*\(~?$RE_NUMBER samples\)\s+\].*$"
+# The second line of perf-record "OK" output
+# Examples:
+#    [ perf record: Captured and wrote 0.405 MB perf.data (109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB perf.data (~109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB /some/temp/dir/perf.data (109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB ./perf.data (109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB ./perf.data.3 (109 samples) ]
+
+
+export RE_LINE_RECORD2_TOLERANT="^\[\s+perf\s+record:\s+Captured and wrote $RE_NUMBER\s*MB\s+(?:[\w\+\.-]*(?:$RE_PATH)?\/)?perf\.data(?:\.\d+)?\s*(?:\(~?$RE_NUMBER samples\))?\s+\].*$"
+# The second line of perf-record "OK" output, even no samples is OK here
+# Examples:
+#    [ perf record: Captured and wrote 0.405 MB perf.data (109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB perf.data (~109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB /some/temp/dir/perf.data (109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB ./perf.data (109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB ./perf.data.3 (109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB perf.data ]
+
+
+export RE_LINE_RECORD2_TOLERANT_FILENAME="^\[\s+perf\s+record:\s+Captured and wrote $RE_NUMBER\s*MB\s+(?:[\w\+\.-]*(?:$RE_PATH)?\/)?perf\w*\.data(?:\.\d+)?\s*\(~?$RE_NUMBER samples\)\s+\].*$"
+# The second line of perf-record "OK" output
+# Examples:
+#    [ perf record: Captured and wrote 0.405 MB perf.data (109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB perf_ls.data (~109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB perf_aNyCaSe.data (109 samples) ]
+#    [ perf record: Captured and wrote 0.405 MB ./perfdata.data.3 (109 samples) ]
+#!    [ perf record: Captured and wrote 0.405 MB /some/temp/dir/my_own.data (109 samples) ]
+#!    [ perf record: Captured and wrote 0.405 MB ./UPPERCASE.data (109 samples) ]
+#!    [ perf record: Captured and wrote 0.405 MB ./aNyKiNDoF.data.3 (109 samples) ]
+#!    [ perf record: Captured and wrote 0.405 MB perf.data ]
+
+
+export RE_LINE_TRACE_FULL="^\s*$RE_NUMBER\s*\(\s*$RE_NUMBER\s*ms\s*\):\s*$RE_PROCESS_PID\s+.*\)\s+=\s+(:?\-?$RE_NUMBER|0x$RE_NUMBER_HEX).*$"
+# A line of perf-trace output
+# Examples:
+#    0.115 ( 0.005 ms): sleep/4102 open(filename: 0xd09e2ab2, flags: CLOEXEC                             ) = 3
+#    0.157 ( 0.005 ms): sleep/4102 mmap(len: 3932736, prot: EXEC|READ, flags: PRIVATE|DENYWRITE, fd: 3   ) = 0x7f89d0605000
+#!    0.115 ( 0.005 ms): sleep/4102 open(filename: 0xd09e2ab2, flags: CLOEXEC                             ) =
+
+export RE_LINE_TRACE_ONE_PROC="^\s*$RE_NUMBER\s*\(\s*$RE_NUMBER\s*ms\s*\):\s*\w+\(.*\)\s+=\s+(?:\-?$RE_NUMBER|0x$RE_NUMBER_HEX).*$"
+# A line of perf-trace output
+# Examples:
+#    0.115 ( 0.005 ms): open(filename: 0xd09e2ab2, flags: CLOEXEC                             ) = 3
+#    0.157 ( 0.005 ms): mmap(len: 3932736, prot: EXEC|READ, flags: PRIVATE|DENYWRITE, fd: 3   ) = 0x7f89d0605000
+#!    0.115 ( 0.005 ms): open(filename: 0xd09e2ab2, flags: CLOEXEC                             ) =
+
+export RE_LINE_TRACE_CONTINUED="^\s*(:?$RE_NUMBER|\?)\s*\(\s*($RE_NUMBER\s*ms\s*)?\):\s*($RE_PROCESS_PID\s*)?\.\.\.\s*\[continued\]:\s+\w+\(\).*\s+=\s+(?:\-?$RE_NUMBER|0x$RE_NUMBER_HEX).*$"
+# A line of perf-trace output
+# Examples:
+#    0.000 ( 0.000 ms):  ... [continued]: nanosleep()) = 0
+#    0.000 ( 0.000 ms):  ... [continued]: nanosleep()) = 0x00000000
+#    ? (         ): packagekitd/94838  ... [continued]: poll())                                             = 0 (Timeout)
+#!    0.000 ( 0.000 ms):  ... [continued]: nanosleep()) =
+
+export RE_LINE_TRACE_UNFINISHED="^\s*$RE_NUMBER\s*\(\s*\):\s*$RE_PROCESS_PID\s+.*\)\s+\.\.\.\s*$"
+# A line of perf-trace output
+# Examples:
+#    901.040 (         ): in:imjournal/1096 ppoll(ufds: 0x7f701a5adb70, nfds: 1, tsp: 0x7f701a5adaf0, sigsetsize: 8) ...
+#    613.727 (         ): gmain/1099 poll(ufds: 0x56248f6b64b0, nfds: 2, timeout_msecs: 3996)           ...
+
+export RE_LINE_TRACE_SUMMARY_HEADER="\s*syscall\s+calls\s+(?:errors\s+)?total\s+min\s+avg\s+max\s+stddev"
+# A header of a perf-trace summary table
+# Example:
+#    syscall            calls    total       min       avg       max      stddev
+#    syscall            calls  errors  total       min       avg       max       stddev
+
+
+export RE_LINE_TRACE_SUMMARY_CONTENT="^\s*\w+\s+(?:$RE_NUMBER\s+){5,6}$RE_NUMBER%"
+# A line of a perf-trace summary table
+# Example:
+#    open                   3     0.017     0.005     0.006     0.007     10.90%
+#    openat                 2      0     0.017     0.008     0.009     0.010     12.29%
+
+
+export RE_LINE_REPORT_CONTENT="^\s+$RE_NUMBER%\s+\w+\s+\S+\s+\S+\s+\S+" # FIXME
+# A line from typicap perf report --stdio output
+# Example:
+#     100.00%  sleep    [kernel.vmlinux]  [k] syscall_return_slowpath
+
+
+export RE_TASK="\s+[\w~\/ \.\+:#-]+(?:\[-1(?:\/\d+)?\]|\[\d+(?:\/\d+)?\])"
+# A name of a task used for perf sched timehist -s
+# Example:
+#     sleep[62755]
+#     runtest.sh[62762]
+#     gmain[705/682]
+#     xfsaild/dm-0[495]
+#     kworker/u8:1-ev[62714]
+#     :-1[-1/62756]
+#     :-1[-1]
+#     :-1[62756]
+
+
+export RE_SEGFAULT=".*(?:Segmentation\sfault|SIGSEGV|\score\s|dumped|segfault).*"
+# Possible variations of the segfault message
+# Example:
+#     /bin/bash: line 1:    32 Segmentation fault      timeout 15s
+#     Segmentation fault (core dumped)
+#     Program terminated with signal SIGSEGV
+#!     WARNING: 12323431 isn't a 'cpu_core', please use a CPU list in the 'cpu_core' range (0-15)
diff --git a/tools/perf/tests/shell/common/settings.sh b/tools/perf/tests/shell/common/settings.sh
new file mode 100644
index 000000000000..361641dbaaad
--- /dev/null
+++ b/tools/perf/tests/shell/common/settings.sh
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+#	settings.sh
+#	Author: Michael Petlan <mpetlan@redhat.com>
+#
+#	Description:
+#
+#		This file contains global settings for the whole testsuite.
+#	Its purpose is to make it easier when it is necessary i.e. to
+#	change the usual sample command which is used in all of the tests
+#	in many files.
+#
+#		This file is intended to be sourced in the tests.
+#
+
+#### which perf to use in the testing
+export CMD_PERF=${CMD_PERF:-`which perf`}
+
+#### basic programs examinated by perf
+export CMD_BASIC_SLEEP="sleep 0.1"
+export CMD_QUICK_SLEEP="sleep 0.01"
+export CMD_LONGER_SLEEP="sleep 2"
+export CMD_DOUBLE_LONGER_SLEEP="sleep 4"
+export CMD_VERY_LONG_SLEEP="sleep 30"
+export CMD_SIMPLE="true"
+
+#### testsuite run mode
+# define constants:
+export RUNMODE_BASIC=0
+export RUNMODE_STANDARD=1
+export RUNMODE_EXPERIMENTAL=2
+# default runmode is STANDARD
+export PERFTOOL_TESTSUITE_RUNMODE=${PERFTOOL_TESTSUITE_RUNMODE:-$RUNMODE_STANDARD}
+
+#### common settings
+export TESTLOG_VERBOSITY=${TESTLOG_VERBOSITY:-2}
+export TESTLOG_FORCE_COLOR=${TESTLOG_FORCE_COLOR:-n}
+export TESTLOG_ERR_MSG_MAX_LINES=${TESTLOG_ERR_MSG_MAX_LINES:-20}
+export TESTLOG_CLEAN=${TESTLOG_CLEAN:-y}
+
+#### other environment-related settings
+export TEST_IGNORE_MISSING_PMU=${TEST_IGNORE_MISSING_PMU:-n}
+
+#### clear locale
+export LC_ALL=C
+
+#### colors
+if [ -t 1 -o "$TESTLOG_FORCE_COLOR" = "yes" ]; then
+	export MPASS="\e[32m"
+	export MALLPASS="\e[1;32m"
+	export MFAIL="\e[31m"
+	export MALLFAIL="\e[1;31m"
+	export MWARN="\e[1;35m"
+	export MSKIP="\e[33m"
+	export MHIGH="\e[1;33m"
+	export MEND="\e[m"
+else
+	export MPASS=""
+	export MALLPASS=""
+	export MFAIL=""
+	export MALLFAIL=""
+	export MWARN=""
+	export MSKIP=""
+	export MHIGH=""
+	export MEND=""
+fi
+
+
+#### test parametrization
+if [ ! -d ./common ]; then
+	# set parameters based on runmode
+	if [ -f ../common/parametrization.$PERFTOOL_TESTSUITE_RUNMODE.sh ]; then
+		. ../common/parametrization.$PERFTOOL_TESTSUITE_RUNMODE.sh
+	fi
+	# if some parameters haven't been set until now, set them to default
+	if [ -f ../common/parametrization.sh ]; then
+		. ../common/parametrization.sh
+	fi
+fi
diff --git a/tools/perf/tests/shell/lib/perf_has_symbol.sh b/tools/perf/tests/shell/lib/perf_has_symbol.sh
index 5d59c32ae3e7..561c93b75d77 100644
--- a/tools/perf/tests/shell/lib/perf_has_symbol.sh
+++ b/tools/perf/tests/shell/lib/perf_has_symbol.sh
@@ -3,7 +3,7 @@
 
 perf_has_symbol()
 {
-	if perf test -vv "Symbols" 2>&1 | grep "[[:space:]]$1$"; then
+	if perf test -vv -F "Symbols" 2>&1 | grep "[[:space:]]$1$"; then
 		echo "perf does have symbol '$1'"
 		return 0
 	fi
diff --git a/tools/perf/tests/shell/lib/perf_json_output_lint.py b/tools/perf/tests/shell/lib/perf_json_output_lint.py
index ea55d5ea1ced..abc1fd737782 100644
--- a/tools/perf/tests/shell/lib/perf_json_output_lint.py
+++ b/tools/perf/tests/shell/lib/perf_json_output_lint.py
@@ -15,6 +15,7 @@ ap.add_argument('--event', action='store_true')
 ap.add_argument('--per-core', action='store_true')
 ap.add_argument('--per-thread', action='store_true')
 ap.add_argument('--per-cache', action='store_true')
+ap.add_argument('--per-cluster', action='store_true')
 ap.add_argument('--per-die', action='store_true')
 ap.add_argument('--per-node', action='store_true')
 ap.add_argument('--per-socket', action='store_true')
@@ -49,6 +50,7 @@ def check_json_output(expected_items):
       'cgroup': lambda x: True,
       'cpu': lambda x: isint(x),
       'cache': lambda x: True,
+      'cluster': lambda x: True,
       'die': lambda x: True,
       'event': lambda x: True,
       'event-runtime': lambda x: isfloat(x),
@@ -88,7 +90,7 @@ try:
     expected_items = 7
   elif args.interval or args.per_thread or args.system_wide_no_aggr:
     expected_items = 8
-  elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cache:
+  elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cluster or args.per_cache:
     expected_items = 9
   else:
     # If no option is specified, don't check the number of items.
diff --git a/tools/perf/tests/shell/lib/perf_metric_validation.py b/tools/perf/tests/shell/lib/perf_metric_validation.py
index 50a34a9cc040..a2d235252183 100644
--- a/tools/perf/tests/shell/lib/perf_metric_validation.py
+++ b/tools/perf/tests/shell/lib/perf_metric_validation.py
@@ -1,4 +1,4 @@
-#SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: GPL-2.0
 import re
 import csv
 import json
@@ -6,36 +6,61 @@ import argparse
 from pathlib import Path
 import subprocess
 
+
+class TestError:
+    def __init__(self, metric: list[str], wl: str, value: list[float], low: float, up=float('nan'), description=str()):
+        self.metric: list = metric  # multiple metrics in relationship type tests
+        self.workloads = [wl]  # multiple workloads possible
+        self.collectedValue: list = value
+        self.valueLowBound = low
+        self.valueUpBound = up
+        self.description = description
+
+    def __repr__(self) -> str:
+        if len(self.metric) > 1:
+            return "\nMetric Relationship Error: \tThe collected value of metric {0}\n\
+                \tis {1} in workload(s): {2} \n\
+                \tbut expected value range is [{3}, {4}]\n\
+                \tRelationship rule description: \'{5}\'".format(self.metric, self.collectedValue, self.workloads,
+                                                                 self.valueLowBound, self.valueUpBound, self.description)
+        elif len(self.collectedValue) == 0:
+            return "\nNo Metric Value Error: \tMetric {0} returns with no value \n\
+                    \tworkload(s): {1}".format(self.metric, self.workloads)
+        else:
+            return "\nWrong Metric Value Error: \tThe collected value of metric {0}\n\
+                    \tis {1} in workload(s): {2}\n\
+                    \tbut expected value range is [{3}, {4}]"\
+                        .format(self.metric, self.collectedValue, self.workloads,
+                                self.valueLowBound, self.valueUpBound)
+
+
 class Validator:
     def __init__(self, rulefname, reportfname='', t=5, debug=False, datafname='', fullrulefname='', workload='true', metrics=''):
         self.rulefname = rulefname
         self.reportfname = reportfname
         self.rules = None
-        self.collectlist:str = metrics
+        self.collectlist: str = metrics
         self.metrics = self.__set_metrics(metrics)
         self.skiplist = set()
         self.tolerance = t
 
         self.workloads = [x for x in workload.split(",") if x]
-        self.wlidx = 0 # idx of current workloads
-        self.allresults = dict() # metric results of all workload
-        self.allignoremetrics = dict() # metrics with no results or negative results
-        self.allfailtests = dict()
+        self.wlidx = 0  # idx of current workloads
+        self.allresults = dict()  # metric results of all workload
         self.alltotalcnt = dict()
         self.allpassedcnt = dict()
-        self.allerrlist = dict()
 
-        self.results = dict() # metric results of current workload
+        self.results = dict()  # metric results of current workload
         # vars for test pass/failure statistics
-        self.ignoremetrics= set() # metrics with no results or negative results, neg result counts as a failed test
-        self.failtests = dict()
+        # metrics with no results or negative results, neg result counts failed tests
+        self.ignoremetrics = set()
         self.totalcnt = 0
         self.passedcnt = 0
         # vars for errors
         self.errlist = list()
 
         # vars for Rule Generator
-        self.pctgmetrics = set() # Percentage rule
+        self.pctgmetrics = set()  # Percentage rule
 
         # vars for debug
         self.datafname = datafname
@@ -69,10 +94,10 @@ class Validator:
                       ensure_ascii=True,
                       indent=4)
 
-    def get_results(self, idx:int = 0):
+    def get_results(self, idx: int = 0):
         return self.results[idx]
 
-    def get_bounds(self, lb, ub, error, alias={}, ridx:int = 0) -> list:
+    def get_bounds(self, lb, ub, error, alias={}, ridx: int = 0) -> list:
         """
         Get bounds and tolerance from lb, ub, and error.
         If missing lb, use 0.0; missing ub, use float('inf); missing error, use self.tolerance.
@@ -85,7 +110,7 @@ class Validator:
                   tolerance, denormalized base on upper bound value
         """
         # init ubv and lbv to invalid values
-        def get_bound_value (bound, initval, ridx):
+        def get_bound_value(bound, initval, ridx):
             val = initval
             if isinstance(bound, int) or isinstance(bound, float):
                 val = bound
@@ -113,10 +138,10 @@ class Validator:
 
         return lbv, ubv, denormerr
 
-    def get_value(self, name:str, ridx:int = 0) -> list:
+    def get_value(self, name: str, ridx: int = 0) -> list:
         """
         Get value of the metric from self.results.
-        If result of this metric is not provided, the metric name will be added into self.ignoremetics and self.errlist.
+        If result of this metric is not provided, the metric name will be added into self.ignoremetics.
         All future test(s) on this metric will fail.
 
         @param name: name of the metric
@@ -142,7 +167,7 @@ class Validator:
         Check if metrics value are non-negative.
         One metric is counted as one test.
         Failure: when metric value is negative or not provided.
-        Metrics with negative value will be added into the self.failtests['PositiveValueTest'] and self.ignoremetrics.
+        Metrics with negative value will be added into self.ignoremetrics.
         """
         negmetric = dict()
         pcnt = 0
@@ -155,25 +180,27 @@ class Validator:
             else:
                 pcnt += 1
             tcnt += 1
+        # The first round collect_perf() run these metrics with simple workload
+        # "true". We give metrics a second chance with a longer workload if less
+        # than 20 metrics failed positive test.
         if len(rerun) > 0 and len(rerun) < 20:
             second_results = dict()
             self.second_test(rerun, second_results)
             for name, val in second_results.items():
-                if name not in negmetric: continue
+                if name not in negmetric:
+                    continue
                 if val >= 0:
                     del negmetric[name]
                     pcnt += 1
 
-        self.failtests['PositiveValueTest']['Total Tests'] = tcnt
-        self.failtests['PositiveValueTest']['Passed Tests'] = pcnt
         if len(negmetric.keys()):
             self.ignoremetrics.update(negmetric.keys())
-            negmessage = ["{0}(={1:.4f})".format(name, val) for name, val in negmetric.items()]
-            self.failtests['PositiveValueTest']['Failed Tests'].append({'NegativeValue': negmessage})
+            self.errlist.extend(
+                [TestError([m], self.workloads[self.wlidx], negmetric[m], 0) for m in negmetric.keys()])
 
         return
 
-    def evaluate_formula(self, formula:str, alias:dict, ridx:int = 0):
+    def evaluate_formula(self, formula: str, alias: dict, ridx: int = 0):
         """
         Evaluate the value of formula.
 
@@ -187,10 +214,11 @@ class Validator:
         sign = "+"
         f = str()
 
-        #TODO: support parenthesis?
+        # TODO: support parenthesis?
         for i in range(len(formula)):
             if i+1 == len(formula) or formula[i] in ('+', '-', '*', '/'):
-                s = alias[formula[b:i]] if i+1 < len(formula) else alias[formula[b:]]
+                s = alias[formula[b:i]] if i + \
+                    1 < len(formula) else alias[formula[b:]]
                 v = self.get_value(s, ridx)
                 if not v:
                     errs.append(s)
@@ -228,49 +256,49 @@ class Validator:
         alias = dict()
         for m in rule['Metrics']:
             alias[m['Alias']] = m['Name']
-        lbv, ubv, t = self.get_bounds(rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'], alias, ridx=rule['RuleIndex'])
-        val, f = self.evaluate_formula(rule['Formula'], alias, ridx=rule['RuleIndex'])
+        lbv, ubv, t = self.get_bounds(
+            rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'], alias, ridx=rule['RuleIndex'])
+        val, f = self.evaluate_formula(
+            rule['Formula'], alias, ridx=rule['RuleIndex'])
+
+        lb = rule['RangeLower']
+        ub = rule['RangeUpper']
+        if isinstance(lb, str):
+            if lb in alias:
+                lb = alias[lb]
+        if isinstance(ub, str):
+            if ub in alias:
+                ub = alias[ub]
+
         if val == -1:
-            self.failtests['RelationshipTest']['Failed Tests'].append({'RuleIndex': rule['RuleIndex'], 'Description':f})
+            self.errlist.append(TestError([m['Name'] for m in rule['Metrics']], self.workloads[self.wlidx], [],
+                                lb, ub, rule['Description']))
         elif not self.check_bound(val, lbv, ubv, t):
-            lb = rule['RangeLower']
-            ub = rule['RangeUpper']
-            if isinstance(lb, str):
-                if lb in alias:
-                    lb = alias[lb]
-            if isinstance(ub, str):
-                if ub in alias:
-                    ub = alias[ub]
-            self.failtests['RelationshipTest']['Failed Tests'].append({'RuleIndex': rule['RuleIndex'], 'Formula':f,
-                                                                       'RangeLower': lb, 'LowerBoundValue': self.get_value(lb),
-                                                                       'RangeUpper': ub, 'UpperBoundValue':self.get_value(ub),
-                                                                       'ErrorThreshold': t, 'CollectedValue': val})
+            self.errlist.append(TestError([m['Name'] for m in rule['Metrics']], self.workloads[self.wlidx], [val],
+                                lb, ub, rule['Description']))
         else:
             self.passedcnt += 1
-            self.failtests['RelationshipTest']['Passed Tests'] += 1
         self.totalcnt += 1
-        self.failtests['RelationshipTest']['Total Tests'] += 1
 
         return
 
-
     # Single Metric Test
-    def single_test(self, rule:dict):
+    def single_test(self, rule: dict):
         """
         Validate if the metrics are in the required value range.
         eg. lower_bound <= metrics_value <= upper_bound
         One metric is counted as one test in this type of test.
         One rule may include one or more metrics.
         Failure: when the metric value not provided or the value is outside the bounds.
-        This test updates self.total_cnt and records failed tests in self.failtest['SingleMetricTest'].
+        This test updates self.total_cnt.
 
         @param rule: dict with metrics to validate and the value range requirement
         """
-        lbv, ubv, t = self.get_bounds(rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'])
+        lbv, ubv, t = self.get_bounds(
+            rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'])
         metrics = rule['Metrics']
         passcnt = 0
         totalcnt = 0
-        faillist = list()
         failures = dict()
         rerun = list()
         for m in metrics:
@@ -286,25 +314,20 @@ class Validator:
             second_results = dict()
             self.second_test(rerun, second_results)
             for name, val in second_results.items():
-                if name not in failures: continue
+                if name not in failures:
+                    continue
                 if self.check_bound(val, lbv, ubv, t):
                     passcnt += 1
                     del failures[name]
                 else:
-                    failures[name] = val
+                    failures[name] = [val]
                     self.results[0][name] = val
 
         self.totalcnt += totalcnt
         self.passedcnt += passcnt
-        self.failtests['SingleMetricTest']['Total Tests'] += totalcnt
-        self.failtests['SingleMetricTest']['Passed Tests'] += passcnt
         if len(failures.keys()) != 0:
-            faillist = [{'MetricName':name, 'CollectedValue':val} for name, val in failures.items()]
-            self.failtests['SingleMetricTest']['Failed Tests'].append({'RuleIndex':rule['RuleIndex'],
-                                                                       'RangeLower': rule['RangeLower'],
-                                                                       'RangeUpper': rule['RangeUpper'],
-                                                                       'ErrorThreshold':rule['ErrorThreshold'],
-                                                                       'Failure':faillist})
+            self.errlist.extend([TestError([name], self.workloads[self.wlidx], val,
+                                rule['RangeLower'], rule['RangeUpper']) for name, val in failures.items()])
 
         return
 
@@ -312,19 +335,11 @@ class Validator:
         """
         Create final report and write into a JSON file.
         """
-        alldata = list()
-        for i in range(0, len(self.workloads)):
-            reportstas = {"Total Rule Count": self.alltotalcnt[i], "Passed Rule Count": self.allpassedcnt[i]}
-            data = {"Metric Validation Statistics": reportstas, "Tests in Category": self.allfailtests[i],
-                    "Errors":self.allerrlist[i]}
-            alldata.append({"Workload": self.workloads[i], "Report": data})
-
-        json_str = json.dumps(alldata, indent=4)
-        print("Test validation finished. Final report: ")
-        print(json_str)
+        print(self.errlist)
 
         if self.debug:
-            allres = [{"Workload": self.workloads[i], "Results": self.allresults[i]} for i in range(0, len(self.workloads))]
+            allres = [{"Workload": self.workloads[i], "Results": self.allresults[i]}
+                      for i in range(0, len(self.workloads))]
             self.json_dump(allres, self.datafname)
 
     def check_rule(self, testtype, metric_list):
@@ -342,13 +357,13 @@ class Validator:
         return True
 
     # Start of Collector and Converter
-    def convert(self, data: list, metricvalues:dict):
+    def convert(self, data: list, metricvalues: dict):
         """
         Convert collected metric data from the -j output to dict of {metric_name:value}.
         """
         for json_string in data:
             try:
-                result =json.loads(json_string)
+                result = json.loads(json_string)
                 if "metric-unit" in result and result["metric-unit"] != "(null)" and result["metric-unit"] != "":
                     name = result["metric-unit"].split("  ")[1] if len(result["metric-unit"].split("  ")) > 1 \
                         else result["metric-unit"]
@@ -365,9 +380,10 @@ class Validator:
         print(" ".join(command))
         cmd = subprocess.run(command, stderr=subprocess.PIPE, encoding='utf-8')
         data = [x+'}' for x in cmd.stderr.split('}\n') if x]
+        if data[0][0] != '{':
+            data[0] = data[0][data[0].find('{'):]
         return data
 
-
     def collect_perf(self, workload: str):
         """
         Collect metric data with "perf stat -M" on given workload with -a and -j.
@@ -385,14 +401,18 @@ class Validator:
             if rule["TestType"] == "RelationshipTest":
                 metrics = [m["Name"] for m in rule["Metrics"]]
                 if not any(m not in collectlist[0] for m in metrics):
-                    collectlist[rule["RuleIndex"]] = [",".join(list(set(metrics)))]
+                    collectlist[rule["RuleIndex"]] = [
+                        ",".join(list(set(metrics)))]
 
         for idx, metrics in collectlist.items():
-            if idx == 0: wl = "true"
-            else: wl = workload
+            if idx == 0:
+                wl = "true"
+            else:
+                wl = workload
             for metric in metrics:
                 data = self._run_perf(metric, wl)
-                if idx not in self.results: self.results[idx] = dict()
+                if idx not in self.results:
+                    self.results[idx] = dict()
                 self.convert(data, self.results[idx])
         return
 
@@ -412,7 +432,8 @@ class Validator:
         2) create metric name list
         """
         command = ['perf', 'list', '-j', '--details', 'metrics']
-        cmd = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8')
+        cmd = subprocess.run(command, stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE, encoding='utf-8')
         try:
             data = json.loads(cmd.stdout)
             for m in data:
@@ -453,12 +474,12 @@ class Validator:
         rules = data['RelationshipRules']
         self.skiplist = set([name.lower() for name in data['SkipList']])
         self.rules = self.remove_unsupported_rules(rules)
-        pctgrule = {'RuleIndex':0,
-                    'TestType':'SingleMetricTest',
-                    'RangeLower':'0',
+        pctgrule = {'RuleIndex': 0,
+                    'TestType': 'SingleMetricTest',
+                    'RangeLower': '0',
                     'RangeUpper': '100',
                     'ErrorThreshold': self.tolerance,
-                    'Description':'Metrics in percent unit have value with in [0, 100]',
+                    'Description': 'Metrics in percent unit have value with in [0, 100]',
                     'Metrics': [{'Name': m.lower()} for m in self.pctgmetrics]}
         self.rules.append(pctgrule)
 
@@ -469,8 +490,9 @@ class Validator:
             idx += 1
 
         if self.debug:
-            #TODO: need to test and generate file name correctly
-            data = {'RelationshipRules':self.rules, 'SupportedMetrics': [{"MetricName": name} for name in self.metrics]}
+            # TODO: need to test and generate file name correctly
+            data = {'RelationshipRules': self.rules, 'SupportedMetrics': [
+                {"MetricName": name} for name in self.metrics]}
             self.json_dump(data, self.fullrulefname)
 
         return
@@ -482,20 +504,17 @@ class Validator:
         @param key: key to the dictionaries (index of self.workloads).
         '''
         self.allresults[key] = self.results
-        self.allignoremetrics[key] = self.ignoremetrics
-        self.allfailtests[key] = self.failtests
         self.alltotalcnt[key] = self.totalcnt
         self.allpassedcnt[key] = self.passedcnt
-        self.allerrlist[key] = self.errlist
 
-    #Initialize data structures before data validation of each workload
+    # Initialize data structures before data validation of each workload
     def _init_data(self):
 
-        testtypes = ['PositiveValueTest', 'RelationshipTest', 'SingleMetricTest']
+        testtypes = ['PositiveValueTest',
+                     'RelationshipTest', 'SingleMetricTest']
         self.results = dict()
-        self.ignoremetrics= set()
+        self.ignoremetrics = set()
         self.errlist = list()
-        self.failtests = {k:{'Total Tests':0, 'Passed Tests':0, 'Failed Tests':[]} for k in testtypes}
         self.totalcnt = 0
         self.passedcnt = 0
 
@@ -525,32 +544,33 @@ class Validator:
                 testtype = r['TestType']
                 if not self.check_rule(testtype, r['Metrics']):
                     continue
-                if  testtype == 'RelationshipTest':
+                if testtype == 'RelationshipTest':
                     self.relationship_test(r)
                 elif testtype == 'SingleMetricTest':
                     self.single_test(r)
                 else:
                     print("Unsupported Test Type: ", testtype)
-                    self.errlist.append("Unsupported Test Type from rule: " + r['RuleIndex'])
-            self._storewldata(i)
             print("Workload: ", self.workloads[i])
-            print("Total metrics collected: ", self.failtests['PositiveValueTest']['Total Tests'])
-            print("Non-negative metric count: ", self.failtests['PositiveValueTest']['Passed Tests'])
             print("Total Test Count: ", self.totalcnt)
             print("Passed Test Count: ", self.passedcnt)
-
+            self._storewldata(i)
         self.create_report()
-        return sum(self.alltotalcnt.values()) != sum(self.allpassedcnt.values())
+        return len(self.errlist) > 0
 # End of Class Validator
 
 
 def main() -> None:
-    parser = argparse.ArgumentParser(description="Launch metric value validation")
-
-    parser.add_argument("-rule", help="Base validation rule file", required=True)
-    parser.add_argument("-output_dir", help="Path for validator output file, report file", required=True)
-    parser.add_argument("-debug", help="Debug run, save intermediate data to files", action="store_true", default=False)
-    parser.add_argument("-wl", help="Workload to run while data collection", default="true")
+    parser = argparse.ArgumentParser(
+        description="Launch metric value validation")
+
+    parser.add_argument(
+        "-rule", help="Base validation rule file", required=True)
+    parser.add_argument(
+        "-output_dir", help="Path for validator output file, report file", required=True)
+    parser.add_argument("-debug", help="Debug run, save intermediate data to files",
+                        action="store_true", default=False)
+    parser.add_argument(
+        "-wl", help="Workload to run while data collection", default="true")
     parser.add_argument("-m", help="Metric list to validate", default="")
     args = parser.parse_args()
     outpath = Path(args.output_dir)
@@ -559,8 +579,8 @@ def main() -> None:
     datafile = Path.joinpath(outpath, 'perf_data.json')
 
     validator = Validator(args.rule, reportf, debug=args.debug,
-                        datafname=datafile, fullrulefname=fullrule, workload=args.wl,
-                        metrics=args.m)
+                          datafname=datafile, fullrulefname=fullrule, workload=args.wl,
+                          metrics=args.m)
     ret = validator.test()
 
     return ret
@@ -569,6 +589,3 @@ def main() -> None:
 if __name__ == "__main__":
     import sys
     sys.exit(main())
-
-
-
diff --git a/tools/perf/tests/shell/lib/stat_output.sh b/tools/perf/tests/shell/lib/stat_output.sh
index 3cc158a64326..c81d6a9f7983 100644
--- a/tools/perf/tests/shell/lib/stat_output.sh
+++ b/tools/perf/tests/shell/lib/stat_output.sh
@@ -97,6 +97,18 @@ check_per_cache_instance()
 	echo "[Success]"
 }
 
+check_per_cluster()
+{
+	echo -n "Checking $1 output: per cluster "
+	if ParanoidAndNotRoot 0
+	then
+		echo "[Skip] paranoid and not root"
+		return
+	fi
+	perf stat --per-cluster -a $2 true
+	echo "[Success]"
+}
+
 check_per_die()
 {
 	echo -n "Checking $1 output: per die "
diff --git a/tools/perf/tests/shell/perftool-testsuite_probe.sh b/tools/perf/tests/shell/perftool-testsuite_probe.sh
new file mode 100755
index 000000000000..a0fec33a0358
--- /dev/null
+++ b/tools/perf/tests/shell/perftool-testsuite_probe.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# perftool-testsuite_probe
+# SPDX-License-Identifier: GPL-2.0
+
+test -d "$(dirname "$0")/base_probe" || exit 2
+cd "$(dirname "$0")/base_probe" || exit 2
+status=0
+
+PERFSUITE_RUN_DIR=$(mktemp -d /tmp/"$(basename "$0" .sh)".XXX)
+export PERFSUITE_RUN_DIR
+
+for testcase in setup.sh test_*; do                  # skip setup.sh if not present or not executable
+     test -x "$testcase" || continue
+     ./"$testcase"
+     (( status += $? ))
+done
+
+if ! [ "$PERFTEST_KEEP_LOGS" = "y" ]; then
+	rm -rf "$PERFSUITE_RUN_DIR"
+fi
+
+test $status -ne 0 && exit 1
+exit 0
diff --git a/tools/perf/tests/shell/stat+csv_output.sh b/tools/perf/tests/shell/stat+csv_output.sh
index f1818fa6d9ce..fc2d8cc6e5e0 100755
--- a/tools/perf/tests/shell/stat+csv_output.sh
+++ b/tools/perf/tests/shell/stat+csv_output.sh
@@ -42,6 +42,7 @@ function commachecker()
 	;; "--per-socket")	exp=8
 	;; "--per-node")	exp=8
 	;; "--per-die")		exp=8
+	;; "--per-cluster")	exp=8
 	;; "--per-cache")	exp=8
 	esac
 
@@ -79,6 +80,7 @@ then
 	check_system_wide_no_aggr "CSV" "$perf_cmd"
 	check_per_core "CSV" "$perf_cmd"
 	check_per_cache_instance "CSV" "$perf_cmd"
+	check_per_cluster "CSV" "$perf_cmd"
 	check_per_die "CSV" "$perf_cmd"
 	check_per_socket "CSV" "$perf_cmd"
 else
diff --git a/tools/perf/tests/shell/stat+json_output.sh b/tools/perf/tests/shell/stat+json_output.sh
index 3bc900533a5d..2b9c6212dffc 100755
--- a/tools/perf/tests/shell/stat+json_output.sh
+++ b/tools/perf/tests/shell/stat+json_output.sh
@@ -122,6 +122,18 @@ check_per_cache_instance()
 	echo "[Success]"
 }
 
+check_per_cluster()
+{
+	echo -n "Checking json output: per cluster "
+	if ParanoidAndNotRoot 0
+	then
+		echo "[Skip] paranoia and not root"
+		return
+	fi
+	perf stat -j --per-cluster -a true 2>&1 | $PYTHON $pythonchecker --per-cluster
+	echo "[Success]"
+}
+
 check_per_die()
 {
 	echo -n "Checking json output: per die "
@@ -200,6 +212,7 @@ then
 	check_system_wide_no_aggr
 	check_per_core
 	check_per_cache_instance
+	check_per_cluster
 	check_per_die
 	check_per_socket
 else
diff --git a/tools/perf/tests/shell/stat+std_output.sh b/tools/perf/tests/shell/stat+std_output.sh
index 4fcdd1a9142c..cbf2894b2c84 100755
--- a/tools/perf/tests/shell/stat+std_output.sh
+++ b/tools/perf/tests/shell/stat+std_output.sh
@@ -13,7 +13,7 @@ stat_output=$(mktemp /tmp/__perf_test.stat_output.std.XXXXX)
 
 event_name=(cpu-clock task-clock context-switches cpu-migrations page-faults stalled-cycles-frontend stalled-cycles-backend cycles instructions branches branch-misses)
 event_metric=("CPUs utilized" "CPUs utilized" "/sec" "/sec" "/sec" "frontend cycles idle" "backend cycles idle" "GHz" "insn per cycle" "/sec" "of all branches")
-skip_metric=("stalled cycles per insn" "tma_")
+skip_metric=("stalled cycles per insn" "tma_" "retiring" "frontend_bound" "bad_speculation" "backend_bound")
 
 cleanup() {
   rm -f "${stat_output}"
@@ -40,6 +40,7 @@ function commachecker()
 	;; "--per-node")	prefix=3
 	;; "--per-die")		prefix=3
 	;; "--per-cache")	prefix=3
+	;; "--per-cluster")	prefix=3
 	esac
 
 	while read line
@@ -99,6 +100,7 @@ then
 	check_system_wide_no_aggr "STD" "$perf_cmd"
 	check_per_core "STD" "$perf_cmd"
 	check_per_cache_instance "STD" "$perf_cmd"
+	check_per_cluster "STD" "$perf_cmd"
 	check_per_die "STD" "$perf_cmd"
 	check_per_socket "STD" "$perf_cmd"
 else
diff --git a/tools/perf/tests/shell/stat_bpf_counters.sh b/tools/perf/tests/shell/stat_bpf_counters.sh
index a87bb2814b4c..2d9209874774 100755
--- a/tools/perf/tests/shell/stat_bpf_counters.sh
+++ b/tools/perf/tests/shell/stat_bpf_counters.sh
@@ -4,19 +4,19 @@
 
 set -e
 
-# check whether $2 is within +/- 10% of $1
+# check whether $2 is within +/- 20% of $1
 compare_number()
 {
        first_num=$1
        second_num=$2
 
-       # upper bound is first_num * 110%
-       upper=$(expr $first_num + $first_num / 10 )
-       # lower bound is first_num * 90%
-       lower=$(expr $first_num - $first_num / 10 )
+       # upper bound is first_num * 120%
+       upper=$(expr $first_num + $first_num / 5 )
+       # lower bound is first_num * 80%
+       lower=$(expr $first_num - $first_num / 5 )
 
        if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then
-               echo "The difference between $first_num and $second_num are greater than 10%."
+               echo "The difference between $first_num and $second_num are greater than 20%."
                exit 1
        fi
 }
diff --git a/tools/perf/tests/shell/stat_metrics_values.sh b/tools/perf/tests/shell/stat_metrics_values.sh
index 7ca172599aa6..279f19c5919a 100755
--- a/tools/perf/tests/shell/stat_metrics_values.sh
+++ b/tools/perf/tests/shell/stat_metrics_values.sh
@@ -19,6 +19,8 @@ echo "Output will be stored in: $tmpdir"
 $PYTHON $pythonvalidator -rule $rulefile -output_dir $tmpdir -wl "${workload}"
 ret=$?
 rm -rf $tmpdir
-
+if [ $ret -ne 0 ]; then
+	echo "Metric validation return with erros. Please check metrics reported with errors."
+fi
 exit $ret
 
diff --git a/tools/perf/tests/shell/test_arm_callgraph_fp.sh b/tools/perf/tests/shell/test_arm_callgraph_fp.sh
index e342e6c8aa50..83b53591b1ea 100755
--- a/tools/perf/tests/shell/test_arm_callgraph_fp.sh
+++ b/tools/perf/tests/shell/test_arm_callgraph_fp.sh
@@ -8,6 +8,12 @@ shelldir=$(dirname "$0")
 
 lscpu | grep -q "aarch64" || exit 2
 
+if perf version --build-options | grep HAVE_DWARF_UNWIND_SUPPORT | grep -q OFF
+then
+  echo "Skipping, no dwarf unwind support"
+  exit 2
+fi
+
 skip_test_missing_symbol leafloop
 
 PERF_DATA=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
diff --git a/tools/perf/tests/symbols.c b/tools/perf/tests/symbols.c
index 16e1c5502b09..d208105919ed 100644
--- a/tools/perf/tests/symbols.c
+++ b/tools/perf/tests/symbols.c
@@ -41,6 +41,30 @@ static void exit_test_info(struct test_info *ti)
 	machine__delete(ti->machine);
 }
 
+struct dso_map {
+	struct dso *dso;
+	struct map *map;
+};
+
+static int find_map_cb(struct map *map, void *d)
+{
+	struct dso_map *data = d;
+
+	if (map__dso(map) != data->dso)
+		return 0;
+	data->map = map;
+	return 1;
+}
+
+static struct map *find_module_map(struct machine *machine, struct dso *dso)
+{
+	struct dso_map data = { .dso = dso };
+
+	machine__for_each_kernel_map(machine, find_map_cb, &data);
+
+	return data.map;
+}
+
 static void get_test_dso_filename(char *filename, size_t max_sz)
 {
 	if (dso_to_test)
@@ -51,6 +75,26 @@ static void get_test_dso_filename(char *filename, size_t max_sz)
 
 static int create_map(struct test_info *ti, char *filename, struct map **map_p)
 {
+	struct dso *dso = machine__findnew_dso(ti->machine, filename);
+
+	/*
+	 * If 'filename' matches a current kernel module, must use a kernel
+	 * map. Find the one that already exists.
+	 */
+	if (dso && dso->kernel) {
+		*map_p = find_module_map(ti->machine, dso);
+		dso__put(dso);
+		if (!*map_p) {
+			pr_debug("Failed to find map for current kernel module %s",
+				 filename);
+			return TEST_FAIL;
+		}
+		map__get(*map_p);
+		return TEST_OK;
+	}
+
+	dso__put(dso);
+
 	/* Create a dummy map at 0x100000 */
 	*map_p = map__new(ti->machine, 0x100000, 0xffffffff, 0, NULL,
 			  PROT_EXEC, 0, NULL, filename, ti->thread);
@@ -97,6 +141,26 @@ static int test_dso(struct dso *dso)
 	return ret;
 }
 
+static int subdivided_dso_cb(struct dso *dso, struct machine *machine __maybe_unused, void *d)
+{
+	struct dso *text_dso = d;
+
+	if (dso != text_dso && strstarts(dso->short_name, text_dso->short_name))
+		if (test_dso(dso) != TEST_OK)
+			return -1;
+
+	return 0;
+}
+
+static int process_subdivided_dso(struct machine *machine, struct dso *dso)
+{
+	int ret;
+
+	ret = machine__for_each_dso(machine, subdivided_dso_cb, dso);
+
+	return ret < 0 ? TEST_FAIL : TEST_OK;
+}
+
 static int test_file(struct test_info *ti, char *filename)
 {
 	struct map *map = NULL;
@@ -124,6 +188,10 @@ static int test_file(struct test_info *ti, char *filename)
 	}
 
 	ret = test_dso(dso);
+
+	/* Module dso is split into many dsos by section */
+	if (ret == TEST_OK && dso->kernel)
+		ret = process_subdivided_dso(ti->machine, dso);
 out_put:
 	map__put(map);
 
diff --git a/tools/perf/tests/tests-scripts.c b/tools/perf/tests/tests-scripts.c
new file mode 100644
index 000000000000..e2042b368269
--- /dev/null
+++ b/tools/perf/tests/tests-scripts.c
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/zalloc.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <subcmd/exec-cmd.h>
+#include <subcmd/parse-options.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <api/io.h>
+#include "builtin.h"
+#include "tests-scripts.h"
+#include "color.h"
+#include "debug.h"
+#include "hist.h"
+#include "intlist.h"
+#include "string2.h"
+#include "symbol.h"
+#include "tests.h"
+#include "util/rlimit.h"
+#include "util/util.h"
+
+static int shell_tests__dir_fd(void)
+{
+	char path[PATH_MAX], *exec_path;
+	static const char * const devel_dirs[] = { "./tools/perf/tests/shell", "./tests/shell", };
+
+	for (size_t i = 0; i < ARRAY_SIZE(devel_dirs); ++i) {
+		int fd = open(devel_dirs[i], O_PATH);
+
+		if (fd >= 0)
+			return fd;
+	}
+
+	/* Then installed path. */
+	exec_path = get_argv_exec_path();
+	scnprintf(path, sizeof(path), "%s/tests/shell", exec_path);
+	free(exec_path);
+	return open(path, O_PATH);
+}
+
+static char *shell_test__description(int dir_fd, const char *name)
+{
+	struct io io;
+	char buf[128], desc[256];
+	int ch, pos = 0;
+
+	io__init(&io, openat(dir_fd, name, O_RDONLY), buf, sizeof(buf));
+	if (io.fd < 0)
+		return NULL;
+
+	/* Skip first line - should be #!/bin/sh Shebang */
+	if (io__get_char(&io) != '#')
+		goto err_out;
+	if (io__get_char(&io) != '!')
+		goto err_out;
+	do {
+		ch = io__get_char(&io);
+		if (ch < 0)
+			goto err_out;
+	} while (ch != '\n');
+
+	do {
+		ch = io__get_char(&io);
+		if (ch < 0)
+			goto err_out;
+	} while (ch == '#' || isspace(ch));
+	while (ch > 0 && ch != '\n') {
+		desc[pos++] = ch;
+		if (pos >= (int)sizeof(desc) - 1)
+			break;
+		ch = io__get_char(&io);
+	}
+	while (pos > 0 && isspace(desc[--pos]))
+		;
+	desc[++pos] = '\0';
+	close(io.fd);
+	return strdup(desc);
+err_out:
+	close(io.fd);
+	return NULL;
+}
+
+/* Is this full file path a shell script */
+static bool is_shell_script(int dir_fd, const char *path)
+{
+	const char *ext;
+
+	ext = strrchr(path, '.');
+	if (!ext)
+		return false;
+	if (!strcmp(ext, ".sh")) { /* Has .sh extension */
+		if (faccessat(dir_fd, path, R_OK | X_OK, 0) == 0) /* Is executable */
+			return true;
+	}
+	return false;
+}
+
+/* Is this file in this dir a shell script (for test purposes) */
+static bool is_test_script(int dir_fd, const char *name)
+{
+	return is_shell_script(dir_fd, name);
+}
+
+/* Duplicate a string and fall over and die if we run out of memory */
+static char *strdup_check(const char *str)
+{
+	char *newstr;
+
+	newstr = strdup(str);
+	if (!newstr) {
+		pr_err("Out of memory while duplicating test script string\n");
+		abort();
+	}
+	return newstr;
+}
+
+static int shell_test__run(struct test_suite *test, int subtest __maybe_unused)
+{
+	const char *file = test->priv;
+	int err;
+	char *cmd = NULL;
+
+	if (asprintf(&cmd, "%s%s", file, verbose ? " -v" : "") < 0)
+		return TEST_FAIL;
+	err = system(cmd);
+	free(cmd);
+	if (!err)
+		return TEST_OK;
+
+	return WEXITSTATUS(err) == 2 ? TEST_SKIP : TEST_FAIL;
+}
+
+static void append_script(int dir_fd, const char *name, char *desc,
+			  struct test_suite ***result,
+			  size_t *result_sz)
+{
+	char filename[PATH_MAX], link[128];
+	struct test_suite *test_suite, **result_tmp;
+	struct test_case *tests;
+	size_t len;
+
+	snprintf(link, sizeof(link), "/proc/%d/fd/%d", getpid(), dir_fd);
+	len = readlink(link, filename, sizeof(filename));
+	if (len < 0) {
+		pr_err("Failed to readlink %s", link);
+		return;
+	}
+	filename[len++] = '/';
+	strcpy(&filename[len], name);
+
+	tests = calloc(2, sizeof(*tests));
+	if (!tests) {
+		pr_err("Out of memory while building script test suite list\n");
+		return;
+	}
+	tests[0].name = strdup_check(name);
+	tests[0].desc = strdup_check(desc);
+	tests[0].run_case = shell_test__run;
+
+	test_suite = zalloc(sizeof(*test_suite));
+	if (!test_suite) {
+		pr_err("Out of memory while building script test suite list\n");
+		free(tests);
+		return;
+	}
+	test_suite->desc = desc;
+	test_suite->test_cases = tests;
+	test_suite->priv = strdup_check(filename);
+	/* Realloc is good enough, though we could realloc by chunks, not that
+	 * anyone will ever measure performance here */
+	result_tmp = realloc(*result, (*result_sz + 1) * sizeof(*result_tmp));
+	if (result_tmp == NULL) {
+		pr_err("Out of memory while building script test suite list\n");
+		free(tests);
+		free(test_suite);
+		return;
+	}
+	/* Add file to end and NULL terminate the struct array */
+	*result = result_tmp;
+	(*result)[*result_sz] = test_suite;
+	(*result_sz)++;
+}
+
+static void append_scripts_in_dir(int dir_fd,
+				  struct test_suite ***result,
+				  size_t *result_sz)
+{
+	struct dirent **entlist;
+	struct dirent *ent;
+	int n_dirs, i;
+
+	/* List files, sorted by alpha */
+	n_dirs = scandirat(dir_fd, ".", &entlist, NULL, alphasort);
+	if (n_dirs == -1)
+		return;
+	for (i = 0; i < n_dirs && (ent = entlist[i]); i++) {
+		int fd;
+
+		if (ent->d_name[0] == '.')
+			continue; /* Skip hidden files */
+		if (is_test_script(dir_fd, ent->d_name)) { /* It's a test */
+			char *desc = shell_test__description(dir_fd, ent->d_name);
+
+			if (desc) /* It has a desc line - valid script */
+				append_script(dir_fd, ent->d_name, desc, result, result_sz);
+			continue;
+		}
+		if (ent->d_type != DT_DIR) {
+			struct stat st;
+
+			if (ent->d_type != DT_UNKNOWN)
+				continue;
+			fstatat(dir_fd, ent->d_name, &st, 0);
+			if (!S_ISDIR(st.st_mode))
+				continue;
+		}
+		fd = openat(dir_fd, ent->d_name, O_PATH);
+		append_scripts_in_dir(fd, result, result_sz);
+	}
+	for (i = 0; i < n_dirs; i++) /* Clean up */
+		zfree(&entlist[i]);
+	free(entlist);
+}
+
+struct test_suite **create_script_test_suites(void)
+{
+	struct test_suite **result = NULL, **result_tmp;
+	size_t result_sz = 0;
+	int dir_fd = shell_tests__dir_fd(); /* Walk  dir */
+
+	/*
+	 * Append scripts if fd is good, otherwise return a NULL terminated zero
+	 * length array.
+	 */
+	if (dir_fd >= 0)
+		append_scripts_in_dir(dir_fd, &result, &result_sz);
+
+	result_tmp = realloc(result, (result_sz + 1) * sizeof(*result_tmp));
+	if (result_tmp == NULL) {
+		pr_err("Out of memory while building script test suite list\n");
+		abort();
+	}
+	/* NULL terminate the test suite array. */
+	result = result_tmp;
+	result[result_sz] = NULL;
+	if (dir_fd >= 0)
+		close(dir_fd);
+	return result;
+}
diff --git a/tools/perf/tests/tests-scripts.h b/tools/perf/tests/tests-scripts.h
new file mode 100644
index 000000000000..b553ad26ea17
--- /dev/null
+++ b/tools/perf/tests/tests-scripts.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef TESTS_SCRIPTS_H
+#define TESTS_SCRIPTS_H
+
+#include "tests.h"
+
+struct test_suite **create_script_test_suites(void);
+
+#endif /* TESTS_SCRIPTS_H */
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index dad3d7414142..3aa7701ee0e9 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -4,11 +4,17 @@
 
 #include <stdbool.h>
 
+enum {
+	TEST_OK   =  0,
+	TEST_FAIL = -1,
+	TEST_SKIP = -2,
+};
+
 #define TEST_ASSERT_VAL(text, cond)					 \
 do {									 \
 	if (!(cond)) {							 \
 		pr_debug("FAILED %s:%d %s\n", __FILE__, __LINE__, text); \
-		return -1;						 \
+		return TEST_FAIL;					 \
 	}								 \
 } while (0)
 
@@ -17,16 +23,10 @@ do {									 \
 	if (val != expected) {						 \
 		pr_debug("FAILED %s:%d %s (%d != %d)\n",		 \
 			 __FILE__, __LINE__, text, val, expected);	 \
-		return -1;						 \
+		return TEST_FAIL;						 \
 	}								 \
 } while (0)
 
-enum {
-	TEST_OK   =  0,
-	TEST_FAIL = -1,
-	TEST_SKIP = -2,
-};
-
 struct test_suite;
 
 typedef int (*test_fnptr)(struct test_suite *, int);
diff --git a/tools/perf/tests/thread-maps-share.c b/tools/perf/tests/thread-maps-share.c
index 7fa6f7c568e2..e9ecd30a5c05 100644
--- a/tools/perf/tests/thread-maps-share.c
+++ b/tools/perf/tests/thread-maps-share.c
@@ -46,9 +46,9 @@ static int test__thread_maps_share(struct test_suite *test __maybe_unused, int s
 	TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(maps__refcnt(maps)), 4);
 
 	/* test the maps pointer is shared */
-	TEST_ASSERT_VAL("maps don't match", RC_CHK_EQUAL(maps, thread__maps(t1)));
-	TEST_ASSERT_VAL("maps don't match", RC_CHK_EQUAL(maps, thread__maps(t2)));
-	TEST_ASSERT_VAL("maps don't match", RC_CHK_EQUAL(maps, thread__maps(t3)));
+	TEST_ASSERT_VAL("maps don't match", maps__equal(maps, thread__maps(t1)));
+	TEST_ASSERT_VAL("maps don't match", maps__equal(maps, thread__maps(t2)));
+	TEST_ASSERT_VAL("maps don't match", maps__equal(maps, thread__maps(t3)));
 
 	/*
 	 * Verify the other leader was created by previous call.
@@ -73,7 +73,7 @@ static int test__thread_maps_share(struct test_suite *test __maybe_unused, int s
 	other_maps = thread__maps(other);
 	TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(maps__refcnt(other_maps)), 2);
 
-	TEST_ASSERT_VAL("maps don't match", RC_CHK_EQUAL(other_maps, thread__maps(other_leader)));
+	TEST_ASSERT_VAL("maps don't match", maps__equal(other_maps, thread__maps(other_leader)));
 
 	/* release thread group */
 	thread__put(t3);
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index 822f893e67d5..fecbf851bb2e 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -131,9 +131,10 @@ static int test__vmlinux_matches_kallsyms_cb1(struct map *map, void *data)
 	struct map *pair = maps__find_by_name(args->kallsyms.kmaps,
 					(dso->kernel ? dso->short_name : dso->name));
 
-	if (pair)
+	if (pair) {
 		map__set_priv(pair, 1);
-	else {
+		map__put(pair);
+	} else {
 		if (!args->header_printed) {
 			pr_info("WARN: Maps only in vmlinux:\n");
 			args->header_printed = true;
@@ -151,10 +152,8 @@ static int test__vmlinux_matches_kallsyms_cb2(struct map *map, void *data)
 	u64 mem_end = map__unmap_ip(args->vmlinux_map, map__end(map));
 
 	pair = maps__find(args->kallsyms.kmaps, mem_start);
-	if (pair == NULL || map__priv(pair))
-		return 0;
 
-	if (map__start(pair) == mem_start) {
+	if (pair != NULL && !map__priv(pair) && map__start(pair) == mem_start) {
 		struct dso *dso = map__dso(map);
 
 		if (!args->header_printed) {
@@ -170,6 +169,7 @@ static int test__vmlinux_matches_kallsyms_cb2(struct map *map, void *data)
 		pr_info(" %s\n", dso->name);
 		map__set_priv(pair, 1);
 	}
+	map__put(pair);
 	return 0;
 }
 
diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h
index cfcb7e2c3813..139c330ccf2c 100644
--- a/tools/perf/trace/beauty/include/linux/socket.h
+++ b/tools/perf/trace/beauty/include/linux/socket.h
@@ -422,13 +422,6 @@ extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
 			       struct user_msghdr __user *umsg,
 			       struct sockaddr __user *uaddr,
 			       unsigned int flags);
-extern int sendmsg_copy_msghdr(struct msghdr *msg,
-			       struct user_msghdr __user *umsg, unsigned flags,
-			       struct iovec **iov);
-extern int recvmsg_copy_msghdr(struct msghdr *msg,
-			       struct user_msghdr __user *umsg, unsigned flags,
-			       struct sockaddr __user **uaddr,
-			       struct iovec **iov);
 extern int __copy_msghdr(struct msghdr *kmsg,
 			 struct user_msghdr *umsg,
 			 struct sockaddr __user **save_addr);
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index ec5e21932876..4790c735599b 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -970,7 +970,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel,
 	if (dso->annotate_warned)
 		return -1;
 
-	if (not_annotated) {
+	if (not_annotated || !sym->annotate2) {
 		err = symbol__annotate2(ms, evsel, &browser.arch);
 		if (err) {
 			char msg[BUFSIZ];
diff --git a/tools/perf/ui/browsers/res_sample.c b/tools/perf/ui/browsers/res_sample.c
index 7cb2d6678039..5f60e515b12e 100644
--- a/tools/perf/ui/browsers/res_sample.c
+++ b/tools/perf/ui/browsers/res_sample.c
@@ -83,7 +83,7 @@ int res_sample_browse(struct res_sample *res_samples, int num_res,
 		     r->tid ? "--tid " : "",
 		     r->tid ? (sprintf(tidbuf, "%d", r->tid), tidbuf) : "",
 		     extra_format,
-		     rstype == A_ASM ? "-F +insn --xed" :
+		     rstype == A_ASM ? "-F +disasm" :
 		     rstype == A_SOURCE ? "-F +srcline,+srccode" : "",
 		     symbol_conf.inline_name ? "--inline" : "",
 		     "--show-lost-events ",
diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c
index 50d45054ed6c..e437d7889de6 100644
--- a/tools/perf/ui/browsers/scripts.c
+++ b/tools/perf/ui/browsers/scripts.c
@@ -107,7 +107,7 @@ static int list_scripts(char *script_name, bool *custom,
 	if (evsel)
 		attr_to_script(scriptc.extra_format, &evsel->core.attr);
 	add_script_option("Show individual samples", "", &scriptc);
-	add_script_option("Show individual samples with assembler", "-F +insn --xed",
+	add_script_option("Show individual samples with assembler", "-F +disasm",
 			  &scriptc);
 	add_script_option("Show individual samples with source", "-F +srcline,+srccode",
 			  &scriptc);
diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c
index 394861245fd3..93ce3d47e47e 100644
--- a/tools/perf/ui/gtk/annotate.c
+++ b/tools/perf/ui/gtk/annotate.c
@@ -28,21 +28,29 @@ static const char *const col_names[] = {
 static int perf_gtk__get_percent(char *buf, size_t size, struct symbol *sym,
 				 struct disasm_line *dl, int evidx)
 {
+	struct annotation *notes;
 	struct sym_hist *symhist;
+	struct sym_hist_entry *entry;
 	double percent = 0.0;
 	const char *markup;
 	int ret = 0;
+	u64 nr_samples = 0;
 
 	strcpy(buf, "");
 
 	if (dl->al.offset == (s64) -1)
 		return 0;
 
-	symhist = annotation__histogram(symbol__annotation(sym), evidx);
-	if (!symbol_conf.event_group && !symhist->addr[dl->al.offset].nr_samples)
+	notes = symbol__annotation(sym);
+	symhist = annotation__histogram(notes, evidx);
+	entry = annotated_source__hist_entry(notes->src, evidx, dl->al.offset);
+	if (entry)
+		nr_samples = entry->nr_samples;
+
+	if (!symbol_conf.event_group && nr_samples == 0)
 		return 0;
 
-	percent = 100.0 * symhist->addr[dl->al.offset].nr_samples / symhist->nr_samples;
+	percent = 100.0 * nr_samples / symhist->nr_samples;
 
 	markup = perf_gtk__get_percent_color(percent);
 	if (markup)
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 8027f450fa3e..e0a723e24503 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -32,6 +32,7 @@ perf-y += perf_regs.o
 perf-y += perf-regs-arch/
 perf-y += path.o
 perf-y += print_binary.o
+perf-y += print_insn.o
 perf-y += rlimit.o
 perf-y += argv_split.o
 perf-y += rbtree.o
@@ -71,6 +72,7 @@ perf-y += ordered-events.o
 perf-y += namespaces.o
 perf-y += comm.o
 perf-y += thread.o
+perf-y += threads.o
 perf-y += thread_map.o
 perf-y += parse-events-flex.o
 perf-y += parse-events-bison.o
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index f22b4f18271c..30c4d19fcf11 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -9,10 +9,12 @@
 #include <stdlib.h>
 #include <inttypes.h>
 
+#include "annotate.h"
 #include "annotate-data.h"
 #include "debuginfo.h"
 #include "debug.h"
 #include "dso.h"
+#include "dwarf-regs.h"
 #include "evsel.h"
 #include "evlist.h"
 #include "map.h"
@@ -192,7 +194,8 @@ static bool find_cu_die(struct debuginfo *di, u64 pc, Dwarf_Die *cu_die)
 }
 
 /* The type info will be saved in @type_die */
-static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset)
+static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset,
+			  bool is_pointer)
 {
 	Dwarf_Word size;
 
@@ -204,14 +207,18 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset)
 	}
 
 	/*
-	 * It expects a pointer type for a memory access.
-	 * Convert to a real type it points to.
+	 * Usually it expects a pointer type for a memory access.
+	 * Convert to a real type it points to.  But global variables
+	 * and local variables are accessed directly without a pointer.
 	 */
-	if (dwarf_tag(type_die) != DW_TAG_pointer_type ||
-	    die_get_real_type(type_die, type_die) == NULL) {
-		pr_debug("no pointer or no type\n");
-		ann_data_stat.no_typeinfo++;
-		return -1;
+	if (is_pointer) {
+		if ((dwarf_tag(type_die) != DW_TAG_pointer_type &&
+		     dwarf_tag(type_die) != DW_TAG_array_type) ||
+		    die_get_real_type(type_die, type_die) == NULL) {
+			pr_debug("no pointer or no type\n");
+			ann_data_stat.no_typeinfo++;
+			return -1;
+		}
 	}
 
 	/* Get the size of the actual type */
@@ -232,13 +239,18 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset)
 }
 
 /* The result will be saved in @type_die */
-static int find_data_type_die(struct debuginfo *di, u64 pc,
-			      int reg, int offset, Dwarf_Die *type_die)
+static int find_data_type_die(struct debuginfo *di, u64 pc, u64 addr,
+			      const char *var_name, struct annotated_op_loc *loc,
+			      Dwarf_Die *type_die)
 {
 	Dwarf_Die cu_die, var_die;
 	Dwarf_Die *scopes = NULL;
+	int reg, offset;
 	int ret = -1;
 	int i, nr_scopes;
+	int fbreg = -1;
+	bool is_fbreg = false;
+	int fb_offset = 0;
 
 	/* Get a compile_unit for this address */
 	if (!find_cu_die(di, pc, &cu_die)) {
@@ -247,19 +259,81 @@ static int find_data_type_die(struct debuginfo *di, u64 pc,
 		return -1;
 	}
 
+	reg = loc->reg1;
+	offset = loc->offset;
+
+	if (reg == DWARF_REG_PC) {
+		if (die_find_variable_by_addr(&cu_die, pc, addr, &var_die, &offset)) {
+			ret = check_variable(&var_die, type_die, offset,
+					     /*is_pointer=*/false);
+			loc->offset = offset;
+			goto out;
+		}
+
+		if (var_name && die_find_variable_at(&cu_die, var_name, pc,
+						     &var_die)) {
+			ret = check_variable(&var_die, type_die, 0,
+					     /*is_pointer=*/false);
+			/* loc->offset will be updated by the caller */
+			goto out;
+		}
+	}
+
 	/* Get a list of nested scopes - i.e. (inlined) functions and blocks. */
 	nr_scopes = die_get_scopes(&cu_die, pc, &scopes);
 
+	if (reg != DWARF_REG_PC && dwarf_hasattr(&scopes[0], DW_AT_frame_base)) {
+		Dwarf_Attribute attr;
+		Dwarf_Block block;
+
+		/* Check if the 'reg' is assigned as frame base register */
+		if (dwarf_attr(&scopes[0], DW_AT_frame_base, &attr) != NULL &&
+		    dwarf_formblock(&attr, &block) == 0 && block.length == 1) {
+			switch (*block.data) {
+			case DW_OP_reg0 ... DW_OP_reg31:
+				fbreg = *block.data - DW_OP_reg0;
+				break;
+			case DW_OP_call_frame_cfa:
+				if (die_get_cfa(di->dbg, pc, &fbreg,
+						&fb_offset) < 0)
+					fbreg = -1;
+				break;
+			default:
+				break;
+			}
+		}
+	}
+
+retry:
+	is_fbreg = (reg == fbreg);
+	if (is_fbreg)
+		offset = loc->offset - fb_offset;
+
 	/* Search from the inner-most scope to the outer */
 	for (i = nr_scopes - 1; i >= 0; i--) {
-		/* Look up variables/parameters in this scope */
-		if (!die_find_variable_by_reg(&scopes[i], pc, reg, &var_die))
-			continue;
+		if (reg == DWARF_REG_PC) {
+			if (!die_find_variable_by_addr(&scopes[i], pc, addr,
+						       &var_die, &offset))
+				continue;
+		} else {
+			/* Look up variables/parameters in this scope */
+			if (!die_find_variable_by_reg(&scopes[i], pc, reg,
+						      &offset, is_fbreg, &var_die))
+				continue;
+		}
 
 		/* Found a variable, see if it's correct */
-		ret = check_variable(&var_die, type_die, offset);
+		ret = check_variable(&var_die, type_die, offset,
+				     reg != DWARF_REG_PC && !is_fbreg);
+		loc->offset = offset;
 		goto out;
 	}
+
+	if (loc->multi_regs && reg == loc->reg1 && loc->reg1 != loc->reg2) {
+		reg = loc->reg2;
+		goto retry;
+	}
+
 	if (ret < 0)
 		ann_data_stat.no_var++;
 
@@ -272,15 +346,22 @@ out:
  * find_data_type - Return a data type at the location
  * @ms: map and symbol at the location
  * @ip: instruction address of the memory access
- * @reg: register that holds the base address
- * @offset: offset from the base address
+ * @loc: instruction operand location
+ * @addr: data address of the memory access
+ * @var_name: global variable name
  *
  * This functions searches the debug information of the binary to get the data
- * type it accesses.  The exact location is expressed by (ip, reg, offset).
+ * type it accesses.  The exact location is expressed by (@ip, reg, offset)
+ * for pointer variables or (@ip, @addr) for global variables.  Note that global
+ * variables might update the @loc->offset after finding the start of the variable.
+ * If it cannot find a global variable by address, it tried to fine a declaration
+ * of the variable using @var_name.  In that case, @loc->offset won't be updated.
+ *
  * It return %NULL if not found.
  */
 struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip,
-					   int reg, int offset)
+					   struct annotated_op_loc *loc, u64 addr,
+					   const char *var_name)
 {
 	struct annotated_data_type *result = NULL;
 	struct dso *dso = map__dso(ms->map);
@@ -300,7 +381,7 @@ struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip,
 	 * a file address for DWARF processing.
 	 */
 	pc = map__rip_2objdump(ms->map, ip);
-	if (find_data_type_die(di, pc, reg, offset, &type_die) < 0)
+	if (find_data_type_die(di, pc, addr, var_name, loc, &type_die) < 0)
 		goto out;
 
 	result = dso__findnew_data_type(dso, &type_die);
diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h
index 8e73096c01d1..1b0db8e8c40e 100644
--- a/tools/perf/util/annotate-data.h
+++ b/tools/perf/util/annotate-data.h
@@ -7,6 +7,7 @@
 #include <linux/rbtree.h>
 #include <linux/types.h>
 
+struct annotated_op_loc;
 struct evsel;
 struct map_symbol;
 
@@ -69,6 +70,7 @@ struct annotated_data_type {
 };
 
 extern struct annotated_data_type unknown_type;
+extern struct annotated_data_type stackop_type;
 
 /**
  * struct annotated_data_stat - Debug statistics
@@ -105,7 +107,8 @@ extern struct annotated_data_stat ann_data_stat;
 
 /* Returns data type at the location (ip, reg, offset) */
 struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip,
-					   int reg, int offset);
+					   struct annotated_op_loc *loc, u64 addr,
+					   const char *var_name);
 
 /* Update type access histogram at the given offset */
 int annotated_data_type__update_samples(struct annotated_data_type *adt,
@@ -119,7 +122,8 @@ void annotated_data_type__tree_delete(struct rb_root *root);
 
 static inline struct annotated_data_type *
 find_data_type(struct map_symbol *ms __maybe_unused, u64 ip __maybe_unused,
-	       int reg __maybe_unused, int offset __maybe_unused)
+	       struct annotated_op_loc *loc __maybe_unused,
+	       u64 addr __maybe_unused, const char *var_name __maybe_unused)
 {
 	return NULL;
 }
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 9b70ab110ce7..50ca92255ff6 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -37,6 +37,8 @@
 #include "util/sharded_mutex.h"
 #include "arch/common.h"
 #include "namespaces.h"
+#include "thread.h"
+#include "hashmap.h"
 #include <regex.h>
 #include <linux/bitops.h>
 #include <linux/kernel.h>
@@ -107,6 +109,14 @@ static struct ins_ops ret_ops;
 struct annotated_data_stat ann_data_stat;
 LIST_HEAD(ann_insn_stat);
 
+/* Pseudo data types */
+struct annotated_data_type stackop_type = {
+	.self = {
+		.type_name = (char *)"(stack operation)",
+		.children = LIST_HEAD_INIT(stackop_type.self.children),
+	},
+};
+
 static int arch__grow_instructions(struct arch *arch)
 {
 	struct ins *new_instructions;
@@ -854,6 +864,17 @@ bool arch__is(struct arch *arch, const char *name)
 	return !strcmp(arch->name, name);
 }
 
+/* symbol histogram: key = offset << 16 | evsel->core.idx */
+static size_t sym_hist_hash(long key, void *ctx __maybe_unused)
+{
+	return (key >> 16) + (key & 0xffff);
+}
+
+static bool sym_hist_equal(long key1, long key2, void *ctx __maybe_unused)
+{
+	return key1 == key2;
+}
+
 static struct annotated_source *annotated_source__new(void)
 {
 	struct annotated_source *src = zalloc(sizeof(*src));
@@ -868,38 +889,25 @@ static __maybe_unused void annotated_source__delete(struct annotated_source *src
 {
 	if (src == NULL)
 		return;
+
+	hashmap__free(src->samples);
 	zfree(&src->histograms);
 	free(src);
 }
 
 static int annotated_source__alloc_histograms(struct annotated_source *src,
-					      size_t size, int nr_hists)
+					      int nr_hists)
 {
-	size_t sizeof_sym_hist;
-
-	/*
-	 * Add buffer of one element for zero length symbol.
-	 * When sample is taken from first instruction of
-	 * zero length symbol, perf still resolves it and
-	 * shows symbol name in perf report and allows to
-	 * annotate it.
-	 */
-	if (size == 0)
-		size = 1;
+	src->nr_histograms   = nr_hists;
+	src->histograms	     = calloc(nr_hists, sizeof(*src->histograms));
 
-	/* Check for overflow when calculating sizeof_sym_hist */
-	if (size > (SIZE_MAX - sizeof(struct sym_hist)) / sizeof(struct sym_hist_entry))
+	if (src->histograms == NULL)
 		return -1;
 
-	sizeof_sym_hist = (sizeof(struct sym_hist) + size * sizeof(struct sym_hist_entry));
-
-	/* Check for overflow in zalloc argument */
-	if (sizeof_sym_hist > SIZE_MAX / nr_hists)
-		return -1;
+	src->samples = hashmap__new(sym_hist_hash, sym_hist_equal, NULL);
+	if (src->samples == NULL)
+		zfree(&src->histograms);
 
-	src->sizeof_sym_hist = sizeof_sym_hist;
-	src->nr_histograms   = nr_hists;
-	src->histograms	     = calloc(nr_hists, sizeof_sym_hist) ;
 	return src->histograms ? 0 : -1;
 }
 
@@ -910,7 +918,8 @@ void symbol__annotate_zero_histograms(struct symbol *sym)
 	annotation__lock(notes);
 	if (notes->src != NULL) {
 		memset(notes->src->histograms, 0,
-		       notes->src->nr_histograms * notes->src->sizeof_sym_hist);
+		       notes->src->nr_histograms * sizeof(*notes->src->histograms));
+		hashmap__clear(notes->src->samples);
 	}
 	if (notes->branch && notes->branch->cycles_hist) {
 		memset(notes->branch->cycles_hist, 0,
@@ -974,8 +983,10 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
 				      struct perf_sample *sample)
 {
 	struct symbol *sym = ms->sym;
-	unsigned offset;
+	long hash_key;
+	u64 offset;
 	struct sym_hist *h;
+	struct sym_hist_entry *entry;
 
 	pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map__unmap_ip(ms->map, addr));
 
@@ -993,15 +1004,26 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms,
 			 __func__, __LINE__, sym->name, sym->start, addr, sym->end, sym->type == STT_FUNC);
 		return -ENOMEM;
 	}
+
+	hash_key = offset << 16 | evidx;
+	if (!hashmap__find(src->samples, hash_key, &entry)) {
+		entry = zalloc(sizeof(*entry));
+		if (entry == NULL)
+			return -ENOMEM;
+
+		if (hashmap__add(src->samples, hash_key, entry) < 0)
+			return -ENOMEM;
+	}
+
 	h->nr_samples++;
-	h->addr[offset].nr_samples++;
 	h->period += sample->period;
-	h->addr[offset].period += sample->period;
+	entry->nr_samples++;
+	entry->period += sample->period;
 
 	pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64
 		  ", evidx=%d] => nr_samples: %" PRIu64 ", period: %" PRIu64 "\n",
 		  sym->start, sym->name, addr, addr - sym->start, evidx,
-		  h->addr[offset].nr_samples, h->addr[offset].period);
+		  entry->nr_samples, entry->period);
 	return 0;
 }
 
@@ -1047,8 +1069,7 @@ struct annotated_source *symbol__hists(struct symbol *sym, int nr_hists)
 
 	if (notes->src->histograms == NULL) {
 alloc_histograms:
-		annotated_source__alloc_histograms(notes->src, symbol__size(sym),
-						   nr_hists);
+		annotated_source__alloc_histograms(notes->src, nr_hists);
 	}
 
 	return notes->src;
@@ -2321,17 +2342,25 @@ out_remove_tmp:
 	return err;
 }
 
-static void calc_percent(struct sym_hist *sym_hist,
-			 struct hists *hists,
+static void calc_percent(struct annotation *notes,
+			 struct evsel *evsel,
 			 struct annotation_data *data,
 			 s64 offset, s64 end)
 {
+	struct hists *hists = evsel__hists(evsel);
+	int evidx = evsel->core.idx;
+	struct sym_hist *sym_hist = annotation__histogram(notes, evidx);
 	unsigned int hits = 0;
 	u64 period = 0;
 
 	while (offset < end) {
-		hits   += sym_hist->addr[offset].nr_samples;
-		period += sym_hist->addr[offset].period;
+		struct sym_hist_entry *entry;
+
+		entry = annotated_source__hist_entry(notes->src, evidx, offset);
+		if (entry) {
+			hits   += entry->nr_samples;
+			period += entry->period;
+		}
 		++offset;
 	}
 
@@ -2368,16 +2397,13 @@ static void annotation__calc_percent(struct annotation *notes,
 		end  = next ? next->offset : len;
 
 		for_each_group_evsel(evsel, leader) {
-			struct hists *hists = evsel__hists(evsel);
 			struct annotation_data *data;
-			struct sym_hist *sym_hist;
 
 			BUG_ON(i >= al->data_nr);
 
-			sym_hist = annotation__histogram(notes, evsel->core.idx);
 			data = &al->data[i++];
 
-			calc_percent(sym_hist, hists, data, al->offset, end);
+			calc_percent(notes, evsel, data, al->offset, end);
 		}
 	}
 }
@@ -2435,6 +2461,9 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel,
 	if (parch)
 		*parch = arch;
 
+	if (!list_empty(&notes->src->source))
+		return 0;
+
 	args.arch = arch;
 	args.ms = *ms;
 	if (annotate_opts.full_addr)
@@ -2572,14 +2601,19 @@ static void print_summary(struct rb_root *root, const char *filename)
 
 static void symbol__annotate_hits(struct symbol *sym, struct evsel *evsel)
 {
+	int evidx = evsel->core.idx;
 	struct annotation *notes = symbol__annotation(sym);
-	struct sym_hist *h = annotation__histogram(notes, evsel->core.idx);
+	struct sym_hist *h = annotation__histogram(notes, evidx);
 	u64 len = symbol__size(sym), offset;
 
-	for (offset = 0; offset < len; ++offset)
-		if (h->addr[offset].nr_samples != 0)
+	for (offset = 0; offset < len; ++offset) {
+		struct sym_hist_entry *entry;
+
+		entry = annotated_source__hist_entry(notes->src, evidx, offset);
+		if (entry && entry->nr_samples != 0)
 			printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2,
-			       sym->start + offset, h->addr[offset].nr_samples);
+			       sym->start + offset, entry->nr_samples);
+	}
 	printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->nr_samples", h->nr_samples);
 }
 
@@ -2797,7 +2831,7 @@ void symbol__annotate_zero_histogram(struct symbol *sym, int evidx)
 	struct annotation *notes = symbol__annotation(sym);
 	struct sym_hist *h = annotation__histogram(notes, evidx);
 
-	memset(h, 0, notes->src->sizeof_sym_hist);
+	memset(h, 0, sizeof(*notes->src->histograms) * notes->src->nr_histograms);
 }
 
 void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
@@ -2808,8 +2842,14 @@ void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
 
 	h->nr_samples = 0;
 	for (offset = 0; offset < len; ++offset) {
-		h->addr[offset].nr_samples = h->addr[offset].nr_samples * 7 / 8;
-		h->nr_samples += h->addr[offset].nr_samples;
+		struct sym_hist_entry *entry;
+
+		entry = annotated_source__hist_entry(notes->src, evidx, offset);
+		if (entry == NULL)
+			continue;
+
+		entry->nr_samples = entry->nr_samples * 7 / 8;
+		h->nr_samples += entry->nr_samples;
 	}
 }
 
@@ -3563,8 +3603,22 @@ static int extract_reg_offset(struct arch *arch, const char *str,
 	if (regname == NULL)
 		return -1;
 
-	op_loc->reg = get_dwarf_regnum(regname, 0);
+	op_loc->reg1 = get_dwarf_regnum(regname, 0);
 	free(regname);
+
+	/* Get the second register */
+	if (op_loc->multi_regs) {
+		p = strchr(p + 1, arch->objdump.register_char);
+		if (p == NULL)
+			return -1;
+
+		regname = strdup(p);
+		if (regname == NULL)
+			return -1;
+
+		op_loc->reg2 = get_dwarf_regnum(regname, 0);
+		free(regname);
+	}
 	return 0;
 }
 
@@ -3577,14 +3631,20 @@ static int extract_reg_offset(struct arch *arch, const char *str,
  * Get detailed location info (register and offset) in the instruction.
  * It needs both source and target operand and whether it accesses a
  * memory location.  The offset field is meaningful only when the
- * corresponding mem flag is set.
+ * corresponding mem flag is set.  The reg2 field is meaningful only
+ * when multi_regs flag is set.
  *
  * Some examples on x86:
  *
- *   mov  (%rax), %rcx   # src_reg = rax, src_mem = 1, src_offset = 0
- *                       # dst_reg = rcx, dst_mem = 0
+ *   mov  (%rax), %rcx   # src_reg1 = rax, src_mem = 1, src_offset = 0
+ *                       # dst_reg1 = rcx, dst_mem = 0
  *
- *   mov  0x18, %r8      # src_reg = -1, dst_reg = r8
+ *   mov  0x18, %r8      # src_reg1 = -1, src_mem = 0
+ *                       # dst_reg1 = r8, dst_mem = 0
+ *
+ *   mov  %rsi, 8(%rbx,%rcx,4)  # src_reg1 = rsi, src_mem = 0, dst_multi_regs = 0
+ *                              # dst_reg1 = rbx, dst_reg2 = rcx, dst_mem = 1
+ *                              # dst_multi_regs = 1, dst_offset = 8
  */
 int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl,
 			       struct annotated_insn_loc *loc)
@@ -3605,24 +3665,29 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl,
 
 	for_each_insn_op_loc(loc, i, op_loc) {
 		const char *insn_str = ops->source.raw;
+		bool multi_regs = ops->source.multi_regs;
 
-		if (i == INSN_OP_TARGET)
+		if (i == INSN_OP_TARGET) {
 			insn_str = ops->target.raw;
+			multi_regs = ops->target.multi_regs;
+		}
 
 		/* Invalidate the register by default */
-		op_loc->reg = -1;
+		op_loc->reg1 = -1;
+		op_loc->reg2 = -1;
 
 		if (insn_str == NULL)
 			continue;
 
 		if (strchr(insn_str, arch->objdump.memory_ref_char)) {
 			op_loc->mem_ref = true;
+			op_loc->multi_regs = multi_regs;
 			extract_reg_offset(arch, insn_str, op_loc);
 		} else {
 			char *s = strdup(insn_str);
 
 			if (s) {
-				op_loc->reg = get_dwarf_regnum(s, 0);
+				op_loc->reg1 = get_dwarf_regnum(s, 0);
 				free(s);
 			}
 		}
@@ -3660,8 +3725,17 @@ static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip)
 	notes = symbol__annotation(sym);
 
 	list_for_each_entry(dl, &notes->src->source, al.node) {
-		if (sym->start + dl->al.offset == ip)
+		if (sym->start + dl->al.offset == ip) {
+			/*
+			 * llvm-objdump places "lock" in a separate line and
+			 * in that case, we want to get the next line.
+			 */
+			if (!strcmp(dl->ins.name, "lock") && *dl->ops.raw == '\0') {
+				ip++;
+				continue;
+			}
 			return dl;
+		}
 	}
 	return NULL;
 }
@@ -3690,6 +3764,42 @@ static struct annotated_item_stat *annotate_data_stat(struct list_head *head,
 	return istat;
 }
 
+static bool is_stack_operation(struct arch *arch, struct disasm_line *dl)
+{
+	if (arch__is(arch, "x86")) {
+		if (!strncmp(dl->ins.name, "push", 4) ||
+		    !strncmp(dl->ins.name, "pop", 3) ||
+		    !strncmp(dl->ins.name, "ret", 3))
+			return true;
+	}
+
+	return false;
+}
+
+u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset,
+			struct disasm_line *dl)
+{
+	struct annotation *notes;
+	struct disasm_line *next;
+	u64 addr;
+
+	notes = symbol__annotation(ms->sym);
+	/*
+	 * PC-relative addressing starts from the next instruction address
+	 * But the IP is for the current instruction.  Since disasm_line
+	 * doesn't have the instruction size, calculate it using the next
+	 * disasm_line.  If it's the last one, we can use symbol's end
+	 * address directly.
+	 */
+	if (&dl->al.node == notes->src->source.prev)
+		addr = ms->sym->end + offset;
+	else {
+		next = list_next_entry(dl, al.node);
+		addr = ip + (next->al.offset - dl->al.offset) + offset;
+	}
+	return map__rip_2objdump(ms->map, addr);
+}
+
 /**
  * hist_entry__get_data_type - find data type for given hist entry
  * @he: hist entry
@@ -3709,7 +3819,9 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he)
 	struct annotated_op_loc *op_loc;
 	struct annotated_data_type *mem_type;
 	struct annotated_item_stat *istat;
-	u64 ip = he->ip;
+	u64 ip = he->ip, addr = 0;
+	const char *var_name = NULL;
+	int var_offset;
 	int i;
 
 	ann_data_stat.total++;
@@ -3742,6 +3854,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he)
 		return NULL;
 	}
 
+retry:
 	istat = annotate_data_stat(&ann_insn_stat, dl->ins.name);
 	if (istat == NULL) {
 		ann_data_stat.no_insn++;
@@ -3754,16 +3867,51 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he)
 		return NULL;
 	}
 
+	if (is_stack_operation(arch, dl)) {
+		istat->good++;
+		he->mem_type_off = 0;
+		return &stackop_type;
+	}
+
 	for_each_insn_op_loc(&loc, i, op_loc) {
 		if (!op_loc->mem_ref)
 			continue;
 
-		mem_type = find_data_type(ms, ip, op_loc->reg, op_loc->offset);
+		/* Recalculate IP because of LOCK prefix or insn fusion */
+		ip = ms->sym->start + dl->al.offset;
+
+		var_offset = op_loc->offset;
+
+		/* PC-relative addressing */
+		if (op_loc->reg1 == DWARF_REG_PC) {
+			struct addr_location al;
+			struct symbol *var;
+			u64 map_addr;
+
+			addr = annotate_calc_pcrel(ms, ip, op_loc->offset, dl);
+			/* Kernel symbols might be relocated */
+			map_addr = addr + map__reloc(ms->map);
+
+			addr_location__init(&al);
+			var = thread__find_symbol_fb(he->thread, he->cpumode,
+						     map_addr, &al);
+			if (var) {
+				var_name = var->name;
+				/* Calculate type offset from the start of variable */
+				var_offset = map_addr - map__unmap_ip(al.map, var->start);
+			}
+			addr_location__exit(&al);
+		}
+
+		mem_type = find_data_type(ms, ip, op_loc, addr, var_name);
 		if (mem_type)
 			istat->good++;
 		else
 			istat->bad++;
 
+		if (mem_type && var_name)
+			op_loc->offset = var_offset;
+
 		if (symbol_conf.annotate_data_sample) {
 			annotated_data_type__update_samples(mem_type, evsel,
 							    op_loc->offset,
@@ -3774,6 +3922,20 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he)
 		return mem_type;
 	}
 
+	/*
+	 * Some instructions can be fused and the actual memory access came
+	 * from the previous instruction.
+	 */
+	if (dl->al.offset > 0) {
+		struct disasm_line *prev_dl;
+
+		prev_dl = list_prev_entry(dl, al.node);
+		if (ins__is_fused(arch, prev_dl->ins.name, dl->ins.name)) {
+			dl = prev_dl;
+			goto retry;
+		}
+	}
+
 	ann_data_stat.no_mem_ops++;
 	istat->bad++;
 	return NULL;
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index dba50762c6e8..13cc659e508c 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -12,6 +12,7 @@
 #include "symbol_conf.h"
 #include "mutex.h"
 #include "spark.h"
+#include "hashmap.h"
 
 struct hist_browser_timer;
 struct hist_entry;
@@ -238,12 +239,42 @@ int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool r
 size_t disasm__fprintf(struct list_head *head, FILE *fp);
 void symbol__calc_percent(struct symbol *sym, struct evsel *evsel);
 
+/**
+ * struct sym_hist - symbol histogram information for an event
+ *
+ * @nr_samples: Total number of samples.
+ * @period: Sum of sample periods.
+ */
 struct sym_hist {
 	u64		      nr_samples;
 	u64		      period;
-	struct sym_hist_entry addr[];
 };
 
+/**
+ * struct cyc_hist - (CPU) cycle histogram for a basic block
+ *
+ * @start: Start address of current block (if known).
+ * @cycles: Sum of cycles for the longest basic block.
+ * @cycles_aggr: Total cycles for this address.
+ * @cycles_max: Max cycles for this address.
+ * @cycles_min: Min cycles for this address.
+ * @cycles_spark: History of cycles for the longest basic block.
+ * @num: Number of samples for the longest basic block.
+ * @num_aggr: Total number of samples for this address.
+ * @have_start: Whether the current branch info has a start address.
+ * @reset: Number of resets due to a different start address.
+ *
+ * If sample has branch_stack and cycles info, it can construct basic blocks
+ * between two adjacent branches.  It'd have start and end addresses but
+ * sometimes the start address may not be available.  So the cycles are
+ * accounted at the end address.  If multiple basic blocks end at the same
+ * address, it will take the longest one.
+ *
+ * The @start, @cycles, @cycles_spark and @num fields are used for the longest
+ * block only.  Other fields are used for all cases.
+ *
+ * See __symbol__account_cycles().
+ */
 struct cyc_hist {
 	u64	start;
 	u64	cycles;
@@ -258,18 +289,24 @@ struct cyc_hist {
 	u16	reset;
 };
 
-/** struct annotated_source - symbols with hits have this attached as in sannotation
+/**
+ * struct annotated_source - symbols with hits have this attached as in annotation
  *
- * @histograms: Array of addr hit histograms per event being monitored
- * nr_histograms: This may not be the same as evsel->evlist->core.nr_entries if
+ * @source: List head for annotated_line (embeded in disasm_line).
+ * @histograms: Array of symbol histograms per event to maintain the total number
+ * 		of samples and period.
+ * @nr_histograms: This may not be the same as evsel->evlist->core.nr_entries if
  * 		  we have more than a group in a evlist, where we will want
  * 		  to see each group separately, that is why symbol__annotate2()
  * 		  sets src->nr_histograms to evsel->nr_members.
- * @lines: If 'print_lines' is specified, per source code line percentages
- * @source: source parsed from a disassembler like objdump -dS
- * @cyc_hist: Average cycles per basic block
+ * @offsets: Array of annotation_line to be accessed by offset.
+ * @samples: Hash map of sym_hist_entry.  Keyed by event index and offset in symbol.
+ * @nr_entries: Number of annotated_line in the source list.
+ * @nr_asm_entries: Number of annotated_line with actual asm instruction in the
+ * 		    source list.
+ * @max_line_len: Maximum length of objdump output in an annotated_line.
  *
- * lines is allocated, percentages calculated and all sorted by percentage
+ * disasm_lines are allocated, percentages calculated and all sorted by percentage
  * when the annotation is about to be presented, so the percentages are for
  * one of the entries in the histogram array, i.e. for the event/counter being
  * presented. It is deallocated right after symbol__{tui,tty,etc}_annotate
@@ -277,15 +314,33 @@ struct cyc_hist {
  */
 struct annotated_source {
 	struct list_head	source;
-	size_t			sizeof_sym_hist;
 	struct sym_hist		*histograms;
 	struct annotation_line	**offsets;
+	struct hashmap	   	*samples;
 	int    			nr_histograms;
 	int			nr_entries;
 	int			nr_asm_entries;
 	u16			max_line_len;
 };
 
+/**
+ * struct annotated_branch - basic block and IPC information for a symbol.
+ *
+ * @hit_cycles: Total executed cycles.
+ * @hit_insn: Total number of instructions executed.
+ * @total_insn: Number of instructions in the function.
+ * @cover_insn: Number of distinct, actually executed instructions.
+ * @cycles_hist: Array of cyc_hist for each instruction.
+ * @max_coverage: Maximum number of covered basic block (used for block-range).
+ *
+ * This struct is used by two different codes when the sample has branch stack
+ * and cycles information.  annotation__compute_ipc() calculates average IPC
+ * using @hit_insn / @hit_cycles.  The actual coverage can be calculated using
+ * @cover_insn / @total_insn.  The @cycles_hist can give IPC for each (longest)
+ * basic block ends at the given address.
+ * process_basic_block() calculates coverage of instructions (or basic blocks)
+ * in the function.
+ */
 struct annotated_branch {
 	u64			hit_cycles;
 	u64			hit_insn;
@@ -346,7 +401,7 @@ void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *m
 
 static inline struct sym_hist *annotated_source__histogram(struct annotated_source *src, int idx)
 {
-	return ((void *)src->histograms) + (src->sizeof_sym_hist * idx);
+	return &src->histograms[idx];
 }
 
 static inline struct sym_hist *annotation__histogram(struct annotation *notes, int idx)
@@ -354,6 +409,17 @@ static inline struct sym_hist *annotation__histogram(struct annotation *notes, i
 	return annotated_source__histogram(notes->src, idx);
 }
 
+static inline struct sym_hist_entry *
+annotated_source__hist_entry(struct annotated_source *src, int idx, u64 offset)
+{
+	struct sym_hist_entry *entry;
+	long key = offset << 16 | idx;
+
+	if (!hashmap__find(src->samples, key, &entry))
+		return NULL;
+	return entry;
+}
+
 static inline struct annotation *symbol__annotation(struct symbol *sym)
 {
 	return (void *)sym - symbol_conf.priv_size;
@@ -442,14 +508,18 @@ int annotate_check_args(void);
 
 /**
  * struct annotated_op_loc - Location info of instruction operand
- * @reg: Register in the operand
+ * @reg1: First register in the operand
+ * @reg2: Second register in the operand
  * @offset: Memory access offset in the operand
  * @mem_ref: Whether the operand accesses memory
+ * @multi_regs: Whether the second register is used
  */
 struct annotated_op_loc {
-	int reg;
+	int reg1;
+	int reg2;
 	int offset;
 	bool mem_ref;
+	bool multi_regs;
 };
 
 enum annotated_insn_ops {
@@ -487,4 +557,8 @@ struct annotated_item_stat {
 };
 extern struct list_head ann_insn_stat;
 
+/* Calculate PC-relative address */
+u64 annotate_calc_pcrel(struct map_symbol *ms, u64 ip, int offset,
+			struct disasm_line *dl);
+
 #endif	/* __PERF_ANNOTATE_H */
diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 3573e0b7ef3e..83709146a48a 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -63,6 +63,7 @@ static int machine__process_bpf_event_load(struct machine *machine,
 			dso->bpf_prog.id = id;
 			dso->bpf_prog.sub_id = i;
 			dso->bpf_prog.env = env;
+			map__put(map);
 		}
 	}
 	return 0;
diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index 31ff19afc20c..b4cb3fe5cc25 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -179,6 +179,123 @@ int lock_contention_prepare(struct lock_contention *con)
 	return 0;
 }
 
+/*
+ * Run the BPF program directly using BPF_PROG_TEST_RUN to update the end
+ * timestamp in ktime so that it can calculate delta easily.
+ */
+static void mark_end_timestamp(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
+		.flags = BPF_F_TEST_RUN_ON_CPU,
+	);
+	int prog_fd = bpf_program__fd(skel->progs.end_timestamp);
+
+	bpf_prog_test_run_opts(prog_fd, &opts);
+}
+
+static void update_lock_stat(int map_fd, int pid, u64 end_ts,
+			     enum lock_aggr_mode aggr_mode,
+			     struct tstamp_data *ts_data)
+{
+	u64 delta;
+	struct contention_key stat_key = {};
+	struct contention_data stat_data;
+
+	if (ts_data->timestamp >= end_ts)
+		return;
+
+	delta = end_ts - ts_data->timestamp;
+
+	switch (aggr_mode) {
+	case LOCK_AGGR_CALLER:
+		stat_key.stack_id = ts_data->stack_id;
+		break;
+	case LOCK_AGGR_TASK:
+		stat_key.pid = pid;
+		break;
+	case LOCK_AGGR_ADDR:
+		stat_key.lock_addr_or_cgroup = ts_data->lock;
+		break;
+	case LOCK_AGGR_CGROUP:
+		/* TODO */
+		return;
+	default:
+		return;
+	}
+
+	if (bpf_map_lookup_elem(map_fd, &stat_key, &stat_data) < 0)
+		return;
+
+	stat_data.total_time += delta;
+	stat_data.count++;
+
+	if (delta > stat_data.max_time)
+		stat_data.max_time = delta;
+	if (delta < stat_data.min_time)
+		stat_data.min_time = delta;
+
+	bpf_map_update_elem(map_fd, &stat_key, &stat_data, BPF_EXIST);
+}
+
+/*
+ * Account entries in the tstamp map (which didn't see the corresponding
+ * lock:contention_end tracepoint) using end_ts.
+ */
+static void account_end_timestamp(struct lock_contention *con)
+{
+	int ts_fd, stat_fd;
+	int *prev_key, key;
+	u64 end_ts = skel->bss->end_ts;
+	int total_cpus;
+	enum lock_aggr_mode aggr_mode = con->aggr_mode;
+	struct tstamp_data ts_data, *cpu_data;
+
+	/* Iterate per-task tstamp map (key = TID) */
+	ts_fd = bpf_map__fd(skel->maps.tstamp);
+	stat_fd = bpf_map__fd(skel->maps.lock_stat);
+
+	prev_key = NULL;
+	while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
+		if (bpf_map_lookup_elem(ts_fd, &key, &ts_data) == 0) {
+			int pid = key;
+
+			if (aggr_mode == LOCK_AGGR_TASK && con->owner)
+				pid = ts_data.flags;
+
+			update_lock_stat(stat_fd, pid, end_ts, aggr_mode,
+					 &ts_data);
+		}
+
+		prev_key = &key;
+	}
+
+	/* Now it'll check per-cpu tstamp map which doesn't have TID. */
+	if (aggr_mode == LOCK_AGGR_TASK || aggr_mode == LOCK_AGGR_CGROUP)
+		return;
+
+	total_cpus = cpu__max_cpu().cpu;
+	ts_fd = bpf_map__fd(skel->maps.tstamp_cpu);
+
+	cpu_data = calloc(total_cpus, sizeof(*cpu_data));
+	if (cpu_data == NULL)
+		return;
+
+	prev_key = NULL;
+	while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) {
+		if (bpf_map_lookup_elem(ts_fd, &key, cpu_data) < 0)
+			goto next;
+
+		for (int i = 0; i < total_cpus; i++) {
+			update_lock_stat(stat_fd, -1, end_ts, aggr_mode,
+					 &cpu_data[i]);
+		}
+
+next:
+		prev_key = &key;
+	}
+	free(cpu_data);
+}
+
 int lock_contention_start(void)
 {
 	skel->bss->enabled = 1;
@@ -188,6 +305,7 @@ int lock_contention_start(void)
 int lock_contention_stop(void)
 {
 	skel->bss->enabled = 0;
+	mark_end_timestamp();
 	return 0;
 }
 
@@ -210,7 +328,7 @@ static const char *lock_contention_get_name(struct lock_contention *con,
 
 		/* do not update idle comm which contains CPU number */
 		if (pid) {
-			struct thread *t = __machine__findnew_thread(machine, /*pid=*/-1, pid);
+			struct thread *t = machine__findnew_thread(machine, /*pid=*/-1, pid);
 
 			if (t == NULL)
 				return name;
@@ -301,8 +419,10 @@ int lock_contention_read(struct lock_contention *con)
 	if (stack_trace == NULL)
 		return -1;
 
+	account_end_timestamp(con);
+
 	if (con->aggr_mode == LOCK_AGGR_TASK) {
-		struct thread *idle = __machine__findnew_thread(machine,
+		struct thread *idle = machine__findnew_thread(machine,
 								/*pid=*/0,
 								/*tid=*/0);
 		thread__set_comm(idle, "swapper", /*timestamp=*/0);
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 52c270330ae0..2872f9bc0785 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -6,7 +6,7 @@
  * payload expected by the 'perf trace' beautifiers.
  */
 
-#include <linux/bpf.h>
+#include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <linux/limits.h>
 
@@ -22,19 +22,6 @@
 
 #define MAX_CPUS  4096
 
-// FIXME: These should come from system headers
-#ifndef bool
-typedef char bool;
-#endif
-typedef int pid_t;
-typedef long long int __s64;
-typedef __s64 time64_t;
-
-struct timespec64 {
-	time64_t	tv_sec;
-	long int	tv_nsec;
-};
-
 /* bpf-output associated map */
 struct __augmented_syscalls__ {
 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 95cd8414f6ef..d931a898c434 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -19,13 +19,6 @@
 #define LCB_F_PERCPU	(1U << 4)
 #define LCB_F_MUTEX	(1U << 5)
 
-struct tstamp_data {
-	__u64 timestamp;
-	__u64 lock;
-	__u32 flags;
-	__s32 stack_id;
-};
-
 /* callstack storage  */
 struct {
 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
@@ -140,6 +133,8 @@ int perf_subsys_id = -1;
 /* determine the key of lock stat */
 int aggr_mode;
 
+__u64 end_ts;
+
 /* error stat */
 int task_fail;
 int stack_fail;
@@ -289,6 +284,7 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
 	struct task_struct *curr;
 	struct mm_struct___old *mm_old;
 	struct mm_struct___new *mm_new;
+	struct sighand_struct *sighand;
 
 	switch (flags) {
 	case LCB_F_READ:  /* rwsem */
@@ -310,7 +306,9 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
 		break;
 	case LCB_F_SPIN:  /* spinlock */
 		curr = bpf_get_current_task_btf();
-		if (&curr->sighand->siglock == (void *)lock)
+		sighand = curr->sighand;
+
+		if (sighand && &sighand->siglock == (void *)lock)
 			return LCD_F_SIGHAND_LOCK;
 		break;
 	default:
@@ -559,4 +557,11 @@ int BPF_PROG(collect_lock_syms)
 	return 0;
 }
 
+SEC("raw_tp/bpf_test_finish")
+int BPF_PROG(end_timestamp)
+{
+	end_ts = bpf_ktime_get_ns();
+	return 0;
+}
+
 char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
index 08482daf61be..36af11faad03 100644
--- a/tools/perf/util/bpf_skel/lock_data.h
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -3,6 +3,13 @@
 #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
 #define UTIL_BPF_SKEL_LOCK_DATA_H
 
+struct tstamp_data {
+	u64 timestamp;
+	u64 lock;
+	u32 flags;
+	u32 stack_id;
+};
+
 struct contention_key {
 	u32 stack_id;
 	u32 pid;
diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
index ab84a6e1da5e..e9028235d771 100644
--- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
+++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
@@ -20,6 +20,13 @@ typedef __s64 s64;
 
 typedef int pid_t;
 
+typedef __s64 time64_t;
+
+struct timespec64 {
+        time64_t        tv_sec;
+        long int        tv_nsec;
+};
+
 enum cgroup_subsys_id {
 	perf_event_cgrp_id  = 8,
 };
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 8262f69118db..7517d16c02ec 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1157,7 +1157,7 @@ int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *
 		if (al->map == NULL)
 			goto out;
 	}
-	if (RC_CHK_EQUAL(al->maps, machine__kernel_maps(machine))) {
+	if (maps__equal(al->maps, machine__kernel_maps(machine))) {
 		if (machine__is_host(machine)) {
 			al->cpumode = PERF_RECORD_MISC_KERNEL;
 			al->level = 'k';
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 0581ee0fa5f2..356e30c42cd8 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -222,6 +222,8 @@ static int aggr_cpu_id__cmp(const void *a_pointer, const void *b_pointer)
 		return a->socket - b->socket;
 	else if (a->die != b->die)
 		return a->die - b->die;
+	else if (a->cluster != b->cluster)
+		return a->cluster - b->cluster;
 	else if (a->cache_lvl != b->cache_lvl)
 		return a->cache_lvl - b->cache_lvl;
 	else if (a->cache != b->cache)
@@ -309,6 +311,30 @@ struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data)
 	return id;
 }
 
+int cpu__get_cluster_id(struct perf_cpu cpu)
+{
+	int value, ret = cpu__get_topology_int(cpu.cpu, "cluster_id", &value);
+
+	return ret ?: value;
+}
+
+struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data)
+{
+	int cluster = cpu__get_cluster_id(cpu);
+	struct aggr_cpu_id id;
+
+	/* There is no cluster_id on legacy system. */
+	if (cluster == -1)
+		cluster = 0;
+
+	id = aggr_cpu_id__die(cpu, data);
+	if (aggr_cpu_id__is_empty(&id))
+		return id;
+
+	id.cluster = cluster;
+	return id;
+}
+
 int cpu__get_core_id(struct perf_cpu cpu)
 {
 	int value, ret = cpu__get_topology_int(cpu.cpu, "core_id", &value);
@@ -320,8 +346,8 @@ struct aggr_cpu_id aggr_cpu_id__core(struct perf_cpu cpu, void *data)
 	struct aggr_cpu_id id;
 	int core = cpu__get_core_id(cpu);
 
-	/* aggr_cpu_id__die returns a struct with socket and die set. */
-	id = aggr_cpu_id__die(cpu, data);
+	/* aggr_cpu_id__die returns a struct with socket die, and cluster set. */
+	id = aggr_cpu_id__cluster(cpu, data);
 	if (aggr_cpu_id__is_empty(&id))
 		return id;
 
@@ -683,6 +709,7 @@ bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b
 		a->node == b->node &&
 		a->socket == b->socket &&
 		a->die == b->die &&
+		a->cluster == b->cluster &&
 		a->cache_lvl == b->cache_lvl &&
 		a->cache == b->cache &&
 		a->core == b->core &&
@@ -695,6 +722,7 @@ bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a)
 		a->node == -1 &&
 		a->socket == -1 &&
 		a->die == -1 &&
+		a->cluster == -1 &&
 		a->cache_lvl == -1 &&
 		a->cache == -1 &&
 		a->core == -1 &&
@@ -708,6 +736,7 @@ struct aggr_cpu_id aggr_cpu_id__empty(void)
 		.node = -1,
 		.socket = -1,
 		.die = -1,
+		.cluster = -1,
 		.cache_lvl = -1,
 		.cache = -1,
 		.core = -1,
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 9df2aeb34d3d..26cf76c693f5 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -20,6 +20,8 @@ struct aggr_cpu_id {
 	int socket;
 	/** The die id as read from /sys/devices/system/cpu/cpuX/topology/die_id. */
 	int die;
+	/** The cluster id as read from /sys/devices/system/cpu/cpuX/topology/cluster_id */
+	int cluster;
 	/** The cache level as read from /sys/devices/system/cpu/cpuX/cache/indexY/level */
 	int cache_lvl;
 	/**
@@ -87,6 +89,11 @@ int cpu__get_socket_id(struct perf_cpu cpu);
  */
 int cpu__get_die_id(struct perf_cpu cpu);
 /**
+ * cpu__get_cluster_id - Returns the cluster id as read from
+ * /sys/devices/system/cpu/cpuX/topology/cluster_id for the given CPU
+ */
+int cpu__get_cluster_id(struct perf_cpu cpu);
+/**
  * cpu__get_core_id - Returns the core id as read from
  * /sys/devices/system/cpu/cpuX/topology/core_id for the given CPU.
  */
@@ -127,9 +134,15 @@ struct aggr_cpu_id aggr_cpu_id__socket(struct perf_cpu cpu, void *data);
  */
 struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data);
 /**
- * aggr_cpu_id__core - Create an aggr_cpu_id with the core, die and socket
- * populated with the core, die and socket for cpu. The function signature is
- * compatible with aggr_cpu_id_get_t.
+ * aggr_cpu_id__cluster - Create an aggr_cpu_id with cluster, die and socket
+ * populated with the cluster, die and socket for cpu. The function signature
+ * is compatible with aggr_cpu_id_get_t.
+ */
+struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data);
+/**
+ * aggr_cpu_id__core - Create an aggr_cpu_id with the core, cluster, die and
+ * socket populated with the core, die and socket for cpu. The function
+ * signature is compatible with aggr_cpu_id_get_t.
  */
 struct aggr_cpu_id aggr_cpu_id__core(struct perf_cpu cpu, void *data);
 /**
diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
index 5bb3c2ba95ca..09d57efd2d9d 100644
--- a/tools/perf/util/data-convert-json.c
+++ b/tools/perf/util/data-convert-json.c
@@ -284,7 +284,9 @@ static void output_headers(struct perf_session *session, struct convert_json *c)
 	output_json_key_string(out, true, 2, "os-release", header->env.os_release);
 	output_json_key_string(out, true, 2, "arch", header->env.arch);
 
-	output_json_key_string(out, true, 2, "cpu-desc", header->env.cpu_desc);
+	if (header->env.cpu_desc)
+		output_json_key_string(out, true, 2, "cpu-desc", header->env.cpu_desc);
+
 	output_json_key_string(out, true, 2, "cpuid", header->env.cpuid);
 	output_json_key_format(out, true, 2, "nrcpus-online", "%u", header->env.nr_cpus_online);
 	output_json_key_format(out, true, 2, "nrcpus-avail", "%u", header->env.nr_cpus_avail);
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index c29d8a382b19..08c4bfbd817f 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -413,7 +413,7 @@ ssize_t perf_data_file__write(struct perf_data_file *file,
 }
 
 ssize_t perf_data__write(struct perf_data *data,
-			      void *buf, size_t size)
+			 void *buf, size_t size)
 {
 	if (data->use_stdio) {
 		if (fwrite(buf, size, 1, data->file.fptr) == 1)
@@ -424,14 +424,12 @@ ssize_t perf_data__write(struct perf_data *data,
 }
 
 int perf_data__switch(struct perf_data *data,
-			   const char *postfix,
-			   size_t pos, bool at_exit,
-			   char **new_filepath)
+		      const char *postfix,
+		      size_t pos, bool at_exit,
+		      char **new_filepath)
 {
 	int ret;
 
-	if (check_pipe(data))
-		return -EINVAL;
 	if (perf_data__is_read(data))
 		return -EINVAL;
 
diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h
index effcc195d7e9..110f3ebde30f 100644
--- a/tools/perf/util/data.h
+++ b/tools/perf/util/data.h
@@ -80,7 +80,7 @@ int perf_data__open(struct perf_data *data);
 void perf_data__close(struct perf_data *data);
 ssize_t perf_data__read(struct perf_data *data, void *buf, size_t size);
 ssize_t perf_data__write(struct perf_data *data,
-			      void *buf, size_t size);
+			 void *buf, size_t size);
 ssize_t perf_data_file__write(struct perf_data_file *file,
 			      void *buf, size_t size);
 /*
@@ -91,8 +91,8 @@ ssize_t perf_data_file__write(struct perf_data_file *file,
  * Return value is fd of new output.
  */
 int perf_data__switch(struct perf_data *data,
-			   const char *postfix,
-			   size_t pos, bool at_exit, char **new_filepath);
+		      const char *postfix,
+		      size_t pos, bool at_exit, char **new_filepath);
 
 int perf_data__create_dir(struct perf_data *data, int nr);
 int perf_data__open_dir(struct perf_data *data);
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index e282b4ceb4d2..c39ee0fcb8cf 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -33,6 +33,7 @@
 #endif
 
 int verbose;
+int debug_kmaps;
 int debug_peo_args;
 bool dump_trace = false, quiet = false;
 int debug_ordered_events;
@@ -229,6 +230,7 @@ static struct sublevel_option debug_opts[] = {
 	{ .name = "stderr",		.value_ptr = &redirect_to_stderr},
 	{ .name = "data-convert",	.value_ptr = &debug_data_convert },
 	{ .name = "perf-event-open",	.value_ptr = &debug_peo_args },
+	{ .name = "kmaps",		.value_ptr = &debug_kmaps },
 	{ .name = NULL, }
 };
 
@@ -267,6 +269,7 @@ int perf_quiet_option(void)
 	/* For debug variables that are used as bool types, set to 0. */
 	redirect_to_stderr = 0;
 	debug_peo_args = 0;
+	debug_kmaps = 0;
 
 	return 0;
 }
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index de8870980d44..35a7a5ae762e 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 
 extern int verbose;
+extern int debug_kmaps;
 extern int debug_peo_args;
 extern bool quiet, dump_trace;
 extern int debug_ordered_events;
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 7aa5fee0da19..2791126069b4 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -1272,11 +1272,67 @@ struct find_var_data {
 	unsigned reg;
 	/* Access offset, set for global data */
 	int offset;
+	/* True if the current register is the frame base */
+	bool is_fbreg;
 };
 
 /* Max number of registers DW_OP_regN supports */
 #define DWARF_OP_DIRECT_REGS  32
 
+static bool match_var_offset(Dwarf_Die *die_mem, struct find_var_data *data,
+			     u64 addr_offset, u64 addr_type)
+{
+	Dwarf_Die type_die;
+	Dwarf_Word size;
+
+	if (addr_offset == addr_type) {
+		/* Update offset relative to the start of the variable */
+		data->offset = 0;
+		return true;
+	}
+
+	if (die_get_real_type(die_mem, &type_die) == NULL)
+		return false;
+
+	if (dwarf_aggregate_size(&type_die, &size) < 0)
+		return false;
+
+	if (addr_offset >= addr_type + size)
+		return false;
+
+	/* Update offset relative to the start of the variable */
+	data->offset = addr_offset - addr_type;
+	return true;
+}
+
+static bool check_allowed_ops(Dwarf_Op *ops, size_t nops)
+{
+	/* The first op is checked separately */
+	ops++;
+	nops--;
+
+	/*
+	 * It needs to make sure if the location expression matches to the given
+	 * register and offset exactly.  Thus it rejects any complex expressions
+	 * and only allows a few of selected operators that doesn't change the
+	 * location.
+	 */
+	while (nops) {
+		switch (ops->atom) {
+		case DW_OP_stack_value:
+		case DW_OP_deref_size:
+		case DW_OP_deref:
+		case DW_OP_piece:
+			break;
+		default:
+			return false;
+		}
+		ops++;
+		nops--;
+	}
+	return true;
+}
+
 /* Only checks direct child DIEs in the given scope. */
 static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg)
 {
@@ -1301,13 +1357,35 @@ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg)
 		if (start > data->pc)
 			break;
 
+		/* Local variables accessed using frame base register */
+		if (data->is_fbreg && ops->atom == DW_OP_fbreg &&
+		    data->offset >= (int)ops->number &&
+		    check_allowed_ops(ops, nops) &&
+		    match_var_offset(die_mem, data, data->offset, ops->number))
+			return DIE_FIND_CB_END;
+
 		/* Only match with a simple case */
 		if (data->reg < DWARF_OP_DIRECT_REGS) {
-			if (ops->atom == (DW_OP_reg0 + data->reg) && nops == 1)
+			/* pointer variables saved in a register 0 to 31 */
+			if (ops->atom == (DW_OP_reg0 + data->reg) &&
+			    check_allowed_ops(ops, nops))
+				return DIE_FIND_CB_END;
+
+			/* Local variables accessed by a register + offset */
+			if (ops->atom == (DW_OP_breg0 + data->reg) &&
+			    check_allowed_ops(ops, nops) &&
+			    match_var_offset(die_mem, data, data->offset, ops->number))
 				return DIE_FIND_CB_END;
 		} else {
+			/* pointer variables saved in a register 32 or above */
 			if (ops->atom == DW_OP_regx && ops->number == data->reg &&
-			    nops == 1)
+			    check_allowed_ops(ops, nops))
+				return DIE_FIND_CB_END;
+
+			/* Local variables accessed by a register + offset */
+			if (ops->atom == DW_OP_bregx && data->reg == ops->number &&
+			    check_allowed_ops(ops, nops) &&
+			    match_var_offset(die_mem, data, data->offset, ops->number2))
 				return DIE_FIND_CB_END;
 		}
 	}
@@ -1319,18 +1397,29 @@ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg)
  * @sc_die: a scope DIE
  * @pc: the program address to find
  * @reg: the register number to find
+ * @poffset: pointer to offset, will be updated for fbreg case
+ * @is_fbreg: boolean value if the current register is the frame base
  * @die_mem: a buffer to save the resulting DIE
  *
- * Find the variable DIE accessed by the given register.
+ * Find the variable DIE accessed by the given register.  It'll update the @offset
+ * when the variable is in the stack.
  */
 Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg,
+				    int *poffset, bool is_fbreg,
 				    Dwarf_Die *die_mem)
 {
 	struct find_var_data data = {
 		.pc = pc,
 		.reg = reg,
+		.offset = *poffset,
+		.is_fbreg = is_fbreg,
 	};
-	return die_find_child(sc_die, __die_find_var_reg_cb, &data, die_mem);
+	Dwarf_Die *result;
+
+	result = die_find_child(sc_die, __die_find_var_reg_cb, &data, die_mem);
+	if (result)
+		*poffset = data.offset;
+	return result;
 }
 
 /* Only checks direct child DIEs in the given scope */
@@ -1341,8 +1430,6 @@ static int __die_find_var_addr_cb(Dwarf_Die *die_mem, void *arg)
 	ptrdiff_t off = 0;
 	Dwarf_Attribute attr;
 	Dwarf_Addr base, start, end;
-	Dwarf_Word size;
-	Dwarf_Die type_die;
 	Dwarf_Op *ops;
 	size_t nops;
 
@@ -1359,24 +1446,9 @@ static int __die_find_var_addr_cb(Dwarf_Die *die_mem, void *arg)
 		if (data->addr < ops->number)
 			continue;
 
-		if (data->addr == ops->number) {
-			/* Update offset relative to the start of the variable */
-			data->offset = 0;
+		if (check_allowed_ops(ops, nops) &&
+		    match_var_offset(die_mem, data, data->addr, ops->number))
 			return DIE_FIND_CB_END;
-		}
-
-		if (die_get_real_type(die_mem, &type_die) == NULL)
-			continue;
-
-		if (dwarf_aggregate_size(&type_die, &size) < 0)
-			continue;
-
-		if (data->addr >= ops->number + size)
-			continue;
-
-		/* Update offset relative to the start of the variable */
-		data->offset = data->addr - ops->number;
-		return DIE_FIND_CB_END;
 	}
 	return DIE_FIND_CB_SIBLING;
 }
@@ -1407,7 +1479,74 @@ Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr pc,
 		*offset = data.offset;
 	return result;
 }
-#endif
+#endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */
+
+#ifdef HAVE_DWARF_CFI_SUPPORT
+static int reg_from_dwarf_op(Dwarf_Op *op)
+{
+	switch (op->atom) {
+	case DW_OP_reg0 ... DW_OP_reg31:
+		return op->atom - DW_OP_reg0;
+	case DW_OP_breg0 ... DW_OP_breg31:
+		return op->atom - DW_OP_breg0;
+	case DW_OP_regx:
+	case DW_OP_bregx:
+		return op->number;
+	default:
+		break;
+	}
+	return -1;
+}
+
+static int offset_from_dwarf_op(Dwarf_Op *op)
+{
+	switch (op->atom) {
+	case DW_OP_reg0 ... DW_OP_reg31:
+	case DW_OP_regx:
+		return 0;
+	case DW_OP_breg0 ... DW_OP_breg31:
+		return op->number;
+	case DW_OP_bregx:
+		return op->number2;
+	default:
+		break;
+	}
+	return -1;
+}
+
+/**
+ * die_get_cfa - Get frame base information
+ * @dwarf: a Dwarf info
+ * @pc: program address
+ * @preg: pointer for saved register
+ * @poffset: pointer for saved offset
+ *
+ * This function gets register and offset for CFA (Canonical Frame Address)
+ * by searching the CIE/FDE info.  The CFA usually points to the start address
+ * of the current stack frame and local variables can be located using an offset
+ * from the CFA.  The @preg and @poffset will be updated if it returns 0.
+ */
+int die_get_cfa(Dwarf *dwarf, u64 pc, int *preg, int *poffset)
+{
+	Dwarf_CFI *cfi;
+	Dwarf_Frame *frame = NULL;
+	Dwarf_Op *ops = NULL;
+	size_t nops;
+
+	cfi = dwarf_getcfi(dwarf);
+	if (cfi == NULL)
+		return -1;
+
+	if (!dwarf_cfi_addrframe(cfi, pc, &frame) &&
+	    !dwarf_frame_cfa(frame, &ops, &nops) &&
+	    check_allowed_ops(ops, nops)) {
+		*preg = reg_from_dwarf_op(ops);
+		*poffset = offset_from_dwarf_op(ops);
+		return 0;
+	}
+	return -1;
+}
+#endif /* HAVE_DWARF_CFI_SUPPORT */
 
 /*
  * die_has_loclist - Check if DW_AT_location of @vr_die is a location list
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index 4e64caac6df8..85dd527ae1f7 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -142,6 +142,7 @@ int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf);
 
 /* Find a variable saved in the 'reg' at given address */
 Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg,
+				    int *poffset, bool is_fbreg,
 				    Dwarf_Die *die_mem);
 
 /* Find a (global) variable located in the 'addr' */
@@ -161,6 +162,8 @@ static inline int die_get_var_range(Dwarf_Die *sp_die __maybe_unused,
 static inline Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die __maybe_unused,
 						  Dwarf_Addr pc __maybe_unused,
 						  int reg __maybe_unused,
+						  int *poffset __maybe_unused,
+						  bool is_fbreg __maybe_unused,
 						  Dwarf_Die *die_mem __maybe_unused)
 {
 	return NULL;
@@ -177,4 +180,19 @@ static inline Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die __maybe_unu
 
 #endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */
 
+#ifdef HAVE_DWARF_CFI_SUPPORT
+
+/* Get the frame base information from CFA */
+int die_get_cfa(Dwarf *dwarf, u64 pc, int *preg, int *poffset);
+
+#else /* HAVE_DWARF_CFI_SUPPORT */
+
+static inline int die_get_cfa(Dwarf *dwarf __maybe_unused, u64 pc __maybe_unused,
+			      int *preg __maybe_unused, int *poffset __maybe_unused)
+{
+	return -1;
+}
+
+#endif /* HAVE_DWARF_CFI_SUPPORT */
+
 #endif /* _DWARF_AUX_H */
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index 7c527e65c186..2a2c37cc40b7 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -12,6 +12,7 @@ struct perf_cpu_map;
 struct cpu_topology_map {
 	int	socket_id;
 	int	die_id;
+	int	cluster_id;
 	int	core_id;
 };
 
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 68f45e9e63b6..198903157f9e 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -511,7 +511,7 @@ size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *ma
 		struct addr_location al;
 
 		addr_location__init(&al);
-		al.map = map__get(maps__find(machine__kernel_maps(machine), tp->addr));
+		al.map = maps__find(machine__kernel_maps(machine), tp->addr);
 		if (al.map && map__load(al.map) >= 0) {
 			al.addr = map__map_ip(al.map, tp->addr);
 			al.sym = map__find_symbol(al.map, al.addr);
@@ -641,7 +641,7 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr,
 		return NULL;
 	}
 	al->maps = maps__get(maps);
-	al->map = map__get(maps__find(maps, al->addr));
+	al->map = maps__find(maps, al->addr);
 	if (al->map != NULL) {
 		/*
 		 * Kernel maps might be changed when loading symbols so loading
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 6d7c9c58a9bc..3536404e9447 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2363,7 +2363,6 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 	data->period = evsel->core.attr.sample_period;
 	data->cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 	data->misc    = event->header.misc;
-	data->id = -1ULL;
 	data->data_src = PERF_MEM_DATA_SRC_NONE;
 	data->vcpu = -1;
 
@@ -2851,6 +2850,39 @@ u64 evsel__intval_common(struct evsel *evsel, struct perf_sample *sample, const
 	return field ? format_field__intval(field, sample, evsel->needs_swap) : 0;
 }
 
+char evsel__taskstate(struct evsel *evsel, struct perf_sample *sample, const char *name)
+{
+	static struct tep_format_field *prev_state_field;
+	static const char *states;
+	struct tep_format_field *field;
+	unsigned long long val;
+	unsigned int bit;
+	char state = '?'; /* '?' denotes unknown task state */
+
+	field = evsel__field(evsel, name);
+
+	if (!field)
+		return state;
+
+	if (!states || field != prev_state_field) {
+		states = parse_task_states(field);
+		if (!states)
+			return state;
+		prev_state_field = field;
+	}
+
+	/*
+	 * Note since the kernel exposes TASK_REPORT_MAX to userspace
+	 * to denote the 'preempted' state, we might as welll report
+	 * 'R' for this case, which make senses to users as well.
+	 *
+	 * We can change this if we have a good reason in the future.
+	 */
+	val = evsel__intval(evsel, sample, name);
+	bit = val ? ffs(val) : 0;
+	state = (!bit || bit > strlen(states)) ? 'R' : states[bit-1];
+	return state;
+}
 #endif
 
 bool evsel__fallback(struct evsel *evsel, struct target *target, int err,
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index efbb6e848287..517cff431de2 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -339,6 +339,7 @@ struct perf_sample;
 void *evsel__rawptr(struct evsel *evsel, struct perf_sample *sample, const char *name);
 u64 evsel__intval(struct evsel *evsel, struct perf_sample *sample, const char *name);
 u64 evsel__intval_common(struct evsel *evsel, struct perf_sample *sample, const char *name);
+char evsel__taskstate(struct evsel *evsel, struct perf_sample *sample, const char *name);
 
 static inline char *evsel__strval(struct evsel *evsel, struct perf_sample *sample, const char *name)
 {
diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index 7be23b3ac082..b8875aac8f87 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -500,7 +500,25 @@ double expr__has_event(const struct expr_parse_ctx *ctx, bool compute_ids, const
 	tmp = evlist__new();
 	if (!tmp)
 		return NAN;
-	ret = parse_event(tmp, id) ? 0 : 1;
+
+	if (strchr(id, '@')) {
+		char *tmp_id, *p;
+
+		tmp_id = strdup(id);
+		if (!tmp_id) {
+			ret = NAN;
+			goto out;
+		}
+		p = strchr(tmp_id, '@');
+		*p = '/';
+		p = strrchr(tmp_id, '@');
+		*p = '/';
+		ret = parse_event(tmp, tmp_id) ? 0 : 1;
+		free(tmp_id);
+	} else {
+		ret = parse_event(tmp, id) ? 0 : 1;
+	}
+out:
 	evlist__delete(tmp);
 	return ret;
 }
diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l
index 0feef0726c48..a2fc43159ee9 100644
--- a/tools/perf/util/expr.l
+++ b/tools/perf/util/expr.l
@@ -94,6 +94,14 @@ static int literal(yyscan_t scanner, const struct expr_scanner_ctx *sctx)
 	}
 	return LITERAL;
 }
+
+static int nan_value(yyscan_t scanner)
+{
+	YYSTYPE *yylval = expr_get_lval(scanner);
+
+	yylval->num = NAN;
+	return NUMBER;
+}
 %}
 
 number		([0-9]+\.?[0-9]*|[0-9]*\.?[0-9]+)(e-?[0-9]+)?
@@ -115,6 +123,7 @@ else		{ return ELSE; }
 source_count	{ return SOURCE_COUNT; }
 has_event	{ return HAS_EVENT; }
 strcmp_cpuid_str	{ return STRCMP_CPUID_STR; }
+NaN		{ return nan_value(yyscanner); }
 {literal}	{ return literal(yyscanner, sctx); }
 {number}	{ return value(yyscanner); }
 {symbol}	{ return str(yyscanner, ID, sctx->runtime); }
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index b397a769006f..527517db3182 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -43,9 +43,6 @@
 #include <linux/string.h>
 #include <linux/zalloc.h>
 
-static void __machine__remove_thread(struct machine *machine, struct thread_rb_node *nd,
-				     struct thread *th, bool lock);
-
 static struct dso *machine__kernel_dso(struct machine *machine)
 {
 	return map__dso(machine->vmlinux_map);
@@ -58,35 +55,6 @@ static void dsos__init(struct dsos *dsos)
 	init_rwsem(&dsos->lock);
 }
 
-static void machine__threads_init(struct machine *machine)
-{
-	int i;
-
-	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
-		struct threads *threads = &machine->threads[i];
-		threads->entries = RB_ROOT_CACHED;
-		init_rwsem(&threads->lock);
-		threads->nr = 0;
-		threads->last_match = NULL;
-	}
-}
-
-static int thread_rb_node__cmp_tid(const void *key, const struct rb_node *nd)
-{
-	int to_find = (int) *((pid_t *)key);
-
-	return to_find - (int)thread__tid(rb_entry(nd, struct thread_rb_node, rb_node)->thread);
-}
-
-static struct thread_rb_node *thread_rb_node__find(const struct thread *th,
-						   struct rb_root *tree)
-{
-	pid_t to_find = thread__tid(th);
-	struct rb_node *nd = rb_find(&to_find, tree, thread_rb_node__cmp_tid);
-
-	return rb_entry(nd, struct thread_rb_node, rb_node);
-}
-
 static int machine__set_mmap_name(struct machine *machine)
 {
 	if (machine__is_host(machine))
@@ -120,7 +88,7 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
 	RB_CLEAR_NODE(&machine->rb_node);
 	dsos__init(&machine->dsos);
 
-	machine__threads_init(machine);
+	threads__init(&machine->threads);
 
 	machine->vdso_info = NULL;
 	machine->env = NULL;
@@ -221,27 +189,11 @@ static void dsos__exit(struct dsos *dsos)
 
 void machine__delete_threads(struct machine *machine)
 {
-	struct rb_node *nd;
-	int i;
-
-	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
-		struct threads *threads = &machine->threads[i];
-		down_write(&threads->lock);
-		nd = rb_first_cached(&threads->entries);
-		while (nd) {
-			struct thread_rb_node *trb = rb_entry(nd, struct thread_rb_node, rb_node);
-
-			nd = rb_next(nd);
-			__machine__remove_thread(machine, trb, trb->thread, false);
-		}
-		up_write(&threads->lock);
-	}
+	threads__remove_all_threads(&machine->threads);
 }
 
 void machine__exit(struct machine *machine)
 {
-	int i;
-
 	if (machine == NULL)
 		return;
 
@@ -254,12 +206,7 @@ void machine__exit(struct machine *machine)
 	zfree(&machine->current_tid);
 	zfree(&machine->kallsyms_filename);
 
-	machine__delete_threads(machine);
-	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
-		struct threads *threads = &machine->threads[i];
-
-		exit_rwsem(&threads->lock);
-	}
+	threads__exit(&machine->threads);
 }
 
 void machine__delete(struct machine *machine)
@@ -440,7 +387,7 @@ static struct thread *findnew_guest_code(struct machine *machine,
 		return NULL;
 
 	/* Assume maps are set up if there are any */
-	if (maps__nr_maps(thread__maps(thread)))
+	if (!maps__empty(thread__maps(thread)))
 		return thread;
 
 	host_thread = machine__find_thread(host_machine, -1, pid);
@@ -526,7 +473,7 @@ static void machine__update_thread_pid(struct machine *machine,
 	if (thread__pid(th) == thread__tid(th))
 		return;
 
-	leader = __machine__findnew_thread(machine, thread__pid(th), thread__pid(th));
+	leader = machine__findnew_thread(machine, thread__pid(th), thread__pid(th));
 	if (!leader)
 		goto out_err;
 
@@ -561,159 +508,55 @@ out_err:
 }
 
 /*
- * Front-end cache - TID lookups come in blocks,
- * so most of the time we dont have to look up
- * the full rbtree:
- */
-static struct thread*
-__threads__get_last_match(struct threads *threads, struct machine *machine,
-			  int pid, int tid)
-{
-	struct thread *th;
-
-	th = threads->last_match;
-	if (th != NULL) {
-		if (thread__tid(th) == tid) {
-			machine__update_thread_pid(machine, th, pid);
-			return thread__get(th);
-		}
-		thread__put(threads->last_match);
-		threads->last_match = NULL;
-	}
-
-	return NULL;
-}
-
-static struct thread*
-threads__get_last_match(struct threads *threads, struct machine *machine,
-			int pid, int tid)
-{
-	struct thread *th = NULL;
-
-	if (perf_singlethreaded)
-		th = __threads__get_last_match(threads, machine, pid, tid);
-
-	return th;
-}
-
-static void
-__threads__set_last_match(struct threads *threads, struct thread *th)
-{
-	thread__put(threads->last_match);
-	threads->last_match = thread__get(th);
-}
-
-static void
-threads__set_last_match(struct threads *threads, struct thread *th)
-{
-	if (perf_singlethreaded)
-		__threads__set_last_match(threads, th);
-}
-
-/*
  * Caller must eventually drop thread->refcnt returned with a successful
  * lookup/new thread inserted.
  */
-static struct thread *____machine__findnew_thread(struct machine *machine,
-						  struct threads *threads,
-						  pid_t pid, pid_t tid,
-						  bool create)
+static struct thread *__machine__findnew_thread(struct machine *machine,
+						pid_t pid,
+						pid_t tid,
+						bool create)
 {
-	struct rb_node **p = &threads->entries.rb_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct thread *th;
-	struct thread_rb_node *nd;
-	bool leftmost = true;
+	struct thread *th = threads__find(&machine->threads, tid);
+	bool created;
 
-	th = threads__get_last_match(threads, machine, pid, tid);
-	if (th)
+	if (th) {
+		machine__update_thread_pid(machine, th, pid);
 		return th;
-
-	while (*p != NULL) {
-		parent = *p;
-		th = rb_entry(parent, struct thread_rb_node, rb_node)->thread;
-
-		if (thread__tid(th) == tid) {
-			threads__set_last_match(threads, th);
-			machine__update_thread_pid(machine, th, pid);
-			return thread__get(th);
-		}
-
-		if (tid < thread__tid(th))
-			p = &(*p)->rb_left;
-		else {
-			p = &(*p)->rb_right;
-			leftmost = false;
-		}
 	}
-
 	if (!create)
 		return NULL;
 
-	th = thread__new(pid, tid);
-	if (th == NULL)
-		return NULL;
-
-	nd = malloc(sizeof(*nd));
-	if (nd == NULL) {
-		thread__put(th);
-		return NULL;
-	}
-	nd->thread = th;
-
-	rb_link_node(&nd->rb_node, parent, p);
-	rb_insert_color_cached(&nd->rb_node, &threads->entries, leftmost);
-	/*
-	 * We have to initialize maps separately after rb tree is updated.
-	 *
-	 * The reason is that we call machine__findnew_thread within
-	 * thread__init_maps to find the thread leader and that would screwed
-	 * the rb tree.
-	 */
-	if (thread__init_maps(th, machine)) {
-		pr_err("Thread init failed thread %d\n", pid);
-		rb_erase_cached(&nd->rb_node, &threads->entries);
-		RB_CLEAR_NODE(&nd->rb_node);
-		free(nd);
-		thread__put(th);
-		return NULL;
-	}
-	/*
-	 * It is now in the rbtree, get a ref
-	 */
-	threads__set_last_match(threads, th);
-	++threads->nr;
-
-	return thread__get(th);
-}
+	th = threads__findnew(&machine->threads, pid, tid, &created);
+	if (created) {
+		/*
+		 * We have to initialize maps separately after rb tree is
+		 * updated.
+		 *
+		 * The reason is that we call machine__findnew_thread within
+		 * thread__init_maps to find the thread leader and that would
+		 * screwed the rb tree.
+		 */
+		if (thread__init_maps(th, machine)) {
+			pr_err("Thread init failed thread %d\n", pid);
+			threads__remove(&machine->threads, th);
+			thread__put(th);
+			return NULL;
+		}
+	} else
+		machine__update_thread_pid(machine, th, pid);
 
-struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid)
-{
-	return ____machine__findnew_thread(machine, machine__threads(machine, tid), pid, tid, true);
+	return th;
 }
 
-struct thread *machine__findnew_thread(struct machine *machine, pid_t pid,
-				       pid_t tid)
+struct thread *machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid)
 {
-	struct threads *threads = machine__threads(machine, tid);
-	struct thread *th;
-
-	down_write(&threads->lock);
-	th = __machine__findnew_thread(machine, pid, tid);
-	up_write(&threads->lock);
-	return th;
+	return __machine__findnew_thread(machine, pid, tid, /*create=*/true);
 }
 
 struct thread *machine__find_thread(struct machine *machine, pid_t pid,
 				    pid_t tid)
 {
-	struct threads *threads = machine__threads(machine, tid);
-	struct thread *th;
-
-	down_read(&threads->lock);
-	th =  ____machine__findnew_thread(machine, threads, pid, tid, false);
-	up_read(&threads->lock);
-	return th;
+	return __machine__findnew_thread(machine, pid, tid, /*create=*/false);
 }
 
 /*
@@ -896,7 +739,6 @@ static int machine__process_ksymbol_register(struct machine *machine,
 	struct symbol *sym;
 	struct dso *dso;
 	struct map *map = maps__find(machine__kernel_maps(machine), event->ksymbol.addr);
-	bool put_map = false;
 	int err = 0;
 
 	if (!map) {
@@ -913,12 +755,6 @@ static int machine__process_ksymbol_register(struct machine *machine,
 			err = -ENOMEM;
 			goto out;
 		}
-		/*
-		 * The inserted map has a get on it, we need to put to release
-		 * the reference count here, but do it after all accesses are
-		 * done.
-		 */
-		put_map = true;
 		if (event->ksymbol.ksym_type == PERF_RECORD_KSYMBOL_TYPE_OOL) {
 			dso->binary_type = DSO_BINARY_TYPE__OOL;
 			dso->data.file_size = event->ksymbol.len;
@@ -952,8 +788,7 @@ static int machine__process_ksymbol_register(struct machine *machine,
 	}
 	dso__insert_symbol(dso, sym);
 out:
-	if (put_map)
-		map__put(map);
+	map__put(map);
 	return err;
 }
 
@@ -977,7 +812,7 @@ static int machine__process_ksymbol_unregister(struct machine *machine,
 		if (sym)
 			dso__delete_symbol(dso, sym);
 	}
-
+	map__put(map);
 	return 0;
 }
 
@@ -1005,11 +840,11 @@ int machine__process_text_poke(struct machine *machine, union perf_event *event,
 		perf_event__fprintf_text_poke(event, machine, stdout);
 
 	if (!event->text_poke.new_len)
-		return 0;
+		goto out;
 
 	if (cpumode != PERF_RECORD_MISC_KERNEL) {
 		pr_debug("%s: unsupported cpumode - ignoring\n", __func__);
-		return 0;
+		goto out;
 	}
 
 	if (dso) {
@@ -1032,7 +867,8 @@ int machine__process_text_poke(struct machine *machine, union perf_event *event,
 		pr_debug("Failed to find kernel text poke address map for %#" PRI_lx64 "\n",
 			 event->text_poke.addr);
 	}
-
+out:
+	map__put(map);
 	return 0;
 }
 
@@ -1120,29 +956,30 @@ size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp)
 	return printed;
 }
 
-size_t machine__fprintf(struct machine *machine, FILE *fp)
-{
-	struct rb_node *nd;
-	size_t ret;
-	int i;
-
-	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
-		struct threads *threads = &machine->threads[i];
-
-		down_read(&threads->lock);
+struct machine_fprintf_cb_args {
+	FILE *fp;
+	size_t printed;
+};
 
-		ret = fprintf(fp, "Threads: %u\n", threads->nr);
+static int machine_fprintf_cb(struct thread *thread, void *data)
+{
+	struct machine_fprintf_cb_args *args = data;
 
-		for (nd = rb_first_cached(&threads->entries); nd;
-		     nd = rb_next(nd)) {
-			struct thread *pos = rb_entry(nd, struct thread_rb_node, rb_node)->thread;
+	/* TODO: handle fprintf errors. */
+	args->printed += thread__fprintf(thread, args->fp);
+	return 0;
+}
 
-			ret += thread__fprintf(pos, fp);
-		}
+size_t machine__fprintf(struct machine *machine, FILE *fp)
+{
+	struct machine_fprintf_cb_args args = {
+		.fp = fp,
+		.printed = 0,
+	};
+	size_t ret = fprintf(fp, "Threads: %zu\n", threads__nr(&machine->threads));
 
-		up_read(&threads->lock);
-	}
-	return ret;
+	machine__for_each_thread(machine, machine_fprintf_cb, &args);
+	return ret + args.printed;
 }
 
 static struct dso *machine__get_kernel(struct machine *machine)
@@ -1300,9 +1137,10 @@ static int machine__map_x86_64_entry_trampolines_cb(struct map *map, void *data)
 		return 0;
 
 	dest_map = maps__find(args->kmaps, map__pgoff(map));
-	if (dest_map != map)
+	if (RC_CHK_ACCESS(dest_map) != RC_CHK_ACCESS(map))
 		map__set_pgoff(map, map__map_ip(dest_map, map__pgoff(map)));
 
+	map__put(dest_map);
 	args->found = true;
 	return 0;
 }
@@ -1543,8 +1381,10 @@ static int maps__set_module_path(struct maps *maps, const char *path, struct kmo
 		return 0;
 
 	long_name = strdup(path);
-	if (long_name == NULL)
+	if (long_name == NULL) {
+		map__put(map);
 		return -ENOMEM;
+	}
 
 	dso = map__dso(map);
 	dso__set_long_name(dso, long_name, true);
@@ -1558,7 +1398,7 @@ static int maps__set_module_path(struct maps *maps, const char *path, struct kmo
 		dso->symtab_type++;
 		dso->comp = m->comp;
 	}
-
+	map__put(map);
 	return 0;
 }
 
@@ -1765,8 +1605,10 @@ int machine__create_kernel_maps(struct machine *machine)
 		struct map *next = maps__find_next_entry(machine__kernel_maps(machine),
 							 machine__kernel_map(machine));
 
-		if (next)
+		if (next) {
 			machine__set_kernel_mmap(machine, start, map__start(next));
+			map__put(next);
+		}
 	}
 
 out_put:
@@ -2060,36 +1902,9 @@ out_problem:
 	return 0;
 }
 
-static void __machine__remove_thread(struct machine *machine, struct thread_rb_node *nd,
-				     struct thread *th, bool lock)
-{
-	struct threads *threads = machine__threads(machine, thread__tid(th));
-
-	if (!nd)
-		nd = thread_rb_node__find(th, &threads->entries.rb_root);
-
-	if (threads->last_match && RC_CHK_EQUAL(threads->last_match, th))
-		threads__set_last_match(threads, NULL);
-
-	if (lock)
-		down_write(&threads->lock);
-
-	BUG_ON(refcount_read(thread__refcnt(th)) == 0);
-
-	thread__put(nd->thread);
-	rb_erase_cached(&nd->rb_node, &threads->entries);
-	RB_CLEAR_NODE(&nd->rb_node);
-	--threads->nr;
-
-	free(nd);
-
-	if (lock)
-		up_write(&threads->lock);
-}
-
 void machine__remove_thread(struct machine *machine, struct thread *th)
 {
-	return __machine__remove_thread(machine, NULL, th, true);
+	return threads__remove(&machine->threads, th);
 }
 
 int machine__process_fork_event(struct machine *machine, union perf_event *event,
@@ -3223,23 +3038,7 @@ int machine__for_each_thread(struct machine *machine,
 			     int (*fn)(struct thread *thread, void *p),
 			     void *priv)
 {
-	struct threads *threads;
-	struct rb_node *nd;
-	int rc = 0;
-	int i;
-
-	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
-		threads = &machine->threads[i];
-		for (nd = rb_first_cached(&threads->entries); nd;
-		     nd = rb_next(nd)) {
-			struct thread_rb_node *trb = rb_entry(nd, struct thread_rb_node, rb_node);
-
-			rc = fn(trb->thread, priv);
-			if (rc != 0)
-				return rc;
-		}
-	}
-	return rc;
+	return threads__for_each_thread(&machine->threads, fn, priv);
 }
 
 int machines__for_each_thread(struct machines *machines,
@@ -3263,6 +3062,36 @@ int machines__for_each_thread(struct machines *machines,
 	return rc;
 }
 
+
+static int thread_list_cb(struct thread *thread, void *data)
+{
+	struct list_head *list = data;
+	struct thread_list *entry = malloc(sizeof(*entry));
+
+	if (!entry)
+		return -ENOMEM;
+
+	entry->thread = thread__get(thread);
+	list_add_tail(&entry->list, list);
+	return 0;
+}
+
+int machine__thread_list(struct machine *machine, struct list_head *list)
+{
+	return machine__for_each_thread(machine, thread_list_cb, list);
+}
+
+void thread_list__delete(struct list_head *list)
+{
+	struct thread_list *pos, *next;
+
+	list_for_each_entry_safe(pos, next, list, list) {
+		thread__zput(pos->thread);
+		list_del(&pos->list);
+		free(pos);
+	}
+}
+
 pid_t machine__get_current_tid(struct machine *machine, int cpu)
 {
 	if (cpu < 0 || (size_t)cpu >= machine->current_tid_sz)
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 1279acda6a8a..e28c787616fe 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -7,6 +7,7 @@
 #include "maps.h"
 #include "dsos.h"
 #include "rwsem.h"
+#include "threads.h"
 
 struct addr_location;
 struct branch_stack;
@@ -28,16 +29,6 @@ extern const char *ref_reloc_sym_names[];
 
 struct vdso_info;
 
-#define THREADS__TABLE_BITS	8
-#define THREADS__TABLE_SIZE	(1 << THREADS__TABLE_BITS)
-
-struct threads {
-	struct rb_root_cached  entries;
-	struct rw_semaphore    lock;
-	unsigned int	       nr;
-	struct thread	       *last_match;
-};
-
 struct machine {
 	struct rb_node	  rb_node;
 	pid_t		  pid;
@@ -48,7 +39,7 @@ struct machine {
 	char		  *root_dir;
 	char		  *mmap_name;
 	char		  *kallsyms_filename;
-	struct threads    threads[THREADS__TABLE_SIZE];
+	struct threads    threads;
 	struct vdso_info  *vdso_info;
 	struct perf_env   *env;
 	struct dsos	  dsos;
@@ -69,12 +60,6 @@ struct machine {
 	bool		  trampolines_mapped;
 };
 
-static inline struct threads *machine__threads(struct machine *machine, pid_t tid)
-{
-	/* Cast it to handle tid == -1 */
-	return &machine->threads[(unsigned int)tid % THREADS__TABLE_SIZE];
-}
-
 /*
  * The main kernel (vmlinux) map
  */
@@ -220,7 +205,6 @@ bool machine__is(struct machine *machine, const char *arch);
 bool machine__normalized_is(struct machine *machine, const char *arch);
 int machine__nr_cpus_avail(struct machine *machine);
 
-struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid);
 struct thread *machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid);
 
 struct dso *machine__findnew_dso_id(struct machine *machine, const char *filename, struct dso_id *id);
@@ -280,6 +264,16 @@ int machines__for_each_thread(struct machines *machines,
 			      int (*fn)(struct thread *thread, void *p),
 			      void *priv);
 
+struct thread_list {
+	struct list_head	 list;
+	struct thread		*thread;
+};
+
+/* Make a list of struct thread_list based on threads in the machine. */
+int machine__thread_list(struct machine *machine, struct list_head *list);
+/* Free up the nodes within the thread_list list. */
+void thread_list__delete(struct list_head *list);
+
 pid_t machine__get_current_tid(struct machine *machine, int cpu);
 int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid,
 			     pid_t tid);
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 54c67cb7ecef..14a5ea70d81e 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -168,6 +168,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
 		if (dso == NULL)
 			goto out_delete;
 
+		assert(!dso->kernel);
 		map__init(result, start, start + len, pgoff, dso);
 
 		if (anon || no_dso) {
@@ -552,10 +553,6 @@ u64 map__rip_2objdump(struct map *map, u64 rip)
 	if (dso->rel)
 		return rip - map__pgoff(map);
 
-	/*
-	 * kernel modules also have DSO_TYPE_USER in dso->kernel,
-	 * but all kernel modules are ET_REL, so won't get here.
-	 */
 	if (dso->kernel == DSO_SPACE__USER)
 		return rip + dso->text_offset;
 
@@ -584,10 +581,6 @@ u64 map__objdump_2mem(struct map *map, u64 ip)
 	if (dso->rel)
 		return map__unmap_ip(map, ip + map__pgoff(map));
 
-	/*
-	 * kernel modules also have DSO_TYPE_USER in dso->kernel,
-	 * but all kernel modules are ET_REL, so won't get here.
-	 */
 	if (dso->kernel == DSO_SPACE__USER)
 		return map__unmap_ip(map, ip - dso->text_offset);
 
diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c
index 0334fc18d9c6..ce13145a9f8e 100644
--- a/tools/perf/util/maps.c
+++ b/tools/perf/util/maps.c
@@ -6,81 +6,285 @@
 #include "dso.h"
 #include "map.h"
 #include "maps.h"
+#include "rwsem.h"
 #include "thread.h"
 #include "ui/ui.h"
 #include "unwind.h"
+#include <internal/rc_check.h>
 
-struct map_rb_node {
-	struct rb_node rb_node;
-	struct map *map;
+/*
+ * Locking/sorting note:
+ *
+ * Sorting is done with the write lock, iteration and binary searching happens
+ * under the read lock requiring being sorted. There is a race between sorting
+ * releasing the write lock and acquiring the read lock for iteration/searching
+ * where another thread could insert and break the sorting of the maps. In
+ * practice inserting maps should be rare meaning that the race shouldn't lead
+ * to live lock. Removal of maps doesn't break being sorted.
+ */
+
+DECLARE_RC_STRUCT(maps) {
+	struct rw_semaphore lock;
+	/**
+	 * @maps_by_address: array of maps sorted by their starting address if
+	 * maps_by_address_sorted is true.
+	 */
+	struct map	 **maps_by_address;
+	/**
+	 * @maps_by_name: optional array of maps sorted by their dso name if
+	 * maps_by_name_sorted is true.
+	 */
+	struct map	 **maps_by_name;
+	struct machine	 *machine;
+#ifdef HAVE_LIBUNWIND_SUPPORT
+	void		*addr_space;
+	const struct unwind_libunwind_ops *unwind_libunwind_ops;
+#endif
+	refcount_t	 refcnt;
+	/**
+	 * @nr_maps: number of maps_by_address, and possibly maps_by_name,
+	 * entries that contain maps.
+	 */
+	unsigned int	 nr_maps;
+	/**
+	 * @nr_maps_allocated: number of entries in maps_by_address and possibly
+	 * maps_by_name.
+	 */
+	unsigned int	 nr_maps_allocated;
+	/**
+	 * @last_search_by_name_idx: cache of last found by name entry's index
+	 * as frequent searches for the same dso name are common.
+	 */
+	unsigned int	 last_search_by_name_idx;
+	/** @maps_by_address_sorted: is maps_by_address sorted. */
+	bool		 maps_by_address_sorted;
+	/** @maps_by_name_sorted: is maps_by_name sorted. */
+	bool		 maps_by_name_sorted;
+	/** @ends_broken: does the map contain a map where end values are unset/unsorted? */
+	bool		 ends_broken;
 };
 
-#define maps__for_each_entry(maps, map) \
-	for (map = maps__first(maps); map; map = map_rb_node__next(map))
+static void check_invariants(const struct maps *maps __maybe_unused)
+{
+#ifndef NDEBUG
+	assert(RC_CHK_ACCESS(maps)->nr_maps <= RC_CHK_ACCESS(maps)->nr_maps_allocated);
+	for (unsigned int i = 0; i < RC_CHK_ACCESS(maps)->nr_maps; i++) {
+		struct map *map = RC_CHK_ACCESS(maps)->maps_by_address[i];
+
+		/* Check map is well-formed. */
+		assert(map__end(map) == 0 || map__start(map) <= map__end(map));
+		/* Expect at least 1 reference count. */
+		assert(refcount_read(map__refcnt(map)) > 0);
+
+		if (map__dso(map) && map__dso(map)->kernel)
+			assert(RC_CHK_EQUAL(map__kmap(map)->kmaps, maps));
+
+		if (i > 0) {
+			struct map *prev = RC_CHK_ACCESS(maps)->maps_by_address[i - 1];
+
+			/* If addresses are sorted... */
+			if (RC_CHK_ACCESS(maps)->maps_by_address_sorted) {
+				/* Maps should be in start address order. */
+				assert(map__start(prev) <= map__start(map));
+				/*
+				 * If the ends of maps aren't broken (during
+				 * construction) then they should be ordered
+				 * too.
+				 */
+				if (!RC_CHK_ACCESS(maps)->ends_broken) {
+					assert(map__end(prev) <= map__end(map));
+					assert(map__end(prev) <= map__start(map) ||
+					       map__start(prev) == map__start(map));
+				}
+			}
+		}
+	}
+	if (RC_CHK_ACCESS(maps)->maps_by_name) {
+		for (unsigned int i = 0; i < RC_CHK_ACCESS(maps)->nr_maps; i++) {
+			struct map *map = RC_CHK_ACCESS(maps)->maps_by_name[i];
 
-#define maps__for_each_entry_safe(maps, map, next) \
-	for (map = maps__first(maps), next = map_rb_node__next(map); map; \
-	     map = next, next = map_rb_node__next(map))
+			/*
+			 * Maps by name maps should be in maps_by_address, so
+			 * the reference count should be higher.
+			 */
+			assert(refcount_read(map__refcnt(map)) > 1);
+		}
+	}
+#endif
+}
 
-static struct rb_root *maps__entries(struct maps *maps)
+static struct map **maps__maps_by_address(const struct maps *maps)
 {
-	return &RC_CHK_ACCESS(maps)->entries;
+	return RC_CHK_ACCESS(maps)->maps_by_address;
 }
 
-static struct rw_semaphore *maps__lock(struct maps *maps)
+static void maps__set_maps_by_address(struct maps *maps, struct map **new)
 {
-	return &RC_CHK_ACCESS(maps)->lock;
+	RC_CHK_ACCESS(maps)->maps_by_address = new;
+
+}
+
+static struct map ***maps__maps_by_name_addr(struct maps *maps)
+{
+	return &RC_CHK_ACCESS(maps)->maps_by_name;
+}
+
+static void maps__set_nr_maps_allocated(struct maps *maps, unsigned int nr_maps_allocated)
+{
+	RC_CHK_ACCESS(maps)->nr_maps_allocated = nr_maps_allocated;
+}
+
+static void maps__set_nr_maps(struct maps *maps, unsigned int nr_maps)
+{
+	RC_CHK_ACCESS(maps)->nr_maps = nr_maps;
 }
 
-static struct map **maps__maps_by_name(struct maps *maps)
+/* Not in the header, to aid reference counting. */
+static struct map **maps__maps_by_name(const struct maps *maps)
 {
 	return RC_CHK_ACCESS(maps)->maps_by_name;
+
 }
 
-static struct map_rb_node *maps__first(struct maps *maps)
+static void maps__set_maps_by_name(struct maps *maps, struct map **new)
 {
-	struct rb_node *first = rb_first(maps__entries(maps));
+	RC_CHK_ACCESS(maps)->maps_by_name = new;
 
-	if (first)
-		return rb_entry(first, struct map_rb_node, rb_node);
-	return NULL;
 }
 
-static struct map_rb_node *map_rb_node__next(struct map_rb_node *node)
+static bool maps__maps_by_address_sorted(const struct maps *maps)
+{
+	return RC_CHK_ACCESS(maps)->maps_by_address_sorted;
+}
+
+static void maps__set_maps_by_address_sorted(struct maps *maps, bool value)
 {
-	struct rb_node *next;
+	RC_CHK_ACCESS(maps)->maps_by_address_sorted = value;
+}
 
-	if (!node)
-		return NULL;
+static bool maps__maps_by_name_sorted(const struct maps *maps)
+{
+	return RC_CHK_ACCESS(maps)->maps_by_name_sorted;
+}
 
-	next = rb_next(&node->rb_node);
+static void maps__set_maps_by_name_sorted(struct maps *maps, bool value)
+{
+	RC_CHK_ACCESS(maps)->maps_by_name_sorted = value;
+}
 
-	if (!next)
-		return NULL;
+struct machine *maps__machine(const struct maps *maps)
+{
+	return RC_CHK_ACCESS(maps)->machine;
+}
 
-	return rb_entry(next, struct map_rb_node, rb_node);
+unsigned int maps__nr_maps(const struct maps *maps)
+{
+	return RC_CHK_ACCESS(maps)->nr_maps;
 }
 
-static struct map_rb_node *maps__find_node(struct maps *maps, struct map *map)
+refcount_t *maps__refcnt(struct maps *maps)
 {
-	struct map_rb_node *rb_node;
+	return &RC_CHK_ACCESS(maps)->refcnt;
+}
 
-	maps__for_each_entry(maps, rb_node) {
-		if (rb_node->RC_CHK_ACCESS(map) == RC_CHK_ACCESS(map))
-			return rb_node;
-	}
-	return NULL;
+#ifdef HAVE_LIBUNWIND_SUPPORT
+void *maps__addr_space(const struct maps *maps)
+{
+	return RC_CHK_ACCESS(maps)->addr_space;
+}
+
+void maps__set_addr_space(struct maps *maps, void *addr_space)
+{
+	RC_CHK_ACCESS(maps)->addr_space = addr_space;
+}
+
+const struct unwind_libunwind_ops *maps__unwind_libunwind_ops(const struct maps *maps)
+{
+	return RC_CHK_ACCESS(maps)->unwind_libunwind_ops;
+}
+
+void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libunwind_ops *ops)
+{
+	RC_CHK_ACCESS(maps)->unwind_libunwind_ops = ops;
+}
+#endif
+
+static struct rw_semaphore *maps__lock(struct maps *maps)
+{
+	/*
+	 * When the lock is acquired or released the maps invariants should
+	 * hold.
+	 */
+	check_invariants(maps);
+	return &RC_CHK_ACCESS(maps)->lock;
 }
 
 static void maps__init(struct maps *maps, struct machine *machine)
 {
-	refcount_set(maps__refcnt(maps), 1);
 	init_rwsem(maps__lock(maps));
-	RC_CHK_ACCESS(maps)->entries = RB_ROOT;
+	RC_CHK_ACCESS(maps)->maps_by_address = NULL;
+	RC_CHK_ACCESS(maps)->maps_by_name = NULL;
 	RC_CHK_ACCESS(maps)->machine = machine;
-	RC_CHK_ACCESS(maps)->last_search_by_name = NULL;
+#ifdef HAVE_LIBUNWIND_SUPPORT
+	RC_CHK_ACCESS(maps)->addr_space = NULL;
+	RC_CHK_ACCESS(maps)->unwind_libunwind_ops = NULL;
+#endif
+	refcount_set(maps__refcnt(maps), 1);
 	RC_CHK_ACCESS(maps)->nr_maps = 0;
-	RC_CHK_ACCESS(maps)->maps_by_name = NULL;
+	RC_CHK_ACCESS(maps)->nr_maps_allocated = 0;
+	RC_CHK_ACCESS(maps)->last_search_by_name_idx = 0;
+	RC_CHK_ACCESS(maps)->maps_by_address_sorted = true;
+	RC_CHK_ACCESS(maps)->maps_by_name_sorted = false;
+}
+
+static void maps__exit(struct maps *maps)
+{
+	struct map **maps_by_address = maps__maps_by_address(maps);
+	struct map **maps_by_name = maps__maps_by_name(maps);
+
+	for (unsigned int i = 0; i < maps__nr_maps(maps); i++) {
+		map__zput(maps_by_address[i]);
+		if (maps_by_name)
+			map__zput(maps_by_name[i]);
+	}
+	zfree(&maps_by_address);
+	zfree(&maps_by_name);
+	unwind__finish_access(maps);
+}
+
+struct maps *maps__new(struct machine *machine)
+{
+	struct maps *result;
+	RC_STRUCT(maps) *maps = zalloc(sizeof(*maps));
+
+	if (ADD_RC_CHK(result, maps))
+		maps__init(result, machine);
+
+	return result;
+}
+
+static void maps__delete(struct maps *maps)
+{
+	maps__exit(maps);
+	RC_CHK_FREE(maps);
+}
+
+struct maps *maps__get(struct maps *maps)
+{
+	struct maps *result;
+
+	if (RC_CHK_GET(result, maps))
+		refcount_inc(maps__refcnt(maps));
+
+	return result;
+}
+
+void maps__put(struct maps *maps)
+{
+	if (maps && refcount_dec_and_test(maps__refcnt(maps)))
+		maps__delete(maps);
+	else
+		RC_CHK_PUT(maps);
 }
 
 static void __maps__free_maps_by_name(struct maps *maps)
@@ -92,219 +296,330 @@ static void __maps__free_maps_by_name(struct maps *maps)
 		map__put(maps__maps_by_name(maps)[i]);
 
 	zfree(&RC_CHK_ACCESS(maps)->maps_by_name);
-	RC_CHK_ACCESS(maps)->nr_maps_allocated = 0;
 }
 
-static int __maps__insert(struct maps *maps, struct map *map)
+static int map__start_cmp(const void *a, const void *b)
 {
-	struct rb_node **p = &maps__entries(maps)->rb_node;
-	struct rb_node *parent = NULL;
-	const u64 ip = map__start(map);
-	struct map_rb_node *m, *new_rb_node;
-
-	new_rb_node = malloc(sizeof(*new_rb_node));
-	if (!new_rb_node)
-		return -ENOMEM;
-
-	RB_CLEAR_NODE(&new_rb_node->rb_node);
-	new_rb_node->map = map__get(map);
-
-	while (*p != NULL) {
-		parent = *p;
-		m = rb_entry(parent, struct map_rb_node, rb_node);
-		if (ip < map__start(m->map))
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
+	const struct map *map_a = *(const struct map * const *)a;
+	const struct map *map_b = *(const struct map * const *)b;
+	u64 map_a_start = map__start(map_a);
+	u64 map_b_start = map__start(map_b);
+
+	if (map_a_start == map_b_start) {
+		u64 map_a_end = map__end(map_a);
+		u64 map_b_end = map__end(map_b);
+
+		if  (map_a_end == map_b_end) {
+			/* Ensure maps with the same addresses have a fixed order. */
+			if (RC_CHK_ACCESS(map_a) == RC_CHK_ACCESS(map_b))
+				return 0;
+			return (intptr_t)RC_CHK_ACCESS(map_a) > (intptr_t)RC_CHK_ACCESS(map_b)
+				? 1 : -1;
+		}
+		return map_a_end > map_b_end ? 1 : -1;
 	}
-
-	rb_link_node(&new_rb_node->rb_node, parent, p);
-	rb_insert_color(&new_rb_node->rb_node, maps__entries(maps));
-	return 0;
+	return map_a_start > map_b_start ? 1 : -1;
 }
 
-int maps__insert(struct maps *maps, struct map *map)
+static void __maps__sort_by_address(struct maps *maps)
 {
-	int err;
-	const struct dso *dso = map__dso(map);
+	if (maps__maps_by_address_sorted(maps))
+		return;
+
+	qsort(maps__maps_by_address(maps),
+		maps__nr_maps(maps),
+		sizeof(struct map *),
+		map__start_cmp);
+	maps__set_maps_by_address_sorted(maps, true);
+}
 
+static void maps__sort_by_address(struct maps *maps)
+{
 	down_write(maps__lock(maps));
-	err = __maps__insert(maps, map);
-	if (err)
-		goto out;
-
-	++RC_CHK_ACCESS(maps)->nr_maps;
+	__maps__sort_by_address(maps);
+	up_write(maps__lock(maps));
+}
 
-	if (dso && dso->kernel) {
-		struct kmap *kmap = map__kmap(map);
+static int map__strcmp(const void *a, const void *b)
+{
+	const struct map *map_a = *(const struct map * const *)a;
+	const struct map *map_b = *(const struct map * const *)b;
+	const struct dso *dso_a = map__dso(map_a);
+	const struct dso *dso_b = map__dso(map_b);
+	int ret = strcmp(dso_a->short_name, dso_b->short_name);
 
-		if (kmap)
-			kmap->kmaps = maps;
-		else
-			pr_err("Internal error: kernel dso with non kernel map\n");
+	if (ret == 0 && RC_CHK_ACCESS(map_a) != RC_CHK_ACCESS(map_b)) {
+		/* Ensure distinct but name equal maps have an order. */
+		return map__start_cmp(a, b);
 	}
+	return ret;
+}
 
+static int maps__sort_by_name(struct maps *maps)
+{
+	int err = 0;
+	down_write(maps__lock(maps));
+	if (!maps__maps_by_name_sorted(maps)) {
+		struct map **maps_by_name = maps__maps_by_name(maps);
 
-	/*
-	 * If we already performed some search by name, then we need to add the just
-	 * inserted map and resort.
-	 */
-	if (maps__maps_by_name(maps)) {
-		if (maps__nr_maps(maps) > RC_CHK_ACCESS(maps)->nr_maps_allocated) {
-			int nr_allocate = maps__nr_maps(maps) * 2;
-			struct map **maps_by_name = realloc(maps__maps_by_name(maps),
-							    nr_allocate * sizeof(map));
-
-			if (maps_by_name == NULL) {
-				__maps__free_maps_by_name(maps);
+		if (!maps_by_name) {
+			maps_by_name = malloc(RC_CHK_ACCESS(maps)->nr_maps_allocated *
+					sizeof(*maps_by_name));
+			if (!maps_by_name)
 				err = -ENOMEM;
-				goto out;
-			}
+			else {
+				struct map **maps_by_address = maps__maps_by_address(maps);
+				unsigned int n = maps__nr_maps(maps);
 
-			RC_CHK_ACCESS(maps)->maps_by_name = maps_by_name;
-			RC_CHK_ACCESS(maps)->nr_maps_allocated = nr_allocate;
+				maps__set_maps_by_name(maps, maps_by_name);
+				for (unsigned int i = 0; i < n; i++)
+					maps_by_name[i] = map__get(maps_by_address[i]);
+			}
+		}
+		if (!err) {
+			qsort(maps_by_name,
+				maps__nr_maps(maps),
+				sizeof(struct map *),
+				map__strcmp);
+			maps__set_maps_by_name_sorted(maps, true);
 		}
-		maps__maps_by_name(maps)[maps__nr_maps(maps) - 1] = map__get(map);
-		__maps__sort_by_name(maps);
 	}
- out:
 	up_write(maps__lock(maps));
 	return err;
 }
 
-static void __maps__remove(struct maps *maps, struct map_rb_node *rb_node)
+static unsigned int maps__by_address_index(const struct maps *maps, const struct map *map)
 {
-	rb_erase_init(&rb_node->rb_node, maps__entries(maps));
-	map__put(rb_node->map);
-	free(rb_node);
+	struct map **maps_by_address = maps__maps_by_address(maps);
+
+	if (maps__maps_by_address_sorted(maps)) {
+		struct map **mapp =
+			bsearch(&map, maps__maps_by_address(maps), maps__nr_maps(maps),
+				sizeof(*mapp), map__start_cmp);
+
+		if (mapp)
+			return mapp - maps_by_address;
+	} else {
+		for (unsigned int i = 0; i < maps__nr_maps(maps); i++) {
+			if (RC_CHK_ACCESS(maps_by_address[i]) == RC_CHK_ACCESS(map))
+				return i;
+		}
+	}
+	pr_err("Map missing from maps");
+	return -1;
 }
 
-void maps__remove(struct maps *maps, struct map *map)
+static unsigned int maps__by_name_index(const struct maps *maps, const struct map *map)
 {
-	struct map_rb_node *rb_node;
-
-	down_write(maps__lock(maps));
-	if (RC_CHK_ACCESS(maps)->last_search_by_name == map)
-		RC_CHK_ACCESS(maps)->last_search_by_name = NULL;
-
-	rb_node = maps__find_node(maps, map);
-	assert(rb_node->RC_CHK_ACCESS(map) == RC_CHK_ACCESS(map));
-	__maps__remove(maps, rb_node);
-	if (maps__maps_by_name(maps))
-		__maps__free_maps_by_name(maps);
-	--RC_CHK_ACCESS(maps)->nr_maps;
-	up_write(maps__lock(maps));
+	struct map **maps_by_name = maps__maps_by_name(maps);
+
+	if (maps__maps_by_name_sorted(maps)) {
+		struct map **mapp =
+			bsearch(&map, maps_by_name, maps__nr_maps(maps),
+				sizeof(*mapp), map__strcmp);
+
+		if (mapp)
+			return mapp - maps_by_name;
+	} else {
+		for (unsigned int i = 0; i < maps__nr_maps(maps); i++) {
+			if (RC_CHK_ACCESS(maps_by_name[i]) == RC_CHK_ACCESS(map))
+				return i;
+		}
+	}
+	pr_err("Map missing from maps");
+	return -1;
 }
 
-static void __maps__purge(struct maps *maps)
+static int __maps__insert(struct maps *maps, struct map *new)
 {
-	struct map_rb_node *pos, *next;
+	struct map **maps_by_address = maps__maps_by_address(maps);
+	struct map **maps_by_name = maps__maps_by_name(maps);
+	const struct dso *dso = map__dso(new);
+	unsigned int nr_maps = maps__nr_maps(maps);
+	unsigned int nr_allocate = RC_CHK_ACCESS(maps)->nr_maps_allocated;
+
+	if (nr_maps + 1 > nr_allocate) {
+		nr_allocate = !nr_allocate ? 32 : nr_allocate * 2;
+
+		maps_by_address = realloc(maps_by_address, nr_allocate * sizeof(new));
+		if (!maps_by_address)
+			return -ENOMEM;
+
+		maps__set_maps_by_address(maps, maps_by_address);
+		if (maps_by_name) {
+			maps_by_name = realloc(maps_by_name, nr_allocate * sizeof(new));
+			if (!maps_by_name) {
+				/*
+				 * If by name fails, just disable by name and it will
+				 * recompute next time it is required.
+				 */
+				__maps__free_maps_by_name(maps);
+			}
+			maps__set_maps_by_name(maps, maps_by_name);
+		}
+		RC_CHK_ACCESS(maps)->nr_maps_allocated = nr_allocate;
+	}
+	/* Insert the value at the end. */
+	maps_by_address[nr_maps] = map__get(new);
+	if (maps_by_name)
+		maps_by_name[nr_maps] = map__get(new);
 
-	if (maps__maps_by_name(maps))
-		__maps__free_maps_by_name(maps);
+	nr_maps++;
+	RC_CHK_ACCESS(maps)->nr_maps = nr_maps;
 
-	maps__for_each_entry_safe(maps, pos, next) {
-		rb_erase_init(&pos->rb_node,  maps__entries(maps));
-		map__put(pos->map);
-		free(pos);
+	/*
+	 * Recompute if things are sorted. If things are inserted in a sorted
+	 * manner, for example by processing /proc/pid/maps, then no
+	 * sorting/resorting will be necessary.
+	 */
+	if (nr_maps == 1) {
+		/* If there's just 1 entry then maps are sorted. */
+		maps__set_maps_by_address_sorted(maps, true);
+		maps__set_maps_by_name_sorted(maps, maps_by_name != NULL);
+	} else {
+		/* Sorted if maps were already sorted and this map starts after the last one. */
+		maps__set_maps_by_address_sorted(maps,
+			maps__maps_by_address_sorted(maps) &&
+			map__end(maps_by_address[nr_maps - 2]) <= map__start(new));
+		maps__set_maps_by_name_sorted(maps, false);
 	}
+	if (map__end(new) < map__start(new))
+		RC_CHK_ACCESS(maps)->ends_broken = true;
+	if (dso && dso->kernel) {
+		struct kmap *kmap = map__kmap(new);
+
+		if (kmap)
+			kmap->kmaps = maps;
+		else
+			pr_err("Internal error: kernel dso with non kernel map\n");
+	}
+	return 0;
 }
 
-static void maps__exit(struct maps *maps)
+int maps__insert(struct maps *maps, struct map *map)
 {
+	int ret;
+
 	down_write(maps__lock(maps));
-	__maps__purge(maps);
+	ret = __maps__insert(maps, map);
 	up_write(maps__lock(maps));
+	return ret;
 }
 
-bool maps__empty(struct maps *maps)
-{
-	return !maps__first(maps);
-}
-
-struct maps *maps__new(struct machine *machine)
+static void __maps__remove(struct maps *maps, struct map *map)
 {
-	struct maps *result;
-	RC_STRUCT(maps) *maps = zalloc(sizeof(*maps));
-
-	if (ADD_RC_CHK(result, maps))
-		maps__init(result, machine);
+	struct map **maps_by_address = maps__maps_by_address(maps);
+	struct map **maps_by_name = maps__maps_by_name(maps);
+	unsigned int nr_maps = maps__nr_maps(maps);
+	unsigned int address_idx;
+
+	/* Slide later mappings over the one to remove */
+	address_idx = maps__by_address_index(maps, map);
+	map__put(maps_by_address[address_idx]);
+	memmove(&maps_by_address[address_idx],
+		&maps_by_address[address_idx + 1],
+		(nr_maps - address_idx - 1) * sizeof(*maps_by_address));
+
+	if (maps_by_name) {
+		unsigned int name_idx = maps__by_name_index(maps, map);
+
+		map__put(maps_by_name[name_idx]);
+		memmove(&maps_by_name[name_idx],
+			&maps_by_name[name_idx + 1],
+			(nr_maps - name_idx - 1) *  sizeof(*maps_by_name));
+	}
 
-	return result;
+	--RC_CHK_ACCESS(maps)->nr_maps;
 }
 
-static void maps__delete(struct maps *maps)
+void maps__remove(struct maps *maps, struct map *map)
 {
-	maps__exit(maps);
-	unwind__finish_access(maps);
-	RC_CHK_FREE(maps);
+	down_write(maps__lock(maps));
+	__maps__remove(maps, map);
+	up_write(maps__lock(maps));
 }
 
-struct maps *maps__get(struct maps *maps)
+bool maps__empty(struct maps *maps)
 {
-	struct maps *result;
+	bool res;
 
-	if (RC_CHK_GET(result, maps))
-		refcount_inc(maps__refcnt(maps));
+	down_read(maps__lock(maps));
+	res = maps__nr_maps(maps) == 0;
+	up_read(maps__lock(maps));
 
-	return result;
+	return res;
 }
 
-void maps__put(struct maps *maps)
+bool maps__equal(struct maps *a, struct maps *b)
 {
-	if (maps && refcount_dec_and_test(maps__refcnt(maps)))
-		maps__delete(maps);
-	else
-		RC_CHK_PUT(maps);
+	return RC_CHK_EQUAL(a, b);
 }
 
 int maps__for_each_map(struct maps *maps, int (*cb)(struct map *map, void *data), void *data)
 {
-	struct map_rb_node *pos;
+	bool done = false;
 	int ret = 0;
 
-	down_read(maps__lock(maps));
-	maps__for_each_entry(maps, pos)	{
-		ret = cb(pos->map, data);
-		if (ret)
-			break;
+	/* See locking/sorting note. */
+	while (!done) {
+		down_read(maps__lock(maps));
+		if (maps__maps_by_address_sorted(maps)) {
+			/*
+			 * maps__for_each_map callbacks may buggily/unsafely
+			 * insert into maps_by_address. Deliberately reload
+			 * maps__nr_maps and maps_by_address on each iteration
+			 * to avoid using memory freed by maps__insert growing
+			 * the array - this may cause maps to be skipped or
+			 * repeated.
+			 */
+			for (unsigned int i = 0; i < maps__nr_maps(maps); i++) {
+				struct map **maps_by_address = maps__maps_by_address(maps);
+				struct map *map = maps_by_address[i];
+
+				ret = cb(map, data);
+				if (ret)
+					break;
+			}
+			done = true;
+		}
+		up_read(maps__lock(maps));
+		if (!done)
+			maps__sort_by_address(maps);
 	}
-	up_read(maps__lock(maps));
 	return ret;
 }
 
 void maps__remove_maps(struct maps *maps, bool (*cb)(struct map *map, void *data), void *data)
 {
-	struct map_rb_node *pos, *next;
-	unsigned int start_nr_maps;
+	struct map **maps_by_address;
 
 	down_write(maps__lock(maps));
 
-	start_nr_maps = maps__nr_maps(maps);
-	maps__for_each_entry_safe(maps, pos, next)	{
-		if (cb(pos->map, data)) {
-			__maps__remove(maps, pos);
-			--RC_CHK_ACCESS(maps)->nr_maps;
-		}
+	maps_by_address = maps__maps_by_address(maps);
+	for (unsigned int i = 0; i < maps__nr_maps(maps);) {
+		if (cb(maps_by_address[i], data))
+			__maps__remove(maps, maps_by_address[i]);
+		else
+			i++;
 	}
-	if (maps__maps_by_name(maps) && start_nr_maps != maps__nr_maps(maps))
-		__maps__free_maps_by_name(maps);
-
 	up_write(maps__lock(maps));
 }
 
 struct symbol *maps__find_symbol(struct maps *maps, u64 addr, struct map **mapp)
 {
 	struct map *map = maps__find(maps, addr);
+	struct symbol *result = NULL;
 
 	/* Ensure map is loaded before using map->map_ip */
-	if (map != NULL && map__load(map) >= 0) {
-		if (mapp != NULL)
-			*mapp = map;
-		return map__find_symbol(map, map__map_ip(map, addr));
-	}
+	if (map != NULL && map__load(map) >= 0)
+		result = map__find_symbol(map, map__map_ip(map, addr));
+
+	if (mapp)
+		*mapp = map;
+	else
+		map__put(map);
 
-	return NULL;
+	return result;
 }
 
 struct maps__find_symbol_by_name_args {
@@ -393,24 +708,28 @@ size_t maps__fprintf(struct maps *maps, FILE *fp)
  * Find first map where end > map->start.
  * Same as find_vma() in kernel.
  */
-static struct rb_node *first_ending_after(struct maps *maps, const struct map *map)
+static unsigned int first_ending_after(struct maps *maps, const struct map *map)
 {
-	struct rb_root *root;
-	struct rb_node *next, *first;
+	struct map **maps_by_address = maps__maps_by_address(maps);
+	int low = 0, high = (int)maps__nr_maps(maps) - 1, first = high + 1;
+
+	assert(maps__maps_by_address_sorted(maps));
+	if (low <= high && map__end(maps_by_address[0]) > map__start(map))
+		return 0;
 
-	root = maps__entries(maps);
-	next = root->rb_node;
-	first = NULL;
-	while (next) {
-		struct map_rb_node *pos = rb_entry(next, struct map_rb_node, rb_node);
+	while (low <= high) {
+		int mid = (low + high) / 2;
+		struct map *pos = maps_by_address[mid];
 
-		if (map__end(pos->map) > map__start(map)) {
-			first = next;
-			if (map__start(pos->map) <= map__start(map))
+		if (map__end(pos) > map__start(map)) {
+			first = mid;
+			if (map__start(pos) <= map__start(map)) {
+				/* Entry overlaps map. */
 				break;
-			next = next->rb_left;
+			}
+			high = mid - 1;
 		} else
-			next = next->rb_right;
+			low = mid + 1;
 	}
 	return first;
 }
@@ -419,171 +738,249 @@ static struct rb_node *first_ending_after(struct maps *maps, const struct map *m
  * Adds new to maps, if new overlaps existing entries then the existing maps are
  * adjusted or removed so that new fits without overlapping any entries.
  */
-int maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
+static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
 {
-
-	struct rb_node *next;
+	struct map **maps_by_address;
 	int err = 0;
 	FILE *fp = debug_file();
 
-	down_write(maps__lock(maps));
+sort_again:
+	if (!maps__maps_by_address_sorted(maps))
+		__maps__sort_by_address(maps);
 
-	next = first_ending_after(maps, new);
-	while (next && !err) {
-		struct map_rb_node *pos = rb_entry(next, struct map_rb_node, rb_node);
-		next = rb_next(&pos->rb_node);
+	maps_by_address = maps__maps_by_address(maps);
+	/*
+	 * Iterate through entries where the end of the existing entry is
+	 * greater-than the new map's start.
+	 */
+	for (unsigned int i = first_ending_after(maps, new); i < maps__nr_maps(maps); ) {
+		struct map *pos = maps_by_address[i];
+		struct map *before = NULL, *after = NULL;
 
 		/*
 		 * Stop if current map starts after map->end.
 		 * Maps are ordered by start: next will not overlap for sure.
 		 */
-		if (map__start(pos->map) >= map__end(new))
+		if (map__start(pos) >= map__end(new))
 			break;
 
-		if (verbose >= 2) {
-
-			if (use_browser) {
-				pr_debug("overlapping maps in %s (disable tui for more info)\n",
-					 map__dso(new)->name);
-			} else {
-				pr_debug("overlapping maps:\n");
-				map__fprintf(new, fp);
-				map__fprintf(pos->map, fp);
-			}
+		if (use_browser) {
+			pr_debug("overlapping maps in %s (disable tui for more info)\n",
+				map__dso(new)->name);
+		} else if (verbose >= 2) {
+			pr_debug("overlapping maps:\n");
+			map__fprintf(new, fp);
+			map__fprintf(pos, fp);
 		}
 
-		rb_erase_init(&pos->rb_node, maps__entries(maps));
 		/*
 		 * Now check if we need to create new maps for areas not
 		 * overlapped by the new map:
 		 */
-		if (map__start(new) > map__start(pos->map)) {
-			struct map *before = map__clone(pos->map);
+		if (map__start(new) > map__start(pos)) {
+			/* Map starts within existing map. Need to shorten the existing map. */
+			before = map__clone(pos);
 
 			if (before == NULL) {
 				err = -ENOMEM;
-				goto put_map;
+				goto out_err;
 			}
-
 			map__set_end(before, map__start(new));
-			err = __maps__insert(maps, before);
-			if (err) {
-				map__put(before);
-				goto put_map;
-			}
 
 			if (verbose >= 2 && !use_browser)
 				map__fprintf(before, fp);
-			map__put(before);
 		}
-
-		if (map__end(new) < map__end(pos->map)) {
-			struct map *after = map__clone(pos->map);
+		if (map__end(new) < map__end(pos)) {
+			/* The new map isn't as long as the existing map. */
+			after = map__clone(pos);
 
 			if (after == NULL) {
+				map__zput(before);
 				err = -ENOMEM;
-				goto put_map;
+				goto out_err;
 			}
 
 			map__set_start(after, map__end(new));
-			map__add_pgoff(after, map__end(new) - map__start(pos->map));
-			assert(map__map_ip(pos->map, map__end(new)) ==
-				map__map_ip(after, map__end(new)));
-			err = __maps__insert(maps, after);
-			if (err) {
-				map__put(after);
-				goto put_map;
-			}
+			map__add_pgoff(after, map__end(new) - map__start(pos));
+			assert(map__map_ip(pos, map__end(new)) ==
+			       map__map_ip(after, map__end(new)));
+
 			if (verbose >= 2 && !use_browser)
 				map__fprintf(after, fp);
-			map__put(after);
 		}
-put_map:
-		map__put(pos->map);
-		free(pos);
+		/*
+		 * If adding one entry, for `before` or `after`, we can replace
+		 * the existing entry. If both `before` and `after` are
+		 * necessary than an insert is needed. If the existing entry
+		 * entirely overlaps the existing entry it can just be removed.
+		 */
+		if (before) {
+			map__put(maps_by_address[i]);
+			maps_by_address[i] = before;
+			/* Maps are still ordered, go to next one. */
+			i++;
+			if (after) {
+				__maps__insert(maps, after);
+				map__put(after);
+				if (!maps__maps_by_address_sorted(maps)) {
+					/*
+					 * Sorting broken so invariants don't
+					 * hold, sort and go again.
+					 */
+					goto sort_again;
+				}
+				/*
+				 * Maps are still ordered, skip after and go to
+				 * next one (terminate loop).
+				 */
+				i++;
+			}
+		} else if (after) {
+			map__put(maps_by_address[i]);
+			maps_by_address[i] = after;
+			/* Maps are ordered, go to next one. */
+			i++;
+		} else {
+			__maps__remove(maps, pos);
+			/*
+			 * Maps are ordered but no need to increase `i` as the
+			 * later maps were moved down.
+			 */
+		}
+		check_invariants(maps);
 	}
 	/* Add the map. */
-	err = __maps__insert(maps, new);
-	up_write(maps__lock(maps));
+	__maps__insert(maps, new);
+out_err:
 	return err;
 }
 
-int maps__copy_from(struct maps *maps, struct maps *parent)
+int maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
 {
 	int err;
-	struct map_rb_node *rb_node;
 
+	down_write(maps__lock(maps));
+	err =  __maps__fixup_overlap_and_insert(maps, new);
+	up_write(maps__lock(maps));
+	return err;
+}
+
+int maps__copy_from(struct maps *dest, struct maps *parent)
+{
+	/* Note, if struct map were immutable then cloning could use ref counts. */
+	struct map **parent_maps_by_address;
+	int err = 0;
+	unsigned int n;
+
+	down_write(maps__lock(dest));
 	down_read(maps__lock(parent));
 
-	maps__for_each_entry(parent, rb_node) {
-		struct map *new = map__clone(rb_node->map);
+	parent_maps_by_address = maps__maps_by_address(parent);
+	n = maps__nr_maps(parent);
+	if (maps__nr_maps(dest) == 0) {
+		/* No existing mappings so just copy from parent to avoid reallocs in insert. */
+		unsigned int nr_maps_allocated = RC_CHK_ACCESS(parent)->nr_maps_allocated;
+		struct map **dest_maps_by_address =
+			malloc(nr_maps_allocated * sizeof(struct map *));
+		struct map **dest_maps_by_name = NULL;
 
-		if (new == NULL) {
+		if (!dest_maps_by_address)
 			err = -ENOMEM;
-			goto out_unlock;
+		else {
+			if (maps__maps_by_name(parent)) {
+				dest_maps_by_name =
+					malloc(nr_maps_allocated * sizeof(struct map *));
+			}
+
+			RC_CHK_ACCESS(dest)->maps_by_address = dest_maps_by_address;
+			RC_CHK_ACCESS(dest)->maps_by_name = dest_maps_by_name;
+			RC_CHK_ACCESS(dest)->nr_maps_allocated = nr_maps_allocated;
 		}
 
-		err = unwind__prepare_access(maps, new, NULL);
-		if (err)
-			goto out_unlock;
+		for (unsigned int i = 0; !err && i < n; i++) {
+			struct map *pos = parent_maps_by_address[i];
+			struct map *new = map__clone(pos);
 
-		err = maps__insert(maps, new);
-		if (err)
-			goto out_unlock;
+			if (!new)
+				err = -ENOMEM;
+			else {
+				err = unwind__prepare_access(dest, new, NULL);
+				if (!err) {
+					dest_maps_by_address[i] = new;
+					if (dest_maps_by_name)
+						dest_maps_by_name[i] = map__get(new);
+					RC_CHK_ACCESS(dest)->nr_maps = i + 1;
+				}
+			}
+			if (err)
+				map__put(new);
+		}
+		maps__set_maps_by_address_sorted(dest, maps__maps_by_address_sorted(parent));
+		if (!err) {
+			RC_CHK_ACCESS(dest)->last_search_by_name_idx =
+				RC_CHK_ACCESS(parent)->last_search_by_name_idx;
+			maps__set_maps_by_name_sorted(dest,
+						dest_maps_by_name &&
+						maps__maps_by_name_sorted(parent));
+		} else {
+			RC_CHK_ACCESS(dest)->last_search_by_name_idx = 0;
+			maps__set_maps_by_name_sorted(dest, false);
+		}
+	} else {
+		/* Unexpected copying to a maps containing entries. */
+		for (unsigned int i = 0; !err && i < n; i++) {
+			struct map *pos = parent_maps_by_address[i];
+			struct map *new = map__clone(pos);
 
-		map__put(new);
+			if (!new)
+				err = -ENOMEM;
+			else {
+				err = unwind__prepare_access(dest, new, NULL);
+				if (!err)
+					err = __maps__insert(dest, new);
+			}
+			map__put(new);
+		}
 	}
-
-	err = 0;
-out_unlock:
 	up_read(maps__lock(parent));
+	up_write(maps__lock(dest));
 	return err;
 }
 
-struct map *maps__find(struct maps *maps, u64 ip)
+static int map__addr_cmp(const void *key, const void *entry)
 {
-	struct rb_node *p;
-	struct map_rb_node *m;
-
+	const u64 ip = *(const u64 *)key;
+	const struct map *map = *(const struct map * const *)entry;
 
-	down_read(maps__lock(maps));
-
-	p = maps__entries(maps)->rb_node;
-	while (p != NULL) {
-		m = rb_entry(p, struct map_rb_node, rb_node);
-		if (ip < map__start(m->map))
-			p = p->rb_left;
-		else if (ip >= map__end(m->map))
-			p = p->rb_right;
-		else
-			goto out;
-	}
-
-	m = NULL;
-out:
-	up_read(maps__lock(maps));
-	return m ? m->map : NULL;
+	if (ip < map__start(map))
+		return -1;
+	if (ip >= map__end(map))
+		return 1;
+	return 0;
 }
 
-static int map__strcmp(const void *a, const void *b)
+struct map *maps__find(struct maps *maps, u64 ip)
 {
-	const struct map *map_a = *(const struct map **)a;
-	const struct map *map_b = *(const struct map **)b;
-	const struct dso *dso_a = map__dso(map_a);
-	const struct dso *dso_b = map__dso(map_b);
-	int ret = strcmp(dso_a->short_name, dso_b->short_name);
-
-	if (ret == 0 && map_a != map_b) {
-		/*
-		 * Ensure distinct but name equal maps have an order in part to
-		 * aid reference counting.
-		 */
-		ret = (int)map__start(map_a) - (int)map__start(map_b);
-		if (ret == 0)
-			ret = (int)((intptr_t)map_a - (intptr_t)map_b);
+	struct map *result = NULL;
+	bool done = false;
+
+	/* See locking/sorting note. */
+	while (!done) {
+		down_read(maps__lock(maps));
+		if (maps__maps_by_address_sorted(maps)) {
+			struct map **mapp =
+				bsearch(&ip, maps__maps_by_address(maps), maps__nr_maps(maps),
+					sizeof(*mapp), map__addr_cmp);
+
+			if (mapp)
+				result = map__get(*mapp);
+			done = true;
+		}
+		up_read(maps__lock(maps));
+		if (!done)
+			maps__sort_by_address(maps);
 	}
-
-	return ret;
+	return result;
 }
 
 static int map__strcmp_name(const void *name, const void *b)
@@ -593,126 +990,113 @@ static int map__strcmp_name(const void *name, const void *b)
 	return strcmp(name, dso->short_name);
 }
 
-void __maps__sort_by_name(struct maps *maps)
-{
-	qsort(maps__maps_by_name(maps), maps__nr_maps(maps), sizeof(struct map *), map__strcmp);
-}
-
-static int map__groups__sort_by_name_from_rbtree(struct maps *maps)
-{
-	struct map_rb_node *rb_node;
-	struct map **maps_by_name = realloc(maps__maps_by_name(maps),
-					    maps__nr_maps(maps) * sizeof(struct map *));
-	int i = 0;
-
-	if (maps_by_name == NULL)
-		return -1;
-
-	up_read(maps__lock(maps));
-	down_write(maps__lock(maps));
-
-	RC_CHK_ACCESS(maps)->maps_by_name = maps_by_name;
-	RC_CHK_ACCESS(maps)->nr_maps_allocated = maps__nr_maps(maps);
-
-	maps__for_each_entry(maps, rb_node)
-		maps_by_name[i++] = map__get(rb_node->map);
-
-	__maps__sort_by_name(maps);
-
-	up_write(maps__lock(maps));
-	down_read(maps__lock(maps));
-
-	return 0;
-}
-
-static struct map *__maps__find_by_name(struct maps *maps, const char *name)
+struct map *maps__find_by_name(struct maps *maps, const char *name)
 {
-	struct map **mapp;
-
-	if (maps__maps_by_name(maps) == NULL &&
-	    map__groups__sort_by_name_from_rbtree(maps))
-		return NULL;
+	struct map *result = NULL;
+	bool done = false;
 
-	mapp = bsearch(name, maps__maps_by_name(maps), maps__nr_maps(maps),
-		       sizeof(*mapp), map__strcmp_name);
-	if (mapp)
-		return *mapp;
-	return NULL;
-}
+	/* See locking/sorting note. */
+	while (!done) {
+		unsigned int i;
 
-struct map *maps__find_by_name(struct maps *maps, const char *name)
-{
-	struct map_rb_node *rb_node;
-	struct map *map;
+		down_read(maps__lock(maps));
 
-	down_read(maps__lock(maps));
+		/* First check last found entry. */
+		i = RC_CHK_ACCESS(maps)->last_search_by_name_idx;
+		if (i < maps__nr_maps(maps) && maps__maps_by_name(maps)) {
+			struct dso *dso = map__dso(maps__maps_by_name(maps)[i]);
 
+			if (dso && strcmp(dso->short_name, name) == 0) {
+				result = map__get(maps__maps_by_name(maps)[i]);
+				done = true;
+			}
+		}
 
-	if (RC_CHK_ACCESS(maps)->last_search_by_name) {
-		const struct dso *dso = map__dso(RC_CHK_ACCESS(maps)->last_search_by_name);
+		/* Second search sorted array. */
+		if (!done && maps__maps_by_name_sorted(maps)) {
+			struct map **mapp =
+				bsearch(name, maps__maps_by_name(maps), maps__nr_maps(maps),
+					sizeof(*mapp), map__strcmp_name);
 
-		if (strcmp(dso->short_name, name) == 0) {
-			map = RC_CHK_ACCESS(maps)->last_search_by_name;
-			goto out_unlock;
+			if (mapp) {
+				result = map__get(*mapp);
+				i = mapp - maps__maps_by_name(maps);
+				RC_CHK_ACCESS(maps)->last_search_by_name_idx = i;
+			}
+			done = true;
 		}
-	}
-	/*
-	 * If we have maps->maps_by_name, then the name isn't in the rbtree,
-	 * as maps->maps_by_name mirrors the rbtree when lookups by name are
-	 * made.
-	 */
-	map = __maps__find_by_name(maps, name);
-	if (map || maps__maps_by_name(maps) != NULL)
-		goto out_unlock;
-
-	/* Fallback to traversing the rbtree... */
-	maps__for_each_entry(maps, rb_node) {
-		struct dso *dso;
-
-		map = rb_node->map;
-		dso = map__dso(map);
-		if (strcmp(dso->short_name, name) == 0) {
-			RC_CHK_ACCESS(maps)->last_search_by_name = map;
-			goto out_unlock;
+		up_read(maps__lock(maps));
+		if (!done) {
+			/* Sort and retry binary search. */
+			if (maps__sort_by_name(maps)) {
+				/*
+				 * Memory allocation failed do linear search
+				 * through address sorted maps.
+				 */
+				struct map **maps_by_address;
+				unsigned int n;
+
+				down_read(maps__lock(maps));
+				maps_by_address =  maps__maps_by_address(maps);
+				n = maps__nr_maps(maps);
+				for (i = 0; i < n; i++) {
+					struct map *pos = maps_by_address[i];
+					struct dso *dso = map__dso(pos);
+
+					if (dso && strcmp(dso->short_name, name) == 0) {
+						result = map__get(pos);
+						break;
+					}
+				}
+				up_read(maps__lock(maps));
+				done = true;
+			}
 		}
 	}
-	map = NULL;
-
-out_unlock:
-	up_read(maps__lock(maps));
-	return map;
+	return result;
 }
 
 struct map *maps__find_next_entry(struct maps *maps, struct map *map)
 {
-	struct map_rb_node *rb_node = maps__find_node(maps, map);
-	struct map_rb_node *next = map_rb_node__next(rb_node);
+	unsigned int i;
+	struct map *result = NULL;
 
-	if (next)
-		return next->map;
+	down_read(maps__lock(maps));
+	i = maps__by_address_index(maps, map);
+	if (i < maps__nr_maps(maps))
+		result = map__get(maps__maps_by_address(maps)[i]);
 
-	return NULL;
+	up_read(maps__lock(maps));
+	return result;
 }
 
 void maps__fixup_end(struct maps *maps)
 {
-	struct map_rb_node *prev = NULL, *curr;
+	struct map **maps_by_address;
+	unsigned int n;
 
 	down_write(maps__lock(maps));
+	if (!maps__maps_by_address_sorted(maps))
+		__maps__sort_by_address(maps);
 
-	maps__for_each_entry(maps, curr) {
-		if (prev && (!map__end(prev->map) || map__end(prev->map) > map__start(curr->map)))
-			map__set_end(prev->map, map__start(curr->map));
+	maps_by_address = maps__maps_by_address(maps);
+	n = maps__nr_maps(maps);
+	for (unsigned int i = 1; i < n; i++) {
+		struct map *prev = maps_by_address[i - 1];
+		struct map *curr = maps_by_address[i];
 
-		prev = curr;
+		if (!map__end(prev) || map__end(prev) > map__start(curr))
+			map__set_end(prev, map__start(curr));
 	}
 
 	/*
 	 * We still haven't the actual symbols, so guess the
 	 * last map final address.
 	 */
-	if (curr && !map__end(curr->map))
-		map__set_end(curr->map, ~0ULL);
+	if (n > 0 && !map__end(maps_by_address[n - 1]))
+		map__set_end(maps_by_address[n - 1], ~0ULL);
+
+	RC_CHK_ACCESS(maps)->ends_broken = false;
 
 	up_write(maps__lock(maps));
 }
@@ -723,117 +1107,93 @@ void maps__fixup_end(struct maps *maps)
  */
 int maps__merge_in(struct maps *kmaps, struct map *new_map)
 {
-	struct map_rb_node *rb_node;
-	struct rb_node *first;
-	bool overlaps;
-	LIST_HEAD(merged);
-	int err = 0;
-
-	down_read(maps__lock(kmaps));
-	first = first_ending_after(kmaps, new_map);
-	rb_node = first ? rb_entry(first, struct map_rb_node, rb_node) : NULL;
-	overlaps = rb_node && map__start(rb_node->map) < map__end(new_map);
-	up_read(maps__lock(kmaps));
+	unsigned int first_after_, kmaps__nr_maps;
+	struct map **kmaps_maps_by_address;
+	struct map **merged_maps_by_address;
+	unsigned int merged_nr_maps_allocated;
+
+	/* First try under a read lock. */
+	while (true) {
+		down_read(maps__lock(kmaps));
+		if (maps__maps_by_address_sorted(kmaps))
+			break;
 
-	if (!overlaps)
-		return maps__insert(kmaps, new_map);
+		up_read(maps__lock(kmaps));
 
-	maps__for_each_entry(kmaps, rb_node) {
-		struct map *old_map = rb_node->map;
+		/* First after binary search requires sorted maps. Sort and try again. */
+		maps__sort_by_address(kmaps);
+	}
+	first_after_ = first_ending_after(kmaps, new_map);
+	kmaps_maps_by_address = maps__maps_by_address(kmaps);
 
-		/* no overload with this one */
-		if (map__end(new_map) < map__start(old_map) ||
-		    map__start(new_map) >= map__end(old_map))
-			continue;
+	if (first_after_ >= maps__nr_maps(kmaps) ||
+	    map__start(kmaps_maps_by_address[first_after_]) >= map__end(new_map)) {
+		/* No overlap so regular insert suffices. */
+		up_read(maps__lock(kmaps));
+		return maps__insert(kmaps, new_map);
+	}
+	up_read(maps__lock(kmaps));
 
-		if (map__start(new_map) < map__start(old_map)) {
-			/*
-			 * |new......
-			 *       |old....
-			 */
-			if (map__end(new_map) < map__end(old_map)) {
-				/*
-				 * |new......|     -> |new..|
-				 *       |old....| ->       |old....|
-				 */
-				map__set_end(new_map, map__start(old_map));
-			} else {
-				/*
-				 * |new.............| -> |new..|       |new..|
-				 *       |old....|    ->       |old....|
-				 */
-				struct map_list_node *m = map_list_node__new();
+	/* Plain insert with a read-lock failed, try again now with the write lock. */
+	down_write(maps__lock(kmaps));
+	if (!maps__maps_by_address_sorted(kmaps))
+		__maps__sort_by_address(kmaps);
+
+	first_after_ = first_ending_after(kmaps, new_map);
+	kmaps_maps_by_address = maps__maps_by_address(kmaps);
+	kmaps__nr_maps = maps__nr_maps(kmaps);
+
+	if (first_after_ >= kmaps__nr_maps ||
+	    map__start(kmaps_maps_by_address[first_after_]) >= map__end(new_map)) {
+		/* No overlap so regular insert suffices. */
+		int ret = __maps__insert(kmaps, new_map);
+		up_write(maps__lock(kmaps));
+		return ret;
+	}
+	/* Array to merge into, possibly 1 more for the sake of new_map. */
+	merged_nr_maps_allocated = RC_CHK_ACCESS(kmaps)->nr_maps_allocated;
+	if (kmaps__nr_maps + 1 == merged_nr_maps_allocated)
+		merged_nr_maps_allocated++;
+
+	merged_maps_by_address = malloc(merged_nr_maps_allocated * sizeof(*merged_maps_by_address));
+	if (!merged_maps_by_address) {
+		up_write(maps__lock(kmaps));
+		return -ENOMEM;
+	}
+	maps__set_maps_by_address(kmaps, merged_maps_by_address);
+	maps__set_maps_by_address_sorted(kmaps, true);
+	zfree(maps__maps_by_name_addr(kmaps));
+	maps__set_maps_by_name_sorted(kmaps, true);
+	maps__set_nr_maps_allocated(kmaps, merged_nr_maps_allocated);
 
-				if (!m) {
-					err = -ENOMEM;
-					goto out;
-				}
+	/* Copy entries before the new_map that can't overlap. */
+	for (unsigned int i = 0; i < first_after_; i++)
+		merged_maps_by_address[i] = map__get(kmaps_maps_by_address[i]);
 
-				m->map = map__clone(new_map);
-				if (!m->map) {
-					free(m);
-					err = -ENOMEM;
-					goto out;
-				}
+	maps__set_nr_maps(kmaps, first_after_);
 
-				map__set_end(m->map, map__start(old_map));
-				list_add_tail(&m->node, &merged);
-				map__add_pgoff(new_map, map__end(old_map) - map__start(new_map));
-				map__set_start(new_map, map__end(old_map));
-			}
-		} else {
-			/*
-			 *      |new......
-			 * |old....
-			 */
-			if (map__end(new_map) < map__end(old_map)) {
-				/*
-				 *      |new..|   -> x
-				 * |old.........| -> |old.........|
-				 */
-				map__put(new_map);
-				new_map = NULL;
-				break;
-			} else {
-				/*
-				 *      |new......| ->         |new...|
-				 * |old....|        -> |old....|
-				 */
-				map__add_pgoff(new_map, map__end(old_map) - map__start(new_map));
-				map__set_start(new_map, map__end(old_map));
-			}
-		}
-	}
+	/* Add the new map, it will be split when the later overlapping mappings are added. */
+	__maps__insert(kmaps, new_map);
 
-out:
-	while (!list_empty(&merged)) {
-		struct map_list_node *old_node;
+	/* Insert mappings after new_map, splitting new_map in the process. */
+	for (unsigned int i = first_after_; i < kmaps__nr_maps; i++)
+		__maps__fixup_overlap_and_insert(kmaps, kmaps_maps_by_address[i]);
 
-		old_node = list_entry(merged.next, struct map_list_node, node);
-		list_del_init(&old_node->node);
-		if (!err)
-			err = maps__insert(kmaps, old_node->map);
-		map__put(old_node->map);
-		free(old_node);
-	}
+	/* Copy the maps from merged into kmaps. */
+	for (unsigned int i = 0; i < kmaps__nr_maps; i++)
+		map__zput(kmaps_maps_by_address[i]);
 
-	if (new_map) {
-		if (!err)
-			err = maps__insert(kmaps, new_map);
-		map__put(new_map);
-	}
-	return err;
+	free(kmaps_maps_by_address);
+	up_write(maps__lock(kmaps));
+	return 0;
 }
 
 void maps__load_first(struct maps *maps)
 {
-	struct map_rb_node *first;
-
 	down_read(maps__lock(maps));
 
-	first = maps__first(maps);
-	if (first)
-		map__load(first->map);
+	if (maps__nr_maps(maps) > 0)
+		map__load(maps__maps_by_address(maps)[0]);
 
 	up_read(maps__lock(maps));
 }
diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h
index d836d04c9402..d9aa62ed968a 100644
--- a/tools/perf/util/maps.h
+++ b/tools/perf/util/maps.h
@@ -3,45 +3,15 @@
 #define __PERF_MAPS_H
 
 #include <linux/refcount.h>
-#include <linux/rbtree.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include <linux/types.h>
-#include "rwsem.h"
-#include <internal/rc_check.h>
 
 struct ref_reloc_sym;
 struct machine;
 struct map;
 struct maps;
 
-struct map_list_node {
-	struct list_head node;
-	struct map *map;
-};
-
-static inline struct map_list_node *map_list_node__new(void)
-{
-	return malloc(sizeof(struct map_list_node));
-}
-
-struct map *maps__find(struct maps *maps, u64 addr);
-
-DECLARE_RC_STRUCT(maps) {
-	struct rb_root      entries;
-	struct rw_semaphore lock;
-	struct machine	 *machine;
-	struct map	 *last_search_by_name;
-	struct map	 **maps_by_name;
-	refcount_t	 refcnt;
-	unsigned int	 nr_maps;
-	unsigned int	 nr_maps_allocated;
-#ifdef HAVE_LIBUNWIND_SUPPORT
-	void				*addr_space;
-	const struct unwind_libunwind_ops *unwind_libunwind_ops;
-#endif
-};
-
 #define KMAP_NAME_LEN 256
 
 struct kmap {
@@ -65,36 +35,22 @@ static inline void __maps__zput(struct maps **map)
 
 #define maps__zput(map) __maps__zput(&map)
 
+bool maps__equal(struct maps *a, struct maps *b);
+
 /* Iterate over map calling cb for each entry. */
 int maps__for_each_map(struct maps *maps, int (*cb)(struct map *map, void *data), void *data);
 /* Iterate over map removing an entry if cb returns true. */
 void maps__remove_maps(struct maps *maps, bool (*cb)(struct map *map, void *data), void *data);
 
-static inline struct machine *maps__machine(struct maps *maps)
-{
-	return RC_CHK_ACCESS(maps)->machine;
-}
-
-static inline unsigned int maps__nr_maps(const struct maps *maps)
-{
-	return RC_CHK_ACCESS(maps)->nr_maps;
-}
-
-static inline refcount_t *maps__refcnt(struct maps *maps)
-{
-	return &RC_CHK_ACCESS(maps)->refcnt;
-}
+struct machine *maps__machine(const struct maps *maps);
+unsigned int maps__nr_maps(const struct maps *maps); /* Test only. */
+refcount_t *maps__refcnt(struct maps *maps); /* Test only. */
 
 #ifdef HAVE_LIBUNWIND_SUPPORT
-static inline void *maps__addr_space(struct maps *maps)
-{
-	return RC_CHK_ACCESS(maps)->addr_space;
-}
-
-static inline const struct unwind_libunwind_ops *maps__unwind_libunwind_ops(const struct maps *maps)
-{
-	return RC_CHK_ACCESS(maps)->unwind_libunwind_ops;
-}
+void *maps__addr_space(const struct maps *maps);
+void maps__set_addr_space(struct maps *maps, void *addr_space);
+const struct unwind_libunwind_ops *maps__unwind_libunwind_ops(const struct maps *maps);
+void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libunwind_ops *ops);
 #endif
 
 size_t maps__fprintf(struct maps *maps, FILE *fp);
@@ -102,6 +58,7 @@ size_t maps__fprintf(struct maps *maps, FILE *fp);
 int maps__insert(struct maps *maps, struct map *map);
 void maps__remove(struct maps *maps, struct map *map);
 
+struct map *maps__find(struct maps *maps, u64 addr);
 struct symbol *maps__find_symbol(struct maps *maps, u64 addr, struct map **mapp);
 struct symbol *maps__find_symbol_by_name(struct maps *maps, const char *name, struct map **mapp);
 
@@ -117,8 +74,6 @@ struct map *maps__find_next_entry(struct maps *maps, struct map *map);
 
 int maps__merge_in(struct maps *kmaps, struct map *new_map);
 
-void __maps__sort_by_name(struct maps *maps);
-
 void maps__fixup_end(struct maps *maps);
 
 void maps__load_first(struct maps *maps);
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index 3a2e3687878c..637cbd4a7bfb 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -17,51 +17,126 @@
 
 unsigned int perf_mem_events__loads_ldlat = 30;
 
-#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
+#define E(t, n, s, l, a) { .tag = t, .name = n, .event_name = s, .ldlat = l, .aux_event = a }
 
-static struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
-	E("ldlat-loads",	"cpu/mem-loads,ldlat=%u/P",	"cpu/events/mem-loads"),
-	E("ldlat-stores",	"cpu/mem-stores/P",		"cpu/events/mem-stores"),
-	E(NULL,			NULL,				NULL),
+struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
+	E("ldlat-loads",	"%s/mem-loads,ldlat=%u/P",	"mem-loads",	true,	0),
+	E("ldlat-stores",	"%s/mem-stores/P",		"mem-stores",	false,	0),
+	E(NULL,			NULL,				NULL,		false,	0),
 };
 #undef E
 
 static char mem_loads_name[100];
-static bool mem_loads_name__init;
+static char mem_stores_name[100];
 
-struct perf_mem_event * __weak perf_mem_events__ptr(int i)
+struct perf_mem_event *perf_pmu__mem_events_ptr(struct perf_pmu *pmu, int i)
 {
-	if (i >= PERF_MEM_EVENTS__MAX)
+	if (i >= PERF_MEM_EVENTS__MAX || !pmu)
 		return NULL;
 
-	return &perf_mem_events[i];
+	return &pmu->mem_events[i];
 }
 
-const char * __weak perf_mem_events__name(int i, const char *pmu_name  __maybe_unused)
+static struct perf_pmu *perf_pmus__scan_mem(struct perf_pmu *pmu)
 {
-	struct perf_mem_event *e = perf_mem_events__ptr(i);
+	while ((pmu = perf_pmus__scan(pmu)) != NULL) {
+		if (pmu->mem_events)
+			return pmu;
+	}
+	return NULL;
+}
+
+struct perf_pmu *perf_mem_events_find_pmu(void)
+{
+	/*
+	 * The current perf mem doesn't support per-PMU configuration.
+	 * The exact same configuration is applied to all the
+	 * mem_events supported PMUs.
+	 * Return the first mem_events supported PMU.
+	 *
+	 * Notes: The only case which may support multiple mem_events
+	 * supported PMUs is Intel hybrid. The exact same mem_events
+	 * is shared among the PMUs. Only configure the first PMU
+	 * is good enough as well.
+	 */
+	return perf_pmus__scan_mem(NULL);
+}
+
+/**
+ * perf_pmu__mem_events_num_mem_pmus - Get the number of mem PMUs since the given pmu
+ * @pmu: Start pmu. If it's NULL, search the entire PMU list.
+ */
+int perf_pmu__mem_events_num_mem_pmus(struct perf_pmu *pmu)
+{
+	int num = 0;
+
+	while ((pmu = perf_pmus__scan_mem(pmu)) != NULL)
+		num++;
+
+	return num;
+}
 
+static const char *perf_pmu__mem_events_name(int i, struct perf_pmu *pmu)
+{
+	struct perf_mem_event *e;
+
+	if (i >= PERF_MEM_EVENTS__MAX || !pmu)
+		return NULL;
+
+	e = &pmu->mem_events[i];
 	if (!e)
 		return NULL;
 
-	if (i == PERF_MEM_EVENTS__LOAD) {
-		if (!mem_loads_name__init) {
-			mem_loads_name__init = true;
-			scnprintf(mem_loads_name, sizeof(mem_loads_name),
-				  e->name, perf_mem_events__loads_ldlat);
+	if (i == PERF_MEM_EVENTS__LOAD || i == PERF_MEM_EVENTS__LOAD_STORE) {
+		if (e->ldlat) {
+			if (!e->aux_event) {
+				/* ARM and Most of Intel */
+				scnprintf(mem_loads_name, sizeof(mem_loads_name),
+					  e->name, pmu->name,
+					  perf_mem_events__loads_ldlat);
+			} else {
+				/* Intel with mem-loads-aux event */
+				scnprintf(mem_loads_name, sizeof(mem_loads_name),
+					  e->name, pmu->name, pmu->name,
+					  perf_mem_events__loads_ldlat);
+			}
+		} else {
+			if (!e->aux_event) {
+				/* AMD and POWER */
+				scnprintf(mem_loads_name, sizeof(mem_loads_name),
+					  e->name, pmu->name);
+			} else
+				return NULL;
 		}
+
 		return mem_loads_name;
 	}
 
-	return e->name;
+	if (i == PERF_MEM_EVENTS__STORE) {
+		scnprintf(mem_stores_name, sizeof(mem_stores_name),
+			  e->name, pmu->name);
+		return mem_stores_name;
+	}
+
+	return NULL;
 }
 
-__weak bool is_mem_loads_aux_event(struct evsel *leader __maybe_unused)
+bool is_mem_loads_aux_event(struct evsel *leader)
 {
-	return false;
+	struct perf_pmu *pmu = leader->pmu;
+	struct perf_mem_event *e;
+
+	if (!pmu || !pmu->mem_events)
+		return false;
+
+	e = &pmu->mem_events[PERF_MEM_EVENTS__LOAD];
+	if (!e->aux_event)
+		return false;
+
+	return leader->core.attr.config == e->aux_event;
 }
 
-int perf_mem_events__parse(const char *str)
+int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str)
 {
 	char *tok, *saveptr = NULL;
 	bool found = false;
@@ -79,7 +154,7 @@ int perf_mem_events__parse(const char *str)
 
 	while (tok) {
 		for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
-			struct perf_mem_event *e = perf_mem_events__ptr(j);
+			struct perf_mem_event *e = perf_pmu__mem_events_ptr(pmu, j);
 
 			if (!e->tag)
 				continue;
@@ -100,19 +175,21 @@ int perf_mem_events__parse(const char *str)
 	return -1;
 }
 
-static bool perf_mem_event__supported(const char *mnt, struct perf_pmu *pmu,
+static bool perf_pmu__mem_events_supported(const char *mnt, struct perf_pmu *pmu,
 				      struct perf_mem_event *e)
 {
-	char sysfs_name[100];
 	char path[PATH_MAX];
 	struct stat st;
 
-	scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name);
-	scnprintf(path, PATH_MAX, "%s/devices/%s", mnt, sysfs_name);
+	if (!e->event_name)
+		return true;
+
+	scnprintf(path, PATH_MAX, "%s/devices/%s/events/%s", mnt, pmu->name, e->event_name);
+
 	return !stat(path, &st);
 }
 
-int perf_mem_events__init(void)
+int perf_pmu__mem_events_init(struct perf_pmu *pmu)
 {
 	const char *mnt = sysfs__mount();
 	bool found = false;
@@ -122,8 +199,7 @@ int perf_mem_events__init(void)
 		return -ENOENT;
 
 	for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
-		struct perf_mem_event *e = perf_mem_events__ptr(j);
-		struct perf_pmu *pmu = NULL;
+		struct perf_mem_event *e = perf_pmu__mem_events_ptr(pmu, j);
 
 		/*
 		 * If the event entry isn't valid, skip initialization
@@ -132,103 +208,66 @@ int perf_mem_events__init(void)
 		if (!e->tag)
 			continue;
 
-		/*
-		 * Scan all PMUs not just core ones, since perf mem/c2c on
-		 * platforms like AMD uses IBS OP PMU which is independent
-		 * of core PMU.
-		 */
-		while ((pmu = perf_pmus__scan(pmu)) != NULL) {
-			e->supported |= perf_mem_event__supported(mnt, pmu, e);
-			if (e->supported) {
-				found = true;
-				break;
-			}
-		}
+		e->supported |= perf_pmu__mem_events_supported(mnt, pmu, e);
+		if (e->supported)
+			found = true;
 	}
 
 	return found ? 0 : -ENOENT;
 }
 
-void perf_mem_events__list(void)
+void perf_pmu__mem_events_list(struct perf_pmu *pmu)
 {
 	int j;
 
 	for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
-		struct perf_mem_event *e = perf_mem_events__ptr(j);
+		struct perf_mem_event *e = perf_pmu__mem_events_ptr(pmu, j);
 
 		fprintf(stderr, "%-*s%-*s%s",
 			e->tag ? 13 : 0,
 			e->tag ? : "",
 			e->tag && verbose > 0 ? 25 : 0,
-			e->tag && verbose > 0 ? perf_mem_events__name(j, NULL) : "",
+			e->tag && verbose > 0 ? perf_pmu__mem_events_name(j, pmu) : "",
 			e->supported ? ": available\n" : "");
 	}
 }
 
-static void perf_mem_events__print_unsupport_hybrid(struct perf_mem_event *e,
-						    int idx)
+int perf_mem_events__record_args(const char **rec_argv, int *argv_nr)
 {
 	const char *mnt = sysfs__mount();
 	struct perf_pmu *pmu = NULL;
-
-	while ((pmu = perf_pmus__scan(pmu)) != NULL) {
-		if (!perf_mem_event__supported(mnt, pmu, e)) {
-			pr_err("failed: event '%s' not supported\n",
-			       perf_mem_events__name(idx, pmu->name));
-		}
-	}
-}
-
-int perf_mem_events__record_args(const char **rec_argv, int *argv_nr,
-				 char **rec_tmp, int *tmp_nr)
-{
-	const char *mnt = sysfs__mount();
-	int i = *argv_nr, k = 0;
 	struct perf_mem_event *e;
+	int i = *argv_nr;
+	const char *s;
+	char *copy;
 
-	for (int j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
-		e = perf_mem_events__ptr(j);
-		if (!e->record)
-			continue;
+	while ((pmu = perf_pmus__scan_mem(pmu)) != NULL) {
+		for (int j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+			e = perf_pmu__mem_events_ptr(pmu, j);
+
+			if (!e->record)
+				continue;
 
-		if (perf_pmus__num_mem_pmus() == 1) {
 			if (!e->supported) {
 				pr_err("failed: event '%s' not supported\n",
-				       perf_mem_events__name(j, NULL));
+					perf_pmu__mem_events_name(j, pmu));
 				return -1;
 			}
 
-			rec_argv[i++] = "-e";
-			rec_argv[i++] = perf_mem_events__name(j, NULL);
-		} else {
-			struct perf_pmu *pmu = NULL;
+			s = perf_pmu__mem_events_name(j, pmu);
+			if (!s || !perf_pmu__mem_events_supported(mnt, pmu, e))
+				continue;
 
-			if (!e->supported) {
-				perf_mem_events__print_unsupport_hybrid(e, j);
+			copy = strdup(s);
+			if (!copy)
 				return -1;
-			}
-
-			while ((pmu = perf_pmus__scan(pmu)) != NULL) {
-				const char *s = perf_mem_events__name(j, pmu->name);
-
-				if (!perf_mem_event__supported(mnt, pmu, e))
-					continue;
 
-				rec_argv[i++] = "-e";
-				if (s) {
-					char *copy = strdup(s);
-					if (!copy)
-						return -1;
-
-					rec_argv[i++] = copy;
-					rec_tmp[k++] = copy;
-				}
-			}
+			rec_argv[i++] = "-e";
+			rec_argv[i++] = copy;
 		}
 	}
 
 	*argv_nr = i;
-	*tmp_nr = k;
 	return 0;
 }
 
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index b40ad6ea93fc..15d5f0320d27 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -14,9 +14,11 @@
 struct perf_mem_event {
 	bool		record;
 	bool		supported;
+	bool		ldlat;
+	u32		aux_event;
 	const char	*tag;
 	const char	*name;
-	const char	*sysfs_name;
+	const char	*event_name;
 };
 
 struct mem_info {
@@ -34,17 +36,18 @@ enum {
 };
 
 extern unsigned int perf_mem_events__loads_ldlat;
+extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
 
-int perf_mem_events__parse(const char *str);
-int perf_mem_events__init(void);
+int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str);
+int perf_pmu__mem_events_init(struct perf_pmu *pmu);
 
-const char *perf_mem_events__name(int i, const char *pmu_name);
-struct perf_mem_event *perf_mem_events__ptr(int i);
+struct perf_mem_event *perf_pmu__mem_events_ptr(struct perf_pmu *pmu, int i);
+struct perf_pmu *perf_mem_events_find_pmu(void);
+int perf_pmu__mem_events_num_mem_pmus(struct perf_pmu *pmu);
 bool is_mem_loads_aux_event(struct evsel *leader);
 
-void perf_mem_events__list(void);
-int perf_mem_events__record_args(const char **rec_argv, int *argv_nr,
-				 char **rec_tmp, int *tmp_nr);
+void perf_pmu__mem_events_list(struct perf_pmu *pmu);
+int perf_mem_events__record_args(const char **rec_argv, int *argv_nr);
 
 int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 966cca5a3e88..79ef6095ab28 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -44,6 +44,8 @@ struct metric_event *metricgroup__lookup(struct rblist *metric_events,
 	if (!metric_events)
 		return NULL;
 
+	if (evsel && evsel->metric_leader)
+		me.evsel = evsel->metric_leader;
 	nd = rblist__find(metric_events, &me);
 	if (nd)
 		return container_of(nd, struct metric_event, nd);
@@ -350,25 +352,23 @@ static int setup_metric_events(const char *pmu, struct hashmap *ids,
 	return 0;
 }
 
-static bool match_metric(const char *n, const char *list)
+static bool match_metric(const char *metric_or_groups, const char *sought)
 {
 	int len;
 	char *m;
 
-	if (!list)
+	if (!sought)
 		return false;
-	if (!strcmp(list, "all"))
+	if (!strcmp(sought, "all"))
 		return true;
-	if (!n)
-		return !strcasecmp(list, "No_group");
-	len = strlen(list);
-	m = strcasestr(n, list);
-	if (!m)
-		return false;
-	if ((m == n || m[-1] == ';' || m[-1] == ' ') &&
-	    (m[len] == 0 || m[len] == ';'))
+	if (!metric_or_groups)
+		return !strcasecmp(sought, "No_group");
+	len = strlen(sought);
+	if (!strncasecmp(metric_or_groups, sought, len) &&
+	    (metric_or_groups[len] == 0 || metric_or_groups[len] == ';'))
 		return true;
-	return false;
+	m = strchr(metric_or_groups, ';');
+	return m && match_metric(m + 1, sought);
 }
 
 static bool match_pm_metric(const struct pmu_metric *pm, const char *pmu, const char *metric)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 66eabcea4242..6f8b0fa17689 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -2181,50 +2181,53 @@ int parse_event(struct evlist *evlist, const char *str)
 	return ret;
 }
 
+struct parse_events_error_entry {
+	/** @list: The list the error is part of. */
+	struct list_head list;
+	/** @idx: index in the parsed string */
+	int   idx;
+	/** @str: string to display at the index */
+	char *str;
+	/** @help: optional help string */
+	char *help;
+};
+
 void parse_events_error__init(struct parse_events_error *err)
 {
-	bzero(err, sizeof(*err));
+	INIT_LIST_HEAD(&err->list);
 }
 
 void parse_events_error__exit(struct parse_events_error *err)
 {
-	zfree(&err->str);
-	zfree(&err->help);
-	zfree(&err->first_str);
-	zfree(&err->first_help);
+	struct parse_events_error_entry *pos, *tmp;
+
+	list_for_each_entry_safe(pos, tmp, &err->list, list) {
+		zfree(&pos->str);
+		zfree(&pos->help);
+		list_del_init(&pos->list);
+		free(pos);
+	}
 }
 
 void parse_events_error__handle(struct parse_events_error *err, int idx,
 				char *str, char *help)
 {
+	struct parse_events_error_entry *entry;
+
 	if (WARN(!str || !err, "WARNING: failed to provide error string or struct\n"))
 		goto out_free;
-	switch (err->num_errors) {
-	case 0:
-		err->idx = idx;
-		err->str = str;
-		err->help = help;
-		break;
-	case 1:
-		err->first_idx = err->idx;
-		err->idx = idx;
-		err->first_str = err->str;
-		err->str = str;
-		err->first_help = err->help;
-		err->help = help;
-		break;
-	default:
-		pr_debug("Multiple errors dropping message: %s (%s)\n",
-			err->str, err->help ?: "<no help>");
-		free(err->str);
-		err->str = str;
-		free(err->help);
-		err->help = help;
-		break;
+
+	entry = zalloc(sizeof(*entry));
+	if (!entry) {
+		pr_err("Failed to allocate memory for event parsing error: %s (%s)\n",
+			str, help ?: "<no help>");
+		goto out_free;
 	}
-	err->num_errors++;
+	entry->idx = idx;
+	entry->str = str;
+	entry->help = help;
+	list_add(&entry->list, &err->list);
 	return;
-
 out_free:
 	free(str);
 	free(help);
@@ -2294,19 +2297,34 @@ static void __parse_events_error__print(int err_idx, const char *err_str,
 	}
 }
 
-void parse_events_error__print(struct parse_events_error *err,
+void parse_events_error__print(const struct parse_events_error *err,
 			       const char *event)
 {
-	if (!err->num_errors)
-		return;
+	struct parse_events_error_entry *pos;
+	bool first = true;
+
+	list_for_each_entry(pos, &err->list, list) {
+		if (!first)
+			fputs("\n", stderr);
+		__parse_events_error__print(pos->idx, pos->str, pos->help, event);
+		first = false;
+	}
+}
 
-	__parse_events_error__print(err->idx, err->str, err->help, event);
+/*
+ * In the list of errors err, do any of the error strings (str) contain the
+ * given needle string?
+ */
+bool parse_events_error__contains(const struct parse_events_error *err,
+				  const char *needle)
+{
+	struct parse_events_error_entry *pos;
 
-	if (err->num_errors > 1) {
-		fputs("\nInitial error:\n", stderr);
-		__parse_events_error__print(err->first_idx, err->first_str,
-					err->first_help, event);
+	list_for_each_entry(pos, &err->list, list) {
+		if (strstr(pos->str, needle) != NULL)
+			return true;
 	}
+	return false;
 }
 
 #undef MAX_WIDTH
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 63c0a36a4bf1..809359e8544e 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -130,13 +130,8 @@ struct parse_events_term {
 };
 
 struct parse_events_error {
-	int   num_errors;       /* number of errors encountered */
-	int   idx;	/* index in the parsed string */
-	char *str;      /* string to display at the index */
-	char *help;	/* optional help string */
-	int   first_idx;/* as above, but for the first encountered error */
-	char *first_str;
-	char *first_help;
+	/** @list: The head of a list of errors. */
+	struct list_head list;
 };
 
 /* A wrapper around a list of terms for the sake of better type safety. */
@@ -247,9 +242,10 @@ void parse_events_error__init(struct parse_events_error *err);
 void parse_events_error__exit(struct parse_events_error *err);
 void parse_events_error__handle(struct parse_events_error *err, int idx,
 				char *str, char *help);
-void parse_events_error__print(struct parse_events_error *err,
+void parse_events_error__print(const struct parse_events_error *err,
 			       const char *event);
-
+bool parse_events_error__contains(const struct parse_events_error *err,
+				  const char *needle);
 #ifdef HAVE_LIBELF_SUPPORT
 /*
  * If the probe point starts with '%',
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index de098caf0c1c..d70f5d84af92 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -536,8 +536,6 @@ tracepoint_name opt_event_config
 	list = alloc_list();
 	if (!list)
 		YYNOMEM;
-	if (error)
-		error->idx = @1.first_column;
 
 	err = parse_events_add_tracepoint(list, &parse_state->idx, $1.sys, $1.event,
 					error, $2, &@1);
diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
index a4a100425b3a..cda1c620968e 100644
--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
@@ -46,22 +46,18 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
 
 			if (!strcmp(s, "?")) {
 				fprintf(stderr, "available registers: ");
-#ifdef HAVE_PERF_REGS_SUPPORT
-				for (r = sample_reg_masks; r->name; r++) {
+				for (r = arch__sample_reg_masks(); r->name; r++) {
 					if (r->mask & mask)
 						fprintf(stderr, "%s ", r->name);
 				}
-#endif
 				fputc('\n', stderr);
 				/* just printing available regs */
 				goto error;
 			}
-#ifdef HAVE_PERF_REGS_SUPPORT
-			for (r = sample_reg_masks; r->name; r++) {
+			for (r = arch__sample_reg_masks(); r->name; r++) {
 				if ((r->mask & mask) && !strcasecmp(s, r->name))
 					break;
 			}
-#endif
 			if (!r || !r->name) {
 				ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n",
 					    s, intr ? "-I" : "--user-regs=");
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c b/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
index 696566c54768..9dcda80d310f 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/arm64/include/uapi/asm/perf_regs.h"
 
@@ -92,5 +90,3 @@ uint64_t __perf_reg_sp_arm64(void)
 {
 	return PERF_REG_ARM64_SP;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_arm.c b/tools/perf/util/perf-regs-arch/perf_regs_arm.c
index 700fd07cd2aa..e29d130a587a 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_arm.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_arm.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/arm/include/uapi/asm/perf_regs.h"
 
@@ -56,5 +54,3 @@ uint64_t __perf_reg_sp_arm(void)
 {
 	return PERF_REG_ARM_SP;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_csky.c b/tools/perf/util/perf-regs-arch/perf_regs_csky.c
index a2841094e096..75b461ef2eba 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_csky.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_csky.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../arch/csky/include/uapi/asm/perf_regs.h"
 
@@ -96,5 +94,3 @@ uint64_t __perf_reg_sp_csky(void)
 {
 	return PERF_REG_CSKY_SP;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c b/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
index a9ba0f934123..043f97f4e3ac 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/loongarch/include/uapi/asm/perf_regs.h"
 
@@ -87,5 +85,3 @@ uint64_t __perf_reg_sp_loongarch(void)
 {
 	return PERF_REG_LOONGARCH_R3;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_mips.c b/tools/perf/util/perf-regs-arch/perf_regs_mips.c
index 5a45830cfbf5..793178fc3c78 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_mips.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_mips.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/mips/include/uapi/asm/perf_regs.h"
 
@@ -83,5 +81,3 @@ uint64_t __perf_reg_sp_mips(void)
 {
 	return PERF_REG_MIPS_R29;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c b/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
index 1f0d682db74a..08636bb09a3a 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/powerpc/include/uapi/asm/perf_regs.h"
 
@@ -141,5 +139,3 @@ uint64_t __perf_reg_sp_powerpc(void)
 {
 	return PERF_REG_POWERPC_R1;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_riscv.c b/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
index e432630be4c5..337b687c655d 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/riscv/include/uapi/asm/perf_regs.h"
 
@@ -88,5 +86,3 @@ uint64_t __perf_reg_sp_riscv(void)
 {
 	return PERF_REG_RISCV_SP;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_s390.c b/tools/perf/util/perf-regs-arch/perf_regs_s390.c
index 1c7a46db778c..d69bba881080 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_s390.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_s390.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/s390/include/uapi/asm/perf_regs.h"
 
@@ -92,5 +90,3 @@ uint64_t __perf_reg_sp_s390(void)
 {
 	return PERF_REG_S390_R15;
 }
-
-#endif
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_x86.c b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
index 873c620f0634..708954a9d35d 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_x86.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#ifdef HAVE_PERF_REGS_SUPPORT
-
 #include "../perf_regs.h"
 #include "../../../arch/x86/include/uapi/asm/perf_regs.h"
 
@@ -94,5 +92,3 @@ uint64_t __perf_reg_sp_x86(void)
 {
 	return PERF_REG_X86_SP;
 }
-
-#endif
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index e2275856b570..44b90bbf2d07 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -21,7 +21,14 @@ uint64_t __weak arch__user_reg_mask(void)
 	return 0;
 }
 
-#ifdef HAVE_PERF_REGS_SUPPORT
+static const struct sample_reg sample_reg_masks[] = {
+	SMPL_REG_END
+};
+
+const struct sample_reg * __weak arch__sample_reg_masks(void)
+{
+	return sample_reg_masks;
+}
 
 const char *perf_reg_name(int id, const char *arch)
 {
@@ -125,5 +132,3 @@ uint64_t perf_arch_reg_sp(const char *arch)
 	pr_err("Fail to find SP register for arch %s, returns 0\n", arch);
 	return 0;
 }
-
-#endif
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index ecd2a5362042..f2d0736d65cc 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -26,9 +26,7 @@ enum {
 int arch_sdt_arg_parse_op(char *old_op, char **new_op);
 uint64_t arch__intr_reg_mask(void);
 uint64_t arch__user_reg_mask(void);
-
-#ifdef HAVE_PERF_REGS_SUPPORT
-extern const struct sample_reg sample_reg_masks[];
+const struct sample_reg *arch__sample_reg_masks(void);
 
 const char *perf_reg_name(int id, const char *arch);
 int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
@@ -67,34 +65,4 @@ static inline uint64_t DWARF_MINIMAL_REGS(const char *arch)
 	return (1ULL << perf_arch_reg_ip(arch)) | (1ULL << perf_arch_reg_sp(arch));
 }
 
-#else
-
-static inline uint64_t DWARF_MINIMAL_REGS(const char *arch __maybe_unused)
-{
-	return 0;
-}
-
-static inline const char *perf_reg_name(int id __maybe_unused, const char *arch __maybe_unused)
-{
-	return "unknown";
-}
-
-static inline int perf_reg_value(u64 *valp __maybe_unused,
-				 struct regs_dump *regs __maybe_unused,
-				 int id __maybe_unused)
-{
-	return 0;
-}
-
-static inline uint64_t perf_arch_reg_ip(const char *arch __maybe_unused)
-{
-	return 0;
-}
-
-static inline uint64_t perf_arch_reg_sp(const char *arch __maybe_unused)
-{
-	return 0;
-}
-
-#endif /* HAVE_PERF_REGS_SUPPORT */
 #endif /* __PERF_REGS_H */
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 3c9609944a2f..f39cbbc1a7ec 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -657,7 +657,7 @@ static int pmu_aliases_parse(struct perf_pmu *pmu)
 	return 0;
 }
 
-static int pmu_alias_terms(struct perf_pmu_alias *alias, struct list_head *terms)
+static int pmu_alias_terms(struct perf_pmu_alias *alias, int err_loc, struct list_head *terms)
 {
 	struct parse_events_term *term, *cloned;
 	struct parse_events_terms clone_terms;
@@ -675,6 +675,7 @@ static int pmu_alias_terms(struct perf_pmu_alias *alias, struct list_head *terms
 		 * which we don't want for implicit terms in aliases.
 		 */
 		cloned->weak = true;
+		cloned->err_term = cloned->err_val = err_loc;
 		list_add_tail(&cloned->list, &clone_terms.terms);
 	}
 	list_splice_init(&clone_terms.terms, terms);
@@ -986,8 +987,10 @@ static int pmu_max_precise(int dirfd, struct perf_pmu *pmu)
 }
 
 void __weak
-perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
+perf_pmu__arch_init(struct perf_pmu *pmu)
 {
+	if (pmu->is_core)
+		pmu->mem_events = perf_mem_events;
 }
 
 struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char *name)
@@ -1019,10 +1022,9 @@ struct perf_pmu *perf_pmu__lookup(struct list_head *pmus, int dirfd, const char
 	 * type value and format definitions. Load both right
 	 * now.
 	 */
-	if (pmu_format(pmu, dirfd, name)) {
-		free(pmu);
-		return NULL;
-	}
+	if (pmu_format(pmu, dirfd, name))
+		goto err;
+
 	pmu->is_core = is_pmu_core(name);
 	pmu->cpus = pmu_cpumask(dirfd, name, pmu->is_core);
 
@@ -1361,8 +1363,8 @@ static int pmu_config_term(const struct perf_pmu *pmu,
 
 			parse_events_error__handle(err, term->err_val,
 				asprintf(&err_str,
-				    "value too big for format, maximum is %llu",
-				    (unsigned long long)max_val) < 0
+				    "value too big for format (%s), maximum is %llu",
+				    format->name, (unsigned long long)max_val) < 0
 				    ? strdup("value too big for format")
 				    : err_str,
 				    NULL);
@@ -1516,7 +1518,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_
 		alias = pmu_find_alias(pmu, term);
 		if (!alias)
 			continue;
-		ret = pmu_alias_terms(alias, &term->list);
+		ret = pmu_alias_terms(alias, term->err_term, &term->list);
 		if (ret) {
 			parse_events_error__handle(err, term->err_term,
 						strdup("Failure to duplicate terms"),
@@ -1760,6 +1762,12 @@ bool pmu__name_match(const struct perf_pmu *pmu, const char *pmu_name)
 
 bool perf_pmu__is_software(const struct perf_pmu *pmu)
 {
+	const char *known_sw_pmus[] = {
+		"kprobe",
+		"msr",
+		"uprobe",
+	};
+
 	if (pmu->is_core || pmu->is_uncore || pmu->auxtrace)
 		return false;
 	switch (pmu->type) {
@@ -1771,7 +1779,11 @@ bool perf_pmu__is_software(const struct perf_pmu *pmu)
 	case PERF_TYPE_BREAKPOINT:	return true;
 	default: break;
 	}
-	return !strcmp(pmu->name, "kprobe") || !strcmp(pmu->name, "uprobe");
+	for (size_t i = 0; i < ARRAY_SIZE(known_sw_pmus); i++) {
+		if (!strcmp(pmu->name, known_sw_pmus[i]))
+			return true;
+	}
+	return false;
 }
 
 FILE *perf_pmu__open_file(const struct perf_pmu *pmu, const char *name)
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 424c3fee0949..e35d985206db 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -10,6 +10,8 @@
 #include <stdio.h>
 #include "parse-events.h"
 #include "pmu-events/pmu-events.h"
+#include "map_symbol.h"
+#include "mem-events.h"
 
 struct evsel_config_term;
 struct perf_cpu_map;
@@ -162,6 +164,11 @@ struct perf_pmu {
 		 */
 		bool exclude_guest;
 	} missing_features;
+
+	/**
+	 * @mem_events: List of the supported mem events
+	 */
+	struct perf_mem_event *mem_events;
 };
 
 /** @perf_pmu__fake: A special global PMU used for testing. */
diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c
index ce4931461741..16505071d362 100644
--- a/tools/perf/util/pmus.c
+++ b/tools/perf/util/pmus.c
@@ -345,12 +345,6 @@ const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str)
 	return NULL;
 }
 
-int __weak perf_pmus__num_mem_pmus(void)
-{
-	/* All core PMUs are for mem events. */
-	return perf_pmus__num_core_pmus();
-}
-
 /** Struct for ordering events as output in perf list. */
 struct sevent {
 	/** PMU for event. */
diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h
index 4c67153ac257..94d2a08d894b 100644
--- a/tools/perf/util/pmus.h
+++ b/tools/perf/util/pmus.h
@@ -17,7 +17,6 @@ struct perf_pmu *perf_pmus__scan_core(struct perf_pmu *pmu);
 
 const struct perf_pmu *perf_pmus__pmu_for_pmu_filter(const char *str);
 
-int perf_pmus__num_mem_pmus(void);
 void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *print_state);
 bool perf_pmus__have_event(const char *pname, const char *name);
 int perf_pmus__num_core_pmus(void);
diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c
index 9e47712507cc..7b54e9385442 100644
--- a/tools/perf/util/print-events.c
+++ b/tools/perf/util/print-events.c
@@ -28,6 +28,7 @@
 #include "tracepoint.h"
 #include "pfm.h"
 #include "thread_map.h"
+#include "util.h"
 
 #define MAX_NAME_LEN 100
 
@@ -63,6 +64,8 @@ void print_tracepoint_events(const struct print_callbacks *print_cb __maybe_unus
 {
 	char *events_path = get_tracing_file("events");
 	int events_fd = open(events_path, O_PATH);
+	struct dirent **sys_namelist = NULL;
+	int sys_items;
 
 	put_tracing_file(events_path);
 	if (events_fd < 0) {
@@ -70,10 +73,7 @@ void print_tracepoint_events(const struct print_callbacks *print_cb __maybe_unus
 		return;
 	}
 
-#ifdef HAVE_SCANDIRAT_SUPPORT
-{
-	struct dirent **sys_namelist = NULL;
-	int sys_items = tracing_events__scandir_alphasort(&sys_namelist);
+	sys_items = tracing_events__scandir_alphasort(&sys_namelist);
 
 	for (int i = 0; i < sys_items; i++) {
 		struct dirent *sys_dirent = sys_namelist[i];
@@ -130,11 +130,6 @@ next_sys:
 	}
 
 	free(sys_namelist);
-}
-#else
-	printf("\nWARNING: Your libc doesn't have the scandirat function, please ask its maintainers to implement it.\n"
-	       "         As a rough fallback, please do 'ls %s' to see the available tracepoint events.\n", events_path);
-#endif
 	close(events_fd);
 }
 
@@ -232,7 +227,6 @@ void print_sdt_events(const struct print_callbacks *print_cb, void *print_state)
 bool is_event_supported(u8 type, u64 config)
 {
 	bool ret = true;
-	int open_return;
 	struct evsel *evsel;
 	struct perf_event_attr attr = {
 		.type = type,
@@ -246,20 +240,32 @@ bool is_event_supported(u8 type, u64 config)
 
 	evsel = evsel__new(&attr);
 	if (evsel) {
-		open_return = evsel__open(evsel, NULL, tmap);
-		ret = open_return >= 0;
+		ret = evsel__open(evsel, NULL, tmap) >= 0;
 
-		if (open_return == -EACCES) {
+		if (!ret) {
 			/*
-			 * This happens if the paranoid value
+			 * The event may fail to open if the paranoid value
 			 * /proc/sys/kernel/perf_event_paranoid is set to 2
-			 * Re-run with exclude_kernel set; we don't do that
-			 * by default as some ARM machines do not support it.
-			 *
+			 * Re-run with exclude_kernel set; we don't do that by
+			 * default as some ARM machines do not support it.
 			 */
 			evsel->core.attr.exclude_kernel = 1;
 			ret = evsel__open(evsel, NULL, tmap) >= 0;
 		}
+
+		if (!ret) {
+			/*
+			 * The event may fail to open if the PMU requires
+			 * exclude_guest to be set (e.g. as the Apple M1 PMU
+			 * requires).
+			 * Re-run with exclude_guest set; we don't do that by
+			 * default as it's equally legitimate for another PMU
+			 * driver to require that exclude_guest is clear.
+			 */
+			evsel->core.attr.exclude_guest = 1;
+			ret = evsel__open(evsel, NULL, tmap) >= 0;
+		}
+
 		evsel__delete(evsel);
 	}
 
diff --git a/tools/perf/util/print_insn.c b/tools/perf/util/print_insn.c
new file mode 100644
index 000000000000..459e0e93d7b1
--- /dev/null
+++ b/tools/perf/util/print_insn.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Instruction binary disassembler based on capstone.
+ *
+ * Author(s): Changbin Du <changbin.du@huawei.com>
+ */
+#include <string.h>
+#include <stdbool.h>
+#include "debug.h"
+#include "sample.h"
+#include "symbol.h"
+#include "machine.h"
+#include "thread.h"
+#include "print_insn.h"
+
+size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp)
+{
+	int printed = 0;
+
+	for (int i = 0; i < sample->insn_len; i++) {
+		printed += fprintf(fp, "%02x", (unsigned char)sample->insn[i]);
+		if (sample->insn_len - i > 1)
+			printed += fprintf(fp, " ");
+	}
+	return printed;
+}
+
+#ifdef HAVE_LIBCAPSTONE_SUPPORT
+#include <capstone/capstone.h>
+
+static int capstone_init(struct machine *machine, csh *cs_handle)
+{
+	cs_arch arch;
+	cs_mode mode;
+
+	if (machine__is(machine, "x86_64")) {
+		arch = CS_ARCH_X86;
+		mode = CS_MODE_64;
+	} else if (machine__normalized_is(machine, "x86")) {
+		arch = CS_ARCH_X86;
+		mode = CS_MODE_32;
+	} else if (machine__normalized_is(machine, "arm64")) {
+		arch = CS_ARCH_ARM64;
+		mode = CS_MODE_ARM;
+	} else if (machine__normalized_is(machine, "arm")) {
+		arch = CS_ARCH_ARM;
+		mode = CS_MODE_ARM + CS_MODE_V8;
+	} else if (machine__normalized_is(machine, "s390")) {
+		arch = CS_ARCH_SYSZ;
+		mode = CS_MODE_BIG_ENDIAN;
+	} else {
+		return -1;
+	}
+
+	if (cs_open(arch, mode, cs_handle) != CS_ERR_OK) {
+		pr_warning_once("cs_open failed\n");
+		return -1;
+	}
+
+	if (machine__normalized_is(machine, "x86")) {
+		cs_option(*cs_handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT);
+		/*
+		 * Resolving address operands to symbols is implemented
+		 * on x86 by investigating instruction details.
+		 */
+		cs_option(*cs_handle, CS_OPT_DETAIL, CS_OPT_ON);
+	}
+
+	return 0;
+}
+
+static size_t print_insn_x86(struct perf_sample *sample, struct thread *thread,
+			     cs_insn *insn, FILE *fp)
+{
+	struct addr_location al;
+	size_t printed = 0;
+
+	if (insn->detail && insn->detail->x86.op_count == 1) {
+		cs_x86_op *op = &insn->detail->x86.operands[0];
+
+		addr_location__init(&al);
+		if (op->type == X86_OP_IMM &&
+		    thread__find_symbol(thread, sample->cpumode, op->imm, &al)) {
+			printed += fprintf(fp, "%s ", insn[0].mnemonic);
+			printed += symbol__fprintf_symname_offs(al.sym, &al, fp);
+			addr_location__exit(&al);
+			return printed;
+		}
+		addr_location__exit(&al);
+	}
+
+	printed += fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str);
+	return printed;
+}
+
+size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread,
+				struct machine *machine, FILE *fp)
+{
+	csh cs_handle;
+	cs_insn *insn;
+	size_t count;
+	size_t printed = 0;
+	int ret;
+
+	/* TODO: Try to initiate capstone only once but need a proper place. */
+	ret = capstone_init(machine, &cs_handle);
+	if (ret < 0) {
+		/* fallback */
+		return sample__fprintf_insn_raw(sample, fp);
+	}
+
+	count = cs_disasm(cs_handle, (uint8_t *)sample->insn, sample->insn_len,
+			  sample->ip, 1, &insn);
+	if (count > 0) {
+		if (machine__normalized_is(machine, "x86"))
+			printed += print_insn_x86(sample, thread, &insn[0], fp);
+		else
+			printed += fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str);
+		cs_free(insn, count);
+	} else {
+		printed += fprintf(fp, "illegal instruction");
+	}
+
+	cs_close(&cs_handle);
+	return printed;
+}
+#else
+size_t sample__fprintf_insn_asm(struct perf_sample *sample __maybe_unused,
+				struct thread *thread __maybe_unused,
+				struct machine *machine __maybe_unused,
+				FILE *fp __maybe_unused)
+{
+	return 0;
+}
+#endif
diff --git a/tools/perf/util/print_insn.h b/tools/perf/util/print_insn.h
new file mode 100644
index 000000000000..465bdcfcc2fd
--- /dev/null
+++ b/tools/perf/util/print_insn.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef PERF_PRINT_INSN_H
+#define PERF_PRINT_INSN_H
+
+#include <stddef.h>
+#include <stdio.h>
+
+struct perf_sample;
+struct thread;
+struct machine;
+
+size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread,
+				struct machine *machine, FILE *fp);
+size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp);
+
+#endif /* PERF_PRINT_INSN_H */
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index a1a796043691..2a0ad9ecf0a2 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -358,6 +358,7 @@ static int kernel_get_module_dso(const char *module, struct dso **pdso)
 		map = maps__find_by_name(machine__kernel_maps(host_machine), module_name);
 		if (map) {
 			dso = map__dso(map);
+			map__put(map);
 			goto found;
 		}
 		pr_debug("Failed to find module %s.\n", module);
@@ -2273,9 +2274,7 @@ static int find_perf_probe_point_from_map(struct probe_trace_point *tp,
 	ret = pp->function ? 0 : -ENOMEM;
 
 out:
-	if (map && !is_kprobe) {
-		map__put(map);
-	}
+	map__put(map);
 
 	return ret;
 }
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index c8923375e30d..630e16c54ed5 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -186,8 +186,6 @@ static_var:
 	return ret2;
 }
 
-#define BYTES_TO_BITS(nb)	((nb) * BITS_PER_LONG / sizeof(long))
-
 static int convert_variable_type(Dwarf_Die *vr_die,
 				 struct probe_trace_arg *tvar,
 				 const char *cast, bool user_access)
@@ -217,7 +215,7 @@ static int convert_variable_type(Dwarf_Die *vr_die,
 		total = dwarf_bytesize(vr_die);
 		if (boffs < 0 || total < 0)
 			return -ENOENT;
-		ret = snprintf(buf, 16, "b%d@%d/%zd", bsize, boffs,
+		ret = snprintf(buf, 16, "b%d@%d/%d", bsize, boffs,
 				BYTES_TO_BITS(total));
 		goto formatted;
 	}
diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index 593b660ec75e..1bec945f4838 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources
@@ -31,6 +31,7 @@ util/counts.c
 util/print_binary.c
 util/strlist.c
 util/trace-event.c
+util/trace-event-parse.c
 ../lib/rbtree.c
 util/string.c
 util/symbol_fprintf.c
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 8761f51b5c7c..075c0f79b1b9 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -181,6 +181,7 @@ int perf_bpf_filter__destroy(struct evsel *evsel __maybe_unused)
  * implementing 'verbose' and 'eprintf'.
  */
 int verbose;
+int debug_kmaps;
 int debug_peo_args;
 
 int eprintf(int level, int var, const char *fmt, ...);
diff --git a/tools/perf/util/rb_resort.h b/tools/perf/util/rb_resort.h
index 376e86cb4c3c..d927a0d25052 100644
--- a/tools/perf/util/rb_resort.h
+++ b/tools/perf/util/rb_resort.h
@@ -143,9 +143,4 @@ struct __name##_sorted *__name = __name##_sorted__new
 	DECLARE_RESORT_RB(__name)(&__ilist->rblist.entries.rb_root,		\
 				  __ilist->rblist.nr_entries)
 
-/* For 'struct machine->threads' */
-#define DECLARE_RESORT_RB_MACHINE_THREADS(__name, __machine, hash_bucket)    \
- DECLARE_RESORT_RB(__name)(&__machine->threads[hash_bucket].entries.rb_root, \
-			   __machine->threads[hash_bucket].nr)
-
 #endif /* _PERF_RESORT_RB_H_ */
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 860e1837ba96..b4f0f60e60a6 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -858,6 +858,10 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 	pydict_set_item_string_decref(dict, "ev_name", _PyUnicode_FromString(evsel__name(evsel)));
 	pydict_set_item_string_decref(dict, "attr", _PyBytes_FromStringAndSize((const char *)&evsel->core.attr, sizeof(evsel->core.attr)));
 
+	pydict_set_item_string_decref(dict_sample, "id",
+			PyLong_FromUnsignedLongLong(sample->id));
+	pydict_set_item_string_decref(dict_sample, "stream_id",
+			PyLong_FromUnsignedLongLong(sample->stream_id));
 	pydict_set_item_string_decref(dict_sample, "pid",
 			_PyLong_FromLong(sample->pid));
 	pydict_set_item_string_decref(dict_sample, "tid",
@@ -1306,7 +1310,7 @@ static void python_export_sample_table(struct db_export *dbe,
 	struct tables *tables = container_of(dbe, struct tables, dbe);
 	PyObject *t;
 
-	t = tuple_new(25);
+	t = tuple_new(27);
 
 	tuple_set_d64(t, 0, es->db_id);
 	tuple_set_d64(t, 1, es->evsel->db_id);
@@ -1333,6 +1337,8 @@ static void python_export_sample_table(struct db_export *dbe,
 	tuple_set_d64(t, 22, es->sample->insn_cnt);
 	tuple_set_d64(t, 23, es->sample->cyc_cnt);
 	tuple_set_s32(t, 24, es->sample->flags);
+	tuple_set_d64(t, 25, es->sample->id);
+	tuple_set_d64(t, 26, es->sample->stream_id);
 
 	call_object(tables->sample_handler, t, "sample_table");
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 199d3e8df315..06d0bd7fb459 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2720,6 +2720,17 @@ size_t perf_session__fprintf(struct perf_session *session, FILE *fp)
 	return machine__fprintf(&session->machines.host, fp);
 }
 
+void perf_session__dump_kmaps(struct perf_session *session)
+{
+	int save_verbose = verbose;
+
+	fflush(stdout);
+	fprintf(stderr, "Kernel and module maps:\n");
+	verbose = 0; /* Suppress verbose to print a summary only */
+	maps__fprintf(machine__kernel_maps(&session->machines.host), stderr);
+	verbose = save_verbose;
+}
+
 struct evsel *perf_session__find_first_evtype(struct perf_session *session,
 					      unsigned int type)
 {
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index ee3715e8563b..5064c6ec11e7 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -133,6 +133,8 @@ size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp
 size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp,
 				       bool skip_empty);
 
+void perf_session__dump_kmaps(struct perf_session *session);
+
 struct evsel *perf_session__find_first_evtype(struct perf_session *session,
 					    unsigned int type);
 
diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index 79d5e2955f85..3107f5aa8c9a 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -85,6 +85,7 @@ if '-DHAVE_LIBTRACEEVENT' in cflags:
     extra_libraries += [ 'traceevent' ]
 else:
     ext_sources.remove('util/trace-event.c')
+    ext_sources.remove('util/trace-event-parse.c')
 
 # use full paths with source files
 ext_sources = list(map(lambda x: '%s/%s' % (src_perf, x) , ext_sources))
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 30254eb63709..92a1bd695e8a 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -3372,7 +3372,7 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 				sort_dimension_add_dynamic_header(sd);
 		}
 
-		if (sd->entry == &sort_parent) {
+		if (sd->entry == &sort_parent && parent_pattern) {
 			int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
 			if (ret) {
 				char err[BUFSIZ];
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index 034b496df297..7addc34afcf5 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -399,6 +399,8 @@ static void addr2line_subprocess_cleanup(struct child_process *a2l)
 		kill(a2l->pid, SIGKILL);
 		finish_command(a2l); /* ignore result, we don't care */
 		a2l->pid = -1;
+		close(a2l->in);
+		close(a2l->out);
 	}
 
 	free(a2l);
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 8c61f8627ebc..bfc1d705f437 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -201,6 +201,9 @@ static void print_aggr_id_std(struct perf_stat_config *config,
 		snprintf(buf, sizeof(buf), "S%d-D%d-L%d-ID%d",
 			 id.socket, id.die, id.cache_lvl, id.cache);
 		break;
+	case AGGR_CLUSTER:
+		snprintf(buf, sizeof(buf), "S%d-D%d-CLS%d", id.socket, id.die, id.cluster);
+		break;
 	case AGGR_DIE:
 		snprintf(buf, sizeof(buf), "S%d-D%d", id.socket, id.die);
 		break;
@@ -251,6 +254,10 @@ static void print_aggr_id_csv(struct perf_stat_config *config,
 		fprintf(config->output, "S%d-D%d-L%d-ID%d%s%d%s",
 			id.socket, id.die, id.cache_lvl, id.cache, sep, aggr_nr, sep);
 		break;
+	case AGGR_CLUSTER:
+		fprintf(config->output, "S%d-D%d-CLS%d%s%d%s",
+			id.socket, id.die, id.cluster, sep, aggr_nr, sep);
+		break;
 	case AGGR_DIE:
 		fprintf(output, "S%d-D%d%s%d%s",
 			id.socket, id.die, sep, aggr_nr, sep);
@@ -300,6 +307,10 @@ static void print_aggr_id_json(struct perf_stat_config *config,
 		fprintf(output, "\"cache\" : \"S%d-D%d-L%d-ID%d\", \"aggregate-number\" : %d, ",
 			id.socket, id.die, id.cache_lvl, id.cache, aggr_nr);
 		break;
+	case AGGR_CLUSTER:
+		fprintf(output, "\"cluster\" : \"S%d-D%d-CLS%d\", \"aggregate-number\" : %d, ",
+			id.socket, id.die, id.cluster, aggr_nr);
+		break;
 	case AGGR_DIE:
 		fprintf(output, "\"die\" : \"S%d-D%d\", \"aggregate-number\" : %d, ",
 			id.socket, id.die, aggr_nr);
@@ -560,7 +571,7 @@ static void print_metric_only(struct perf_stat_config *config,
 	if (color)
 		mlen += strlen(color) + sizeof(PERF_COLOR_RESET) - 1;
 
-	color_snprintf(str, sizeof(str), color ?: "", fmt, val);
+	color_snprintf(str, sizeof(str), color ?: "", fmt ?: "", val);
 	fprintf(out, "%*s ", mlen, str);
 	os->first = false;
 }
@@ -1126,11 +1137,16 @@ static void print_no_aggr_metric(struct perf_stat_config *config,
 			u64 ena, run, val;
 			double uval;
 			struct perf_stat_evsel *ps = counter->stats;
-			int aggr_idx = perf_cpu_map__idx(evsel__cpus(counter), cpu);
+			int aggr_idx = 0;
 
-			if (aggr_idx < 0)
+			if (!perf_cpu_map__has(evsel__cpus(counter), cpu))
 				continue;
 
+			cpu_aggr_map__for_each_idx(aggr_idx, config->aggr_map) {
+				if (config->aggr_map->map[aggr_idx].cpu.cpu == cpu.cpu)
+					break;
+			}
+
 			os->evsel = counter;
 			os->id = aggr_cpu_id__cpu(cpu, /*data=*/NULL);
 			if (first) {
@@ -1248,6 +1264,7 @@ static void print_header_interval_std(struct perf_stat_config *config,
 	case AGGR_NODE:
 	case AGGR_SOCKET:
 	case AGGR_DIE:
+	case AGGR_CLUSTER:
 	case AGGR_CACHE:
 	case AGGR_CORE:
 		fprintf(output, "#%*s %-*s cpus",
@@ -1550,6 +1567,7 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf
 	switch (config->aggr_mode) {
 	case AGGR_CORE:
 	case AGGR_CACHE:
+	case AGGR_CLUSTER:
 	case AGGR_DIE:
 	case AGGR_SOCKET:
 	case AGGR_NODE:
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index e31426167852..3466aa952442 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -355,11 +355,13 @@ static void print_nsecs(struct perf_stat_config *config,
 		print_metric(config, ctxp, NULL, NULL, "CPUs utilized", 0);
 }
 
-static int prepare_metric(struct evsel **metric_events,
-			  struct metric_ref *metric_refs,
+static int prepare_metric(const struct metric_expr *mexp,
+			  const struct evsel *evsel,
 			  struct expr_parse_ctx *pctx,
 			  int aggr_idx)
 {
+	struct evsel * const *metric_events = mexp->metric_events;
+	struct metric_ref *metric_refs = mexp->metric_refs;
 	int i;
 
 	for (i = 0; metric_events[i]; i++) {
@@ -398,12 +400,33 @@ static int prepare_metric(struct evsel **metric_events,
 			source_count = 1;
 		} else {
 			struct perf_stat_evsel *ps = metric_events[i]->stats;
-			struct perf_stat_aggr *aggr = &ps->aggr[aggr_idx];
+			struct perf_stat_aggr *aggr;
 
+			/*
+			 * If there are multiple uncore PMUs and we're not
+			 * reading the leader's stats, determine the stats for
+			 * the appropriate uncore PMU.
+			 */
+			if (evsel && evsel->metric_leader &&
+			    evsel->pmu != evsel->metric_leader->pmu &&
+			    mexp->metric_events[i]->pmu == evsel->metric_leader->pmu) {
+				struct evsel *pos;
+
+				evlist__for_each_entry(evsel->evlist, pos) {
+					if (pos->pmu != evsel->pmu)
+						continue;
+					if (pos->metric_leader != mexp->metric_events[i])
+						continue;
+					ps = pos->stats;
+					source_count = 1;
+					break;
+				}
+			}
+			aggr = &ps->aggr[aggr_idx];
 			if (!aggr)
 				break;
 
-                        if (!metric_events[i]->supported) {
+			if (!metric_events[i]->supported) {
 				/*
 				 * Not supported events will have a count of 0,
 				 * which can be confusing in a
@@ -414,13 +437,9 @@ static int prepare_metric(struct evsel **metric_events,
 				val = NAN;
 				source_count = 0;
 			} else {
-				/*
-				 * If an event was scaled during stat gathering,
-				 * reverse the scale before computing the
-				 * metric.
-				 */
-				val = aggr->counts.val * (1.0 / metric_events[i]->scale);
-				source_count = evsel__source_count(metric_events[i]);
+				val = aggr->counts.val;
+				if (!source_count)
+					source_count = evsel__source_count(metric_events[i]);
 			}
 		}
 		n = strdup(evsel__metric_id(metric_events[i]));
@@ -441,18 +460,18 @@ static int prepare_metric(struct evsel **metric_events,
 }
 
 static void generic_metric(struct perf_stat_config *config,
-			   const char *metric_expr,
-			   const char *metric_threshold,
-			   struct evsel **metric_events,
-			   struct metric_ref *metric_refs,
-			   char *name,
-			   const char *metric_name,
-			   const char *metric_unit,
-			   int runtime,
+			   struct metric_expr *mexp,
+			   struct evsel *evsel,
 			   int aggr_idx,
 			   struct perf_stat_output_ctx *out)
 {
 	print_metric_t print_metric = out->print_metric;
+	const char *metric_name = mexp->metric_name;
+	const char *metric_expr = mexp->metric_expr;
+	const char *metric_threshold = mexp->metric_threshold;
+	const char *metric_unit = mexp->metric_unit;
+	struct evsel * const *metric_events = mexp->metric_events;
+	int runtime = mexp->runtime;
 	struct expr_parse_ctx *pctx;
 	double ratio, scale, threshold;
 	int i;
@@ -467,7 +486,7 @@ static void generic_metric(struct perf_stat_config *config,
 		pctx->sctx.user_requested_cpu_list = strdup(config->user_requested_cpu_list);
 	pctx->sctx.runtime = runtime;
 	pctx->sctx.system_wide = config->system_wide;
-	i = prepare_metric(metric_events, metric_refs, pctx, aggr_idx);
+	i = prepare_metric(mexp, evsel, pctx, aggr_idx);
 	if (i < 0) {
 		expr__ctx_free(pctx);
 		return;
@@ -502,18 +521,18 @@ static void generic_metric(struct perf_stat_config *config,
 				print_metric(config, ctxp, color, "%8.2f",
 					metric_name ?
 					metric_name :
-					out->force_header ?  name : "",
+					out->force_header ?  evsel->name : "",
 					ratio);
 			}
 		} else {
 			print_metric(config, ctxp, color, /*unit=*/NULL,
 				     out->force_header ?
-				     (metric_name ? metric_name : name) : "", 0);
+				     (metric_name ?: evsel->name) : "", 0);
 		}
 	} else {
 		print_metric(config, ctxp, color, /*unit=*/NULL,
 			     out->force_header ?
-			     (metric_name ? metric_name : name) : "", 0);
+			     (metric_name ?: evsel->name) : "", 0);
 	}
 
 	expr__ctx_free(pctx);
@@ -528,7 +547,7 @@ double test_generic_metric(struct metric_expr *mexp, int aggr_idx)
 	if (!pctx)
 		return NAN;
 
-	if (prepare_metric(mexp->metric_events, mexp->metric_refs, pctx, aggr_idx) < 0)
+	if (prepare_metric(mexp, /*evsel=*/NULL, pctx, aggr_idx) < 0)
 		goto out;
 
 	if (expr__parse(&ratio, pctx, mexp->metric_expr))
@@ -630,10 +649,7 @@ void *perf_stat__print_shadow_stats_metricgroup(struct perf_stat_config *config,
 
 		if ((*num)++ > 0)
 			out->new_line(config, ctxp);
-		generic_metric(config, mexp->metric_expr, mexp->metric_threshold,
-			       mexp->metric_events, mexp->metric_refs, evsel->name,
-			       mexp->metric_name, mexp->metric_unit, mexp->runtime,
-			       aggr_idx, out);
+		generic_metric(config, mexp, evsel, aggr_idx, out);
 	}
 
 	return NULL;
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 4357ba114822..d6e5c8787ba2 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -48,6 +48,7 @@ enum aggr_mode {
 	AGGR_GLOBAL,
 	AGGR_SOCKET,
 	AGGR_DIE,
+	AGGR_CLUSTER,
 	AGGR_CACHE,
 	AGGR_CORE,
 	AGGR_THREAD,
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 4b934ed3bfd1..0b91f813c4fa 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -23,6 +23,7 @@
 #include <linux/ctype.h>
 #include <linux/kernel.h>
 #include <linux/zalloc.h>
+#include <linux/string.h>
 #include <symbol/kallsyms.h>
 #include <internal/lib.h>
 
@@ -1329,6 +1330,58 @@ out_close:
 	return -1;
 }
 
+static bool is_exe_text(int flags)
+{
+	return (flags & (SHF_ALLOC | SHF_EXECINSTR)) == (SHF_ALLOC | SHF_EXECINSTR);
+}
+
+/*
+ * Some executable module sections like .noinstr.text might be laid out with
+ * .text so they can use the same mapping (memory address to file offset).
+ * Check if that is the case. Refer to kernel layout_sections(). Return the
+ * maximum offset.
+ */
+static u64 max_text_section(Elf *elf, GElf_Ehdr *ehdr)
+{
+	Elf_Scn *sec = NULL;
+	GElf_Shdr shdr;
+	u64 offs = 0;
+
+	/* Doesn't work for some arch */
+	if (ehdr->e_machine == EM_PARISC ||
+	    ehdr->e_machine == EM_ALPHA)
+		return 0;
+
+	/* ELF is corrupted/truncated, avoid calling elf_strptr. */
+	if (!elf_rawdata(elf_getscn(elf, ehdr->e_shstrndx), NULL))
+		return 0;
+
+	while ((sec = elf_nextscn(elf, sec)) != NULL) {
+		char *sec_name;
+
+		if (!gelf_getshdr(sec, &shdr))
+			break;
+
+		if (!is_exe_text(shdr.sh_flags))
+			continue;
+
+		/* .init and .exit sections are not placed with .text */
+		sec_name = elf_strptr(elf, ehdr->e_shstrndx, shdr.sh_name);
+		if (!sec_name ||
+		    strstarts(sec_name, ".init") ||
+		    strstarts(sec_name, ".exit"))
+			break;
+
+		/* Must be next to previous, assumes .text is first */
+		if (offs && PERF_ALIGN(offs, shdr.sh_addralign ?: 1) != shdr.sh_offset)
+			break;
+
+		offs = shdr.sh_offset + shdr.sh_size;
+	}
+
+	return offs;
+}
+
 /**
  * ref_reloc_sym_not_found - has kernel relocation symbol been found.
  * @kmap: kernel maps and relocation reference symbol
@@ -1368,7 +1421,8 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map,
 				      struct maps *kmaps, struct kmap *kmap,
 				      struct dso **curr_dsop, struct map **curr_mapp,
 				      const char *section_name,
-				      bool adjust_kernel_syms, bool kmodule, bool *remap_kernel)
+				      bool adjust_kernel_syms, bool kmodule, bool *remap_kernel,
+				      u64 max_text_sh_offset)
 {
 	struct dso *curr_dso = *curr_dsop;
 	struct map *curr_map;
@@ -1424,6 +1478,17 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map,
 	if (!kmap)
 		return 0;
 
+	/*
+	 * perf does not record module section addresses except for .text, but
+	 * some sections can use the same mapping as .text.
+	 */
+	if (kmodule && adjust_kernel_syms && is_exe_text(shdr->sh_flags) &&
+	    shdr->sh_offset <= max_text_sh_offset) {
+		*curr_mapp = map;
+		*curr_dsop = dso;
+		return 0;
+	}
+
 	snprintf(dso_name, sizeof(dso_name), "%s%s", dso->short_name, section_name);
 
 	curr_map = maps__find_by_name(kmaps, dso_name);
@@ -1470,8 +1535,10 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map,
 		dso__set_loaded(curr_dso);
 		*curr_mapp = curr_map;
 		*curr_dsop = curr_dso;
-	} else
+	} else {
 		*curr_dsop = map__dso(curr_map);
+		map__put(curr_map);
+	}
 
 	return 0;
 }
@@ -1497,6 +1564,7 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 	Elf *elf;
 	int nr = 0;
 	bool remap_kernel = false, adjust_kernel_syms = false;
+	u64 max_text_sh_offset = 0;
 
 	if (kmap && !kmaps)
 		return -1;
@@ -1584,6 +1652,10 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 		remap_kernel = true;
 		adjust_kernel_syms = dso->adjust_symbols;
 	}
+
+	if (kmodule && adjust_kernel_syms)
+		max_text_sh_offset = max_text_section(runtime_ss->elf, &runtime_ss->ehdr);
+
 	elf_symtab__for_each_symbol(syms, nr_syms, idx, sym) {
 		struct symbol *f;
 		const char *elf_name = elf_sym__name(&sym, symstrs);
@@ -1673,7 +1745,8 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 
 		if (dso->kernel) {
 			if (dso__process_kernel_symbol(dso, map, &sym, &shdr, kmaps, kmap, &curr_dso, &curr_map,
-						       section_name, adjust_kernel_syms, kmodule, &remap_kernel))
+						       section_name, adjust_kernel_syms, kmodule,
+						       &remap_kernel, max_text_sh_offset))
 				goto out_elf_end;
 		} else if ((used_opd && runtime_ss->adjust_symbols) ||
 			   (!used_opd && syms_ss->adjust_symbols)) {
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index be212ba157dc..9ebdb8e13c0b 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -63,6 +63,16 @@ struct symbol_conf symbol_conf = {
 	.res_sample		= 0,
 };
 
+struct map_list_node {
+	struct list_head node;
+	struct map *map;
+};
+
+static struct map_list_node *map_list_node__new(void)
+{
+	return malloc(sizeof(struct map_list_node));
+}
+
 static enum dso_binary_type binary_type_symtab[] = {
 	DSO_BINARY_TYPE__KALLSYMS,
 	DSO_BINARY_TYPE__GUEST_KALLSYMS,
@@ -238,14 +248,31 @@ void symbols__fixup_end(struct rb_root_cached *symbols, bool is_kallsyms)
 		 * segment is very big.  Therefore do not fill this gap and do
 		 * not assign it to the kernel dso map (kallsyms).
 		 *
+		 * Also BPF code can be allocated separately from text segments
+		 * and modules.  So the last entry in a module should not fill
+		 * the gap too.
+		 *
 		 * In kallsyms, it determines module symbols using '[' character
 		 * like in:
 		 *   ffffffffc1937000 T hdmi_driver_init  [snd_hda_codec_hdmi]
 		 */
 		if (prev->end == prev->start) {
+			const char *prev_mod;
+			const char *curr_mod;
+
+			if (!is_kallsyms) {
+				prev->end = curr->start;
+				continue;
+			}
+
+			prev_mod = strchr(prev->name, '[');
+			curr_mod = strchr(curr->name, '[');
+
 			/* Last kernel/module symbol mapped to end of page */
-			if (is_kallsyms && (!strchr(prev->name, '[') !=
-					    !strchr(curr->name, '[')))
+			if (!prev_mod != !curr_mod)
+				prev->end = roundup(prev->end + 4096, 4096);
+			/* Last symbol in the previous module */
+			else if (prev_mod && strcmp(prev_mod, curr_mod))
 				prev->end = roundup(prev->end + 4096, 4096);
 			else
 				prev->end = curr->start;
@@ -757,7 +784,6 @@ static int dso__load_all_kallsyms(struct dso *dso, const char *filename)
 
 static int maps__split_kallsyms_for_kcore(struct maps *kmaps, struct dso *dso)
 {
-	struct map *curr_map;
 	struct symbol *pos;
 	int count = 0;
 	struct rb_root_cached old_root = dso->symbols;
@@ -770,6 +796,7 @@ static int maps__split_kallsyms_for_kcore(struct maps *kmaps, struct dso *dso)
 	*root = RB_ROOT_CACHED;
 
 	while (next) {
+		struct map *curr_map;
 		struct dso *curr_map_dso;
 		char *module;
 
@@ -796,6 +823,7 @@ static int maps__split_kallsyms_for_kcore(struct maps *kmaps, struct dso *dso)
 			pos->end -= map__start(curr_map) - map__pgoff(curr_map);
 		symbols__insert(&curr_map_dso->symbols, pos);
 		++count;
+		map__put(curr_map);
 	}
 
 	/* Symbols have been adjusted */
@@ -813,7 +841,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta,
 				struct map *initial_map)
 {
 	struct machine *machine;
-	struct map *curr_map = initial_map;
+	struct map *curr_map = map__get(initial_map);
 	struct symbol *pos;
 	int count = 0, moved = 0;
 	struct rb_root_cached *root = &dso->symbols;
@@ -857,13 +885,14 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta,
 					dso__set_loaded(curr_map_dso);
 				}
 
+				map__zput(curr_map);
 				curr_map = maps__find_by_name(kmaps, module);
 				if (curr_map == NULL) {
 					pr_debug("%s/proc/{kallsyms,modules} "
 					         "inconsistency while looking "
 						 "for \"%s\" module!\n",
 						 machine->root_dir, module);
-					curr_map = initial_map;
+					curr_map = map__get(initial_map);
 					goto discard_symbol;
 				}
 				curr_map_dso = map__dso(curr_map);
@@ -887,7 +916,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta,
 			 * symbols at this point.
 			 */
 			goto discard_symbol;
-		} else if (curr_map != initial_map) {
+		} else if (!RC_CHK_EQUAL(curr_map, initial_map)) {
 			char dso_name[PATH_MAX];
 			struct dso *ndso;
 
@@ -898,7 +927,8 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta,
 			}
 
 			if (count == 0) {
-				curr_map = initial_map;
+				map__zput(curr_map);
+				curr_map = map__get(initial_map);
 				goto add_symbol;
 			}
 
@@ -912,6 +942,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta,
 					kernel_range++);
 
 			ndso = dso__new(dso_name);
+			map__zput(curr_map);
 			if (ndso == NULL)
 				return -1;
 
@@ -925,6 +956,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta,
 
 			map__set_mapping_type(curr_map, MAPPING_TYPE__IDENTITY);
 			if (maps__insert(kmaps, curr_map)) {
+				map__zput(curr_map);
 				dso__put(ndso);
 				return -1;
 			}
@@ -935,7 +967,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta,
 			pos->end -= delta;
 		}
 add_symbol:
-		if (curr_map != initial_map) {
+		if (!RC_CHK_EQUAL(curr_map, initial_map)) {
 			struct dso *curr_map_dso = map__dso(curr_map);
 
 			rb_erase_cached(&pos->rb_node, root);
@@ -950,12 +982,12 @@ discard_symbol:
 		symbol__delete(pos);
 	}
 
-	if (curr_map != initial_map &&
+	if (!RC_CHK_EQUAL(curr_map, initial_map) &&
 	    dso->kernel == DSO_SPACE__KERNEL_GUEST &&
 	    machine__is_default_guest(maps__machine(kmaps))) {
 		dso__set_loaded(map__dso(curr_map));
 	}
-
+	map__put(curr_map);
 	return count + moved;
 }
 
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 89c47a5098e2..1aa8962dcf52 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -26,7 +26,7 @@ int thread__init_maps(struct thread *thread, struct machine *machine)
 	if (pid == thread__tid(thread) || pid == -1) {
 		thread__set_maps(thread, maps__new(machine));
 	} else {
-		struct thread *leader = __machine__findnew_thread(machine, pid, pid);
+		struct thread *leader = machine__findnew_thread(machine, pid, pid);
 
 		if (leader) {
 			thread__set_maps(thread, maps__get(thread__maps(leader)));
@@ -383,7 +383,7 @@ static int thread__clone_maps(struct thread *thread, struct thread *parent, bool
 	if (thread__pid(thread) == thread__pid(parent))
 		return thread__prepare_access(thread);
 
-	if (RC_CHK_EQUAL(thread__maps(thread), thread__maps(parent))) {
+	if (maps__equal(thread__maps(thread), thread__maps(parent))) {
 		pr_debug("broken map groups on thread %d/%d parent %d/%d\n",
 			 thread__pid(thread), thread__tid(thread),
 			 thread__pid(parent), thread__tid(parent));
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 0df775b5c110..8b4a3c69bad1 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -3,7 +3,6 @@
 #define __PERF_THREAD_H
 
 #include <linux/refcount.h>
-#include <linux/rbtree.h>
 #include <linux/list.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -13,7 +12,6 @@
 #include <strlist.h>
 #include <intlist.h>
 #include "rwsem.h"
-#include "event.h"
 #include "callchain.h"
 #include <internal/rc_check.h>
 
@@ -30,11 +28,6 @@ struct lbr_stitch {
 	struct callchain_cursor_node	*prev_lbr_cursor;
 };
 
-struct thread_rb_node {
-	struct rb_node rb_node;
-	struct thread *thread;
-};
-
 DECLARE_RC_STRUCT(thread) {
 	/** @maps: mmaps associated with this thread. */
 	struct maps		*maps;
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
index e848579e61a8..b5f12390c355 100644
--- a/tools/perf/util/thread_map.c
+++ b/tools/perf/util/thread_map.c
@@ -109,9 +109,10 @@ static struct perf_thread_map *__thread_map__new_all_cpus(uid_t uid)
 
 		snprintf(path, sizeof(path), "/proc/%d/task", pid);
 		items = scandir(path, &namelist, filter, NULL);
-		if (items <= 0)
-			goto out_free_closedir;
-
+		if (items <= 0) {
+			pr_debug("scandir for %d returned empty, skipping\n", pid);
+			continue;
+		}
 		while (threads->nr + items >= max_threads) {
 			max_threads *= 2;
 			grow = true;
@@ -152,8 +153,6 @@ out_free_namelist:
 	for (i = 0; i < items; i++)
 		zfree(&namelist[i]);
 	free(namelist);
-
-out_free_closedir:
 	zfree(&threads);
 	goto out_closedir;
 }
@@ -280,13 +279,13 @@ struct perf_thread_map *thread_map__new_by_tid_str(const char *tid_str)
 		threads->nr = ntasks;
 	}
 out:
+	strlist__delete(slist);
 	if (threads)
 		refcount_set(&threads->refcnt, 1);
 	return threads;
 
 out_free_threads:
 	zfree(&threads);
-	strlist__delete(slist);
 	goto out;
 }
 
diff --git a/tools/perf/util/threads.c b/tools/perf/util/threads.c
new file mode 100644
index 000000000000..ff2b169e0085
--- /dev/null
+++ b/tools/perf/util/threads.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "threads.h"
+#include "machine.h"
+#include "thread.h"
+
+static struct threads_table_entry *threads__table(struct threads *threads, pid_t tid)
+{
+	/* Cast it to handle tid == -1 */
+	return &threads->table[(unsigned int)tid % THREADS__TABLE_SIZE];
+}
+
+static size_t key_hash(long key, void *ctx __maybe_unused)
+{
+	/* The table lookup removes low bit entropy, but this is just ignored here. */
+	return key;
+}
+
+static bool key_equal(long key1, long key2, void *ctx __maybe_unused)
+{
+	return key1 == key2;
+}
+
+void threads__init(struct threads *threads)
+{
+	for (int i = 0; i < THREADS__TABLE_SIZE; i++) {
+		struct threads_table_entry *table = &threads->table[i];
+
+		hashmap__init(&table->shard, key_hash, key_equal, NULL);
+		init_rwsem(&table->lock);
+		table->last_match = NULL;
+	}
+}
+
+void threads__exit(struct threads *threads)
+{
+	threads__remove_all_threads(threads);
+	for (int i = 0; i < THREADS__TABLE_SIZE; i++) {
+		struct threads_table_entry *table = &threads->table[i];
+
+		hashmap__clear(&table->shard);
+		exit_rwsem(&table->lock);
+	}
+}
+
+size_t threads__nr(struct threads *threads)
+{
+	size_t nr = 0;
+
+	for (int i = 0; i < THREADS__TABLE_SIZE; i++) {
+		struct threads_table_entry *table = &threads->table[i];
+
+		down_read(&table->lock);
+		nr += hashmap__size(&table->shard);
+		up_read(&table->lock);
+	}
+	return nr;
+}
+
+/*
+ * Front-end cache - TID lookups come in blocks,
+ * so most of the time we dont have to look up
+ * the full rbtree:
+ */
+static struct thread *__threads_table_entry__get_last_match(struct threads_table_entry *table,
+							    pid_t tid)
+{
+	struct thread *th, *res = NULL;
+
+	th = table->last_match;
+	if (th != NULL) {
+		if (thread__tid(th) == tid)
+			res = thread__get(th);
+	}
+	return res;
+}
+
+static void __threads_table_entry__set_last_match(struct threads_table_entry *table,
+						  struct thread *th)
+{
+	thread__put(table->last_match);
+	table->last_match = thread__get(th);
+}
+
+static void threads_table_entry__set_last_match(struct threads_table_entry *table,
+						struct thread *th)
+{
+	down_write(&table->lock);
+	__threads_table_entry__set_last_match(table, th);
+	up_write(&table->lock);
+}
+
+struct thread *threads__find(struct threads *threads, pid_t tid)
+{
+	struct threads_table_entry *table  = threads__table(threads, tid);
+	struct thread *res;
+
+	down_read(&table->lock);
+	res = __threads_table_entry__get_last_match(table, tid);
+	if (!res) {
+		if (hashmap__find(&table->shard, tid, &res))
+			res = thread__get(res);
+	}
+	up_read(&table->lock);
+	if (res)
+		threads_table_entry__set_last_match(table, res);
+	return res;
+}
+
+struct thread *threads__findnew(struct threads *threads, pid_t pid, pid_t tid, bool *created)
+{
+	struct threads_table_entry *table  = threads__table(threads, tid);
+	struct thread *res = NULL;
+
+	*created = false;
+	down_write(&table->lock);
+	res = thread__new(pid, tid);
+	if (res) {
+		if (hashmap__add(&table->shard, tid, res)) {
+			/* Add failed. Assume a race so find other entry. */
+			thread__put(res);
+			res = NULL;
+			if (hashmap__find(&table->shard, tid, &res))
+				res = thread__get(res);
+		} else {
+			res = thread__get(res);
+			*created = true;
+		}
+		if (res)
+			__threads_table_entry__set_last_match(table, res);
+	}
+	up_write(&table->lock);
+	return res;
+}
+
+void threads__remove_all_threads(struct threads *threads)
+{
+	for (int i = 0; i < THREADS__TABLE_SIZE; i++) {
+		struct threads_table_entry *table = &threads->table[i];
+		struct hashmap_entry *cur, *tmp;
+		size_t bkt;
+
+		down_write(&table->lock);
+		__threads_table_entry__set_last_match(table, NULL);
+		hashmap__for_each_entry_safe((&table->shard), cur, tmp, bkt) {
+			struct thread *old_value;
+
+			hashmap__delete(&table->shard, cur->key, /*old_key=*/NULL, &old_value);
+			thread__put(old_value);
+		}
+		up_write(&table->lock);
+	}
+}
+
+void threads__remove(struct threads *threads, struct thread *thread)
+{
+	struct threads_table_entry *table  = threads__table(threads, thread__tid(thread));
+	struct thread *old_value;
+
+	down_write(&table->lock);
+	if (table->last_match && RC_CHK_EQUAL(table->last_match, thread))
+		__threads_table_entry__set_last_match(table, NULL);
+
+	hashmap__delete(&table->shard, thread__tid(thread), /*old_key=*/NULL, &old_value);
+	thread__put(old_value);
+	up_write(&table->lock);
+}
+
+int threads__for_each_thread(struct threads *threads,
+			     int (*fn)(struct thread *thread, void *data),
+			     void *data)
+{
+	for (int i = 0; i < THREADS__TABLE_SIZE; i++) {
+		struct threads_table_entry *table = &threads->table[i];
+		struct hashmap_entry *cur;
+		size_t bkt;
+
+		down_read(&table->lock);
+		hashmap__for_each_entry((&table->shard), cur, bkt) {
+			int rc = fn((struct thread *)cur->pvalue, data);
+
+			if (rc != 0) {
+				up_read(&table->lock);
+				return rc;
+			}
+		}
+		up_read(&table->lock);
+	}
+	return 0;
+
+}
diff --git a/tools/perf/util/threads.h b/tools/perf/util/threads.h
new file mode 100644
index 000000000000..da68d2223f18
--- /dev/null
+++ b/tools/perf/util/threads.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_THREADS_H
+#define __PERF_THREADS_H
+
+#include "hashmap.h"
+#include "rwsem.h"
+
+struct thread;
+
+#define THREADS__TABLE_BITS	3
+#define THREADS__TABLE_SIZE	(1 << THREADS__TABLE_BITS)
+
+struct threads_table_entry {
+	/* Key is tid, value is struct thread. */
+	struct hashmap	       shard;
+	struct rw_semaphore    lock;
+	struct thread	       *last_match;
+};
+
+struct threads {
+	struct threads_table_entry table[THREADS__TABLE_SIZE];
+};
+
+void threads__init(struct threads *threads);
+void threads__exit(struct threads *threads);
+size_t threads__nr(struct threads *threads);
+struct thread *threads__find(struct threads *threads, pid_t tid);
+struct thread *threads__findnew(struct threads *threads, pid_t pid, pid_t tid, bool *created);
+void threads__remove_all_threads(struct threads *threads);
+void threads__remove(struct threads *threads, struct thread *thread);
+int threads__for_each_thread(struct threads *threads,
+			     int (*fn)(struct thread *thread, void *data),
+			     void *data);
+
+#endif	/* __PERF_THREADS_H */
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 2d3c2576bab7..f0332bd3a501 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -122,6 +122,119 @@ void event_format__print(struct tep_event *event,
 	return event_format__fprintf(event, cpu, data, size, stdout);
 }
 
+/*
+ * prev_state is of size long, which is 32 bits on 32 bit architectures.
+ * As it needs to have the same bits for both 32 bit and 64 bit architectures
+ * we can just assume that the flags we care about will all be within
+ * the 32 bits.
+ */
+#define MAX_STATE_BITS 32
+
+static const char *convert_sym(struct tep_print_flag_sym *sym)
+{
+	static char save_states[MAX_STATE_BITS + 1];
+
+	memset(save_states, 0, sizeof(save_states));
+
+	/* This is the flags for the prev_state_field, now make them into a string */
+	for (; sym; sym = sym->next) {
+		long bitmask = strtoul(sym->value, NULL, 0);
+		int i;
+
+		for (i = 0; !(bitmask & 1); i++)
+			bitmask >>= 1;
+
+		if (i >= MAX_STATE_BITS)
+			continue;
+
+		save_states[i] = sym->str[0];
+	}
+
+	return save_states;
+}
+
+static struct tep_print_arg_field *
+find_arg_field(struct tep_format_field *prev_state_field, struct tep_print_arg *arg)
+{
+	struct tep_print_arg_field *field;
+
+	if (!arg)
+		return NULL;
+
+	if (arg->type == TEP_PRINT_FIELD)
+		return &arg->field;
+
+	if (arg->type == TEP_PRINT_OP) {
+		field = find_arg_field(prev_state_field, arg->op.left);
+		if (field && field->field == prev_state_field)
+			return field;
+		field = find_arg_field(prev_state_field, arg->op.right);
+		if (field && field->field == prev_state_field)
+			return field;
+	}
+	return NULL;
+}
+
+static struct tep_print_flag_sym *
+test_flags(struct tep_format_field *prev_state_field, struct tep_print_arg *arg)
+{
+	struct tep_print_arg_field *field;
+
+	field = find_arg_field(prev_state_field, arg->flags.field);
+	if (!field)
+		return NULL;
+
+	return arg->flags.flags;
+}
+
+static struct tep_print_flag_sym *
+search_op(struct tep_format_field *prev_state_field, struct tep_print_arg *arg)
+{
+	struct tep_print_flag_sym *sym = NULL;
+
+	if (!arg)
+		return NULL;
+
+	if (arg->type == TEP_PRINT_OP) {
+		sym = search_op(prev_state_field, arg->op.left);
+		if (sym)
+			return sym;
+
+		sym = search_op(prev_state_field, arg->op.right);
+		if (sym)
+			return sym;
+	} else if (arg->type == TEP_PRINT_FLAGS) {
+		sym = test_flags(prev_state_field, arg);
+	}
+
+	return sym;
+}
+
+const char *parse_task_states(struct tep_format_field *state_field)
+{
+	struct tep_print_flag_sym *sym;
+	struct tep_print_arg *arg;
+	struct tep_event *event;
+
+	event = state_field->event;
+
+	/*
+	 * Look at the event format fields, and search for where
+	 * the prev_state is parsed via the format flags.
+	 */
+	for (arg = event->print_fmt.args; arg; arg = arg->next) {
+		/*
+		 * Currently, the __print_flags() for the prev_state
+		 * is embedded in operations, so they too must be
+		 * searched.
+		 */
+		sym = search_op(state_field, arg);
+		if (sym)
+			return convert_sym(sym);
+	}
+	return NULL;
+}
+
 void parse_ftrace_printk(struct tep_handle *pevent,
 			 char *file, unsigned int size __maybe_unused)
 {
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index a69ee29419f3..bbf8b26bc8da 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -15,6 +15,7 @@ struct perf_tool;
 struct thread;
 struct tep_plugin_list;
 struct evsel;
+struct tep_format_field;
 
 struct trace_event {
 	struct tep_handle	*pevent;
@@ -51,6 +52,8 @@ int parse_event_file(struct tep_handle *pevent,
 unsigned long long
 raw_field_value(struct tep_event *event, const char *name, void *data);
 
+const char *parse_task_states(struct tep_format_field *state_field);
+
 void parse_proc_kallsyms(struct tep_handle *pevent, char *file, unsigned int size);
 void parse_ftrace_printk(struct tep_handle *pevent, char *file, unsigned int size);
 void parse_saved_cmdline(struct tep_handle *pevent, char *file, unsigned int size);
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 6013335a8dae..b38d322734b4 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -263,7 +263,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	struct unwind_info *ui, ui_buf = {
 		.sample		= data,
 		.thread		= thread,
-		.machine	= RC_CHK_ACCESS(thread__maps(thread))->machine,
+		.machine	= maps__machine((thread__maps(thread))),
 		.cb		= cb,
 		.arg		= arg,
 		.max_stack	= max_stack,
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index dac536e28360..6a5ac0faa6f4 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -706,7 +706,7 @@ static int _unwind__prepare_access(struct maps *maps)
 {
 	void *addr_space = unw_create_addr_space(&accessors, 0);
 
-	RC_CHK_ACCESS(maps)->addr_space = addr_space;
+	maps__set_addr_space(maps, addr_space);
 	if (!addr_space) {
 		pr_err("unwind: Can't create unwind address space.\n");
 		return -ENOMEM;
diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index 76cd63de80a8..2728eb4f13ea 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -12,11 +12,6 @@ struct unwind_libunwind_ops __weak *local_unwind_libunwind_ops;
 struct unwind_libunwind_ops __weak *x86_32_unwind_libunwind_ops;
 struct unwind_libunwind_ops __weak *arm64_unwind_libunwind_ops;
 
-static void unwind__register_ops(struct maps *maps, struct unwind_libunwind_ops *ops)
-{
-	RC_CHK_ACCESS(maps)->unwind_libunwind_ops = ops;
-}
-
 int unwind__prepare_access(struct maps *maps, struct map *map, bool *initialized)
 {
 	const char *arch;
@@ -60,7 +55,7 @@ int unwind__prepare_access(struct maps *maps, struct map *map, bool *initialized
 		return 0;
 	}
 out_register:
-	unwind__register_ops(maps, ops);
+	maps__set_unwind_libunwind_ops(maps, ops);
 
 	err = maps__unwind_libunwind_ops(maps)->prepare_access(maps);
 	if (initialized)
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index c1fd9ba6d697..4f561e5e4162 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -552,3 +552,22 @@ int sched_getcpu(void)
 	return -1;
 }
 #endif
+
+#ifndef HAVE_SCANDIRAT_SUPPORT
+int scandirat(int dirfd, const char *dirp,
+	      struct dirent ***namelist,
+	      int (*filter)(const struct dirent *),
+	      int (*compar)(const struct dirent **, const struct dirent **))
+{
+	char path[PATH_MAX];
+	int err, fd = openat(dirfd, dirp, O_PATH);
+
+	if (fd < 0)
+		return fd;
+
+	snprintf(path, sizeof(path), "/proc/%d/fd/%d", getpid(), fd);
+	err = scandir(path, namelist, filter, compar);
+	close(fd);
+	return err;
+}
+#endif
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 7c8915d92dca..9966c21aaf04 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -6,6 +6,7 @@
 /* glibc 2.20 deprecates _BSD_SOURCE in favour of _DEFAULT_SOURCE */
 #define _DEFAULT_SOURCE 1
 
+#include <dirent.h>
 #include <fcntl.h>
 #include <stdbool.h>
 #include <stddef.h>
@@ -56,6 +57,13 @@ int perf_tip(char **strp, const char *dirpath);
 int sched_getcpu(void);
 #endif
 
+#ifndef HAVE_SCANDIRAT_SUPPORT
+int scandirat(int dirfd, const char *dirp,
+	      struct dirent ***namelist,
+	      int (*filter)(const struct dirent *),
+	      int (*compar)(const struct dirent **, const struct dirent **));
+#endif
+
 extern bool perf_singlethreaded;
 
 void perf_set_singlethreaded(void);
diff --git a/tools/power/acpi/tools/pfrut/pfrut.c b/tools/power/acpi/tools/pfrut/pfrut.c
index 388c9e3ad040..44a9ecbd91e8 100644
--- a/tools/power/acpi/tools/pfrut/pfrut.c
+++ b/tools/power/acpi/tools/pfrut/pfrut.c
@@ -174,6 +174,8 @@ void print_cap(struct pfru_update_cap_info *cap)
 		exit(1);
 	}
 
+	printf("update capability:%d\n", cap->update_cap);
+
 	uuid_unparse(cap->code_type, uuid);
 	printf("code injection image type:%s\n", uuid);
 	printf("fw_version:%d\n", cap->fw_version);
diff --git a/tools/power/cpupower/man/cpupower-frequency-info.1 b/tools/power/cpupower/man/cpupower-frequency-info.1
index dd545b499480..47fdd7218748 100644
--- a/tools/power/cpupower/man/cpupower-frequency-info.1
+++ b/tools/power/cpupower/man/cpupower-frequency-info.1
@@ -32,7 +32,7 @@ Gets the currently used cpufreq policy.
 \fB\-g\fR \fB\-\-governors\fR
 Determines available cpufreq governors.
 .TP  
-\fB\-a\fR \fB\-\-related\-cpus\fR
+\fB\-r\fR \fB\-\-related\-cpus\fR
 Determines which CPUs run at the same hardware frequency.
 .TP  
 \fB\-a\fR \fB\-\-affected\-cpus\fR
diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 8f08c3fd498d..0d3672e5d9ed 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -67,6 +67,10 @@ The column name "all" can be used to enable all disabled-by-default built-in cou
 .PP
 \fB--quiet\fP Do not decode and print the system configuration header information.
 .PP
++\fB--no-msr\fP Disable all the uses of the MSR driver.
++.PP
++\fB--no-perf\fP Disable all the uses of the perf API.
++.PP
 \fB--interval seconds\fP overrides the default 5.0 second measurement interval.
 .PP
 \fB--num_iterations num\fP number of the measurement iterations.
@@ -125,9 +129,17 @@ The system configuration dump (if --quiet is not used) is followed by statistics
 .PP
 \fBPkgTmp\fP Degrees Celsius reported by the per-package Package Thermal Monitor.
 .PP
-\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms.
+\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms or /sys/class/drm/card0/gt/gt0/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used.
 .PP
-\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz.
+\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used.
+.PP
+\fBGFXAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used.
+.PP
+\fBSAM%mc6\fP The percentage of time the SA Media is in the "module C6" state, mc6, during the measurement interval. From /sys/class/drm/card0/gt/gt1/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used.
+.PP
+\fBSAMMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used.
+.PP
+\fBSAMAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used.
 .PP
 \fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states.  These numbers are from hardware residency counters.
 .PP
@@ -370,7 +382,7 @@ below the processor's base frequency.
 
 Busy% = MPERF_delta/TSC_delta
 
-Bzy_MHz = TSC_delta/APERF_delta/MPERF_delta/measurement_interval
+Bzy_MHz = TSC_delta*APERF_delta/MPERF_delta/measurement_interval
 
 Note that these calculations depend on TSC_delta, so they
 are not reliable during intervals when TSC_MHz is not running at the base frequency.
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 7a334377f92b..98256468e248 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3,7 +3,7 @@
  * turbostat -- show CPU frequency and C-state residency
  * on modern Intel and AMD processors.
  *
- * Copyright (c) 2023 Intel Corporation.
+ * Copyright (c) 2024 Intel Corporation.
  * Len Brown <len.brown@intel.com>
  */
 
@@ -36,6 +36,8 @@
 #include <linux/perf_event.h>
 #include <asm/unistd.h>
 #include <stdbool.h>
+#include <assert.h>
+#include <linux/kernel.h>
 
 #define UNUSED(x) (void)(x)
 
@@ -53,9 +55,13 @@
 #define	NAME_BYTES 20
 #define PATH_BYTES 128
 
+#define MAX_NOFILE 0x8000
+
 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
 enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC };
 enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT };
+enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR };
+enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR };
 
 struct msr_counter {
 	unsigned int msr_num;
@@ -127,6 +133,9 @@ struct msr_counter bic[] = {
 	{ 0x0, "IPC", "", 0, 0, 0, NULL, 0 },
 	{ 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 },
 	{ 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 },
+	{ 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 },
+	{ 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 },
+	{ 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 },
 };
 
 #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
@@ -185,11 +194,14 @@ struct msr_counter bic[] = {
 #define	BIC_IPC		(1ULL << 52)
 #define	BIC_CORE_THROT_CNT	(1ULL << 53)
 #define	BIC_UNCORE_MHZ		(1ULL << 54)
+#define	BIC_SAM_mc6		(1ULL << 55)
+#define	BIC_SAMMHz		(1ULL << 56)
+#define	BIC_SAMACTMHz		(1ULL << 57)
 
 #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
 #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__)
-#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_UNCORE_MHZ)
-#define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX)
+#define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
+#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6)
 #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
 
 #define BIC_DISABLED_BY_DEFAULT	(BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC)
@@ -204,10 +216,13 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC
 #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT)
 #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT)
 
+struct amperf_group_fd;
+
 char *proc_stat = "/proc/stat";
 FILE *outf;
 int *fd_percpu;
 int *fd_instr_count_percpu;
+struct amperf_group_fd *fd_amperf_percpu;	/* File descriptors for perf group with APERF and MPERF counters. */
 struct timeval interval_tv = { 5, 0 };
 struct timespec interval_ts = { 5, 0 };
 
@@ -242,11 +257,8 @@ char *output_buffer, *outp;
 unsigned int do_dts;
 unsigned int do_ptm;
 unsigned int do_ipc;
-unsigned long long gfx_cur_rc6_ms;
 unsigned long long cpuidle_cur_cpu_lpi_us;
 unsigned long long cpuidle_cur_sys_lpi_us;
-unsigned int gfx_cur_mhz;
-unsigned int gfx_act_mhz;
 unsigned int tj_max;
 unsigned int tj_max_override;
 double rapl_power_units, rapl_time_units;
@@ -263,6 +275,28 @@ unsigned int has_hwp_epp;	/* IA32_HWP_REQUEST[bits 31:24] */
 unsigned int has_hwp_pkg;	/* IA32_HWP_REQUEST_PKG */
 unsigned int first_counter_read = 1;
 int ignore_stdin;
+bool no_msr;
+bool no_perf;
+enum amperf_source amperf_source;
+
+enum gfx_sysfs_idx {
+	GFX_rc6,
+	GFX_MHz,
+	GFX_ACTMHz,
+	SAM_mc6,
+	SAM_MHz,
+	SAM_ACTMHz,
+	GFX_MAX
+};
+
+struct gfx_sysfs_info {
+	const char *path;
+	FILE *fp;
+	unsigned int val;
+	unsigned long long val_ull;
+};
+
+static struct gfx_sysfs_info gfx_info[GFX_MAX];
 
 int get_msr(int cpu, off_t offset, unsigned long long *msr);
 
@@ -652,6 +686,7 @@ static const struct platform_features icx_features = {
 	.bclk_freq = BCLK_100MHZ,
 	.supported_cstates = CC1 | CC6 | PC2 | PC6,
 	.cst_limit = CST_LIMIT_ICX,
+	.has_msr_core_c1_res = 1,
 	.has_irtl_msrs = 1,
 	.has_cst_prewake_bit = 1,
 	.trl_msrs = TRL_BASE | TRL_CORECOUNT,
@@ -948,6 +983,175 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi
 #define MAX_ADDED_THREAD_COUNTERS 24
 #define BITMASK_SIZE 32
 
+/* Indexes used to map data read from perf and MSRs into global variables */
+enum rapl_rci_index {
+	RAPL_RCI_INDEX_ENERGY_PKG = 0,
+	RAPL_RCI_INDEX_ENERGY_CORES = 1,
+	RAPL_RCI_INDEX_DRAM = 2,
+	RAPL_RCI_INDEX_GFX = 3,
+	RAPL_RCI_INDEX_PKG_PERF_STATUS = 4,
+	RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5,
+	RAPL_RCI_INDEX_CORE_ENERGY = 6,
+	NUM_RAPL_COUNTERS,
+};
+
+enum rapl_unit {
+	RAPL_UNIT_INVALID,
+	RAPL_UNIT_JOULES,
+	RAPL_UNIT_WATTS,
+};
+
+struct rapl_counter_info_t {
+	unsigned long long data[NUM_RAPL_COUNTERS];
+	enum rapl_source source[NUM_RAPL_COUNTERS];
+	unsigned long long flags[NUM_RAPL_COUNTERS];
+	double scale[NUM_RAPL_COUNTERS];
+	enum rapl_unit unit[NUM_RAPL_COUNTERS];
+
+	union {
+		/* Active when source == RAPL_SOURCE_MSR */
+		struct {
+			unsigned long long msr[NUM_RAPL_COUNTERS];
+			unsigned long long msr_mask[NUM_RAPL_COUNTERS];
+			int msr_shift[NUM_RAPL_COUNTERS];
+		};
+	};
+
+	int fd_perf;
+};
+
+/* struct rapl_counter_info_t for each RAPL domain */
+struct rapl_counter_info_t *rapl_counter_info_perdomain;
+
+#define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1)
+
+struct rapl_counter_arch_info {
+	int feature_mask;	/* Mask for testing if the counter is supported on host */
+	const char *perf_subsys;
+	const char *perf_name;
+	unsigned long long msr;
+	unsigned long long msr_mask;
+	int msr_shift;		/* Positive mean shift right, negative mean shift left */
+	double *platform_rapl_msr_scale;	/* Scale applied to values read by MSR (platform dependent, filled at runtime) */
+	unsigned int rci_index;	/* Maps data from perf counters to global variables */
+	unsigned long long bic;
+	double compat_scale;	/* Some counters require constant scaling to be in the same range as other, similar ones */
+	unsigned long long flags;
+};
+
+static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
+	{
+	 .feature_mask = RAPL_PKG,
+	 .perf_subsys = "power",
+	 .perf_name = "energy-pkg",
+	 .msr = MSR_PKG_ENERGY_STATUS,
+	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
+	 .msr_shift = 0,
+	 .platform_rapl_msr_scale = &rapl_energy_units,
+	 .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
+	 .bic = BIC_PkgWatt | BIC_Pkg_J,
+	 .compat_scale = 1.0,
+	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+	  },
+	{
+	 .feature_mask = RAPL_AMD_F17H,
+	 .perf_subsys = "power",
+	 .perf_name = "energy-pkg",
+	 .msr = MSR_PKG_ENERGY_STAT,
+	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
+	 .msr_shift = 0,
+	 .platform_rapl_msr_scale = &rapl_energy_units,
+	 .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
+	 .bic = BIC_PkgWatt | BIC_Pkg_J,
+	 .compat_scale = 1.0,
+	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+	  },
+	{
+	 .feature_mask = RAPL_CORE_ENERGY_STATUS,
+	 .perf_subsys = "power",
+	 .perf_name = "energy-cores",
+	 .msr = MSR_PP0_ENERGY_STATUS,
+	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
+	 .msr_shift = 0,
+	 .platform_rapl_msr_scale = &rapl_energy_units,
+	 .rci_index = RAPL_RCI_INDEX_ENERGY_CORES,
+	 .bic = BIC_CorWatt | BIC_Cor_J,
+	 .compat_scale = 1.0,
+	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+	  },
+	{
+	 .feature_mask = RAPL_DRAM,
+	 .perf_subsys = "power",
+	 .perf_name = "energy-ram",
+	 .msr = MSR_DRAM_ENERGY_STATUS,
+	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
+	 .msr_shift = 0,
+	 .platform_rapl_msr_scale = &rapl_dram_energy_units,
+	 .rci_index = RAPL_RCI_INDEX_DRAM,
+	 .bic = BIC_RAMWatt | BIC_RAM_J,
+	 .compat_scale = 1.0,
+	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+	  },
+	{
+	 .feature_mask = RAPL_GFX,
+	 .perf_subsys = "power",
+	 .perf_name = "energy-gpu",
+	 .msr = MSR_PP1_ENERGY_STATUS,
+	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
+	 .msr_shift = 0,
+	 .platform_rapl_msr_scale = &rapl_energy_units,
+	 .rci_index = RAPL_RCI_INDEX_GFX,
+	 .bic = BIC_GFXWatt | BIC_GFX_J,
+	 .compat_scale = 1.0,
+	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+	  },
+	{
+	 .feature_mask = RAPL_PKG_PERF_STATUS,
+	 .perf_subsys = NULL,
+	 .perf_name = NULL,
+	 .msr = MSR_PKG_PERF_STATUS,
+	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
+	 .msr_shift = 0,
+	 .platform_rapl_msr_scale = &rapl_time_units,
+	 .rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS,
+	 .bic = BIC_PKG__,
+	 .compat_scale = 100.0,
+	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+	  },
+	{
+	 .feature_mask = RAPL_DRAM_PERF_STATUS,
+	 .perf_subsys = NULL,
+	 .perf_name = NULL,
+	 .msr = MSR_DRAM_PERF_STATUS,
+	 .msr_mask = 0xFFFFFFFFFFFFFFFF,
+	 .msr_shift = 0,
+	 .platform_rapl_msr_scale = &rapl_time_units,
+	 .rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS,
+	 .bic = BIC_RAM__,
+	 .compat_scale = 100.0,
+	 .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
+	  },
+	{
+	 .feature_mask = RAPL_AMD_F17H,
+	 .perf_subsys = NULL,
+	 .perf_name = NULL,
+	 .msr = MSR_CORE_ENERGY_STAT,
+	 .msr_mask = 0xFFFFFFFF,
+	 .msr_shift = 0,
+	 .platform_rapl_msr_scale = &rapl_energy_units,
+	 .rci_index = RAPL_RCI_INDEX_CORE_ENERGY,
+	 .bic = BIC_CorWatt | BIC_Cor_J,
+	 .compat_scale = 1.0,
+	 .flags = 0,
+	  },
+};
+
+struct rapl_counter {
+	unsigned long long raw_value;
+	enum rapl_unit unit;
+	double scale;
+};
+
 struct thread_data {
 	struct timeval tv_begin;
 	struct timeval tv_end;
@@ -974,7 +1178,7 @@ struct core_data {
 	unsigned long long c7;
 	unsigned long long mc6_us;	/* duplicate as per-core for now, even though per module */
 	unsigned int core_temp_c;
-	unsigned int core_energy;	/* MSR_CORE_ENERGY_STAT */
+	struct rapl_counter core_energy;	/* MSR_CORE_ENERGY_STAT */
 	unsigned int core_id;
 	unsigned long long core_throt_cnt;
 	unsigned long long counter[MAX_ADDED_COUNTERS];
@@ -989,8 +1193,8 @@ struct pkg_data {
 	unsigned long long pc8;
 	unsigned long long pc9;
 	unsigned long long pc10;
-	unsigned long long cpu_lpi;
-	unsigned long long sys_lpi;
+	long long cpu_lpi;
+	long long sys_lpi;
 	unsigned long long pkg_wtd_core_c0;
 	unsigned long long pkg_any_core_c0;
 	unsigned long long pkg_any_gfxe_c0;
@@ -998,13 +1202,16 @@ struct pkg_data {
 	long long gfx_rc6_ms;
 	unsigned int gfx_mhz;
 	unsigned int gfx_act_mhz;
+	long long sam_mc6_ms;
+	unsigned int sam_mhz;
+	unsigned int sam_act_mhz;
 	unsigned int package_id;
-	unsigned long long energy_pkg;	/* MSR_PKG_ENERGY_STATUS */
-	unsigned long long energy_dram;	/* MSR_DRAM_ENERGY_STATUS */
-	unsigned long long energy_cores;	/* MSR_PP0_ENERGY_STATUS */
-	unsigned long long energy_gfx;	/* MSR_PP1_ENERGY_STATUS */
-	unsigned long long rapl_pkg_perf_status;	/* MSR_PKG_PERF_STATUS */
-	unsigned long long rapl_dram_perf_status;	/* MSR_DRAM_PERF_STATUS */
+	struct rapl_counter energy_pkg;	/* MSR_PKG_ENERGY_STATUS */
+	struct rapl_counter energy_dram;	/* MSR_DRAM_ENERGY_STATUS */
+	struct rapl_counter energy_cores;	/* MSR_PP0_ENERGY_STATUS */
+	struct rapl_counter energy_gfx;	/* MSR_PP1_ENERGY_STATUS */
+	struct rapl_counter rapl_pkg_perf_status;	/* MSR_PKG_PERF_STATUS */
+	struct rapl_counter rapl_dram_perf_status;	/* MSR_DRAM_PERF_STATUS */
 	unsigned int pkg_temp_c;
 	unsigned int uncore_mhz;
 	unsigned long long counter[MAX_ADDED_COUNTERS];
@@ -1150,6 +1357,38 @@ struct sys_counters {
 	struct msr_counter *pp;
 } sys;
 
+void free_sys_counters(void)
+{
+	struct msr_counter *p = sys.tp, *pnext = NULL;
+
+	while (p) {
+		pnext = p->next;
+		free(p);
+		p = pnext;
+	}
+
+	p = sys.cp, pnext = NULL;
+	while (p) {
+		pnext = p->next;
+		free(p);
+		p = pnext;
+	}
+
+	p = sys.pp, pnext = NULL;
+	while (p) {
+		pnext = p->next;
+		free(p);
+		p = pnext;
+	}
+
+	sys.added_thread_counters = 0;
+	sys.added_core_counters = 0;
+	sys.added_package_counters = 0;
+	sys.tp = NULL;
+	sys.cp = NULL;
+	sys.pp = NULL;
+}
+
 struct system_summary {
 	struct thread_data threads;
 	struct core_data cores;
@@ -1280,34 +1519,60 @@ int get_msr_fd(int cpu)
 	sprintf(pathname, "/dev/cpu/%d/msr", cpu);
 	fd = open(pathname, O_RDONLY);
 	if (fd < 0)
-		err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname);
+		err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, "
+		    "or run with --no-msr, or run as root", pathname);
 
 	fd_percpu[cpu] = fd;
 
 	return fd;
 }
 
+static void bic_disable_msr_access(void)
+{
+	const unsigned long bic_msrs =
+	    BIC_SMI |
+	    BIC_CPU_c1 |
+	    BIC_CPU_c3 |
+	    BIC_CPU_c6 |
+	    BIC_CPU_c7 |
+	    BIC_Mod_c6 |
+	    BIC_CoreTmp |
+	    BIC_Totl_c0 |
+	    BIC_Any_c0 |
+	    BIC_GFX_c0 |
+	    BIC_CPUGFX |
+	    BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp;
+
+	bic_enabled &= ~bic_msrs;
+
+	free_sys_counters();
+}
+
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
 {
+	assert(!no_perf);
+
 	return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
 }
 
-static int perf_instr_count_open(int cpu_num)
+static long open_perf_counter(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format)
 {
-	struct perf_event_attr pea;
-	int fd;
+	struct perf_event_attr attr;
+	const pid_t pid = -1;
+	const unsigned long flags = 0;
 
-	memset(&pea, 0, sizeof(struct perf_event_attr));
-	pea.type = PERF_TYPE_HARDWARE;
-	pea.size = sizeof(struct perf_event_attr);
-	pea.config = PERF_COUNT_HW_INSTRUCTIONS;
+	assert(!no_perf);
 
-	/* counter for cpu_num, including user + kernel and all processes */
-	fd = perf_event_open(&pea, -1, cpu_num, -1, 0);
-	if (fd == -1) {
-		warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"", progname);
-		BIC_NOT_PRESENT(BIC_IPC);
-	}
+	memset(&attr, 0, sizeof(struct perf_event_attr));
+
+	attr.type = type;
+	attr.size = sizeof(struct perf_event_attr);
+	attr.config = config;
+	attr.disabled = 0;
+	attr.sample_type = PERF_SAMPLE_IDENTIFIER;
+	attr.read_format = read_format;
+
+	const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags);
 
 	return fd;
 }
@@ -1317,7 +1582,7 @@ int get_instr_count_fd(int cpu)
 	if (fd_instr_count_percpu[cpu])
 		return fd_instr_count_percpu[cpu];
 
-	fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu);
+	fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
 
 	return fd_instr_count_percpu[cpu];
 }
@@ -1326,6 +1591,8 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
 {
 	ssize_t retval;
 
+	assert(!no_msr);
+
 	retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset);
 
 	if (retval != sizeof *msr)
@@ -1334,6 +1601,21 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr)
 	return 0;
 }
 
+int probe_msr(int cpu, off_t offset)
+{
+	ssize_t retval;
+	unsigned long long dummy;
+
+	assert(!no_msr);
+
+	retval = pread(get_msr_fd(cpu), &dummy, sizeof(dummy), offset);
+
+	if (retval != sizeof(dummy))
+		return 1;
+
+	return 0;
+}
+
 #define MAX_DEFERRED 16
 char *deferred_add_names[MAX_DEFERRED];
 char *deferred_skip_names[MAX_DEFERRED];
@@ -1369,6 +1651,8 @@ void help(void)
 		"		Override default 5-second measurement interval\n"
 		"  -J, --Joules	displays energy in Joules instead of Watts\n"
 		"  -l, --list	list column headers only\n"
+		"  -M, --no-msr Disable all uses of the MSR driver\n"
+		"  -P, --no-perf Disable all uses of the perf API\n"
 		"  -n, --num_iterations num\n"
 		"		number of the measurement iterations\n"
 		"  -N, --header_iterations num\n"
@@ -1573,6 +1857,15 @@ void print_header(char *delim)
 	if (DO_BIC(BIC_GFXACTMHz))
 		outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : ""));
 
+	if (DO_BIC(BIC_SAM_mc6))
+		outp += sprintf(outp, "%sSAM%%mc6", (printed++ ? delim : ""));
+
+	if (DO_BIC(BIC_SAMMHz))
+		outp += sprintf(outp, "%sSAMMHz", (printed++ ? delim : ""));
+
+	if (DO_BIC(BIC_SAMACTMHz))
+		outp += sprintf(outp, "%sSAMAMHz", (printed++ ? delim : ""));
+
 	if (DO_BIC(BIC_Totl_c0))
 		outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : ""));
 	if (DO_BIC(BIC_Any_c0))
@@ -1671,26 +1964,35 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p
 			outp += sprintf(outp, "SMI: %d\n", t->smi_count);
 
 		for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
-			outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, t->counter[i]);
+			outp +=
+			    sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
+				    t->counter[i], mp->path);
 		}
 	}
 
-	if (c) {
+	if (c && is_cpu_first_thread_in_core(t, c, p)) {
 		outp += sprintf(outp, "core: %d\n", c->core_id);
 		outp += sprintf(outp, "c3: %016llX\n", c->c3);
 		outp += sprintf(outp, "c6: %016llX\n", c->c6);
 		outp += sprintf(outp, "c7: %016llX\n", c->c7);
 		outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c);
 		outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt);
-		outp += sprintf(outp, "Joules: %0X\n", c->core_energy);
+
+		const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale;
+		const double energy_scale = c->core_energy.scale;
+
+		if (c->core_energy.unit == RAPL_UNIT_JOULES)
+			outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale);
 
 		for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
-			outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, c->counter[i]);
+			outp +=
+			    sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
+				    c->counter[i], mp->path);
 		}
 		outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us);
 	}
 
-	if (p) {
+	if (p && is_cpu_first_core_in_package(t, c, p)) {
 		outp += sprintf(outp, "package: %d\n", p->package_id);
 
 		outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0);
@@ -1710,16 +2012,18 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p
 		outp += sprintf(outp, "pc10: %016llX\n", p->pc10);
 		outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi);
 		outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi);
-		outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg);
-		outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores);
-		outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx);
-		outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram);
-		outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status);
-		outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status);
+		outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg.raw_value);
+		outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value);
+		outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value);
+		outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value);
+		outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value);
+		outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value);
 		outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c);
 
 		for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
-			outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, p->counter[i]);
+			outp +=
+			    sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
+				    p->counter[i], mp->path);
 		}
 	}
 
@@ -1728,6 +2032,23 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p
 	return 0;
 }
 
+double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desired_unit, double interval)
+{
+	assert(desired_unit != RAPL_UNIT_INVALID);
+
+	/*
+	 * For now we don't expect anything other than joules,
+	 * so just simplify the logic.
+	 */
+	assert(c->unit == RAPL_UNIT_JOULES);
+
+	const double scaled = c->raw_value * c->scale;
+
+	if (desired_unit == RAPL_UNIT_WATTS)
+		return scaled / interval;
+	return scaled;
+}
+
 /*
  * column formatting convention & formats
  */
@@ -1921,9 +2242,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 
 	if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
 		outp +=
-		    sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float);
+		    sprintf(outp, fmt8, (printed++ ? delim : ""),
+			    rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float));
 	if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
-		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units);
+		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
+				rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float));
 
 	/* print per-package data only for 1st core in package */
 	if (!is_cpu_first_core_in_package(t, c, p))
@@ -1951,6 +2274,24 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	if (DO_BIC(BIC_GFXACTMHz))
 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz);
 
+	/* SAMmc6 */
+	if (DO_BIC(BIC_SAM_mc6)) {
+		if (p->sam_mc6_ms == -1) {	/* detect GFX counter reset */
+			outp += sprintf(outp, "%s**.**", (printed++ ? delim : ""));
+		} else {
+			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
+					p->sam_mc6_ms / 10.0 / interval_float);
+		}
+	}
+
+	/* SAMMHz */
+	if (DO_BIC(BIC_SAMMHz))
+		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_mhz);
+
+	/* SAMACTMHz */
+	if (DO_BIC(BIC_SAMACTMHz))
+		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_act_mhz);
+
 	/* Totl%C0, Any%C0 GFX%C0 CPUGFX% */
 	if (DO_BIC(BIC_Totl_c0))
 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc);
@@ -1976,43 +2317,59 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	if (DO_BIC(BIC_Pkgpc10))
 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc);
 
-	if (DO_BIC(BIC_CPU_LPI))
-		outp +=
-		    sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float);
-	if (DO_BIC(BIC_SYS_LPI))
-		outp +=
-		    sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float);
+	if (DO_BIC(BIC_CPU_LPI)) {
+		if (p->cpu_lpi >= 0)
+			outp +=
+			    sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
+				    100.0 * p->cpu_lpi / 1000000.0 / interval_float);
+		else
+			outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
+	}
+	if (DO_BIC(BIC_SYS_LPI)) {
+		if (p->sys_lpi >= 0)
+			outp +=
+			    sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
+				    100.0 * p->sys_lpi / 1000000.0 / interval_float);
+		else
+			outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
+	}
 
 	if (DO_BIC(BIC_PkgWatt))
 		outp +=
-		    sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float);
-
+		    sprintf(outp, fmt8, (printed++ ? delim : ""),
+			    rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float));
 	if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
 		outp +=
-		    sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float);
+		    sprintf(outp, fmt8, (printed++ ? delim : ""),
+			    rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float));
 	if (DO_BIC(BIC_GFXWatt))
 		outp +=
-		    sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float);
+		    sprintf(outp, fmt8, (printed++ ? delim : ""),
+			    rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float));
 	if (DO_BIC(BIC_RAMWatt))
 		outp +=
 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
-			    p->energy_dram * rapl_dram_energy_units / interval_float);
+			    rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float));
 	if (DO_BIC(BIC_Pkg_J))
-		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units);
+		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
+				rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float));
 	if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
-		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units);
+		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
+				rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float));
 	if (DO_BIC(BIC_GFX_J))
-		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units);
+		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
+				rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float));
 	if (DO_BIC(BIC_RAM_J))
-		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units);
+		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
+				rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float));
 	if (DO_BIC(BIC_PKG__))
 		outp +=
 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
-			    100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float);
+			    rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float));
 	if (DO_BIC(BIC_RAM__))
 		outp +=
 		    sprintf(outp, fmt8, (printed++ ? delim : ""),
-			    100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float);
+			    rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float));
 	/* UncMHz */
 	if (DO_BIC(BIC_UNCORE_MHZ))
 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz);
@@ -2121,12 +2478,22 @@ int delta_package(struct pkg_data *new, struct pkg_data *old)
 	old->gfx_mhz = new->gfx_mhz;
 	old->gfx_act_mhz = new->gfx_act_mhz;
 
-	old->energy_pkg = new->energy_pkg - old->energy_pkg;
-	old->energy_cores = new->energy_cores - old->energy_cores;
-	old->energy_gfx = new->energy_gfx - old->energy_gfx;
-	old->energy_dram = new->energy_dram - old->energy_dram;
-	old->rapl_pkg_perf_status = new->rapl_pkg_perf_status - old->rapl_pkg_perf_status;
-	old->rapl_dram_perf_status = new->rapl_dram_perf_status - old->rapl_dram_perf_status;
+	/* flag an error when mc6 counter resets/wraps */
+	if (old->sam_mc6_ms > new->sam_mc6_ms)
+		old->sam_mc6_ms = -1;
+	else
+		old->sam_mc6_ms = new->sam_mc6_ms - old->sam_mc6_ms;
+
+	old->sam_mhz = new->sam_mhz;
+	old->sam_act_mhz = new->sam_act_mhz;
+
+	old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value;
+	old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value;
+	old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value;
+	old->energy_dram.raw_value = new->energy_dram.raw_value - old->energy_dram.raw_value;
+	old->rapl_pkg_perf_status.raw_value = new->rapl_pkg_perf_status.raw_value - old->rapl_pkg_perf_status.raw_value;
+	old->rapl_dram_perf_status.raw_value =
+	    new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value;
 
 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
@@ -2150,7 +2517,7 @@ void delta_core(struct core_data *new, struct core_data *old)
 	old->core_throt_cnt = new->core_throt_cnt;
 	old->mc6_us = new->mc6_us - old->mc6_us;
 
-	DELTA_WRAP32(new->core_energy, old->core_energy);
+	DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value);
 
 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
@@ -2277,6 +2644,13 @@ int delta_cpu(struct thread_data *t, struct core_data *c,
 	return retval;
 }
 
+void rapl_counter_clear(struct rapl_counter *c)
+{
+	c->raw_value = 0;
+	c->scale = 0.0;
+	c->unit = RAPL_UNIT_INVALID;
+}
+
 void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	int i;
@@ -2304,7 +2678,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	c->c7 = 0;
 	c->mc6_us = 0;
 	c->core_temp_c = 0;
-	c->core_energy = 0;
+	rapl_counter_clear(&c->core_energy);
 	c->core_throt_cnt = 0;
 
 	p->pkg_wtd_core_c0 = 0;
@@ -2325,18 +2699,21 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	p->cpu_lpi = 0;
 	p->sys_lpi = 0;
 
-	p->energy_pkg = 0;
-	p->energy_dram = 0;
-	p->energy_cores = 0;
-	p->energy_gfx = 0;
-	p->rapl_pkg_perf_status = 0;
-	p->rapl_dram_perf_status = 0;
+	rapl_counter_clear(&p->energy_pkg);
+	rapl_counter_clear(&p->energy_dram);
+	rapl_counter_clear(&p->energy_cores);
+	rapl_counter_clear(&p->energy_gfx);
+	rapl_counter_clear(&p->rapl_pkg_perf_status);
+	rapl_counter_clear(&p->rapl_dram_perf_status);
 	p->pkg_temp_c = 0;
 
 	p->gfx_rc6_ms = 0;
 	p->uncore_mhz = 0;
 	p->gfx_mhz = 0;
 	p->gfx_act_mhz = 0;
+	p->sam_mc6_ms = 0;
+	p->sam_mhz = 0;
+	p->sam_act_mhz = 0;
 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next)
 		t->counter[i] = 0;
 
@@ -2347,6 +2724,20 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 		p->counter[i] = 0;
 }
 
+void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src)
+{
+	/* Copy unit and scale from src if dst is not initialized */
+	if (dst->unit == RAPL_UNIT_INVALID) {
+		dst->unit = src->unit;
+		dst->scale = src->scale;
+	}
+
+	assert(dst->unit == src->unit);
+	assert(dst->scale == src->scale);
+
+	dst->raw_value += src->raw_value;
+}
+
 int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	int i;
@@ -2393,7 +2784,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c);
 	average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt);
 
-	average.cores.core_energy += c->core_energy;
+	rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy);
 
 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
@@ -2428,25 +2819,29 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	average.packages.cpu_lpi = p->cpu_lpi;
 	average.packages.sys_lpi = p->sys_lpi;
 
-	average.packages.energy_pkg += p->energy_pkg;
-	average.packages.energy_dram += p->energy_dram;
-	average.packages.energy_cores += p->energy_cores;
-	average.packages.energy_gfx += p->energy_gfx;
+	rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg);
+	rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram);
+	rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores);
+	rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx);
 
 	average.packages.gfx_rc6_ms = p->gfx_rc6_ms;
 	average.packages.uncore_mhz = p->uncore_mhz;
 	average.packages.gfx_mhz = p->gfx_mhz;
 	average.packages.gfx_act_mhz = p->gfx_act_mhz;
+	average.packages.sam_mc6_ms = p->sam_mc6_ms;
+	average.packages.sam_mhz = p->sam_mhz;
+	average.packages.sam_act_mhz = p->sam_act_mhz;
 
 	average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c);
 
-	average.packages.rapl_pkg_perf_status += p->rapl_pkg_perf_status;
-	average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status;
+	rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status);
+	rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status);
 
 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
-		if (mp->format == FORMAT_RAW)
-			continue;
-		average.packages.counter[i] += p->counter[i];
+		if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0))
+			average.packages.counter[i] = p->counter[i];
+		else
+			average.packages.counter[i] += p->counter[i];
 	}
 	return 0;
 }
@@ -2578,6 +2973,7 @@ unsigned long long snapshot_sysfs_counter(char *path)
 int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp)
 {
 	if (mp->msr_num != 0) {
+		assert(!no_msr);
 		if (get_msr(cpu, mp->msr_num, counterp))
 			return -1;
 	} else {
@@ -2599,7 +2995,7 @@ unsigned long long get_uncore_mhz(int package, int die)
 {
 	char path[128];
 
-	sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/current_freq_khz", package,
+	sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package,
 		die);
 
 	return (snapshot_sysfs_counter(path) / 1000);
@@ -2627,6 +3023,9 @@ int get_epb(int cpu)
 	return epb;
 
 msr_fallback:
+	if (no_msr)
+		return -1;
+
 	get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr);
 
 	return msr & 0xf;
@@ -2700,6 +3099,351 @@ int get_core_throt_cnt(int cpu, unsigned long long *cnt)
 	return 0;
 }
 
+struct amperf_group_fd {
+	int aperf;		/* Also the group descriptor */
+	int mperf;
+};
+
+static int read_perf_counter_info(const char *const path, const char *const parse_format, void *value_ptr)
+{
+	int fdmt;
+	int bytes_read;
+	char buf[64];
+	int ret = -1;
+
+	fdmt = open(path, O_RDONLY, 0);
+	if (fdmt == -1) {
+		if (debug)
+			fprintf(stderr, "Failed to parse perf counter info %s\n", path);
+		ret = -1;
+		goto cleanup_and_exit;
+	}
+
+	bytes_read = read(fdmt, buf, sizeof(buf) - 1);
+	if (bytes_read <= 0 || bytes_read >= (int)sizeof(buf)) {
+		if (debug)
+			fprintf(stderr, "Failed to parse perf counter info %s\n", path);
+		ret = -1;
+		goto cleanup_and_exit;
+	}
+
+	buf[bytes_read] = '\0';
+
+	if (sscanf(buf, parse_format, value_ptr) != 1) {
+		if (debug)
+			fprintf(stderr, "Failed to parse perf counter info %s\n", path);
+		ret = -1;
+		goto cleanup_and_exit;
+	}
+
+	ret = 0;
+
+cleanup_and_exit:
+	close(fdmt);
+	return ret;
+}
+
+static unsigned int read_perf_counter_info_n(const char *const path, const char *const parse_format)
+{
+	unsigned int v;
+	int status;
+
+	status = read_perf_counter_info(path, parse_format, &v);
+	if (status)
+		v = -1;
+
+	return v;
+}
+
+static unsigned int read_msr_type(void)
+{
+	const char *const path = "/sys/bus/event_source/devices/msr/type";
+	const char *const format = "%u";
+
+	return read_perf_counter_info_n(path, format);
+}
+
+static unsigned int read_aperf_config(void)
+{
+	const char *const path = "/sys/bus/event_source/devices/msr/events/aperf";
+	const char *const format = "event=%x";
+
+	return read_perf_counter_info_n(path, format);
+}
+
+static unsigned int read_mperf_config(void)
+{
+	const char *const path = "/sys/bus/event_source/devices/msr/events/mperf";
+	const char *const format = "event=%x";
+
+	return read_perf_counter_info_n(path, format);
+}
+
+static unsigned int read_perf_type(const char *subsys)
+{
+	const char *const path_format = "/sys/bus/event_source/devices/%s/type";
+	const char *const format = "%u";
+	char path[128];
+
+	snprintf(path, sizeof(path), path_format, subsys);
+
+	return read_perf_counter_info_n(path, format);
+}
+
+static unsigned int read_rapl_config(const char *subsys, const char *event_name)
+{
+	const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s";
+	const char *const format = "event=%x";
+	char path[128];
+
+	snprintf(path, sizeof(path), path_format, subsys, event_name);
+
+	return read_perf_counter_info_n(path, format);
+}
+
+static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name)
+{
+	const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit";
+	const char *const format = "%s";
+	char path[128];
+	char unit_buffer[16];
+
+	snprintf(path, sizeof(path), path_format, subsys, event_name);
+
+	read_perf_counter_info(path, format, &unit_buffer);
+	if (strcmp("Joules", unit_buffer) == 0)
+		return RAPL_UNIT_JOULES;
+
+	return RAPL_UNIT_INVALID;
+}
+
+static double read_perf_rapl_scale(const char *subsys, const char *event_name)
+{
+	const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale";
+	const char *const format = "%lf";
+	char path[128];
+	double scale;
+
+	snprintf(path, sizeof(path), path_format, subsys, event_name);
+
+	if (read_perf_counter_info(path, format, &scale))
+		return 0.0;
+
+	return scale;
+}
+
+static struct amperf_group_fd open_amperf_fd(int cpu)
+{
+	const unsigned int msr_type = read_msr_type();
+	const unsigned int aperf_config = read_aperf_config();
+	const unsigned int mperf_config = read_mperf_config();
+	struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 };
+
+	fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP);
+	fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP);
+
+	return fds;
+}
+
+static int get_amperf_fd(int cpu)
+{
+	assert(fd_amperf_percpu);
+
+	if (fd_amperf_percpu[cpu].aperf)
+		return fd_amperf_percpu[cpu].aperf;
+
+	fd_amperf_percpu[cpu] = open_amperf_fd(cpu);
+
+	return fd_amperf_percpu[cpu].aperf;
+}
+
+/* Read APERF, MPERF and TSC using the perf API. */
+static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu)
+{
+	union {
+		struct {
+			unsigned long nr_entries;
+			unsigned long aperf;
+			unsigned long mperf;
+		};
+
+		unsigned long as_array[3];
+	} cnt;
+
+	const int fd_amperf = get_amperf_fd(cpu);
+
+	/*
+	 * Read the TSC with rdtsc, because we want the absolute value and not
+	 * the offset from the start of the counter.
+	 */
+	t->tsc = rdtsc();
+
+	const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array));
+
+	if (n != sizeof(cnt.as_array))
+		return -2;
+
+	t->aperf = cnt.aperf * aperf_mperf_multiplier;
+	t->mperf = cnt.mperf * aperf_mperf_multiplier;
+
+	return 0;
+}
+
+/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */
+static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu)
+{
+	unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
+	int aperf_mperf_retry_count = 0;
+
+	/*
+	 * The TSC, APERF and MPERF must be read together for
+	 * APERF/MPERF and MPERF/TSC to give accurate results.
+	 *
+	 * Unfortunately, APERF and MPERF are read by
+	 * individual system call, so delays may occur
+	 * between them.  If the time to read them
+	 * varies by a large amount, we re-read them.
+	 */
+
+	/*
+	 * This initial dummy APERF read has been seen to
+	 * reduce jitter in the subsequent reads.
+	 */
+
+	if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
+		return -3;
+
+retry:
+	t->tsc = rdtsc();	/* re-read close to APERF */
+
+	tsc_before = t->tsc;
+
+	if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
+		return -3;
+
+	tsc_between = rdtsc();
+
+	if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
+		return -4;
+
+	tsc_after = rdtsc();
+
+	aperf_time = tsc_between - tsc_before;
+	mperf_time = tsc_after - tsc_between;
+
+	/*
+	 * If the system call latency to read APERF and MPERF
+	 * differ by more than 2x, then try again.
+	 */
+	if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
+		aperf_mperf_retry_count++;
+		if (aperf_mperf_retry_count < 5)
+			goto retry;
+		else
+			warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
+	}
+	aperf_mperf_retry_count = 0;
+
+	t->aperf = t->aperf * aperf_mperf_multiplier;
+	t->mperf = t->mperf * aperf_mperf_multiplier;
+
+	return 0;
+}
+
+size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci)
+{
+	size_t ret = 0;
+
+	for (int i = 0; i < NUM_RAPL_COUNTERS; ++i)
+		if (rci->source[i] == RAPL_SOURCE_PERF)
+			++ret;
+
+	return ret;
+}
+
+void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx)
+{
+	rc->raw_value = rci->data[idx];
+	rc->unit = rci->unit[idx];
+	rc->scale = rci->scale[idx];
+}
+
+int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p)
+{
+	unsigned long long perf_data[NUM_RAPL_COUNTERS + 1];
+	struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain];
+
+	if (debug)
+		fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain);
+
+	assert(rapl_counter_info_perdomain);
+
+	/*
+	 * If we have any perf counters to read, read them all now, in bulk
+	 */
+	if (rci->fd_perf != -1) {
+		size_t num_perf_counters = rapl_counter_info_count_perf(rci);
+		const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long);
+		const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data));
+
+		if (actual_read_size != expected_read_size)
+			err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size,
+			    actual_read_size);
+	}
+
+	for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) {
+		switch (rci->source[i]) {
+		case RAPL_SOURCE_NONE:
+			break;
+
+		case RAPL_SOURCE_PERF:
+			assert(pi < ARRAY_SIZE(perf_data));
+			assert(rci->fd_perf != -1);
+
+			if (debug)
+				fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n",
+					i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]);
+
+			rci->data[i] = perf_data[pi];
+
+			++pi;
+			break;
+
+		case RAPL_SOURCE_MSR:
+			if (debug)
+				fprintf(stderr, "Reading rapl counter via msr at %u\n", i);
+
+			assert(!no_msr);
+			if (rci->flags[i] & RAPL_COUNTER_FLAG_USE_MSR_SUM) {
+				if (get_msr_sum(cpu, rci->msr[i], &rci->data[i]))
+					return -13 - i;
+			} else {
+				if (get_msr(cpu, rci->msr[i], &rci->data[i]))
+					return -13 - i;
+			}
+
+			rci->data[i] &= rci->msr_mask[i];
+			if (rci->msr_shift[i] >= 0)
+				rci->data[i] >>= abs(rci->msr_shift[i]);
+			else
+				rci->data[i] <<= abs(rci->msr_shift[i]);
+
+			break;
+		}
+	}
+
+	_Static_assert(NUM_RAPL_COUNTERS == 7);
+	write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG);
+	write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES);
+	write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM);
+	write_rapl_counter(&p->energy_gfx, rci, RAPL_RCI_INDEX_GFX);
+	write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS);
+	write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS);
+	write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY);
+
+	return 0;
+}
+
 /*
  * get_counters(...)
  * migrate to cpu
@@ -2709,12 +3453,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	int cpu = t->cpu_id;
 	unsigned long long msr;
-	int aperf_mperf_retry_count = 0;
 	struct msr_counter *mp;
 	int i;
+	int status;
 
 	if (cpu_migrate(cpu)) {
-		fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu);
+		fprintf(outf, "%s: Could not migrate to CPU %d\n", __func__, cpu);
 		return -1;
 	}
 
@@ -2722,63 +3466,26 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
 	if (first_counter_read)
 		get_apic_id(t);
-retry:
+
 	t->tsc = rdtsc();	/* we are running on local CPU of interest */
 
 	if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
 	    || soft_c1_residency_display(BIC_Avg_MHz)) {
-		unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
-
-		/*
-		 * The TSC, APERF and MPERF must be read together for
-		 * APERF/MPERF and MPERF/TSC to give accurate results.
-		 *
-		 * Unfortunately, APERF and MPERF are read by
-		 * individual system call, so delays may occur
-		 * between them.  If the time to read them
-		 * varies by a large amount, we re-read them.
-		 */
-
-		/*
-		 * This initial dummy APERF read has been seen to
-		 * reduce jitter in the subsequent reads.
-		 */
-
-		if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
-			return -3;
-
-		t->tsc = rdtsc();	/* re-read close to APERF */
-
-		tsc_before = t->tsc;
+		int status = -1;
 
-		if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
-			return -3;
+		assert(!no_perf || !no_msr);
 
-		tsc_between = rdtsc();
-
-		if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
-			return -4;
-
-		tsc_after = rdtsc();
-
-		aperf_time = tsc_between - tsc_before;
-		mperf_time = tsc_after - tsc_between;
-
-		/*
-		 * If the system call latency to read APERF and MPERF
-		 * differ by more than 2x, then try again.
-		 */
-		if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
-			aperf_mperf_retry_count++;
-			if (aperf_mperf_retry_count < 5)
-				goto retry;
-			else
-				warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
+		switch (amperf_source) {
+		case AMPERF_SOURCE_PERF:
+			status = read_aperf_mperf_tsc_perf(t, cpu);
+			break;
+		case AMPERF_SOURCE_MSR:
+			status = read_aperf_mperf_tsc_msr(t, cpu);
+			break;
 		}
-		aperf_mperf_retry_count = 0;
 
-		t->aperf = t->aperf * aperf_mperf_multiplier;
-		t->mperf = t->mperf * aperf_mperf_multiplier;
+		if (status != 0)
+			return status;
 	}
 
 	if (DO_BIC(BIC_IPC))
@@ -2806,6 +3513,12 @@ retry:
 	if (!is_cpu_first_thread_in_core(t, c, p))
 		goto done;
 
+	if (platform->has_per_core_rapl) {
+		status = get_rapl_counters(cpu, c->core_id, c, p);
+		if (status != 0)
+			return status;
+	}
+
 	if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) {
 		if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3))
 			return -6;
@@ -2846,12 +3559,6 @@ retry:
 	if (DO_BIC(BIC_CORE_THROT_CNT))
 		get_core_throt_cnt(cpu, &c->core_throt_cnt);
 
-	if (platform->rapl_msrs & RAPL_AMD_F17H) {
-		if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr))
-			return -14;
-		c->core_energy = msr & 0xFFFFFFFF;
-	}
-
 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
 		if (get_mp(cpu, mp, &c->counter[i]))
 			return -10;
@@ -2911,59 +3618,39 @@ retry:
 	if (DO_BIC(BIC_SYS_LPI))
 		p->sys_lpi = cpuidle_cur_sys_lpi_us;
 
-	if (platform->rapl_msrs & RAPL_PKG) {
-		if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr))
-			return -13;
-		p->energy_pkg = msr;
-	}
-	if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) {
-		if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr))
-			return -14;
-		p->energy_cores = msr;
-	}
-	if (platform->rapl_msrs & RAPL_DRAM) {
-		if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr))
-			return -15;
-		p->energy_dram = msr;
-	}
-	if (platform->rapl_msrs & RAPL_GFX) {
-		if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr))
-			return -16;
-		p->energy_gfx = msr;
-	}
-	if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) {
-		if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr))
-			return -16;
-		p->rapl_pkg_perf_status = msr;
-	}
-	if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) {
-		if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr))
-			return -16;
-		p->rapl_dram_perf_status = msr;
-	}
-	if (platform->rapl_msrs & RAPL_AMD_F17H) {
-		if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr))
-			return -13;
-		p->energy_pkg = msr;
+	if (!platform->has_per_core_rapl) {
+		status = get_rapl_counters(cpu, p->package_id, c, p);
+		if (status != 0)
+			return status;
 	}
+
 	if (DO_BIC(BIC_PkgTmp)) {
 		if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
 			return -17;
 		p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F);
 	}
 
-	if (DO_BIC(BIC_GFX_rc6))
-		p->gfx_rc6_ms = gfx_cur_rc6_ms;
-
 	/* n.b. assume die0 uncore frequency applies to whole package */
 	if (DO_BIC(BIC_UNCORE_MHZ))
 		p->uncore_mhz = get_uncore_mhz(p->package_id, 0);
 
+	if (DO_BIC(BIC_GFX_rc6))
+		p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull;
+
 	if (DO_BIC(BIC_GFXMHz))
-		p->gfx_mhz = gfx_cur_mhz;
+		p->gfx_mhz = gfx_info[GFX_MHz].val;
 
 	if (DO_BIC(BIC_GFXACTMHz))
-		p->gfx_act_mhz = gfx_act_mhz;
+		p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val;
+
+	if (DO_BIC(BIC_SAM_mc6))
+		p->sam_mc6_ms = gfx_info[SAM_mc6].val_ull;
+
+	if (DO_BIC(BIC_SAMMHz))
+		p->sam_mhz = gfx_info[SAM_MHz].val;
+
+	if (DO_BIC(BIC_SAMACTMHz))
+		p->sam_act_mhz = gfx_info[SAM_ACTMHz].val;
 
 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
 		if (get_mp(cpu, mp, &p->counter[i]))
@@ -3053,7 +3740,7 @@ void probe_cst_limit(void)
 	unsigned long long msr;
 	int *pkg_cstate_limits;
 
-	if (!platform->has_nhm_msrs)
+	if (!platform->has_nhm_msrs || no_msr)
 		return;
 
 	switch (platform->cst_limit) {
@@ -3097,7 +3784,7 @@ static void dump_platform_info(void)
 	unsigned long long msr;
 	unsigned int ratio;
 
-	if (!platform->has_nhm_msrs)
+	if (!platform->has_nhm_msrs || no_msr)
 		return;
 
 	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
@@ -3115,7 +3802,7 @@ static void dump_power_ctl(void)
 {
 	unsigned long long msr;
 
-	if (!platform->has_nhm_msrs)
+	if (!platform->has_nhm_msrs || no_msr)
 		return;
 
 	get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
@@ -3321,7 +4008,7 @@ static void dump_cst_cfg(void)
 {
 	unsigned long long msr;
 
-	if (!platform->has_nhm_msrs)
+	if (!platform->has_nhm_msrs || no_msr)
 		return;
 
 	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
@@ -3393,7 +4080,7 @@ void print_irtl(void)
 {
 	unsigned long long msr;
 
-	if (!platform->has_irtl_msrs)
+	if (!platform->has_irtl_msrs || no_msr)
 		return;
 
 	if (platform->supported_cstates & PC3) {
@@ -3443,12 +4130,64 @@ void free_fd_percpu(void)
 {
 	int i;
 
+	if (!fd_percpu)
+		return;
+
 	for (i = 0; i < topo.max_cpu_num + 1; ++i) {
 		if (fd_percpu[i] != 0)
 			close(fd_percpu[i]);
 	}
 
 	free(fd_percpu);
+	fd_percpu = NULL;
+}
+
+void free_fd_amperf_percpu(void)
+{
+	int i;
+
+	if (!fd_amperf_percpu)
+		return;
+
+	for (i = 0; i < topo.max_cpu_num + 1; ++i) {
+		if (fd_amperf_percpu[i].mperf != 0)
+			close(fd_amperf_percpu[i].mperf);
+
+		if (fd_amperf_percpu[i].aperf != 0)
+			close(fd_amperf_percpu[i].aperf);
+	}
+
+	free(fd_amperf_percpu);
+	fd_amperf_percpu = NULL;
+}
+
+void free_fd_instr_count_percpu(void)
+{
+	if (!fd_instr_count_percpu)
+		return;
+
+	for (int i = 0; i < topo.max_cpu_num + 1; ++i) {
+		if (fd_instr_count_percpu[i] != 0)
+			close(fd_instr_count_percpu[i]);
+	}
+
+	free(fd_instr_count_percpu);
+	fd_instr_count_percpu = NULL;
+}
+
+void free_fd_rapl_percpu(void)
+{
+	if (!rapl_counter_info_perdomain)
+		return;
+
+	const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages;
+
+	for (int domain_id = 0; domain_id < num_domains; ++domain_id) {
+		if (rapl_counter_info_perdomain[domain_id].fd_perf != -1)
+			close(rapl_counter_info_perdomain[domain_id].fd_perf);
+	}
+
+	free(rapl_counter_info_perdomain);
 }
 
 void free_all_buffers(void)
@@ -3492,6 +4231,9 @@ void free_all_buffers(void)
 	outp = NULL;
 
 	free_fd_percpu();
+	free_fd_instr_count_percpu();
+	free_fd_amperf_percpu();
+	free_fd_rapl_percpu();
 
 	free(irq_column_2_cpu);
 	free(irqs_per_cpu);
@@ -3825,11 +4567,17 @@ static void update_effective_set(bool startup)
 		err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str);
 }
 
+void linux_perf_init(void);
+void rapl_perf_init(void);
+
 void re_initialize(void)
 {
 	free_all_buffers();
 	setup_all_buffers(false);
-	fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus);
+	linux_perf_init();
+	rapl_perf_init();
+	fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus,
+		topo.allowed_cpus);
 }
 
 void set_max_cpu_num(void)
@@ -3940,85 +4688,43 @@ int snapshot_proc_interrupts(void)
 }
 
 /*
- * snapshot_gfx_rc6_ms()
+ * snapshot_graphics()
  *
- * record snapshot of
- * /sys/class/drm/card0/power/rc6_residency_ms
+ * record snapshot of specified graphics sysfs knob
  *
  * return 1 if config change requires a restart, else return 0
  */
-int snapshot_gfx_rc6_ms(void)
+int snapshot_graphics(int idx)
 {
 	FILE *fp;
 	int retval;
 
-	fp = fopen_or_die("/sys/class/drm/card0/power/rc6_residency_ms", "r");
-
-	retval = fscanf(fp, "%lld", &gfx_cur_rc6_ms);
-	if (retval != 1)
-		err(1, "GFX rc6");
-
-	fclose(fp);
-
-	return 0;
-}
-
-/*
- * snapshot_gfx_mhz()
- *
- * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz
- * when /sys/class/drm/card0/gt_cur_freq_mhz is not available.
- *
- * return 1 if config change requires a restart, else return 0
- */
-int snapshot_gfx_mhz(void)
-{
-	static FILE *fp;
-	int retval;
-
-	if (fp == NULL) {
-		fp = fopen("/sys/class/drm/card0/gt_cur_freq_mhz", "r");
-		if (!fp)
-			fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r");
-	} else {
-		rewind(fp);
-		fflush(fp);
-	}
-
-	retval = fscanf(fp, "%d", &gfx_cur_mhz);
-	if (retval != 1)
-		err(1, "GFX MHz");
-
-	return 0;
-}
-
-/*
- * snapshot_gfx_cur_mhz()
- *
- * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz
- * when /sys/class/drm/card0/gt_act_freq_mhz is not available.
- *
- * return 1 if config change requires a restart, else return 0
- */
-int snapshot_gfx_act_mhz(void)
-{
-	static FILE *fp;
-	int retval;
-
-	if (fp == NULL) {
-		fp = fopen("/sys/class/drm/card0/gt_act_freq_mhz", "r");
-		if (!fp)
-			fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r");
-	} else {
-		rewind(fp);
-		fflush(fp);
+	switch (idx) {
+	case GFX_rc6:
+	case SAM_mc6:
+		fp = fopen_or_die(gfx_info[idx].path, "r");
+		retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull);
+		if (retval != 1)
+			err(1, "rc6");
+		fclose(fp);
+		return 0;
+	case GFX_MHz:
+	case GFX_ACTMHz:
+	case SAM_MHz:
+	case SAM_ACTMHz:
+		if (gfx_info[idx].fp == NULL) {
+			gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r");
+		} else {
+			rewind(gfx_info[idx].fp);
+			fflush(gfx_info[idx].fp);
+		}
+		retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val);
+		if (retval != 1)
+			err(1, "MHz");
+		return 0;
+	default:
+		return -EINVAL;
 	}
-
-	retval = fscanf(fp, "%d", &gfx_act_mhz);
-	if (retval != 1)
-		err(1, "GFX ACT MHz");
-
-	return 0;
 }
 
 /*
@@ -4083,13 +4789,22 @@ int snapshot_proc_sysfs_files(void)
 			return 1;
 
 	if (DO_BIC(BIC_GFX_rc6))
-		snapshot_gfx_rc6_ms();
+		snapshot_graphics(GFX_rc6);
 
 	if (DO_BIC(BIC_GFXMHz))
-		snapshot_gfx_mhz();
+		snapshot_graphics(GFX_MHz);
 
 	if (DO_BIC(BIC_GFXACTMHz))
-		snapshot_gfx_act_mhz();
+		snapshot_graphics(GFX_ACTMHz);
+
+	if (DO_BIC(BIC_SAM_mc6))
+		snapshot_graphics(SAM_mc6);
+
+	if (DO_BIC(BIC_SAMMHz))
+		snapshot_graphics(SAM_MHz);
+
+	if (DO_BIC(BIC_SAMACTMHz))
+		snapshot_graphics(SAM_ACTMHz);
 
 	if (DO_BIC(BIC_CPU_LPI))
 		snapshot_cpu_lpi_us();
@@ -4173,6 +4888,8 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr)
 	int ret, idx;
 	unsigned long long msr_cur, msr_last;
 
+	assert(!no_msr);
+
 	if (!per_cpu_msr_sum)
 		return 1;
 
@@ -4201,6 +4918,8 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg
 	UNUSED(c);
 	UNUSED(p);
 
+	assert(!no_msr);
+
 	for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
 		unsigned long long msr_cur, msr_last;
 		off_t offset;
@@ -4280,7 +4999,8 @@ release_msr:
 
 /*
  * set_my_sched_priority(pri)
- * return previous
+ * return previous priority on success
+ * return value < -20 on failure
  */
 int set_my_sched_priority(int priority)
 {
@@ -4290,16 +5010,16 @@ int set_my_sched_priority(int priority)
 	errno = 0;
 	original_priority = getpriority(PRIO_PROCESS, 0);
 	if (errno && (original_priority == -1))
-		err(errno, "getpriority");
+		return -21;
 
 	retval = setpriority(PRIO_PROCESS, 0, priority);
 	if (retval)
-		errx(retval, "capget(CAP_SYS_NICE) failed,try \"# setcap cap_sys_nice=ep %s\"", progname);
+		return -21;
 
 	errno = 0;
 	retval = getpriority(PRIO_PROCESS, 0);
 	if (retval != priority)
-		err(retval, "getpriority(%d) != setpriority(%d)", retval, priority);
+		return -21;
 
 	return original_priority;
 }
@@ -4314,6 +5034,9 @@ void turbostat_loop()
 
 	/*
 	 * elevate own priority for interval mode
+	 *
+	 * ignore on error - we probably don't have permission to set it, but
+	 * it's not a big deal
 	 */
 	set_my_sched_priority(-20);
 
@@ -4399,10 +5122,13 @@ void check_dev_msr()
 	struct stat sb;
 	char pathname[32];
 
+	if (no_msr)
+		return;
+
 	sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
 	if (stat(pathname, &sb))
 		if (system("/sbin/modprobe msr > /dev/null 2>&1"))
-			err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" ");
+			no_msr = 1;
 }
 
 /*
@@ -4414,47 +5140,51 @@ int check_for_cap_sys_rawio(void)
 {
 	cap_t caps;
 	cap_flag_value_t cap_flag_value;
+	int ret = 0;
 
 	caps = cap_get_proc();
 	if (caps == NULL)
-		err(-6, "cap_get_proc\n");
+		return 1;
 
-	if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value))
-		err(-6, "cap_get\n");
+	if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) {
+		ret = 1;
+		goto free_and_exit;
+	}
 
 	if (cap_flag_value != CAP_SET) {
-		warnx("capget(CAP_SYS_RAWIO) failed," " try \"# setcap cap_sys_rawio=ep %s\"", progname);
-		return 1;
+		ret = 1;
+		goto free_and_exit;
 	}
 
+free_and_exit:
 	if (cap_free(caps) == -1)
 		err(-6, "cap_free\n");
 
-	return 0;
+	return ret;
 }
 
-void check_permissions(void)
+void check_msr_permission(void)
 {
-	int do_exit = 0;
+	int failed = 0;
 	char pathname[32];
 
+	if (no_msr)
+		return;
+
 	/* check for CAP_SYS_RAWIO */
-	do_exit += check_for_cap_sys_rawio();
+	failed += check_for_cap_sys_rawio();
 
 	/* test file permissions */
 	sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
 	if (euidaccess(pathname, R_OK)) {
-		do_exit++;
-		warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr");
+		failed++;
 	}
 
-	/* if all else fails, thell them to be root */
-	if (do_exit)
-		if (getuid() != 0)
-			warnx("... or simply run as root");
-
-	if (do_exit)
-		exit(-6);
+	if (failed) {
+		warnx("Failed to access %s. Some of the counters may not be available\n"
+		      "\tRun as root to enable them or use %s to disable the access explicitly", pathname, "--no-msr");
+		no_msr = 1;
+	}
 }
 
 void probe_bclk(void)
@@ -4462,7 +5192,7 @@ void probe_bclk(void)
 	unsigned long long msr;
 	unsigned int base_ratio;
 
-	if (!platform->has_nhm_msrs)
+	if (!platform->has_nhm_msrs || no_msr)
 		return;
 
 	if (platform->bclk_freq == BCLK_100MHZ)
@@ -4502,7 +5232,7 @@ static void dump_turbo_ratio_info(void)
 	if (!has_turbo)
 		return;
 
-	if (!platform->has_nhm_msrs)
+	if (!platform->has_nhm_msrs || no_msr)
 		return;
 
 	if (platform->trl_msrs & TRL_LIMIT2)
@@ -4567,20 +5297,15 @@ static void dump_sysfs_file(char *path)
 static void probe_intel_uncore_frequency(void)
 {
 	int i, j;
-	char path[128];
+	char path[256];
 
 	if (!genuine_intel)
 		return;
 
-	if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK))
-		return;
-
-	/* Cluster level sysfs not supported yet. */
-	if (!access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK))
-		return;
+	if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK))
+		goto probe_cluster;
 
-	if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK))
-		BIC_PRESENT(BIC_UNCORE_MHZ);
+	BIC_PRESENT(BIC_UNCORE_MHZ);
 
 	if (quiet)
 		return;
@@ -4588,40 +5313,178 @@ static void probe_intel_uncore_frequency(void)
 	for (i = 0; i < topo.num_packages; ++i) {
 		for (j = 0; j < topo.num_die; ++j) {
 			int k, l;
+			char path_base[128];
+
+			sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i,
+				j);
 
-			sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/min_freq_khz",
-				i, j);
+			sprintf(path, "%s/min_freq_khz", path_base);
 			k = read_sysfs_int(path);
-			sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/max_freq_khz",
-				i, j);
+			sprintf(path, "%s/max_freq_khz", path_base);
 			l = read_sysfs_int(path);
-			fprintf(outf, "Uncore Frequency pkg%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000);
+			fprintf(outf, "Uncore Frequency package%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000);
 
-			sprintf(path,
-				"/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_min_freq_khz",
-				i, j);
+			sprintf(path, "%s/initial_min_freq_khz", path_base);
 			k = read_sysfs_int(path);
-			sprintf(path,
-				"/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_max_freq_khz",
-				i, j);
+			sprintf(path, "%s/initial_max_freq_khz", path_base);
 			l = read_sysfs_int(path);
-			fprintf(outf, "(%d - %d MHz)\n", k / 1000, l / 1000);
+			fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000);
+
+			sprintf(path, "%s/current_freq_khz", path_base);
+			k = read_sysfs_int(path);
+			fprintf(outf, " %d MHz\n", k / 1000);
 		}
 	}
+	return;
+
+probe_cluster:
+	if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK))
+		return;
+
+	if (quiet)
+		return;
+
+	for (i = 0;; ++i) {
+		int k, l;
+		char path_base[128];
+		int package_id, domain_id, cluster_id;
+
+		sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i);
+
+		if (access(path_base, R_OK))
+			break;
+
+		sprintf(path, "%s/package_id", path_base);
+		package_id = read_sysfs_int(path);
+
+		sprintf(path, "%s/domain_id", path_base);
+		domain_id = read_sysfs_int(path);
+
+		sprintf(path, "%s/fabric_cluster_id", path_base);
+		cluster_id = read_sysfs_int(path);
+
+		sprintf(path, "%s/min_freq_khz", path_base);
+		k = read_sysfs_int(path);
+		sprintf(path, "%s/max_freq_khz", path_base);
+		l = read_sysfs_int(path);
+		fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id,
+			cluster_id, k / 1000, l / 1000);
+
+		sprintf(path, "%s/initial_min_freq_khz", path_base);
+		k = read_sysfs_int(path);
+		sprintf(path, "%s/initial_max_freq_khz", path_base);
+		l = read_sysfs_int(path);
+		fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000);
+
+		sprintf(path, "%s/current_freq_khz", path_base);
+		k = read_sysfs_int(path);
+		fprintf(outf, " %d MHz\n", k / 1000);
+	}
 }
 
 static void probe_graphics(void)
 {
+	/* Xe graphics sysfs knobs */
+	if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) {
+		FILE *fp;
+		char buf[8];
+		bool gt0_is_gt;
+		int idx;
+
+		fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r");
+		if (!fp)
+			goto next;
+
+		if (!fread(buf, sizeof(char), 7, fp)) {
+			fclose(fp);
+			goto next;
+		}
+		fclose(fp);
+
+		if (!strncmp(buf, "gt0-rc", strlen("gt0-rc")))
+			gt0_is_gt = true;
+		else if (!strncmp(buf, "gt0-mc", strlen("gt0-mc")))
+			gt0_is_gt = false;
+		else
+			goto next;
+
+		idx = gt0_is_gt ? GFX_rc6 : SAM_mc6;
+		gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms";
+
+		idx = gt0_is_gt ? GFX_MHz : SAM_MHz;
+		if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", R_OK))
+			gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq";
+
+		idx = gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz;
+		if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", R_OK))
+			gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq";
+
+		idx = gt0_is_gt ? SAM_mc6 : GFX_rc6;
+		if (!access("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", R_OK))
+			gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms";
+
+		idx = gt0_is_gt ? SAM_MHz : GFX_MHz;
+		if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", R_OK))
+			gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq";
+
+		idx = gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz;
+		if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", R_OK))
+			gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq";
+
+		goto end;
+	}
+
+next:
+	/* New i915 graphics sysfs knobs */
+	if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) {
+		gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms";
+
+		if (!access("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", R_OK))
+			gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz";
+
+		if (!access("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", R_OK))
+			gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz";
+
+		if (!access("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", R_OK))
+			gfx_info[SAM_mc6].path = "/sys/class/drm/card0/gt/gt1/rc6_residency_ms";
+
+		if (!access("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", R_OK))
+			gfx_info[SAM_MHz].path = "/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz";
+
+		if (!access("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", R_OK))
+			gfx_info[SAM_ACTMHz].path = "/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz";
+
+		goto end;
+	}
+
+	/* Fall back to traditional i915 graphics sysfs knobs */
 	if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
-		BIC_PRESENT(BIC_GFX_rc6);
+		gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms";
+
+	if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK))
+		gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt_cur_freq_mhz";
+	else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
+		gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz";
 
-	if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) ||
-	    !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
-		BIC_PRESENT(BIC_GFXMHz);
 
-	if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) ||
-	    !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
+	if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK))
+		gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz";
+	else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
+		gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz";
+
+end:
+	if (gfx_info[GFX_rc6].path)
+		BIC_PRESENT(BIC_GFX_rc6);
+	if (gfx_info[GFX_MHz].path)
+		BIC_PRESENT(BIC_GFXMHz);
+	if (gfx_info[GFX_ACTMHz].path)
 		BIC_PRESENT(BIC_GFXACTMHz);
+	if (gfx_info[SAM_mc6].path)
+		BIC_PRESENT(BIC_SAM_mc6);
+	if (gfx_info[SAM_MHz].path)
+		BIC_PRESENT(BIC_SAMMHz);
+	if (gfx_info[SAM_ACTMHz].path)
+		BIC_PRESENT(BIC_SAMACTMHz);
 }
 
 static void dump_sysfs_cstate_config(void)
@@ -4783,6 +5646,9 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	UNUSED(c);
 	UNUSED(p);
 
+	if (no_msr)
+		return 0;
+
 	if (!has_hwp)
 		return 0;
 
@@ -4869,6 +5735,9 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 	UNUSED(c);
 	UNUSED(p);
 
+	if (no_msr)
+		return 0;
+
 	cpu = t->cpu_id;
 
 	/* per-package */
@@ -4983,31 +5852,18 @@ void rapl_probe_intel(void)
 	unsigned long long msr;
 	unsigned int time_unit;
 	double tdp;
+	const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt;
+	const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J;
 
-	if (rapl_joules) {
-		if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS)
-			BIC_PRESENT(BIC_Pkg_J);
-		if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS)
-			BIC_PRESENT(BIC_Cor_J);
-		if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS)
-			BIC_PRESENT(BIC_RAM_J);
-		if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS)
-			BIC_PRESENT(BIC_GFX_J);
-	} else {
-		if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS)
-			BIC_PRESENT(BIC_PkgWatt);
-		if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS)
-			BIC_PRESENT(BIC_CorWatt);
-		if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS)
-			BIC_PRESENT(BIC_RAMWatt);
-		if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS)
-			BIC_PRESENT(BIC_GFXWatt);
-	}
+	if (rapl_joules)
+		bic_enabled &= ~bic_watt_bits;
+	else
+		bic_enabled &= ~bic_joules_bits;
 
-	if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS)
-		BIC_PRESENT(BIC_PKG__);
-	if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS)
-		BIC_PRESENT(BIC_RAM__);
+	if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS))
+		bic_enabled &= ~BIC_PKG__;
+	if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS))
+		bic_enabled &= ~BIC_RAM__;
 
 	/* units on package 0, verify later other packages match */
 	if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr))
@@ -5041,14 +5897,13 @@ void rapl_probe_amd(void)
 {
 	unsigned long long msr;
 	double tdp;
+	const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt;
+	const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J;
 
-	if (rapl_joules) {
-		BIC_PRESENT(BIC_Pkg_J);
-		BIC_PRESENT(BIC_Cor_J);
-	} else {
-		BIC_PRESENT(BIC_PkgWatt);
-		BIC_PRESENT(BIC_CorWatt);
-	}
+	if (rapl_joules)
+		bic_enabled &= ~bic_watt_bits;
+	else
+		bic_enabled &= ~bic_joules_bits;
 
 	if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr))
 		return;
@@ -5202,7 +6057,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
  */
 void probe_rapl(void)
 {
-	if (!platform->rapl_msrs)
+	if (!platform->rapl_msrs || no_msr)
 		return;
 
 	if (genuine_intel)
@@ -5258,7 +6113,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
 	}
 
 	/* Temperature Target MSR is Nehalem and newer only */
-	if (!platform->has_nhm_msrs)
+	if (!platform->has_nhm_msrs || no_msr)
 		goto guess;
 
 	if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
@@ -5305,6 +6160,9 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
 	UNUSED(c);
 	UNUSED(p);
 
+	if (no_msr)
+		return 0;
+
 	if (!(do_dts || do_ptm))
 		return 0;
 
@@ -5402,6 +6260,9 @@ void decode_feature_control_msr(void)
 {
 	unsigned long long msr;
 
+	if (no_msr)
+		return;
+
 	if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr))
 		fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
 			base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
@@ -5411,6 +6272,9 @@ void decode_misc_enable_msr(void)
 {
 	unsigned long long msr;
 
+	if (no_msr)
+		return;
+
 	if (!genuine_intel)
 		return;
 
@@ -5428,6 +6292,9 @@ void decode_misc_feature_control(void)
 {
 	unsigned long long msr;
 
+	if (no_msr)
+		return;
+
 	if (!platform->has_msr_misc_feature_control)
 		return;
 
@@ -5449,6 +6316,9 @@ void decode_misc_pwr_mgmt_msr(void)
 {
 	unsigned long long msr;
 
+	if (no_msr)
+		return;
+
 	if (!platform->has_msr_misc_pwr_mgmt)
 		return;
 
@@ -5468,6 +6338,9 @@ void decode_c6_demotion_policy_msr(void)
 {
 	unsigned long long msr;
 
+	if (no_msr)
+		return;
+
 	if (!platform->has_msr_c6_demotion_policy_config)
 		return;
 
@@ -5489,7 +6362,8 @@ void print_dev_latency(void)
 
 	fd = open(path, O_RDONLY);
 	if (fd < 0) {
-		warnx("capget(CAP_SYS_ADMIN) failed, try \"# setcap cap_sys_admin=ep %s\"", progname);
+		if (debug)
+			warnx("Read %s failed", path);
 		return;
 	}
 
@@ -5504,23 +6378,260 @@ void print_dev_latency(void)
 	close(fd);
 }
 
+static int has_instr_count_access(void)
+{
+	int fd;
+	int has_access;
+
+	if (no_perf)
+		return 0;
+
+	fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
+	has_access = fd != -1;
+
+	if (fd != -1)
+		close(fd);
+
+	if (!has_access)
+		warnx("Failed to access %s. Some of the counters may not be available\n"
+		      "\tRun as root to enable them or use %s to disable the access explicitly",
+		      "instructions retired perf counter", "--no-perf");
+
+	return has_access;
+}
+
+bool is_aperf_access_required(void)
+{
+	return BIC_IS_ENABLED(BIC_Avg_MHz)
+	    || BIC_IS_ENABLED(BIC_Busy)
+	    || BIC_IS_ENABLED(BIC_Bzy_MHz)
+	    || BIC_IS_ENABLED(BIC_IPC);
+}
+
+int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
+			   double *scale_, enum rapl_unit *unit_)
+{
+	if (no_perf)
+		return -1;
+
+	const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name);
+
+	if (scale == 0.0)
+		return -1;
+
+	const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name);
+
+	if (unit == RAPL_UNIT_INVALID)
+		return -1;
+
+	const unsigned int rapl_type = read_perf_type(cai->perf_subsys);
+	const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name);
+
+	const int fd_counter =
+	    open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP);
+	if (fd_counter == -1)
+		return -1;
+
+	/* If it's the first counter opened, make it a group descriptor */
+	if (rci->fd_perf == -1)
+		rci->fd_perf = fd_counter;
+
+	*scale_ = scale;
+	*unit_ = unit;
+	return fd_counter;
+}
+
+int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
+			  double *scale, enum rapl_unit *unit)
+{
+	int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit);
+
+	if (debug)
+		fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
+
+	return ret;
+}
+
 /*
  * Linux-perf manages the HW instructions-retired counter
  * by enabling when requested, and hiding rollover
  */
 void linux_perf_init(void)
 {
-	if (!BIC_IS_ENABLED(BIC_IPC))
-		return;
-
 	if (access("/proc/sys/kernel/perf_event_paranoid", F_OK))
 		return;
 
-	fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
-	if (fd_instr_count_percpu == NULL)
-		err(-1, "calloc fd_instr_count_percpu");
+	if (BIC_IS_ENABLED(BIC_IPC) && has_aperf) {
+		fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
+		if (fd_instr_count_percpu == NULL)
+			err(-1, "calloc fd_instr_count_percpu");
+	}
+
+	const bool aperf_required = is_aperf_access_required();
+
+	if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) {
+		fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu));
+		if (fd_amperf_percpu == NULL)
+			err(-1, "calloc fd_amperf_percpu");
+	}
+}
+
+void rapl_perf_init(void)
+{
+	const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages;
+	bool *domain_visited = calloc(num_domains, sizeof(bool));
+
+	rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain));
+	if (rapl_counter_info_perdomain == NULL)
+		err(-1, "calloc rapl_counter_info_percpu");
+
+	/*
+	 * Initialize rapl_counter_info_percpu
+	 */
+	for (int domain_id = 0; domain_id < num_domains; ++domain_id) {
+		struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id];
+
+		rci->fd_perf = -1;
+		for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) {
+			rci->data[i] = 0;
+			rci->source[i] = RAPL_SOURCE_NONE;
+		}
+	}
 
-	BIC_PRESENT(BIC_IPC);
+	/*
+	 * Open/probe the counters
+	 * If can't get it via perf, fallback to MSR
+	 */
+	for (size_t i = 0; i < ARRAY_SIZE(rapl_counter_arch_infos); ++i) {
+
+		const struct rapl_counter_arch_info *const cai = &rapl_counter_arch_infos[i];
+		bool has_counter = 0;
+		double scale;
+		enum rapl_unit unit;
+		int next_domain;
+
+		memset(domain_visited, 0, num_domains * sizeof(*domain_visited));
+
+		for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) {
+
+			if (cpu_is_not_allowed(cpu))
+				continue;
+
+			/* Skip already seen and handled RAPL domains */
+			next_domain =
+			    platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id;
+
+			if (domain_visited[next_domain])
+				continue;
+
+			domain_visited[next_domain] = 1;
+
+			struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain];
+
+			/* Check if the counter is enabled and accessible */
+			if (BIC_IS_ENABLED(cai->bic) && (platform->rapl_msrs & cai->feature_mask)) {
+
+				/* Use perf API for this counter */
+				if (!no_perf && cai->perf_name
+				    && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) {
+					rci->source[cai->rci_index] = RAPL_SOURCE_PERF;
+					rci->scale[cai->rci_index] = scale * cai->compat_scale;
+					rci->unit[cai->rci_index] = unit;
+					rci->flags[cai->rci_index] = cai->flags;
+
+					/* Use MSR for this counter */
+				} else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) {
+					rci->source[cai->rci_index] = RAPL_SOURCE_MSR;
+					rci->msr[cai->rci_index] = cai->msr;
+					rci->msr_mask[cai->rci_index] = cai->msr_mask;
+					rci->msr_shift[cai->rci_index] = cai->msr_shift;
+					rci->unit[cai->rci_index] = RAPL_UNIT_JOULES;
+					rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale;
+					rci->flags[cai->rci_index] = cai->flags;
+				}
+			}
+
+			if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE)
+				has_counter = 1;
+		}
+
+		/* If any CPU has access to the counter, make it present */
+		if (has_counter)
+			BIC_PRESENT(cai->bic);
+	}
+
+	free(domain_visited);
+}
+
+static int has_amperf_access_via_msr(void)
+{
+	if (no_msr)
+		return 0;
+
+	if (probe_msr(base_cpu, MSR_IA32_APERF))
+		return 0;
+
+	if (probe_msr(base_cpu, MSR_IA32_MPERF))
+		return 0;
+
+	return 1;
+}
+
+static int has_amperf_access_via_perf(void)
+{
+	struct amperf_group_fd fds;
+
+	/*
+	 * Cache the last result, so we don't warn the user multiple times
+	 *
+	 * Negative means cached, no access
+	 * Zero means not cached
+	 * Positive means cached, has access
+	 */
+	static int has_access_cached;
+
+	if (no_perf)
+		return 0;
+
+	if (has_access_cached != 0)
+		return has_access_cached > 0;
+
+	fds = open_amperf_fd(base_cpu);
+	has_access_cached = (fds.aperf != -1) && (fds.mperf != -1);
+
+	if (fds.aperf == -1)
+		warnx("Failed to access %s. Some of the counters may not be available\n"
+		      "\tRun as root to enable them or use %s to disable the access explicitly",
+		      "APERF perf counter", "--no-perf");
+	else
+		close(fds.aperf);
+
+	if (fds.mperf == -1)
+		warnx("Failed to access %s. Some of the counters may not be available\n"
+		      "\tRun as root to enable them or use %s to disable the access explicitly",
+		      "MPERF perf counter", "--no-perf");
+	else
+		close(fds.mperf);
+
+	if (has_access_cached == 0)
+		has_access_cached = -1;
+
+	return has_access_cached > 0;
+}
+
+/* Check if we can access APERF and MPERF */
+static int has_amperf_access(void)
+{
+	if (!is_aperf_access_required())
+		return 0;
+
+	if (!no_msr && has_amperf_access_via_msr())
+		return 1;
+
+	if (!no_perf && has_amperf_access_via_perf())
+		return 1;
+
+	return 0;
 }
 
 void probe_cstates(void)
@@ -5563,7 +6674,7 @@ void probe_cstates(void)
 	if (platform->has_msr_module_c6_res_ms)
 		BIC_PRESENT(BIC_Mod_c6);
 
-	if (platform->has_ext_cst_msrs) {
+	if (platform->has_ext_cst_msrs && !no_msr) {
 		BIC_PRESENT(BIC_Totl_c0);
 		BIC_PRESENT(BIC_Any_c0);
 		BIC_PRESENT(BIC_GFX_c0);
@@ -5623,6 +6734,7 @@ void process_cpuid()
 	unsigned int eax, ebx, ecx, edx;
 	unsigned int fms, family, model, stepping, ecx_flags, edx_flags;
 	unsigned long long ucode_patch = 0;
+	bool ucode_patch_valid = false;
 
 	eax = ebx = ecx = edx = 0;
 
@@ -5650,8 +6762,12 @@ void process_cpuid()
 	ecx_flags = ecx;
 	edx_flags = edx;
 
-	if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
-		warnx("get_msr(UCODE)");
+	if (!no_msr) {
+		if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
+			warnx("get_msr(UCODE)");
+		else
+			ucode_patch_valid = true;
+	}
 
 	/*
 	 * check max extended function levels of CPUID.
@@ -5662,9 +6778,12 @@ void process_cpuid()
 	__cpuid(0x80000000, max_extended_level, ebx, ecx, edx);
 
 	if (!quiet) {
-		fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n",
-			family, model, stepping, family, model, stepping,
-			(unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
+		fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)",
+			family, model, stepping, family, model, stepping);
+		if (ucode_patch_valid)
+			fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
+		fputc('\n', outf);
+
 		fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level);
 		fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n",
 			ecx_flags & (1 << 0) ? "SSE3" : "-",
@@ -5700,10 +6819,11 @@ void process_cpuid()
 
 	__cpuid(0x6, eax, ebx, ecx, edx);
 	has_aperf = ecx & (1 << 0);
-	if (has_aperf) {
+	if (has_aperf && has_amperf_access()) {
 		BIC_PRESENT(BIC_Avg_MHz);
 		BIC_PRESENT(BIC_Busy);
 		BIC_PRESENT(BIC_Bzy_MHz);
+		BIC_PRESENT(BIC_IPC);
 	}
 	do_dts = eax & (1 << 0);
 	if (do_dts)
@@ -5786,6 +6906,15 @@ void process_cpuid()
 		base_mhz = max_mhz = bus_mhz = edx = 0;
 
 		__cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx);
+
+		bclk = bus_mhz;
+
+		base_hz = base_mhz * 1000000;
+		has_base_hz = 1;
+
+		if (platform->enable_tsc_tweak)
+			tsc_tweak = base_hz / tsc_hz;
+
 		if (!quiet)
 			fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n",
 				base_mhz, max_mhz, bus_mhz);
@@ -5814,7 +6943,7 @@ void probe_pm_features(void)
 
 	probe_thermal();
 
-	if (platform->has_nhm_msrs)
+	if (platform->has_nhm_msrs && !no_msr)
 		BIC_PRESENT(BIC_SMI);
 
 	if (!quiet)
@@ -6142,6 +7271,7 @@ void topology_update(void)
 	topo.allowed_packages = 0;
 	for_all_cpus(update_topo, ODD_COUNTERS);
 }
+
 void setup_all_buffers(bool startup)
 {
 	topology_probe(startup);
@@ -6169,21 +7299,129 @@ void set_base_cpu(void)
 	err(-ENODEV, "No valid cpus found");
 }
 
+static void set_amperf_source(void)
+{
+	amperf_source = AMPERF_SOURCE_PERF;
+
+	const bool aperf_required = is_aperf_access_required();
+
+	if (no_perf || !aperf_required || !has_amperf_access_via_perf())
+		amperf_source = AMPERF_SOURCE_MSR;
+
+	if (quiet || !debug)
+		return;
+
+	fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf");
+}
+
+bool has_added_counters(void)
+{
+	/*
+	 * It only makes sense to call this after the command line is parsed,
+	 * otherwise sys structure is not populated.
+	 */
+
+	return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters;
+}
+
+bool is_msr_access_required(void)
+{
+	if (no_msr)
+		return false;
+
+	if (has_added_counters())
+		return true;
+
+	return BIC_IS_ENABLED(BIC_SMI)
+	    || BIC_IS_ENABLED(BIC_CPU_c1)
+	    || BIC_IS_ENABLED(BIC_CPU_c3)
+	    || BIC_IS_ENABLED(BIC_CPU_c6)
+	    || BIC_IS_ENABLED(BIC_CPU_c7)
+	    || BIC_IS_ENABLED(BIC_Mod_c6)
+	    || BIC_IS_ENABLED(BIC_CoreTmp)
+	    || BIC_IS_ENABLED(BIC_Totl_c0)
+	    || BIC_IS_ENABLED(BIC_Any_c0)
+	    || BIC_IS_ENABLED(BIC_GFX_c0)
+	    || BIC_IS_ENABLED(BIC_CPUGFX)
+	    || BIC_IS_ENABLED(BIC_Pkgpc3)
+	    || BIC_IS_ENABLED(BIC_Pkgpc6)
+	    || BIC_IS_ENABLED(BIC_Pkgpc2)
+	    || BIC_IS_ENABLED(BIC_Pkgpc7)
+	    || BIC_IS_ENABLED(BIC_Pkgpc8)
+	    || BIC_IS_ENABLED(BIC_Pkgpc9)
+	    || BIC_IS_ENABLED(BIC_Pkgpc10)
+	    /* TODO: Multiplex access with perf */
+	    || BIC_IS_ENABLED(BIC_CorWatt)
+	    || BIC_IS_ENABLED(BIC_Cor_J)
+	    || BIC_IS_ENABLED(BIC_PkgWatt)
+	    || BIC_IS_ENABLED(BIC_CorWatt)
+	    || BIC_IS_ENABLED(BIC_GFXWatt)
+	    || BIC_IS_ENABLED(BIC_RAMWatt)
+	    || BIC_IS_ENABLED(BIC_Pkg_J)
+	    || BIC_IS_ENABLED(BIC_Cor_J)
+	    || BIC_IS_ENABLED(BIC_GFX_J)
+	    || BIC_IS_ENABLED(BIC_RAM_J)
+	    || BIC_IS_ENABLED(BIC_PKG__)
+	    || BIC_IS_ENABLED(BIC_RAM__)
+	    || BIC_IS_ENABLED(BIC_PkgTmp)
+	    || (is_aperf_access_required() && !has_amperf_access_via_perf());
+}
+
+void check_msr_access(void)
+{
+	if (!is_msr_access_required())
+		no_msr = 1;
+
+	check_dev_msr();
+	check_msr_permission();
+
+	if (no_msr)
+		bic_disable_msr_access();
+}
+
+void check_perf_access(void)
+{
+	const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC);
+
+	if (no_perf || !intrcount_required || !has_instr_count_access())
+		bic_enabled &= ~BIC_IPC;
+
+	const bool aperf_required = is_aperf_access_required();
+
+	if (!aperf_required || !has_amperf_access()) {
+		bic_enabled &= ~BIC_Avg_MHz;
+		bic_enabled &= ~BIC_Busy;
+		bic_enabled &= ~BIC_Bzy_MHz;
+		bic_enabled &= ~BIC_IPC;
+	}
+}
+
 void turbostat_init()
 {
 	setup_all_buffers(true);
 	set_base_cpu();
-	check_dev_msr();
-	check_permissions();
+	check_msr_access();
+	check_perf_access();
 	process_cpuid();
 	probe_pm_features();
+	set_amperf_source();
 	linux_perf_init();
+	rapl_perf_init();
 
 	for_all_cpus(get_cpu_type, ODD_COUNTERS);
 	for_all_cpus(get_cpu_type, EVEN_COUNTERS);
 
 	if (DO_BIC(BIC_IPC))
 		(void)get_instr_count_fd(base_cpu);
+
+	/*
+	 * If TSC tweak is needed, but couldn't get it,
+	 * disable more BICs, since it can't be reported accurately.
+	 */
+	if (platform->enable_tsc_tweak && !has_base_hz) {
+		bic_enabled &= ~BIC_Busy;
+		bic_enabled &= ~BIC_Bzy_MHz;
+	}
 }
 
 int fork_it(char **argv)
@@ -6259,7 +7497,7 @@ int get_and_dump_counters(void)
 
 void print_version()
 {
-	fprintf(outf, "turbostat version 2023.11.07 - Len Brown <lenb@kernel.org>\n");
+	fprintf(outf, "turbostat version 2024.04.08 - Len Brown <lenb@kernel.org>\n");
 }
 
 #define COMMAND_LINE_SIZE 2048
@@ -6291,6 +7529,9 @@ int add_counter(unsigned int msr_num, char *path, char *name,
 {
 	struct msr_counter *msrp;
 
+	if (no_msr && msr_num)
+		errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num);
+
 	msrp = calloc(1, sizeof(struct msr_counter));
 	if (msrp == NULL) {
 		perror("calloc");
@@ -6595,6 +7836,8 @@ void cmdline(int argc, char **argv)
 		{ "list", no_argument, 0, 'l' },
 		{ "out", required_argument, 0, 'o' },
 		{ "quiet", no_argument, 0, 'q' },
+		{ "no-msr", no_argument, 0, 'M' },
+		{ "no-perf", no_argument, 0, 'P' },
 		{ "show", required_argument, 0, 's' },
 		{ "Summary", no_argument, 0, 'S' },
 		{ "TCC", required_argument, 0, 'T' },
@@ -6604,7 +7847,25 @@ void cmdline(int argc, char **argv)
 
 	progname = argv[0];
 
-	while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) {
+	/*
+	 * Parse some options early, because they may make other options invalid,
+	 * like adding the MSR counter with --add and at the same time using --no-msr.
+	 */
+	while ((opt = getopt_long_only(argc, argv, "MP", long_options, &option_index)) != -1) {
+		switch (opt) {
+		case 'M':
+			no_msr = 1;
+			break;
+		case 'P':
+			no_perf = 1;
+			break;
+		default:
+			break;
+		}
+	}
+	optind = 0;
+
+	while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) {
 		switch (opt) {
 		case 'a':
 			parse_add_command(optarg);
@@ -6662,6 +7923,10 @@ void cmdline(int argc, char **argv)
 		case 'q':
 			quiet = 1;
 			break;
+		case 'M':
+		case 'P':
+			/* Parsed earlier */
+			break;
 		case 'n':
 			num_iterations = strtod(optarg, NULL);
 
@@ -6704,6 +7969,22 @@ void cmdline(int argc, char **argv)
 	}
 }
 
+void set_rlimit(void)
+{
+	struct rlimit limit;
+
+	if (getrlimit(RLIMIT_NOFILE, &limit) < 0)
+		err(1, "Failed to get rlimit");
+
+	if (limit.rlim_max < MAX_NOFILE)
+		limit.rlim_max = MAX_NOFILE;
+	if (limit.rlim_cur < MAX_NOFILE)
+		limit.rlim_cur = MAX_NOFILE;
+
+	if (setrlimit(RLIMIT_NOFILE, &limit) < 0)
+		err(1, "Failed to set rlimit");
+}
+
 int main(int argc, char **argv)
 {
 	int fd, ret;
@@ -6729,9 +8010,13 @@ skip_cgroup_setting:
 
 	probe_sysfs();
 
+	if (!getuid())
+		set_rlimit();
+
 	turbostat_init();
 
-	msr_sum_record();
+	if (!no_msr)
+		msr_sum_record();
 
 	/* dump counters and exit */
 	if (dump_only)
diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c
index 5fd9e594079c..ebda9c366b2b 100644
--- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c
+++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c
@@ -1241,6 +1241,7 @@ unsigned int get_pkg_num(int cpu)
 	retval = fscanf(fp, "%d\n", &pkg);
 	if (retval != 1)
 		errx(1, "%s: failed to parse", pathname);
+	fclose(fp);
 	return pkg;
 }
 
diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include
index 6fba29f3222d..0aa4005017c7 100644
--- a/tools/scripts/Makefile.include
+++ b/tools/scripts/Makefile.include
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 ifneq ($(O),)
 ifeq ($(origin O), command line)
-	dummy := $(if $(shell cd $(PWD); test -d $(O) || echo $(O)),$(error O=$(O) does not exist),)
+        $(if $(shell cd $(PWD); test -d $(O) || echo $(O)),$(error O=$(O) does not exist),)
 	ABSOLUTE_O := $(shell cd $(PWD); cd $(O) ; pwd)
 	OUTPUT := $(ABSOLUTE_O)/$(if $(subdir),$(subdir)/)
 	COMMAND_O := O=$(ABSOLUTE_O)
diff --git a/tools/sound/dapm-graph b/tools/sound/dapm-graph
new file mode 100755
index 000000000000..57d78f6df041
--- /dev/null
+++ b/tools/sound/dapm-graph
@@ -0,0 +1,303 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Generate a graph of the current DAPM state for an audio card
+#
+# Copyright 2024 Bootlin
+# Author: Luca Ceresoli <luca.ceresol@bootlin.com>
+
+set -eu
+
+STYLE_NODE_ON="shape=box,style=bold,color=green4"
+STYLE_NODE_OFF="shape=box,style=filled,color=gray30,fillcolor=gray95"
+
+# Print usage and exit
+#
+# $1 = exit return value
+# $2 = error string (required if $1 != 0)
+usage()
+{
+    if [  "${1}" -ne 0 ]; then
+	echo "${2}" >&2
+    fi
+
+    echo "
+Generate a graph of the current DAPM state for an audio card.
+
+The DAPM state can be obtained via debugfs for a card on the local host or
+a remote target, or from a local copy of the debugfs tree for the card.
+
+Usage:
+    $(basename $0) [options] -c CARD                  - Local sound card
+    $(basename $0) [options] -c CARD -r REMOTE_TARGET - Card on remote system
+    $(basename $0) [options] -d STATE_DIR             - Local directory
+
+Options:
+    -c CARD             Sound card to get DAPM state of
+    -r REMOTE_TARGET    Get DAPM state from REMOTE_TARGET via SSH and SCP
+                        instead of using a local sound card
+    -d STATE_DIR        Get DAPM state from a local copy of a debugfs tree
+    -o OUT_FILE         Output file (default: dapm.dot)
+    -D                  Show verbose debugging info
+    -h                  Print this help and exit
+
+The output format is implied by the extension of OUT_FILE:
+
+ * Use the .dot extension to generate a text graph representation in
+   graphviz dot syntax.
+ * Any other extension is assumed to be a format supported by graphviz for
+   rendering, e.g. 'png', 'svg', and will produce both the .dot file and a
+   picture from it. This requires the 'dot' program from the graphviz
+   package.
+"
+
+    exit ${1}
+}
+
+# Connect to a remote target via SSH, collect all DAPM files from debufs
+# into a tarball and get the tarball via SCP into $3/dapm.tar
+#
+# $1 = target as used by ssh and scp, e.g. "root@192.168.1.1"
+# $2 = sound card name
+# $3 = temp dir path (present on the host, created on the target)
+# $4 = local directory to extract the tarball into
+#
+# Requires an ssh+scp server, find and tar+gz on the target
+#
+# Note: the tarball is needed because plain 'scp -r' from debugfs would
+# copy only empty files
+grab_remote_files()
+{
+    echo "Collecting DAPM state from ${1}"
+    dbg_echo "Collected DAPM state in ${3}"
+
+    ssh "${1}" "
+set -eu &&
+cd \"/sys/kernel/debug/asoc/${2}\" &&
+find * -type d -exec mkdir -p ${3}/dapm-tree/{} \; &&
+find * -type f -exec cp \"{}\" \"${3}/dapm-tree/{}\" \; &&
+cd ${3}/dapm-tree &&
+tar cf ${3}/dapm.tar ."
+    scp -q "${1}:${3}/dapm.tar" "${3}"
+
+    mkdir -p "${4}"
+    tar xf "${tmp_dir}/dapm.tar" -C "${4}"
+}
+
+# Parse a widget file and generate graph description in graphviz dot format
+#
+# Skips any file named "bias_level".
+#
+# $1 = temporary work dir
+# $2 = component name
+# $3 = widget filename
+process_dapm_widget()
+{
+    local tmp_dir="${1}"
+    local c_name="${2}"
+    local w_file="${3}"
+    local dot_file="${tmp_dir}/main.dot"
+    local links_file="${tmp_dir}/links.dot"
+
+    local w_name="$(basename "${w_file}")"
+    local w_tag="${c_name}_${w_name}"
+
+    if [ "${w_name}" = "bias_level" ]; then
+	return 0
+    fi
+
+    dbg_echo "   + Widget: ${w_name}"
+
+    cat "${w_file}" | (
+ 	read line
+
+ 	if echo "${line}" | grep -q ': On '
+	then local node_style="${STYLE_NODE_ON}"
+	else local node_style="${STYLE_NODE_OFF}"
+ 	fi
+
+	local w_type=""
+	while read line; do
+	    # Collect widget type if present
+	    if echo "${line}" | grep -q '^widget-type '; then
+		local w_type_raw="$(echo "$line" | cut -d ' ' -f 2)"
+		dbg_echo "     - Widget type: ${w_type_raw}"
+
+		# Note: escaping '\n' is tricky to get working with both
+		# bash and busybox ash, so use a '%' here and replace it
+		# later
+		local w_type="%n[${w_type_raw}]"
+	    fi
+
+	    # Collect any links. We could use "in" links or "out" links,
+	    # let's use "in" links
+	    if echo "${line}" | grep -q '^in '; then
+		local w_src=$(echo "$line" |
+				  awk -F\" '{print $6 "_" $4}' |
+				  sed  's/^(null)_/ROOT_/')
+		dbg_echo "     - Input route from: ${w_src}"
+		echo "  \"${w_src}\" -> \"$w_tag\"" >> "${links_file}"
+	    fi
+	done
+
+	echo "    \"${w_tag}\" [label=\"${w_name}${w_type}\",${node_style}]" |
+	    tr '%' '\\' >> "${dot_file}"
+   )
+}
+
+# Parse the DAPM tree for a sound card component and generate graph
+# description in graphviz dot format
+#
+# $1 = temporary work dir
+# $2 = component directory
+# $3 = forced component name (extracted for path if empty)
+process_dapm_component()
+{
+    local tmp_dir="${1}"
+    local c_dir="${2}"
+    local c_name="${3}"
+    local dot_file="${tmp_dir}/main.dot"
+    local links_file="${tmp_dir}/links.dot"
+
+    if [ -z "${c_name}" ]; then
+	# Extract directory name into component name:
+	#   "./cs42l51.0-004a/dapm" -> "cs42l51.0-004a"
+	c_name="$(basename $(dirname "${c_dir}"))"
+    fi
+
+    dbg_echo " * Component: ${c_name}"
+
+    echo ""                           >> "${dot_file}"
+    echo "  subgraph \"${c_name}\" {" >> "${dot_file}"
+    echo "    cluster = true"         >> "${dot_file}"
+    echo "    label = \"${c_name}\""  >> "${dot_file}"
+    echo "    color=dodgerblue"       >> "${dot_file}"
+
+    # Create empty file to ensure it will exist in all cases
+    >"${links_file}"
+
+    # Iterate over widgets in the component dir
+    for w_file in ${c_dir}/*; do
+	process_dapm_widget "${tmp_dir}" "${c_name}" "${w_file}"
+    done
+
+    echo "  }" >> "${dot_file}"
+
+    cat "${links_file}" >> "${dot_file}"
+}
+
+# Parse the DAPM tree for a sound card and generate graph description in
+# graphviz dot format
+#
+# $1 = temporary work dir
+# $2 = directory tree with DAPM state (either in debugfs or a mirror)
+process_dapm_tree()
+{
+    local tmp_dir="${1}"
+    local dapm_dir="${2}"
+    local dot_file="${tmp_dir}/main.dot"
+
+    echo "digraph G {" > "${dot_file}"
+    echo "  fontname=\"sans-serif\"" >> "${dot_file}"
+    echo "  node [fontname=\"sans-serif\"]" >> "${dot_file}"
+
+
+    # Process root directory (no component)
+    process_dapm_component "${tmp_dir}" "${dapm_dir}/dapm" "ROOT"
+
+    # Iterate over components
+    for c_dir in "${dapm_dir}"/*/dapm
+    do
+	process_dapm_component "${tmp_dir}" "${c_dir}" ""
+    done
+
+    echo "}" >> "${dot_file}"
+}
+
+main()
+{
+    # Parse command line
+    local out_file="dapm.dot"
+    local card_name=""
+    local remote_target=""
+    local dapm_tree=""
+    local dbg_on=""
+    while getopts "c:r:d:o:Dh" arg; do
+	case $arg in
+	    c)  card_name="${OPTARG}"      ;;
+	    r)  remote_target="${OPTARG}"  ;;
+	    d)  dapm_tree="${OPTARG}"      ;;
+	    o)  out_file="${OPTARG}"       ;;
+	    D)  dbg_on="1"                 ;;
+	    h)  usage 0                    ;;
+	    *)  usage 1                    ;;
+	esac
+    done
+    shift $(($OPTIND - 1))
+
+    if [ -n "${dapm_tree}" ]; then
+	if [ -n "${card_name}${remote_target}" ]; then
+	    usage 1 "Cannot use -c and -r with -d"
+	fi
+	echo "Using local tree: ${dapm_tree}"
+    elif [ -n "${remote_target}" ]; then
+	if [ -z "${card_name}" ]; then
+	    usage 1 "-r requires -c"
+	fi
+	echo "Using card ${card_name} from remote target ${remote_target}"
+    elif [ -n "${card_name}" ]; then
+	echo "Using local card: ${card_name}"
+    else
+	usage 1 "Please choose mode using -c, -r or -d"
+    fi
+
+    # Define logging function
+    if [ "${dbg_on}" ]; then
+	dbg_echo() {
+	    echo "$*" >&2
+	}
+    else
+	dbg_echo() {
+	    :
+	}
+    fi
+
+    # Filename must have a dot in order the infer the format from the
+    # extension
+    if ! echo "${out_file}" | grep -qE '\.'; then
+	echo "Missing extension in output filename ${out_file}" >&2
+	usage
+	exit 1
+    fi
+
+    local out_fmt="${out_file##*.}"
+    local dot_file="${out_file%.*}.dot"
+
+    dbg_echo "dot file:      $dot_file"
+    dbg_echo "Output file:   $out_file"
+    dbg_echo "Output format: $out_fmt"
+
+    tmp_dir="$(mktemp -d /tmp/$(basename $0).XXXXXX)"
+    trap "{ rm -fr ${tmp_dir}; }" INT TERM EXIT
+
+    if [ -z "${dapm_tree}" ]
+    then
+	dapm_tree="/sys/kernel/debug/asoc/${card_name}"
+    fi
+    if [ -n "${remote_target}" ]; then
+	dapm_tree="${tmp_dir}/dapm-tree"
+	grab_remote_files "${remote_target}" "${card_name}" "${tmp_dir}" "${dapm_tree}"
+    fi
+    # In all cases now ${dapm_tree} contains the DAPM state
+
+    process_dapm_tree "${tmp_dir}" "${dapm_tree}"
+    cp "${tmp_dir}/main.dot" "${dot_file}"
+
+    if [ "${out_file}" != "${dot_file}" ]; then
+	dot -T"${out_fmt}" "${dot_file}" -o "${out_file}"
+    fi
+
+    echo "Generated file ${out_file}"
+}
+
+main "${@}"
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 908e0d083936..3482248aa344 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -986,10 +986,12 @@ static void dpa_perf_setup(struct cxl_port *endpoint, struct range *range,
 {
 	dpa_perf->qos_class = FAKE_QTG_ID;
 	dpa_perf->dpa_range = *range;
-	dpa_perf->coord.read_latency = 500;
-	dpa_perf->coord.write_latency = 500;
-	dpa_perf->coord.read_bandwidth = 1000;
-	dpa_perf->coord.write_bandwidth = 1000;
+	for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+		dpa_perf->coord[i].read_latency = 500;
+		dpa_perf->coord[i].write_latency = 500;
+		dpa_perf->coord[i].read_bandwidth = 1000;
+		dpa_perf->coord[i].write_bandwidth = 1000;
+	}
 }
 
 static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port)
@@ -999,6 +1001,7 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port)
 	struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
 	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+	struct access_coordinate ep_c[ACCESS_COORDINATE_MAX];
 	struct range pmem_range = {
 		.start = cxlds->pmem_res.start,
 		.end = cxlds->pmem_res.end,
@@ -1018,6 +1021,12 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port)
 		dpa_perf_setup(port, &pmem_range, &mds->pmem_perf);
 
 	cxl_memdev_update_perf(cxlmd);
+
+	/*
+	 * This function is here to only test the topology iterator. It serves
+	 * no other purpose.
+	 */
+	cxl_endpoint_get_perf_coordinates(port, ep_c);
 }
 
 static struct cxl_mock_ops cxl_mock_ops = {
diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 829f5bdfd2e4..eb31cd9c977b 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -792,13 +792,13 @@ sub process_variables {
     my $retval = "";
 
     # We want to check for '\', and it is just easier
-    # to check the previous characet of '$' and not need
+    # to check the previous character of '$' and not need
     # to worry if '$' is the first character. By adding
     # a space to $value, we can just check [^\\]\$ and
     # it will still work.
     $value = " $value";
 
-    while ($value =~ /(.*?[^\\])\$\{(.*?)\}(.*)/) {
+    while ($value =~ /(.*?[^\\])\$\{([^\{]*?)\}(.*)/) {
 	my $begin = $1;
 	my $var = $2;
 	my $end = $3;
@@ -818,16 +818,20 @@ sub process_variables {
 	    # we simple convert to 0
 	    $retval = "${retval}0";
 	} else {
-	    # put back the origin piece.
-	    $retval = "$retval\$\{$var\}";
+	    # put back the origin piece, but with $#### to not reprocess it
+	    $retval = "$retval\$####\{$var\}";
 	    # This could be an option that is used later, save
 	    # it so we don't warn if this option is not one of
 	    # ktests options.
 	    $used_options{$var} = 1;
 	}
-	$value = $end;
+	$value = "$retval$end";
+	$retval = "";
     }
-    $retval = "$retval$value";
+    $retval = $value;
+
+    # Convert the saved variables with $####{var} back to ${var}
+    $retval =~ s/\$####/\$/g;
 
     # remove the space added in the beginning
     $retval =~ s/ //;
@@ -843,6 +847,7 @@ sub set_value {
     if ($lvalue =~ /^(TEST|BISECT|CONFIG_BISECT)_TYPE(\[.*\])?$/ &&
 	$prvalue !~ /^(config_|)bisect$/ &&
 	$prvalue !~ /^build$/ &&
+	$prvalue !~ /^make_warnings_file$/ &&
 	$buildonly) {
 
 	# Note if a test is something other than build, then we
diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config
index a6cf69a665e8..b3b00269a52a 100644
--- a/tools/testing/kunit/configs/all_tests.config
+++ b/tools/testing/kunit/configs/all_tests.config
@@ -28,6 +28,8 @@ CONFIG_MCTP_FLOWS=y
 CONFIG_INET=y
 CONFIG_MPTCP=y
 
+CONFIG_NETDEVICES=y
+CONFIG_WLAN=y
 CONFIG_CFG80211=y
 CONFIG_MAC80211=y
 CONFIG_WLAN_VENDOR_INTEL=y
@@ -38,11 +40,13 @@ CONFIG_DAMON_VADDR=y
 CONFIG_DAMON_PADDR=y
 CONFIG_DEBUG_FS=y
 CONFIG_DAMON_DBGFS=y
+CONFIG_DAMON_DBGFS_DEPRECATED=y
 
 CONFIG_REGMAP_BUILD=y
 
 CONFIG_SECURITY=y
 CONFIG_SECURITY_APPARMOR=y
+CONFIG_SECURITY_LANDLOCK=y
 
 CONFIG_SOUND=y
 CONFIG_SND=y
diff --git a/tools/testing/kunit/qemu_configs/riscv.py b/tools/testing/kunit/qemu_configs/riscv.py
index 12a1d525978a..c87758030ff7 100644
--- a/tools/testing/kunit/qemu_configs/riscv.py
+++ b/tools/testing/kunit/qemu_configs/riscv.py
@@ -13,7 +13,7 @@ if not os.path.isfile(OPENSBI_PATH):
 
 QEMU_ARCH = QemuArchParams(linux_arch='riscv',
 			   kconfig='''
-CONFIG_SOC_VIRT=y
+CONFIG_ARCH_VIRT=y
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_OF_PLATFORM=y
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index c5c9d05f29da..c0a2bb785b92 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -18,6 +18,8 @@
 #define pr_info printk
 #define pr_debug printk
 #define pr_cont printk
+#define schedule()
+#define PAGE_SHIFT	12
 
 #define __acquires(x)
 #define __releases(x)
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index d117e8a96ded..f0431e6cb67e 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -13,11 +13,14 @@ TARGETS += core
 TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += damon
+TARGETS += devices
 TARGETS += dmabuf-heaps
 TARGETS += drivers/dma-buf
 TARGETS += drivers/s390x/uvdevice
+TARGETS += drivers/net
 TARGETS += drivers/net/bonding
 TARGETS += drivers/net/team
+TARGETS += drivers/net/virtio_net
 TARGETS += dt
 TARGETS += efivarfs
 TARGETS += exec
@@ -62,7 +65,7 @@ TARGETS += net/hsr
 TARGETS += net/mptcp
 TARGETS += net/openvswitch
 TARGETS += net/tcp_ao
-TARGETS += netfilter
+TARGETS += net/netfilter
 TARGETS += nsfs
 TARGETS += perf_events
 TARGETS += pidfd
@@ -115,6 +118,13 @@ TARGETS += zram
 TARGETS_HOTPLUG = cpu-hotplug
 TARGETS_HOTPLUG += memory-hotplug
 
+# Networking tests want the net/lib target, include it automatically
+ifneq ($(filter net drivers/net drivers/net/hw,$(TARGETS)),)
+ifeq ($(filter net/lib,$(TARGETS)),)
+	INSTALL_DEP_TARGETS := net/lib
+endif
+endif
+
 # User can optionally provide a TARGETS skiplist.  By default we skip
 # BPF since it has cutting edge build time dependencies which require
 # more effort to install.
@@ -160,11 +170,11 @@ ifneq ($(KBUILD_OUTPUT),)
   # $(realpath ...) resolves symlinks
   abs_objtree := $(realpath $(abs_objtree))
   BUILD := $(abs_objtree)/kselftest
-  KHDR_INCLUDES := -isystem ${abs_objtree}/usr/include
+  KHDR_INCLUDES := -D_GNU_SOURCE -isystem ${abs_objtree}/usr/include
 else
   BUILD := $(CURDIR)
   abs_srctree := $(shell cd $(top_srcdir) && pwd)
-  KHDR_INCLUDES := -isystem ${abs_srctree}/usr/include
+  KHDR_INCLUDES := -D_GNU_SOURCE -isystem ${abs_srctree}/usr/include
   DEFAULT_INSTALL_HDR_PATH := 1
 endif
 
@@ -244,7 +254,7 @@ ifdef INSTALL_PATH
 	install -m 744 run_kselftest.sh $(INSTALL_PATH)/
 	rm -f $(TEST_LIST)
 	@ret=1;	\
-	for TARGET in $(TARGETS); do \
+	for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \
 		BUILD_TARGET=$$BUILD/$$TARGET;	\
 		$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install \
 				INSTALL_PATH=$(INSTALL_PATH)/$$TARGET \
diff --git a/tools/testing/selftests/alsa/conf.c b/tools/testing/selftests/alsa/conf.c
index 89e3656a042d..e2b3a5810f47 100644
--- a/tools/testing/selftests/alsa/conf.c
+++ b/tools/testing/selftests/alsa/conf.c
@@ -105,7 +105,7 @@ static struct card_cfg_data *conf_data_by_card(int card, bool msg)
 	return NULL;
 }
 
-static int dump_config_tree(snd_config_t *top)
+static void dump_config_tree(snd_config_t *top)
 {
 	snd_output_t *out;
 	int err;
diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index 1189e77c8152..d8909b2b535a 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -58,11 +58,46 @@ static void cssc_sigill(void)
 	asm volatile(".inst 0xdac01c00" : : : "x0");
 }
 
+static void f8cvt_sigill(void)
+{
+	/* FSCALE V0.4H, V0.4H, V0.4H */
+	asm volatile(".inst 0x2ec03c00");
+}
+
+static void f8dp2_sigill(void)
+{
+	/* FDOT V0.4H, V0.4H, V0.5H */
+	asm volatile(".inst 0xe40fc00");
+}
+
+static void f8dp4_sigill(void)
+{
+	/* FDOT V0.2S, V0.2S, V0.2S */
+	asm volatile(".inst 0xe00fc00");
+}
+
+static void f8fma_sigill(void)
+{
+	/* FMLALB V0.8H, V0.16B, V0.16B */
+	asm volatile(".inst 0xec0fc00");
+}
+
+static void faminmax_sigill(void)
+{
+	/* FAMIN V0.4H, V0.4H, V0.4H */
+	asm volatile(".inst 0x2ec01c00");
+}
+
 static void fp_sigill(void)
 {
 	asm volatile("fmov s0, #1");
 }
 
+static void fpmr_sigill(void)
+{
+	asm volatile("mrs x0, S3_3_C4_C4_2" : : : "x0");
+}
+
 static void ilrcpc_sigill(void)
 {
 	/* LDAPUR W0, [SP, #8] */
@@ -95,6 +130,12 @@ static void lse128_sigill(void)
 		     : "cc", "memory");
 }
 
+static void lut_sigill(void)
+{
+	/* LUTI2 V0.16B, { V0.16B }, V[0] */
+	asm volatile(".inst 0x4e801000");
+}
+
 static void mops_sigill(void)
 {
 	char dst[1], src[1];
@@ -216,6 +257,78 @@ static void smef16f16_sigill(void)
 	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
 }
 
+static void smef8f16_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FDOT ZA.H[W0, 0], Z0.B-Z1.B, Z0.B-Z1.B */
+	asm volatile(".inst 0xc1a01020" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smef8f32_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FDOT ZA.S[W0, 0], { Z0.B-Z1.B }, Z0.B[0] */
+	asm volatile(".inst 0xc1500038" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smelutv2_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* LUTI4 { Z0.B-Z3.B }, ZT0, { Z0-Z1 } */
+	asm volatile(".inst 0xc08b0000" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smesf8dp2_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FDOT Z0.H, Z0.B, Z0.B[0] */
+	asm volatile(".inst 0x64204400" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smesf8dp4_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FDOT Z0.S, Z0.B, Z0.B[0] */
+	asm volatile(".inst 0xc1a41C00" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smesf8fma_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FMLALB V0.8H, V0.16B, V0.16B */
+	asm volatile(".inst 0xec0fc00");
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
 static void sve_sigill(void)
 {
 	/* RDVL x0, #0 */
@@ -354,6 +467,53 @@ static const struct hwcap_data {
 		.sigill_fn = cssc_sigill,
 	},
 	{
+		.name = "F8CVT",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8CVT,
+		.cpuinfo = "f8cvt",
+		.sigill_fn = f8cvt_sigill,
+	},
+	{
+		.name = "F8DP4",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8DP4,
+		.cpuinfo = "f8dp4",
+		.sigill_fn = f8dp4_sigill,
+	},
+	{
+		.name = "F8DP2",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8DP2,
+		.cpuinfo = "f8dp4",
+		.sigill_fn = f8dp2_sigill,
+	},
+	{
+		.name = "F8E5M2",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8E5M2,
+		.cpuinfo = "f8e5m2",
+	},
+	{
+		.name = "F8E4M3",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8E4M3,
+		.cpuinfo = "f8e4m3",
+	},
+	{
+		.name = "F8FMA",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8FMA,
+		.cpuinfo = "f8fma",
+		.sigill_fn = f8fma_sigill,
+	},
+	{
+		.name = "FAMINMAX",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_FAMINMAX,
+		.cpuinfo = "faminmax",
+		.sigill_fn = faminmax_sigill,
+	},
+	{
 		.name = "FP",
 		.at_hwcap = AT_HWCAP,
 		.hwcap_bit = HWCAP_FP,
@@ -361,6 +521,14 @@ static const struct hwcap_data {
 		.sigill_fn = fp_sigill,
 	},
 	{
+		.name = "FPMR",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_FPMR,
+		.cpuinfo = "fpmr",
+		.sigill_fn = fpmr_sigill,
+		.sigill_reliable = true,
+	},
+	{
 		.name = "JSCVT",
 		.at_hwcap = AT_HWCAP,
 		.hwcap_bit = HWCAP_JSCVT,
@@ -412,6 +580,13 @@ static const struct hwcap_data {
 		.sigill_fn = lse128_sigill,
 	},
 	{
+		.name = "LUT",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_LUT,
+		.cpuinfo = "lut",
+		.sigill_fn = lut_sigill,
+	},
+	{
 		.name = "MOPS",
 		.at_hwcap = AT_HWCAP2,
 		.hwcap_bit = HWCAP2_MOPS,
@@ -512,6 +687,48 @@ static const struct hwcap_data {
 		.sigill_fn = smef16f16_sigill,
 	},
 	{
+		.name = "SME F8F16",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_F8F16,
+		.cpuinfo = "smef8f16",
+		.sigill_fn = smef8f16_sigill,
+	},
+	{
+		.name = "SME F8F32",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_F8F32,
+		.cpuinfo = "smef8f32",
+		.sigill_fn = smef8f32_sigill,
+	},
+	{
+		.name = "SME LUTV2",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_LUTV2,
+		.cpuinfo = "smelutv2",
+		.sigill_fn = smelutv2_sigill,
+	},
+	{
+		.name = "SME SF8FMA",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_SF8FMA,
+		.cpuinfo = "smesf8fma",
+		.sigill_fn = smesf8fma_sigill,
+	},
+	{
+		.name = "SME SF8DP2",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_SF8DP2,
+		.cpuinfo = "smesf8dp2",
+		.sigill_fn = smesf8dp2_sigill,
+	},
+	{
+		.name = "SME SF8DP4",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_SF8DP4,
+		.cpuinfo = "smesf8dp4",
+		.sigill_fn = smesf8dp4_sigill,
+	},
+	{
 		.name = "SVE",
 		.at_hwcap = AT_HWCAP,
 		.hwcap_bit = HWCAP_SVE,
diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c
index 02ee3a91b780..285c47dd42f6 100644
--- a/tools/testing/selftests/arm64/abi/tpidr2.c
+++ b/tools/testing/selftests/arm64/abi/tpidr2.c
@@ -262,7 +262,7 @@ static int write_clone_read(void)
 
 int main(int argc, char **argv)
 {
-	int ret, i;
+	int ret;
 
 	putstr("TAP version 13\n");
 	putstr("1..");
diff --git a/tools/testing/selftests/arm64/fp/.gitignore b/tools/testing/selftests/arm64/fp/.gitignore
index ebc86757bdd8..00e52c966281 100644
--- a/tools/testing/selftests/arm64/fp/.gitignore
+++ b/tools/testing/selftests/arm64/fp/.gitignore
@@ -1,4 +1,5 @@
 fp-pidbench
+fp-ptrace
 fp-stress
 fpsimd-test
 rdvl-sme
diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile
index b413b0af07f9..55d4f00d9e8e 100644
--- a/tools/testing/selftests/arm64/fp/Makefile
+++ b/tools/testing/selftests/arm64/fp/Makefile
@@ -5,7 +5,9 @@ top_srcdir = $(realpath ../../../../../)
 
 CFLAGS += $(KHDR_INCLUDES)
 
-TEST_GEN_PROGS := fp-stress \
+TEST_GEN_PROGS := \
+	fp-ptrace \
+	fp-stress \
 	sve-ptrace sve-probe-vls \
 	vec-syscfg \
 	za-fork za-ptrace
@@ -24,6 +26,7 @@ EXTRA_CLEAN += $(OUTPUT)/asm-utils.o $(OUTPUT)/rdvl.o $(OUTPUT)/za-fork-asm.o
 # Build with nolibc to avoid effects due to libc's clone() support
 $(OUTPUT)/fp-pidbench: fp-pidbench.S $(OUTPUT)/asm-utils.o
 	$(CC) -nostdlib $^ -o $@
+$(OUTPUT)/fp-ptrace: fp-ptrace.c fp-ptrace-asm.S
 $(OUTPUT)/fpsimd-test: fpsimd-test.S $(OUTPUT)/asm-utils.o
 	$(CC) -nostdlib $^ -o $@
 $(OUTPUT)/rdvl-sve: rdvl-sve.c $(OUTPUT)/rdvl.o
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
new file mode 100644
index 000000000000..7ad59d92d02b
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2021-3 ARM Limited.
+//
+// Assembly portion of the FP ptrace test
+
+//
+// Load values from memory into registers, break on a breakpoint, then
+// break on a further breakpoint
+//
+
+#include "fp-ptrace.h"
+#include "sme-inst.h"
+
+.arch_extension sve
+
+// Load and save register values with pauses for ptrace
+//
+// x0 - SVE in use
+// x1 - SME in use
+// x2 - SME2 in use
+// x3 - FA64 supported
+
+.globl load_and_save
+load_and_save:
+	stp	x11, x12, [sp, #-0x10]!
+
+	// This should be redundant in the SVE case
+	ldr	x7, =v_in
+	ldp	q0, q1, [x7]
+	ldp	q2, q3, [x7, #16 * 2]
+	ldp	q4, q5, [x7, #16 * 4]
+	ldp	q6, q7, [x7, #16 * 6]
+	ldp	q8, q9, [x7, #16 * 8]
+	ldp	q10, q11, [x7, #16 * 10]
+	ldp	q12, q13, [x7, #16 * 12]
+	ldp	q14, q15, [x7, #16 * 14]
+	ldp	q16, q17, [x7, #16 * 16]
+	ldp	q18, q19, [x7, #16 * 18]
+	ldp	q20, q21, [x7, #16 * 20]
+	ldp	q22, q23, [x7, #16 * 22]
+	ldp	q24, q25, [x7, #16 * 24]
+	ldp	q26, q27, [x7, #16 * 26]
+	ldp	q28, q29, [x7, #16 * 28]
+	ldp	q30, q31, [x7, #16 * 30]
+
+	// SME?
+	cbz	x1, check_sve_in
+
+	adrp	x7, svcr_in
+	ldr	x7, [x7, :lo12:svcr_in]
+	// SVCR is 0 by default, avoid triggering SME if not in use
+	cbz	x7, check_sve_in
+	msr	S3_3_C4_C2_2, x7
+
+	// ZA?
+	tbz	x7, #SVCR_ZA_SHIFT, check_sm_in
+	rdsvl	11, 1
+	mov	w12, #0
+	ldr	x6, =za_in
+1:	_ldr_za 12, 6
+	add	x6, x6, x11
+	add	x12, x12, #1
+	cmp	x11, x12
+	bne	1b
+
+	// ZT?
+	cbz	x2, check_sm_in
+	adrp	x6, zt_in
+	add	x6, x6, :lo12:zt_in
+	_ldr_zt 6
+
+	// In streaming mode?
+check_sm_in:
+	tbz	x7, #SVCR_SM_SHIFT, check_sve_in
+	mov	x4, x3		// Load FFR if we have FA64
+	b	load_sve
+
+	// SVE?
+check_sve_in:
+	cbz	x0, wait_for_writes
+	mov	x4, #1
+
+load_sve:
+	ldr	x7, =z_in
+	ldr	z0, [x7, #0, MUL VL]
+	ldr	z1, [x7, #1, MUL VL]
+	ldr	z2, [x7, #2, MUL VL]
+	ldr	z3, [x7, #3, MUL VL]
+	ldr	z4, [x7, #4, MUL VL]
+	ldr	z5, [x7, #5, MUL VL]
+	ldr	z6, [x7, #6, MUL VL]
+	ldr	z7, [x7, #7, MUL VL]
+	ldr	z8, [x7, #8, MUL VL]
+	ldr	z9, [x7, #9, MUL VL]
+	ldr	z10, [x7, #10, MUL VL]
+	ldr	z11, [x7, #11, MUL VL]
+	ldr	z12, [x7, #12, MUL VL]
+	ldr	z13, [x7, #13, MUL VL]
+	ldr	z14, [x7, #14, MUL VL]
+	ldr	z15, [x7, #15, MUL VL]
+	ldr	z16, [x7, #16, MUL VL]
+	ldr	z17, [x7, #17, MUL VL]
+	ldr	z18, [x7, #18, MUL VL]
+	ldr	z19, [x7, #19, MUL VL]
+	ldr	z20, [x7, #20, MUL VL]
+	ldr	z21, [x7, #21, MUL VL]
+	ldr	z22, [x7, #22, MUL VL]
+	ldr	z23, [x7, #23, MUL VL]
+	ldr	z24, [x7, #24, MUL VL]
+	ldr	z25, [x7, #25, MUL VL]
+	ldr	z26, [x7, #26, MUL VL]
+	ldr	z27, [x7, #27, MUL VL]
+	ldr	z28, [x7, #28, MUL VL]
+	ldr	z29, [x7, #29, MUL VL]
+	ldr	z30, [x7, #30, MUL VL]
+	ldr	z31, [x7, #31, MUL VL]
+
+	// FFR is not present in base SME
+	cbz	x4, 1f
+	ldr	x7, =ffr_in
+	ldr	p0, [x7]
+	ldr	x7, [x7, #0]
+	cbz	x7, 1f
+	wrffr	p0.b
+1:
+
+	ldr	x7, =p_in
+	ldr	p0, [x7, #0, MUL VL]
+	ldr	p1, [x7, #1, MUL VL]
+	ldr	p2, [x7, #2, MUL VL]
+	ldr	p3, [x7, #3, MUL VL]
+	ldr	p4, [x7, #4, MUL VL]
+	ldr	p5, [x7, #5, MUL VL]
+	ldr	p6, [x7, #6, MUL VL]
+	ldr	p7, [x7, #7, MUL VL]
+	ldr	p8, [x7, #8, MUL VL]
+	ldr	p9, [x7, #9, MUL VL]
+	ldr	p10, [x7, #10, MUL VL]
+	ldr	p11, [x7, #11, MUL VL]
+	ldr	p12, [x7, #12, MUL VL]
+	ldr	p13, [x7, #13, MUL VL]
+	ldr	p14, [x7, #14, MUL VL]
+	ldr	p15, [x7, #15, MUL VL]
+
+wait_for_writes:
+	// Wait for the parent
+	brk #0
+
+	// Save values
+	ldr	x7, =v_out
+	stp	q0, q1, [x7]
+	stp	q2, q3, [x7, #16 * 2]
+	stp	q4, q5, [x7, #16 * 4]
+	stp	q6, q7, [x7, #16 * 6]
+	stp	q8, q9, [x7, #16 * 8]
+	stp	q10, q11, [x7, #16 * 10]
+	stp	q12, q13, [x7, #16 * 12]
+	stp	q14, q15, [x7, #16 * 14]
+	stp	q16, q17, [x7, #16 * 16]
+	stp	q18, q19, [x7, #16 * 18]
+	stp	q20, q21, [x7, #16 * 20]
+	stp	q22, q23, [x7, #16 * 22]
+	stp	q24, q25, [x7, #16 * 24]
+	stp	q26, q27, [x7, #16 * 26]
+	stp	q28, q29, [x7, #16 * 28]
+	stp	q30, q31, [x7, #16 * 30]
+
+	// SME?
+	cbz	x1, check_sve_out
+
+	rdsvl	11, 1
+	adrp	x6, sme_vl_out
+	str	x11, [x6, :lo12:sme_vl_out]
+
+	mrs	x7, S3_3_C4_C2_2
+	adrp	x6, svcr_out
+	str	x7, [x6, :lo12:svcr_out]
+
+	// ZA?
+	tbz	x7, #SVCR_ZA_SHIFT, check_sm_out
+	mov	w12, #0
+	ldr	x6, =za_out
+1:	_str_za 12, 6
+	add	x6, x6, x11
+	add	x12, x12, #1
+	cmp	x11, x12
+	bne	1b
+
+	// ZT?
+	cbz	x2, check_sm_out
+	adrp	x6, zt_out
+	add	x6, x6, :lo12:zt_out
+	_str_zt 6
+
+	// In streaming mode?
+check_sm_out:
+	tbz	x7, #SVCR_SM_SHIFT, check_sve_out
+	mov	x4, x3				// FFR?
+	b	read_sve
+
+	// SVE?
+check_sve_out:
+	cbz	x0, wait_for_reads
+	mov	x4, #1
+
+	rdvl	x7, #1
+	adrp	x6, sve_vl_out
+	str	x7, [x6, :lo12:sve_vl_out]
+
+read_sve:
+	ldr	x7, =z_out
+	str	z0, [x7, #0, MUL VL]
+	str	z1, [x7, #1, MUL VL]
+	str	z2, [x7, #2, MUL VL]
+	str	z3, [x7, #3, MUL VL]
+	str	z4, [x7, #4, MUL VL]
+	str	z5, [x7, #5, MUL VL]
+	str	z6, [x7, #6, MUL VL]
+	str	z7, [x7, #7, MUL VL]
+	str	z8, [x7, #8, MUL VL]
+	str	z9, [x7, #9, MUL VL]
+	str	z10, [x7, #10, MUL VL]
+	str	z11, [x7, #11, MUL VL]
+	str	z12, [x7, #12, MUL VL]
+	str	z13, [x7, #13, MUL VL]
+	str	z14, [x7, #14, MUL VL]
+	str	z15, [x7, #15, MUL VL]
+	str	z16, [x7, #16, MUL VL]
+	str	z17, [x7, #17, MUL VL]
+	str	z18, [x7, #18, MUL VL]
+	str	z19, [x7, #19, MUL VL]
+	str	z20, [x7, #20, MUL VL]
+	str	z21, [x7, #21, MUL VL]
+	str	z22, [x7, #22, MUL VL]
+	str	z23, [x7, #23, MUL VL]
+	str	z24, [x7, #24, MUL VL]
+	str	z25, [x7, #25, MUL VL]
+	str	z26, [x7, #26, MUL VL]
+	str	z27, [x7, #27, MUL VL]
+	str	z28, [x7, #28, MUL VL]
+	str	z29, [x7, #29, MUL VL]
+	str	z30, [x7, #30, MUL VL]
+	str	z31, [x7, #31, MUL VL]
+
+	ldr	x7, =p_out
+	str	p0, [x7, #0, MUL VL]
+	str	p1, [x7, #1, MUL VL]
+	str	p2, [x7, #2, MUL VL]
+	str	p3, [x7, #3, MUL VL]
+	str	p4, [x7, #4, MUL VL]
+	str	p5, [x7, #5, MUL VL]
+	str	p6, [x7, #6, MUL VL]
+	str	p7, [x7, #7, MUL VL]
+	str	p8, [x7, #8, MUL VL]
+	str	p9, [x7, #9, MUL VL]
+	str	p10, [x7, #10, MUL VL]
+	str	p11, [x7, #11, MUL VL]
+	str	p12, [x7, #12, MUL VL]
+	str	p13, [x7, #13, MUL VL]
+	str	p14, [x7, #14, MUL VL]
+	str	p15, [x7, #15, MUL VL]
+
+	// Only save FFR if it exists
+	cbz	x4, wait_for_reads
+	ldr	x7, =ffr_out
+	rdffr	p0.b
+	str	p0, [x7]
+
+wait_for_reads:
+	// Wait for the parent
+	brk #0
+
+	// Ensure we don't leave ourselves in streaming mode
+	cbz	x1, out
+	msr	S3_3_C4_C2_2, xzr
+
+out:
+	ldp	x11, x12, [sp, #-0x10]
+	ret
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
new file mode 100644
index 000000000000..c7ceafe5f471
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -0,0 +1,1503 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ * Original author: Mark Brown <broonie@kernel.org>
+ */
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+
+#include <linux/kernel.h>
+
+#include <asm/sigcontext.h>
+#include <asm/sve_context.h>
+#include <asm/ptrace.h>
+
+#include "../../kselftest.h"
+
+#include "fp-ptrace.h"
+
+/* <linux/elf.h> and <sys/auxv.h> don't like each other, so: */
+#ifndef NT_ARM_SVE
+#define NT_ARM_SVE 0x405
+#endif
+
+#ifndef NT_ARM_SSVE
+#define NT_ARM_SSVE 0x40b
+#endif
+
+#ifndef NT_ARM_ZA
+#define NT_ARM_ZA 0x40c
+#endif
+
+#ifndef NT_ARM_ZT
+#define NT_ARM_ZT 0x40d
+#endif
+
+#define ARCH_VQ_MAX 256
+
+/* VL 128..2048 in powers of 2 */
+#define MAX_NUM_VLS 5
+
+#define NUM_FPR 32
+__uint128_t v_in[NUM_FPR];
+__uint128_t v_expected[NUM_FPR];
+__uint128_t v_out[NUM_FPR];
+
+char z_in[__SVE_ZREGS_SIZE(ARCH_VQ_MAX)];
+char z_expected[__SVE_ZREGS_SIZE(ARCH_VQ_MAX)];
+char z_out[__SVE_ZREGS_SIZE(ARCH_VQ_MAX)];
+
+char p_in[__SVE_PREGS_SIZE(ARCH_VQ_MAX)];
+char p_expected[__SVE_PREGS_SIZE(ARCH_VQ_MAX)];
+char p_out[__SVE_PREGS_SIZE(ARCH_VQ_MAX)];
+
+char ffr_in[__SVE_PREG_SIZE(ARCH_VQ_MAX)];
+char ffr_expected[__SVE_PREG_SIZE(ARCH_VQ_MAX)];
+char ffr_out[__SVE_PREG_SIZE(ARCH_VQ_MAX)];
+
+char za_in[ZA_SIG_REGS_SIZE(ARCH_VQ_MAX)];
+char za_expected[ZA_SIG_REGS_SIZE(ARCH_VQ_MAX)];
+char za_out[ZA_SIG_REGS_SIZE(ARCH_VQ_MAX)];
+
+char zt_in[ZT_SIG_REG_BYTES];
+char zt_expected[ZT_SIG_REG_BYTES];
+char zt_out[ZT_SIG_REG_BYTES];
+
+uint64_t sve_vl_out;
+uint64_t sme_vl_out;
+uint64_t svcr_in, svcr_expected, svcr_out;
+
+void load_and_save(int sve, int sme, int sme2, int fa64);
+
+static bool got_alarm;
+
+static void handle_alarm(int sig, siginfo_t *info, void *context)
+{
+	got_alarm = true;
+}
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+static __uint128_t arm64_cpu_to_le128(__uint128_t x)
+{
+	u64 a = swab64(x);
+	u64 b = swab64(x >> 64);
+
+	return ((__uint128_t)a << 64) | b;
+}
+#else
+static __uint128_t arm64_cpu_to_le128(__uint128_t x)
+{
+	return x;
+}
+#endif
+
+#define arm64_le128_to_cpu(x) arm64_cpu_to_le128(x)
+
+static bool sve_supported(void)
+{
+	return getauxval(AT_HWCAP) & HWCAP_SVE;
+}
+
+static bool sme_supported(void)
+{
+	return getauxval(AT_HWCAP2) & HWCAP2_SME;
+}
+
+static bool sme2_supported(void)
+{
+	return getauxval(AT_HWCAP2) & HWCAP2_SME2;
+}
+
+static bool fa64_supported(void)
+{
+	return getauxval(AT_HWCAP2) & HWCAP2_SME_FA64;
+}
+
+static bool compare_buffer(const char *name, void *out,
+			   void *expected, size_t size)
+{
+	void *tmp;
+
+	if (memcmp(out, expected, size) == 0)
+		return true;
+
+	ksft_print_msg("Mismatch in %s\n", name);
+
+	/* Did we just get zeros back? */
+	tmp = malloc(size);
+	if (!tmp) {
+		ksft_print_msg("OOM allocating %lu bytes for %s\n",
+			       size, name);
+		ksft_exit_fail();
+	}
+	memset(tmp, 0, size);
+
+	if (memcmp(out, tmp, size) == 0)
+		ksft_print_msg("%s is zero\n", name);
+
+	free(tmp);
+
+	return false;
+}
+
+struct test_config {
+	int sve_vl_in;
+	int sve_vl_expected;
+	int sme_vl_in;
+	int sme_vl_expected;
+	int svcr_in;
+	int svcr_expected;
+};
+
+struct test_definition {
+	const char *name;
+	bool sve_vl_change;
+	bool (*supported)(struct test_config *config);
+	void (*set_expected_values)(struct test_config *config);
+	void (*modify_values)(pid_t child, struct test_config *test_config);
+};
+
+static int vl_in(struct test_config *config)
+{
+	int vl;
+
+	if (config->svcr_in & SVCR_SM)
+		vl = config->sme_vl_in;
+	else
+		vl = config->sve_vl_in;
+
+	return vl;
+}
+
+static int vl_expected(struct test_config *config)
+{
+	int vl;
+
+	if (config->svcr_expected & SVCR_SM)
+		vl = config->sme_vl_expected;
+	else
+		vl = config->sve_vl_expected;
+
+	return vl;
+}
+
+static void run_child(struct test_config *config)
+{
+	int ret;
+
+	/* Let the parent attach to us */
+	ret = ptrace(PTRACE_TRACEME, 0, 0, 0);
+	if (ret < 0)
+		ksft_exit_fail_msg("PTRACE_TRACEME failed: %s (%d)\n",
+				   strerror(errno), errno);
+
+	/* VL setup */
+	if (sve_supported()) {
+		ret = prctl(PR_SVE_SET_VL, config->sve_vl_in);
+		if (ret != config->sve_vl_in) {
+			ksft_print_msg("Failed to set SVE VL %d: %d\n",
+				       config->sve_vl_in, ret);
+		}
+	}
+
+	if (sme_supported()) {
+		ret = prctl(PR_SME_SET_VL, config->sme_vl_in);
+		if (ret != config->sme_vl_in) {
+			ksft_print_msg("Failed to set SME VL %d: %d\n",
+				       config->sme_vl_in, ret);
+		}
+	}
+
+	/* Load values and wait for the parent */
+	load_and_save(sve_supported(), sme_supported(),
+		      sme2_supported(), fa64_supported());
+
+	exit(0);
+}
+
+static void read_one_child_regs(pid_t child, char *name,
+				struct iovec *iov_parent,
+				struct iovec *iov_child)
+{
+	int len = iov_parent->iov_len;
+	int ret;
+
+	ret = process_vm_readv(child, iov_parent, 1, iov_child, 1, 0);
+	if (ret == -1)
+		ksft_print_msg("%s read failed: %s (%d)\n",
+			       name, strerror(errno), errno);
+	else if (ret != len)
+		ksft_print_msg("Short read of %s: %d\n", name, ret);
+}
+
+static void read_child_regs(pid_t child)
+{
+	struct iovec iov_parent, iov_child;
+
+	/*
+	 * Since the child fork()ed from us the buffer addresses are
+	 * the same in parent and child.
+	 */
+	iov_parent.iov_base = &v_out;
+	iov_parent.iov_len = sizeof(v_out);
+	iov_child.iov_base = &v_out;
+	iov_child.iov_len = sizeof(v_out);
+	read_one_child_regs(child, "FPSIMD", &iov_parent, &iov_child);
+
+	if (sve_supported() || sme_supported()) {
+		iov_parent.iov_base = &sve_vl_out;
+		iov_parent.iov_len = sizeof(sve_vl_out);
+		iov_child.iov_base = &sve_vl_out;
+		iov_child.iov_len = sizeof(sve_vl_out);
+		read_one_child_regs(child, "SVE VL", &iov_parent, &iov_child);
+
+		iov_parent.iov_base = &z_out;
+		iov_parent.iov_len = sizeof(z_out);
+		iov_child.iov_base = &z_out;
+		iov_child.iov_len = sizeof(z_out);
+		read_one_child_regs(child, "Z", &iov_parent, &iov_child);
+
+		iov_parent.iov_base = &p_out;
+		iov_parent.iov_len = sizeof(p_out);
+		iov_child.iov_base = &p_out;
+		iov_child.iov_len = sizeof(p_out);
+		read_one_child_regs(child, "P", &iov_parent, &iov_child);
+
+		iov_parent.iov_base = &ffr_out;
+		iov_parent.iov_len = sizeof(ffr_out);
+		iov_child.iov_base = &ffr_out;
+		iov_child.iov_len = sizeof(ffr_out);
+		read_one_child_regs(child, "FFR", &iov_parent, &iov_child);
+	}
+
+	if (sme_supported()) {
+		iov_parent.iov_base = &sme_vl_out;
+		iov_parent.iov_len = sizeof(sme_vl_out);
+		iov_child.iov_base = &sme_vl_out;
+		iov_child.iov_len = sizeof(sme_vl_out);
+		read_one_child_regs(child, "SME VL", &iov_parent, &iov_child);
+
+		iov_parent.iov_base = &svcr_out;
+		iov_parent.iov_len = sizeof(svcr_out);
+		iov_child.iov_base = &svcr_out;
+		iov_child.iov_len = sizeof(svcr_out);
+		read_one_child_regs(child, "SVCR", &iov_parent, &iov_child);
+
+		iov_parent.iov_base = &za_out;
+		iov_parent.iov_len = sizeof(za_out);
+		iov_child.iov_base = &za_out;
+		iov_child.iov_len = sizeof(za_out);
+		read_one_child_regs(child, "ZA", &iov_parent, &iov_child);
+	}
+
+	if (sme2_supported()) {
+		iov_parent.iov_base = &zt_out;
+		iov_parent.iov_len = sizeof(zt_out);
+		iov_child.iov_base = &zt_out;
+		iov_child.iov_len = sizeof(zt_out);
+		read_one_child_regs(child, "ZT", &iov_parent, &iov_child);
+	}
+}
+
+static bool continue_breakpoint(pid_t child,
+				enum __ptrace_request restart_type)
+{
+	struct user_pt_regs pt_regs;
+	struct iovec iov;
+	int ret;
+
+	/* Get PC */
+	iov.iov_base = &pt_regs;
+	iov.iov_len = sizeof(pt_regs);
+	ret = ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov);
+	if (ret < 0) {
+		ksft_print_msg("Failed to get PC: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	/* Skip over the BRK */
+	pt_regs.pc += 4;
+	ret = ptrace(PTRACE_SETREGSET, child, NT_PRSTATUS, &iov);
+	if (ret < 0) {
+		ksft_print_msg("Failed to skip BRK: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	/* Restart */
+	ret = ptrace(restart_type, child, 0, 0);
+	if (ret < 0) {
+		ksft_print_msg("Failed to restart child: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	return true;
+}
+
+static bool check_ptrace_values_sve(pid_t child, struct test_config *config)
+{
+	struct user_sve_header *sve;
+	struct user_fpsimd_state *fpsimd;
+	struct iovec iov;
+	int ret, vq;
+	bool pass = true;
+
+	if (!sve_supported())
+		return true;
+
+	vq = __sve_vq_from_vl(config->sve_vl_in);
+
+	iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
+	iov.iov_base = malloc(iov.iov_len);
+	if (!iov.iov_base) {
+		ksft_print_msg("OOM allocating %lu byte SVE buffer\n",
+			       iov.iov_len);
+		return false;
+	}
+
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_SVE, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read initial SVE: %s (%d)\n",
+			       strerror(errno), errno);
+		pass = false;
+		goto out;
+	}
+
+	sve = iov.iov_base;
+
+	if (sve->vl != config->sve_vl_in) {
+		ksft_print_msg("Mismatch in initial SVE VL: %d != %d\n",
+			       sve->vl, config->sve_vl_in);
+		pass = false;
+	}
+
+	/* If we are in streaming mode we should just read FPSIMD */
+	if ((config->svcr_in & SVCR_SM) && (sve->flags & SVE_PT_REGS_SVE)) {
+		ksft_print_msg("NT_ARM_SVE reports SVE with PSTATE.SM\n");
+		pass = false;
+	}
+
+	if (sve->size != SVE_PT_SIZE(vq, sve->flags)) {
+		ksft_print_msg("Mismatch in SVE header size: %d != %lu\n",
+			       sve->size, SVE_PT_SIZE(vq, sve->flags));
+		pass = false;
+	}
+
+	/* The registers might be in completely different formats! */
+	if (sve->flags & SVE_PT_REGS_SVE) {
+		if (!compare_buffer("initial SVE Z",
+				    iov.iov_base + SVE_PT_SVE_ZREG_OFFSET(vq, 0),
+				    z_in, SVE_PT_SVE_ZREGS_SIZE(vq)))
+			pass = false;
+
+		if (!compare_buffer("initial SVE P",
+				    iov.iov_base + SVE_PT_SVE_PREG_OFFSET(vq, 0),
+				    p_in, SVE_PT_SVE_PREGS_SIZE(vq)))
+			pass = false;
+
+		if (!compare_buffer("initial SVE FFR",
+				    iov.iov_base + SVE_PT_SVE_FFR_OFFSET(vq),
+				    ffr_in, SVE_PT_SVE_PREG_SIZE(vq)))
+			pass = false;
+	} else {
+		fpsimd = iov.iov_base + SVE_PT_FPSIMD_OFFSET;
+		if (!compare_buffer("initial V via SVE", &fpsimd->vregs[0],
+				    v_in, sizeof(v_in)))
+			pass = false;
+	}
+
+out:
+	free(iov.iov_base);
+	return pass;
+}
+
+static bool check_ptrace_values_ssve(pid_t child, struct test_config *config)
+{
+	struct user_sve_header *sve;
+	struct user_fpsimd_state *fpsimd;
+	struct iovec iov;
+	int ret, vq;
+	bool pass = true;
+
+	if (!sme_supported())
+		return true;
+
+	vq = __sve_vq_from_vl(config->sme_vl_in);
+
+	iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
+	iov.iov_base = malloc(iov.iov_len);
+	if (!iov.iov_base) {
+		ksft_print_msg("OOM allocating %lu byte SSVE buffer\n",
+			       iov.iov_len);
+		return false;
+	}
+
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_SSVE, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read initial SSVE: %s (%d)\n",
+			       strerror(errno), errno);
+		pass = false;
+		goto out;
+	}
+
+	sve = iov.iov_base;
+
+	if (sve->vl != config->sme_vl_in) {
+		ksft_print_msg("Mismatch in initial SSVE VL: %d != %d\n",
+			       sve->vl, config->sme_vl_in);
+		pass = false;
+	}
+
+	if ((config->svcr_in & SVCR_SM) && !(sve->flags & SVE_PT_REGS_SVE)) {
+		ksft_print_msg("NT_ARM_SSVE reports FPSIMD with PSTATE.SM\n");
+		pass = false;
+	}
+
+	if (sve->size != SVE_PT_SIZE(vq, sve->flags)) {
+		ksft_print_msg("Mismatch in SSVE header size: %d != %lu\n",
+			       sve->size, SVE_PT_SIZE(vq, sve->flags));
+		pass = false;
+	}
+
+	/* The registers might be in completely different formats! */
+	if (sve->flags & SVE_PT_REGS_SVE) {
+		if (!compare_buffer("initial SSVE Z",
+				    iov.iov_base + SVE_PT_SVE_ZREG_OFFSET(vq, 0),
+				    z_in, SVE_PT_SVE_ZREGS_SIZE(vq)))
+			pass = false;
+
+		if (!compare_buffer("initial SSVE P",
+				    iov.iov_base + SVE_PT_SVE_PREG_OFFSET(vq, 0),
+				    p_in, SVE_PT_SVE_PREGS_SIZE(vq)))
+			pass = false;
+
+		if (!compare_buffer("initial SSVE FFR",
+				    iov.iov_base + SVE_PT_SVE_FFR_OFFSET(vq),
+				    ffr_in, SVE_PT_SVE_PREG_SIZE(vq)))
+			pass = false;
+	} else {
+		fpsimd = iov.iov_base + SVE_PT_FPSIMD_OFFSET;
+		if (!compare_buffer("initial V via SSVE",
+				    &fpsimd->vregs[0], v_in, sizeof(v_in)))
+			pass = false;
+	}
+
+out:
+	free(iov.iov_base);
+	return pass;
+}
+
+static bool check_ptrace_values_za(pid_t child, struct test_config *config)
+{
+	struct user_za_header *za;
+	struct iovec iov;
+	int ret, vq;
+	bool pass = true;
+
+	if (!sme_supported())
+		return true;
+
+	vq = __sve_vq_from_vl(config->sme_vl_in);
+
+	iov.iov_len = ZA_SIG_CONTEXT_SIZE(vq);
+	iov.iov_base = malloc(iov.iov_len);
+	if (!iov.iov_base) {
+		ksft_print_msg("OOM allocating %lu byte ZA buffer\n",
+			       iov.iov_len);
+		return false;
+	}
+
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_ZA, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read initial ZA: %s (%d)\n",
+			       strerror(errno), errno);
+		pass = false;
+		goto out;
+	}
+
+	za = iov.iov_base;
+
+	if (za->vl != config->sme_vl_in) {
+		ksft_print_msg("Mismatch in initial SME VL: %d != %d\n",
+			       za->vl, config->sme_vl_in);
+		pass = false;
+	}
+
+	/* If PSTATE.ZA is not set we should just read the header */
+	if (config->svcr_in & SVCR_ZA) {
+		if (za->size != ZA_PT_SIZE(vq)) {
+			ksft_print_msg("Unexpected ZA ptrace read size: %d != %lu\n",
+				       za->size, ZA_PT_SIZE(vq));
+			pass = false;
+		}
+
+		if (!compare_buffer("initial ZA",
+				    iov.iov_base + ZA_PT_ZA_OFFSET,
+				    za_in, ZA_PT_ZA_SIZE(vq)))
+			pass = false;
+	} else {
+		if (za->size != sizeof(*za)) {
+			ksft_print_msg("Unexpected ZA ptrace read size: %d != %lu\n",
+				       za->size, sizeof(*za));
+			pass = false;
+		}
+	}
+
+out:
+	free(iov.iov_base);
+	return pass;
+}
+
+static bool check_ptrace_values_zt(pid_t child, struct test_config *config)
+{
+	uint8_t buf[512];
+	struct iovec iov;
+	int ret;
+
+	if (!sme2_supported())
+		return true;
+
+	iov.iov_base = &buf;
+	iov.iov_len = ZT_SIG_REG_BYTES;
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_ZT, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read initial ZT: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	return compare_buffer("initial ZT", buf, zt_in, ZT_SIG_REG_BYTES);
+}
+
+
+static bool check_ptrace_values(pid_t child, struct test_config *config)
+{
+	bool pass = true;
+	struct user_fpsimd_state fpsimd;
+	struct iovec iov;
+	int ret;
+
+	iov.iov_base = &fpsimd;
+	iov.iov_len = sizeof(fpsimd);
+	ret = ptrace(PTRACE_GETREGSET, child, NT_PRFPREG, &iov);
+	if (ret == 0) {
+		if (!compare_buffer("initial V", &fpsimd.vregs, v_in,
+				    sizeof(v_in))) {
+			pass = false;
+		}
+	} else {
+		ksft_print_msg("Failed to read initial V: %s (%d)\n",
+			       strerror(errno), errno);
+		pass = false;
+	}
+
+	if (!check_ptrace_values_sve(child, config))
+		pass = false;
+
+	if (!check_ptrace_values_ssve(child, config))
+		pass = false;
+
+	if (!check_ptrace_values_za(child, config))
+		pass = false;
+
+	if (!check_ptrace_values_zt(child, config))
+		pass = false;
+
+	return pass;
+}
+
+static bool run_parent(pid_t child, struct test_definition *test,
+		       struct test_config *config)
+{
+	int wait_status, ret;
+	pid_t pid;
+	bool pass;
+
+	/* Initial attach */
+	while (1) {
+		pid = waitpid(child, &wait_status, 0);
+		if (pid < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_exit_fail_msg("waitpid() failed: %s (%d)\n",
+					   strerror(errno), errno);
+		}
+
+		if (pid == child)
+			break;
+	}
+
+	if (WIFEXITED(wait_status)) {
+		ksft_print_msg("Child exited loading values with status %d\n",
+			       WEXITSTATUS(wait_status));
+		pass = false;
+		goto out;
+	}
+
+	if (WIFSIGNALED(wait_status)) {
+		ksft_print_msg("Child died from signal %d loading values\n",
+			       WTERMSIG(wait_status));
+		pass = false;
+		goto out;
+	}
+
+	/* Read initial values via ptrace */
+	pass = check_ptrace_values(child, config);
+
+	/* Do whatever writes we want to do */
+	if (test->modify_values)
+		test->modify_values(child, config);
+
+	if (!continue_breakpoint(child, PTRACE_CONT))
+		goto cleanup;
+
+	while (1) {
+		pid = waitpid(child, &wait_status, 0);
+		if (pid < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_exit_fail_msg("waitpid() failed: %s (%d)\n",
+					   strerror(errno), errno);
+		}
+
+		if (pid == child)
+			break;
+	}
+
+	if (WIFEXITED(wait_status)) {
+		ksft_print_msg("Child exited saving values with status %d\n",
+			       WEXITSTATUS(wait_status));
+		pass = false;
+		goto out;
+	}
+
+	if (WIFSIGNALED(wait_status)) {
+		ksft_print_msg("Child died from signal %d saving values\n",
+			       WTERMSIG(wait_status));
+		pass = false;
+		goto out;
+	}
+
+	/* See what happened as a result */
+	read_child_regs(child);
+
+	if (!continue_breakpoint(child, PTRACE_DETACH))
+		goto cleanup;
+
+	/* The child should exit cleanly */
+	got_alarm = false;
+	alarm(1);
+	while (1) {
+		if (got_alarm) {
+			ksft_print_msg("Wait for child timed out\n");
+			goto cleanup;
+		}
+
+		pid = waitpid(child, &wait_status, 0);
+		if (pid < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_exit_fail_msg("waitpid() failed: %s (%d)\n",
+					   strerror(errno), errno);
+		}
+
+		if (pid == child)
+			break;
+	}
+	alarm(0);
+
+	if (got_alarm) {
+		ksft_print_msg("Timed out waiting for child\n");
+		pass = false;
+		goto cleanup;
+	}
+
+	if (pid == child && WIFSIGNALED(wait_status)) {
+		ksft_print_msg("Child died from signal %d cleaning up\n",
+			       WTERMSIG(wait_status));
+		pass = false;
+		goto out;
+	}
+
+	if (pid == child && WIFEXITED(wait_status)) {
+		if (WEXITSTATUS(wait_status) != 0) {
+			ksft_print_msg("Child exited with error %d\n",
+				       WEXITSTATUS(wait_status));
+			pass = false;
+		}
+	} else {
+		ksft_print_msg("Child did not exit cleanly\n");
+		pass = false;
+		goto cleanup;
+	}
+
+	goto out;
+
+cleanup:
+	ret = kill(child, SIGKILL);
+	if (ret != 0) {
+		ksft_print_msg("kill() failed: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	while (1) {
+		pid = waitpid(child, &wait_status, 0);
+		if (pid < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_exit_fail_msg("waitpid() failed: %s (%d)\n",
+					   strerror(errno), errno);
+		}
+
+		if (pid == child)
+			break;
+	}
+
+out:
+	return pass;
+}
+
+static void fill_random(void *buf, size_t size)
+{
+	int i;
+	uint32_t *lbuf = buf;
+
+	/* random() returns a 32 bit number regardless of the size of long */
+	for (i = 0; i < size / sizeof(uint32_t); i++)
+		lbuf[i] = random();
+}
+
+static void fill_random_ffr(void *buf, size_t vq)
+{
+	uint8_t *lbuf = buf;
+	int bits, i;
+
+	/*
+	 * Only values with a continuous set of 0..n bits set are
+	 * valid for FFR, set all bits then clear a random number of
+	 * high bits.
+	 */
+	memset(buf, 0, __SVE_FFR_SIZE(vq));
+
+	bits = random() % (__SVE_FFR_SIZE(vq) * 8);
+	for (i = 0; i < bits / 8; i++)
+		lbuf[i] = 0xff;
+	if (bits / 8 != __SVE_FFR_SIZE(vq))
+		lbuf[i] = (1 << (bits % 8)) - 1;
+}
+
+static void fpsimd_to_sve(__uint128_t *v, char *z, int vl)
+{
+	int vq = __sve_vq_from_vl(vl);
+	int i;
+	__uint128_t *p;
+
+	if (!vl)
+		return;
+
+	for (i = 0; i < __SVE_NUM_ZREGS; i++) {
+		p = (__uint128_t *)&z[__SVE_ZREG_OFFSET(vq, i)];
+		*p = arm64_cpu_to_le128(v[i]);
+	}
+}
+
+static void set_initial_values(struct test_config *config)
+{
+	int vq = __sve_vq_from_vl(vl_in(config));
+	int sme_vq = __sve_vq_from_vl(config->sme_vl_in);
+
+	svcr_in = config->svcr_in;
+	svcr_expected = config->svcr_expected;
+	svcr_out = 0;
+
+	fill_random(&v_in, sizeof(v_in));
+	memcpy(v_expected, v_in, sizeof(v_in));
+	memset(v_out, 0, sizeof(v_out));
+
+	/* Changes will be handled in the test case */
+	if (sve_supported() || (config->svcr_in & SVCR_SM)) {
+		/* The low 128 bits of Z are shared with the V registers */
+		fill_random(&z_in, __SVE_ZREGS_SIZE(vq));
+		fpsimd_to_sve(v_in, z_in, vl_in(config));
+		memcpy(z_expected, z_in, __SVE_ZREGS_SIZE(vq));
+		memset(z_out, 0, sizeof(z_out));
+
+		fill_random(&p_in, __SVE_PREGS_SIZE(vq));
+		memcpy(p_expected, p_in, __SVE_PREGS_SIZE(vq));
+		memset(p_out, 0, sizeof(p_out));
+
+		if ((config->svcr_in & SVCR_SM) && !fa64_supported())
+			memset(ffr_in, 0, __SVE_PREG_SIZE(vq));
+		else
+			fill_random_ffr(&ffr_in, vq);
+		memcpy(ffr_expected, ffr_in, __SVE_PREG_SIZE(vq));
+		memset(ffr_out, 0, __SVE_PREG_SIZE(vq));
+	}
+
+	if (config->svcr_in & SVCR_ZA)
+		fill_random(za_in, ZA_SIG_REGS_SIZE(sme_vq));
+	else
+		memset(za_in, 0, ZA_SIG_REGS_SIZE(sme_vq));
+	if (config->svcr_expected & SVCR_ZA)
+		memcpy(za_expected, za_in, ZA_SIG_REGS_SIZE(sme_vq));
+	else
+		memset(za_expected, 0, ZA_SIG_REGS_SIZE(sme_vq));
+	if (sme_supported())
+		memset(za_out, 0, sizeof(za_out));
+
+	if (sme2_supported()) {
+		if (config->svcr_in & SVCR_ZA)
+			fill_random(zt_in, ZT_SIG_REG_BYTES);
+		else
+			memset(zt_in, 0, ZT_SIG_REG_BYTES);
+		if (config->svcr_expected & SVCR_ZA)
+			memcpy(zt_expected, zt_in, ZT_SIG_REG_BYTES);
+		else
+			memset(zt_expected, 0, ZT_SIG_REG_BYTES);
+		memset(zt_out, 0, sizeof(zt_out));
+	}
+}
+
+static bool check_memory_values(struct test_config *config)
+{
+	bool pass = true;
+	int vq, sme_vq;
+
+	if (!compare_buffer("saved V", v_out, v_expected, sizeof(v_out)))
+		pass = false;
+
+	vq = __sve_vq_from_vl(vl_expected(config));
+	sme_vq = __sve_vq_from_vl(config->sme_vl_expected);
+
+	if (svcr_out != svcr_expected) {
+		ksft_print_msg("Mismatch in saved SVCR %lx != %lx\n",
+			       svcr_out, svcr_expected);
+		pass = false;
+	}
+
+	if (sve_vl_out != config->sve_vl_expected) {
+		ksft_print_msg("Mismatch in SVE VL: %ld != %d\n",
+			       sve_vl_out, config->sve_vl_expected);
+		pass = false;
+	}
+
+	if (sme_vl_out != config->sme_vl_expected) {
+		ksft_print_msg("Mismatch in SME VL: %ld != %d\n",
+			       sme_vl_out, config->sme_vl_expected);
+		pass = false;
+	}
+
+	if (!compare_buffer("saved Z", z_out, z_expected,
+			    __SVE_ZREGS_SIZE(vq)))
+		pass = false;
+
+	if (!compare_buffer("saved P", p_out, p_expected,
+			    __SVE_PREGS_SIZE(vq)))
+		pass = false;
+
+	if (!compare_buffer("saved FFR", ffr_out, ffr_expected,
+			    __SVE_PREG_SIZE(vq)))
+		pass = false;
+
+	if (!compare_buffer("saved ZA", za_out, za_expected,
+			    ZA_PT_ZA_SIZE(sme_vq)))
+		pass = false;
+
+	if (!compare_buffer("saved ZT", zt_out, zt_expected, ZT_SIG_REG_BYTES))
+		pass = false;
+
+	return pass;
+}
+
+static bool sve_sme_same(struct test_config *config)
+{
+	if (config->sve_vl_in != config->sve_vl_expected)
+		return false;
+
+	if (config->sme_vl_in != config->sme_vl_expected)
+		return false;
+
+	if (config->svcr_in != config->svcr_expected)
+		return false;
+
+	return true;
+}
+
+static bool sve_write_supported(struct test_config *config)
+{
+	if (!sve_supported() && !sme_supported())
+		return false;
+
+	if ((config->svcr_in & SVCR_ZA) != (config->svcr_expected & SVCR_ZA))
+		return false;
+
+	if (config->svcr_expected & SVCR_SM) {
+		if (config->sve_vl_in != config->sve_vl_expected) {
+			return false;
+		}
+
+		/* Changing the SME VL disables ZA */
+		if ((config->svcr_expected & SVCR_ZA) &&
+		    (config->sme_vl_in != config->sme_vl_expected)) {
+			return false;
+		}
+	} else {
+		if (config->sme_vl_in != config->sme_vl_expected) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static void fpsimd_write_expected(struct test_config *config)
+{
+	int vl;
+
+	fill_random(&v_expected, sizeof(v_expected));
+
+	/* The SVE registers are flushed by a FPSIMD write */
+	vl = vl_expected(config);
+
+	memset(z_expected, 0, __SVE_ZREGS_SIZE(__sve_vq_from_vl(vl)));
+	memset(p_expected, 0, __SVE_PREGS_SIZE(__sve_vq_from_vl(vl)));
+	memset(ffr_expected, 0, __SVE_PREG_SIZE(__sve_vq_from_vl(vl)));
+
+	fpsimd_to_sve(v_expected, z_expected, vl);
+}
+
+static void fpsimd_write(pid_t child, struct test_config *test_config)
+{
+	struct user_fpsimd_state fpsimd;
+	struct iovec iov;
+	int ret;
+
+	memset(&fpsimd, 0, sizeof(fpsimd));
+	memcpy(&fpsimd.vregs, v_expected, sizeof(v_expected));
+
+	iov.iov_base = &fpsimd;
+	iov.iov_len = sizeof(fpsimd);
+	ret = ptrace(PTRACE_SETREGSET, child, NT_PRFPREG, &iov);
+	if (ret == -1)
+		ksft_print_msg("FPSIMD set failed: (%s) %d\n",
+			       strerror(errno), errno);
+}
+
+static void sve_write_expected(struct test_config *config)
+{
+	int vl = vl_expected(config);
+	int sme_vq = __sve_vq_from_vl(config->sme_vl_expected);
+
+	fill_random(z_expected, __SVE_ZREGS_SIZE(__sve_vq_from_vl(vl)));
+	fill_random(p_expected, __SVE_PREGS_SIZE(__sve_vq_from_vl(vl)));
+
+	if ((svcr_expected & SVCR_SM) && !fa64_supported())
+		memset(ffr_expected, 0, __SVE_PREG_SIZE(sme_vq));
+	else
+		fill_random_ffr(ffr_expected, __sve_vq_from_vl(vl));
+
+	/* Share the low bits of Z with V */
+	fill_random(&v_expected, sizeof(v_expected));
+	fpsimd_to_sve(v_expected, z_expected, vl);
+
+	if (config->sme_vl_in != config->sme_vl_expected) {
+		memset(za_expected, 0, ZA_PT_ZA_SIZE(sme_vq));
+		memset(zt_expected, 0, sizeof(zt_expected));
+	}
+}
+
+static void sve_write(pid_t child, struct test_config *config)
+{
+	struct user_sve_header *sve;
+	struct iovec iov;
+	int ret, vl, vq, regset;
+
+	vl = vl_expected(config);
+	vq = __sve_vq_from_vl(vl);
+
+	iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
+	iov.iov_base = malloc(iov.iov_len);
+	if (!iov.iov_base) {
+		ksft_print_msg("Failed allocating %lu byte SVE write buffer\n",
+			       iov.iov_len);
+		return;
+	}
+	memset(iov.iov_base, 0, iov.iov_len);
+
+	sve = iov.iov_base;
+	sve->size = iov.iov_len;
+	sve->flags = SVE_PT_REGS_SVE;
+	sve->vl = vl;
+
+	memcpy(iov.iov_base + SVE_PT_SVE_ZREG_OFFSET(vq, 0),
+	       z_expected, SVE_PT_SVE_ZREGS_SIZE(vq));
+	memcpy(iov.iov_base + SVE_PT_SVE_PREG_OFFSET(vq, 0),
+	       p_expected, SVE_PT_SVE_PREGS_SIZE(vq));
+	memcpy(iov.iov_base + SVE_PT_SVE_FFR_OFFSET(vq),
+	       ffr_expected, SVE_PT_SVE_PREG_SIZE(vq));
+
+	if (svcr_expected & SVCR_SM)
+		regset = NT_ARM_SSVE;
+	else
+		regset = NT_ARM_SVE;
+
+	ret = ptrace(PTRACE_SETREGSET, child, regset, &iov);
+	if (ret != 0)
+		ksft_print_msg("Failed to write SVE: %s (%d)\n",
+			       strerror(errno), errno);
+
+	free(iov.iov_base);
+}
+
+static bool za_write_supported(struct test_config *config)
+{
+	if (config->svcr_expected & SVCR_SM) {
+		if (!(config->svcr_in & SVCR_SM))
+			return false;
+
+		/* Changing the SME VL exits streaming mode */
+		if (config->sme_vl_in != config->sme_vl_expected) {
+			return false;
+		}
+	}
+
+	/* Can't disable SM outside a VL change */
+	if ((config->svcr_in & SVCR_SM) &&
+	    !(config->svcr_expected & SVCR_SM))
+		return false;
+
+	return true;
+}
+
+static void za_write_expected(struct test_config *config)
+{
+	int sme_vq, sve_vq;
+
+	sme_vq = __sve_vq_from_vl(config->sme_vl_expected);
+
+	if (config->svcr_expected & SVCR_ZA) {
+		fill_random(za_expected, ZA_PT_ZA_SIZE(sme_vq));
+	} else {
+		memset(za_expected, 0, ZA_PT_ZA_SIZE(sme_vq));
+		memset(zt_expected, 0, sizeof(zt_expected));
+	}
+
+	/* Changing the SME VL flushes ZT, SVE state and exits SM */
+	if (config->sme_vl_in != config->sme_vl_expected) {
+		svcr_expected &= ~SVCR_SM;
+
+		sve_vq = __sve_vq_from_vl(vl_expected(config));
+		memset(z_expected, 0, __SVE_ZREGS_SIZE(sve_vq));
+		memset(p_expected, 0, __SVE_PREGS_SIZE(sve_vq));
+		memset(ffr_expected, 0, __SVE_PREG_SIZE(sve_vq));
+		memset(zt_expected, 0, sizeof(zt_expected));
+
+		fpsimd_to_sve(v_expected, z_expected, vl_expected(config));
+	}
+}
+
+static void za_write(pid_t child, struct test_config *config)
+{
+	struct user_za_header *za;
+	struct iovec iov;
+	int ret, vq;
+
+	vq = __sve_vq_from_vl(config->sme_vl_expected);
+
+	if (config->svcr_expected & SVCR_ZA)
+		iov.iov_len = ZA_PT_SIZE(vq);
+	else
+		iov.iov_len = sizeof(*za);
+	iov.iov_base = malloc(iov.iov_len);
+	if (!iov.iov_base) {
+		ksft_print_msg("Failed allocating %lu byte ZA write buffer\n",
+			       iov.iov_len);
+		return;
+	}
+	memset(iov.iov_base, 0, iov.iov_len);
+
+	za = iov.iov_base;
+	za->size = iov.iov_len;
+	za->vl = config->sme_vl_expected;
+	if (config->svcr_expected & SVCR_ZA)
+		memcpy(iov.iov_base + ZA_PT_ZA_OFFSET, za_expected,
+		       ZA_PT_ZA_SIZE(vq));
+
+	ret = ptrace(PTRACE_SETREGSET, child, NT_ARM_ZA, &iov);
+	if (ret != 0)
+		ksft_print_msg("Failed to write ZA: %s (%d)\n",
+			       strerror(errno), errno);
+
+	free(iov.iov_base);
+}
+
+static bool zt_write_supported(struct test_config *config)
+{
+	if (!sme2_supported())
+		return false;
+	if (config->sme_vl_in != config->sme_vl_expected)
+		return false;
+	if (!(config->svcr_expected & SVCR_ZA))
+		return false;
+	if ((config->svcr_in & SVCR_SM) != (config->svcr_expected & SVCR_SM))
+		return false;
+
+	return true;
+}
+
+static void zt_write_expected(struct test_config *config)
+{
+	int sme_vq;
+
+	sme_vq = __sve_vq_from_vl(config->sme_vl_expected);
+
+	if (config->svcr_expected & SVCR_ZA) {
+		fill_random(zt_expected, sizeof(zt_expected));
+	} else {
+		memset(za_expected, 0, ZA_PT_ZA_SIZE(sme_vq));
+		memset(zt_expected, 0, sizeof(zt_expected));
+	}
+}
+
+static void zt_write(pid_t child, struct test_config *config)
+{
+	struct iovec iov;
+	int ret;
+
+	iov.iov_len = ZT_SIG_REG_BYTES;
+	iov.iov_base = zt_expected;
+	ret = ptrace(PTRACE_SETREGSET, child, NT_ARM_ZT, &iov);
+	if (ret != 0)
+		ksft_print_msg("Failed to write ZT: %s (%d)\n",
+			       strerror(errno), errno);
+}
+
+/* Actually run a test */
+static void run_test(struct test_definition *test, struct test_config *config)
+{
+	pid_t child;
+	char name[1024];
+	bool pass;
+
+	if (sve_supported() && sme_supported())
+		snprintf(name, sizeof(name), "%s, SVE %d->%d, SME %d/%x->%d/%x",
+			 test->name,
+			 config->sve_vl_in, config->sve_vl_expected,
+			 config->sme_vl_in, config->svcr_in,
+			 config->sme_vl_expected, config->svcr_expected);
+	else if (sve_supported())
+		snprintf(name, sizeof(name), "%s, SVE %d->%d", test->name,
+			 config->sve_vl_in, config->sve_vl_expected);
+	else if (sme_supported())
+		snprintf(name, sizeof(name), "%s, SME %d/%x->%d/%x",
+			 test->name,
+			 config->sme_vl_in, config->svcr_in,
+			 config->sme_vl_expected, config->svcr_expected);
+	else
+		snprintf(name, sizeof(name), "%s", test->name);
+
+	if (test->supported && !test->supported(config)) {
+		ksft_test_result_skip("%s\n", name);
+		return;
+	}
+
+	set_initial_values(config);
+
+	if (test->set_expected_values)
+		test->set_expected_values(config);
+
+	child = fork();
+	if (child < 0)
+		ksft_exit_fail_msg("fork() failed: %s (%d)\n",
+				   strerror(errno), errno);
+	/* run_child() never returns */
+	if (child == 0)
+		run_child(config);
+
+	pass = run_parent(child, test, config);
+	if (!check_memory_values(config))
+		pass = false;
+
+	ksft_test_result(pass, "%s\n", name);
+}
+
+static void run_tests(struct test_definition defs[], int count,
+		      struct test_config *config)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		run_test(&defs[i], config);
+}
+
+static struct test_definition base_test_defs[] = {
+	{
+		.name = "No writes",
+		.supported = sve_sme_same,
+	},
+	{
+		.name = "FPSIMD write",
+		.supported = sve_sme_same,
+		.set_expected_values = fpsimd_write_expected,
+		.modify_values = fpsimd_write,
+	},
+};
+
+static struct test_definition sve_test_defs[] = {
+	{
+		.name = "SVE write",
+		.supported = sve_write_supported,
+		.set_expected_values = sve_write_expected,
+		.modify_values = sve_write,
+	},
+};
+
+static struct test_definition za_test_defs[] = {
+	{
+		.name = "ZA write",
+		.supported = za_write_supported,
+		.set_expected_values = za_write_expected,
+		.modify_values = za_write,
+	},
+};
+
+static struct test_definition zt_test_defs[] = {
+	{
+		.name = "ZT write",
+		.supported = zt_write_supported,
+		.set_expected_values = zt_write_expected,
+		.modify_values = zt_write,
+	},
+};
+
+static int sve_vls[MAX_NUM_VLS], sme_vls[MAX_NUM_VLS];
+static int sve_vl_count, sme_vl_count;
+
+static void probe_vls(const char *name, int vls[], int *vl_count, int set_vl)
+{
+	unsigned int vq;
+	int vl;
+
+	*vl_count = 0;
+
+	for (vq = ARCH_VQ_MAX; vq > 0; vq /= 2) {
+		vl = prctl(set_vl, vq * 16);
+		if (vl == -1)
+			ksft_exit_fail_msg("SET_VL failed: %s (%d)\n",
+					   strerror(errno), errno);
+
+		vl &= PR_SVE_VL_LEN_MASK;
+
+		if (*vl_count && (vl == vls[*vl_count - 1]))
+			break;
+
+		vq = sve_vq_from_vl(vl);
+
+		vls[*vl_count] = vl;
+		*vl_count += 1;
+	}
+
+	if (*vl_count > 2) {
+		/* Just use the minimum and maximum */
+		vls[1] = vls[*vl_count - 1];
+		ksft_print_msg("%d %s VLs, using %d and %d\n",
+			       *vl_count, name, vls[0], vls[1]);
+		*vl_count = 2;
+	} else {
+		ksft_print_msg("%d %s VLs\n", *vl_count, name);
+	}
+}
+
+static struct {
+	int svcr_in, svcr_expected;
+} svcr_combinations[] = {
+	{ .svcr_in = 0, .svcr_expected = 0, },
+	{ .svcr_in = 0, .svcr_expected = SVCR_SM, },
+	{ .svcr_in = 0, .svcr_expected = SVCR_ZA, },
+	/* Can't enable both SM and ZA with a single ptrace write */
+
+	{ .svcr_in = SVCR_SM, .svcr_expected = 0, },
+	{ .svcr_in = SVCR_SM, .svcr_expected = SVCR_SM, },
+	{ .svcr_in = SVCR_SM, .svcr_expected = SVCR_ZA, },
+	{ .svcr_in = SVCR_SM, .svcr_expected = SVCR_SM | SVCR_ZA, },
+
+	{ .svcr_in = SVCR_ZA, .svcr_expected = 0, },
+	{ .svcr_in = SVCR_ZA, .svcr_expected = SVCR_SM, },
+	{ .svcr_in = SVCR_ZA, .svcr_expected = SVCR_ZA, },
+	{ .svcr_in = SVCR_ZA, .svcr_expected = SVCR_SM | SVCR_ZA, },
+
+	{ .svcr_in = SVCR_SM | SVCR_ZA, .svcr_expected = 0, },
+	{ .svcr_in = SVCR_SM | SVCR_ZA, .svcr_expected = SVCR_SM, },
+	{ .svcr_in = SVCR_SM | SVCR_ZA, .svcr_expected = SVCR_ZA, },
+	{ .svcr_in = SVCR_SM | SVCR_ZA, .svcr_expected = SVCR_SM | SVCR_ZA, },
+};
+
+static void run_sve_tests(void)
+{
+	struct test_config test_config;
+	int i, j;
+
+	if (!sve_supported())
+		return;
+
+	test_config.sme_vl_in = sme_vls[0];
+	test_config.sme_vl_expected = sme_vls[0];
+	test_config.svcr_in = 0;
+	test_config.svcr_expected = 0;
+
+	for (i = 0; i < sve_vl_count; i++) {
+		test_config.sve_vl_in = sve_vls[i];
+
+		for (j = 0; j < sve_vl_count; j++) {
+			test_config.sve_vl_expected = sve_vls[j];
+
+			run_tests(base_test_defs,
+				  ARRAY_SIZE(base_test_defs),
+				  &test_config);
+			if (sve_supported())
+				run_tests(sve_test_defs,
+					  ARRAY_SIZE(sve_test_defs),
+					  &test_config);
+		}
+	}
+
+}
+
+static void run_sme_tests(void)
+{
+	struct test_config test_config;
+	int i, j, k;
+
+	if (!sme_supported())
+		return;
+
+	test_config.sve_vl_in = sve_vls[0];
+	test_config.sve_vl_expected = sve_vls[0];
+
+	/*
+	 * Every SME VL/SVCR combination
+	 */
+	for (i = 0; i < sme_vl_count; i++) {
+		test_config.sme_vl_in = sme_vls[i];
+
+		for (j = 0; j < sme_vl_count; j++) {
+			test_config.sme_vl_expected = sme_vls[j];
+
+			for (k = 0; k < ARRAY_SIZE(svcr_combinations); k++) {
+				test_config.svcr_in = svcr_combinations[k].svcr_in;
+				test_config.svcr_expected = svcr_combinations[k].svcr_expected;
+
+				run_tests(base_test_defs,
+					  ARRAY_SIZE(base_test_defs),
+					  &test_config);
+				run_tests(sve_test_defs,
+					  ARRAY_SIZE(sve_test_defs),
+					  &test_config);
+				run_tests(za_test_defs,
+					  ARRAY_SIZE(za_test_defs),
+					  &test_config);
+
+				if (sme2_supported())
+					run_tests(zt_test_defs,
+						  ARRAY_SIZE(zt_test_defs),
+						  &test_config);
+			}
+		}
+	}
+}
+
+int main(void)
+{
+	struct test_config test_config;
+	struct sigaction sa;
+	int tests, ret, tmp;
+
+	srandom(getpid());
+
+	ksft_print_header();
+
+	if (sve_supported()) {
+		probe_vls("SVE", sve_vls, &sve_vl_count, PR_SVE_SET_VL);
+
+		tests = ARRAY_SIZE(base_test_defs) +
+			ARRAY_SIZE(sve_test_defs);
+		tests *= sve_vl_count * sve_vl_count;
+	} else {
+		/* Only run the FPSIMD tests */
+		sve_vl_count = 1;
+		tests = ARRAY_SIZE(base_test_defs);
+	}
+
+	if (sme_supported()) {
+		probe_vls("SME", sme_vls, &sme_vl_count, PR_SME_SET_VL);
+
+		tmp = ARRAY_SIZE(base_test_defs) + ARRAY_SIZE(sve_test_defs)
+			+ ARRAY_SIZE(za_test_defs);
+
+		if (sme2_supported())
+			tmp += ARRAY_SIZE(zt_test_defs);
+
+		tmp *= sme_vl_count * sme_vl_count;
+		tmp *= ARRAY_SIZE(svcr_combinations);
+		tests += tmp;
+	} else {
+		sme_vl_count = 1;
+	}
+
+	if (sme2_supported())
+		ksft_print_msg("SME2 supported\n");
+
+	if (fa64_supported())
+		ksft_print_msg("FA64 supported\n");
+
+	ksft_set_plan(tests);
+
+	/* Get signal handers ready before we start any children */
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handle_alarm;
+	sa.sa_flags = SA_RESTART | SA_SIGINFO;
+	sigemptyset(&sa.sa_mask);
+	ret = sigaction(SIGALRM, &sa, NULL);
+	if (ret < 0)
+		ksft_print_msg("Failed to install SIGALRM handler: %s (%d)\n",
+			       strerror(errno), errno);
+
+	/*
+	 * Run the test set if there is no SVE or SME, with those we
+	 * have to pick a VL for each run.
+	 */
+	if (!sve_supported()) {
+		test_config.sve_vl_in = 0;
+		test_config.sve_vl_expected = 0;
+		test_config.sme_vl_in = 0;
+		test_config.sme_vl_expected = 0;
+		test_config.svcr_in = 0;
+		test_config.svcr_expected = 0;
+
+		run_tests(base_test_defs, ARRAY_SIZE(base_test_defs),
+			  &test_config);
+	}
+
+	run_sve_tests();
+	run_sme_tests();
+
+	ksft_finished();
+}
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.h b/tools/testing/selftests/arm64/fp/fp-ptrace.h
new file mode 100644
index 000000000000..db4f2c4d750c
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2021-3 ARM Limited.
+
+#ifndef FP_PTRACE_H
+#define FP_PTRACE_H
+
+#define SVCR_SM_SHIFT 0
+#define SVCR_ZA_SHIFT 1
+
+#define SVCR_SM (1 << SVCR_SM_SHIFT)
+#define SVCR_ZA (1 << SVCR_ZA_SHIFT)
+
+#endif
diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore
index 839e3a252629..1ce5b5eac386 100644
--- a/tools/testing/selftests/arm64/signal/.gitignore
+++ b/tools/testing/selftests/arm64/signal/.gitignore
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 mangle_*
 fake_sigreturn_*
+fpmr_*
 sme_*
 ssve_*
 sve_*
diff --git a/tools/testing/selftests/arm64/signal/testcases/fpmr_siginfo.c b/tools/testing/selftests/arm64/signal/testcases/fpmr_siginfo.c
new file mode 100644
index 000000000000..e9d24685e741
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/fpmr_siginfo.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ *
+ * Verify that the FPMR register context in signal frames is set up as
+ * expected.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+#include <asm/sigcontext.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static union {
+	ucontext_t uc;
+	char buf[1024 * 128];
+} context;
+
+#define SYS_FPMR "S3_3_C4_C4_2"
+
+static uint64_t get_fpmr(void)
+{
+	uint64_t val;
+
+	asm volatile (
+		"mrs	%0, " SYS_FPMR "\n"
+		: "=r"(val)
+		:
+		: "cc");
+
+	return val;
+}
+
+int fpmr_present(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
+{
+	struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context);
+	struct fpmr_context *fpmr_ctx;
+	size_t offset;
+	bool in_sigframe;
+	bool have_fpmr;
+	__u64 orig_fpmr;
+
+	have_fpmr = getauxval(AT_HWCAP2) & HWCAP2_FPMR;
+	if (have_fpmr)
+		orig_fpmr = get_fpmr();
+
+	if (!get_current_context(td, &context.uc, sizeof(context)))
+		return 1;
+
+	fpmr_ctx = (struct fpmr_context *)
+		get_header(head, FPMR_MAGIC, td->live_sz, &offset);
+
+	in_sigframe = fpmr_ctx != NULL;
+
+	fprintf(stderr, "FPMR sigframe %s on system %s FPMR\n",
+		in_sigframe ? "present" : "absent",
+		have_fpmr ? "with" : "without");
+
+	td->pass = (in_sigframe == have_fpmr);
+
+	if (have_fpmr && fpmr_ctx) {
+		if (fpmr_ctx->fpmr != orig_fpmr) {
+			fprintf(stderr, "FPMR in frame is %llx, was %llx\n",
+				fpmr_ctx->fpmr, orig_fpmr);
+			td->pass = false;
+		}
+	}
+
+	return 0;
+}
+
+struct tdescr tde = {
+	.name = "FPMR",
+	.descr = "Validate that FPMR is present as expected",
+	.timeout = 3,
+	.run = fpmr_present,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c
index 9f580b55b388..674b88cc8c39 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.c
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c
@@ -209,6 +209,14 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
 			zt = (struct zt_context *)head;
 			new_flags |= ZT_CTX;
 			break;
+		case FPMR_MAGIC:
+			if (flags & FPMR_CTX)
+				*err = "Multiple FPMR_MAGIC";
+			else if (head->size !=
+				 sizeof(struct fpmr_context))
+				*err = "Bad size for fpmr_context";
+			new_flags |= FPMR_CTX;
+			break;
 		case EXTRA_MAGIC:
 			if (flags & EXTRA_CTX)
 				*err = "Multiple EXTRA_MAGIC";
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h
index a08ab0d6207a..7727126347e0 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.h
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h
@@ -19,6 +19,7 @@
 #define ZA_CTX		(1 << 2)
 #define EXTRA_CTX	(1 << 3)
 #define ZT_CTX		(1 << 4)
+#define FPMR_CTX	(1 << 5)
 
 #define KSFT_BAD_MAGIC	0xdeadbeef
 
diff --git a/tools/testing/selftests/arm64/tags/tags_test.c b/tools/testing/selftests/arm64/tags/tags_test.c
index 5701163460ef..955f87c1170d 100644
--- a/tools/testing/selftests/arm64/tags/tags_test.c
+++ b/tools/testing/selftests/arm64/tags/tags_test.c
@@ -6,6 +6,7 @@
 #include <stdint.h>
 #include <sys/prctl.h>
 #include <sys/utsname.h>
+#include "../../kselftest.h"
 
 #define SHIFT_TAG(tag)		((uint64_t)(tag) << 56)
 #define SET_TAG(ptr, tag)	(((uint64_t)(ptr) & ~SHIFT_TAG(0xff)) | \
@@ -21,6 +22,9 @@ int main(void)
 	if (prctl(PR_SET_TAGGED_ADDR_CTRL, PR_TAGGED_ADDR_ENABLE, 0, 0, 0) == 0)
 		tbi_enabled = 1;
 	ptr = (struct utsname *)malloc(sizeof(*ptr));
+	if (!ptr)
+		ksft_exit_fail_msg("Failed to allocate utsname buffer\n");
+
 	if (tbi_enabled)
 		tag = 0x42;
 	ptr = (struct utsname *)SET_TAG(ptr, tag);
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index f1aebabfb017..5025401323af 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -17,7 +17,6 @@ test_dev_cgroup
 test_verifier_log
 feature
 test_sock
-test_sock_addr
 urandom_read
 test_sockmap
 test_lirc_mode2_user
diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64
index d8ade15e2789..0445ac38bc07 100644
--- a/tools/testing/selftests/bpf/DENYLIST.aarch64
+++ b/tools/testing/selftests/bpf/DENYLIST.aarch64
@@ -10,5 +10,3 @@ fill_link_info/kprobe_multi_link_info            # bpf_program__attach_kprobe_mu
 fill_link_info/kretprobe_multi_link_info         # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 fill_link_info/kprobe_multi_invalid_ubuff        # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 missed/kprobe_recursion                          # missed_kprobe_recursion__attach unexpected error: -95 (errno 95)
-verifier_arena                                   # JIT does not support arena
-arena_htab                                       # JIT does not support arena
diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index f4a2f66a683d..c34adf39eeb2 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -6,3 +6,4 @@ stacktrace_build_id                      # compare_map_keys stackid_hmap vs. sta
 verifier_iterating_callbacks
 verifier_arena                           # JIT does not support arena
 arena_htab                               # JIT does not support arena
+arena_atomics
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3b9eb40d6343..e0b3887b3d2d 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -53,6 +53,7 @@ progs/syscall.c-CFLAGS := -fno-strict-aliasing
 progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing
 progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing
 progs/timer_crash.c-CFLAGS := -fno-strict-aliasing
+progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing
 
 ifneq ($(LLVM),)
 # Silence some warnings when compiled with clang
@@ -81,11 +82,24 @@ TEST_INST_SUBDIRS += bpf_gcc
 # The following tests contain C code that, although technically legal,
 # triggers GCC warnings that cannot be disabled: declaration of
 # anonymous struct types in function parameter lists.
-progs/btf_dump_test_case_bitfields.c-CFLAGS := -Wno-error
-progs/btf_dump_test_case_namespacing.c-CFLAGS := -Wno-error
-progs/btf_dump_test_case_packing.c-CFLAGS := -Wno-error
-progs/btf_dump_test_case_padding.c-CFLAGS := -Wno-error
-progs/btf_dump_test_case_syntax.c-CFLAGS := -Wno-error
+progs/btf_dump_test_case_bitfields.c-bpf_gcc-CFLAGS := -Wno-error
+progs/btf_dump_test_case_namespacing.c-bpf_gcc-CFLAGS := -Wno-error
+progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error
+progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error
+progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error
+
+# The following tests do type-punning, via the __imm_insn macro, from
+# `struct bpf_insn' to long and then uses the value.  This triggers an
+# "is used uninitialized" warning in GCC due to strict-aliasing
+# rules.
+progs/verifier_ref_tracking.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
+progs/verifier_unpriv.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
+progs/verifier_cgroup_storage.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
+progs/verifier_ld_ind.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
+progs/verifier_map_ret_val.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
+progs/verifier_spill_fill.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
+progs/verifier_subprog_precision.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
+progs/verifier_uninit.c-bpf_gcc-CFLAGS := -fno-strict-aliasing
 endif
 
 ifneq ($(CLANG_CPUV4),)
@@ -102,8 +116,6 @@ TEST_PROGS := test_kmod.sh \
 	test_xdp_redirect_multi.sh \
 	test_xdp_meta.sh \
 	test_xdp_veth.sh \
-	test_offload.py \
-	test_sock_addr.sh \
 	test_tunnel.sh \
 	test_lwt_seg6local.sh \
 	test_lirc_mode2.sh \
@@ -128,7 +140,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
 	test_xdp_vlan.sh test_bpftool.py
 
 # Compile but not part of 'make run_tests'
-TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
+TEST_GEN_PROGS_EXTENDED = test_skb_cgroup_id_user \
 	flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
 	test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
 	xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \
@@ -136,18 +148,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
 
 TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi
 
-# Emit succinct information message describing current building step
-# $1 - generic step name (e.g., CC, LINK, etc);
-# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor];
-# $3 - target (assumed to be file); only file name will be emitted;
-# $4 - optional extra arg, emitted as-is, if provided.
-ifeq ($(V),1)
-Q =
-msg =
-else
-Q = @
-msg = @printf '  %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))";
-MAKEFLAGS += --no-print-directory
+ifneq ($(V),1)
 submake_extras := feature_display=0
 endif
 
@@ -274,7 +275,7 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT)
 	$(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower	       \
 		    OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF)     \
 		    BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/		       \
-		    BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf			       \
+		    BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf/			       \
 		    BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR)		       \
 		    EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS)'	       \
 		    EXTRA_LDFLAGS='$(SAN_LDFLAGS)' &&			       \
@@ -290,11 +291,11 @@ UNPRIV_HELPERS  := $(OUTPUT)/unpriv_helpers.o
 TRACE_HELPERS	:= $(OUTPUT)/trace_helpers.o
 JSON_WRITER		:= $(OUTPUT)/json_writer.o
 CAP_HELPERS	:= $(OUTPUT)/cap_helpers.o
+NETWORK_HELPERS := $(OUTPUT)/network_helpers.o
 
 $(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS)
-$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS)
 $(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS)
@@ -308,6 +309,7 @@ $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS)
 $(OUTPUT)/test_maps: $(TESTING_HELPERS)
 $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS)
 $(OUTPUT)/xsk.o: $(BPFOBJ)
+$(OUTPUT)/test_tcp_check_syncookie_user: $(NETWORK_HELPERS)
 
 BPFTOOL ?= $(DEFAULT_BPFTOOL)
 $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)    \
@@ -442,7 +444,7 @@ endef
 # Build BPF object using GCC
 define GCC_BPF_BUILD_RULE
 	$(call msg,GCC-BPF,$(TRUNNER_BINARY),$2)
-	$(Q)$(BPF_GCC) $3 -O2 -c $1 -o $2
+	$(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2
 endef
 
 SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c
@@ -455,7 +457,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c 		\
 	trace_printk.c trace_vprintk.c map_ptr_kern.c 			\
 	core_kern.c core_kern_overflow.c test_ringbuf.c			\
-	test_ringbuf_map_key.c
+	test_ringbuf_n.c test_ringbuf_map_key.c
 
 # Generate both light skeleton and libbpf skeleton for these
 LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \
@@ -481,7 +483,7 @@ LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(ske
 # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES.
 # Parameters:
 # $1 - test runner base binary name (e.g., test_progs)
-# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, gcc-bpf, etc)
+# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc)
 define DEFINE_TEST_RUNNER
 
 TRUNNER_OUTPUT := $(OUTPUT)$(if $2,/)$2
@@ -509,7 +511,7 @@ endef
 # Using TRUNNER_XXX variables, provided by callers of DEFINE_TEST_RUNNER and
 # set up by DEFINE_TEST_RUNNER itself, create test runner build rules with:
 # $1 - test runner base binary name (e.g., test_progs)
-# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, gcc-bpf, etc)
+# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc)
 define DEFINE_TEST_RUNNER_RULES
 
 ifeq ($($(TRUNNER_OUTPUT)-dir),)
@@ -532,7 +534,8 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o:				\
 		     | $(TRUNNER_OUTPUT) $$(BPFOBJ)
 	$$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@,			\
 					  $(TRUNNER_BPF_CFLAGS)         \
-					  $$($$<-CFLAGS))
+					  $$($$<-CFLAGS)		\
+					  $$($$<-$2-CFLAGS))
 
 $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT)
 	$$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@)
@@ -658,7 +661,7 @@ $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32))
 # Define test_progs-cpuv4 test runner.
 ifneq ($(CLANG_CPUV4),)
 TRUNNER_BPF_BUILD_RULE := CLANG_CPUV4_BPF_BUILD_RULE
-TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS)
+TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -DENABLE_ATOMICS_TESTS
 $(eval $(call DEFINE_TEST_RUNNER,test_progs,cpuv4))
 endif
 
@@ -695,7 +698,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT)
 
 # Include find_bit.c to compile xskxceiver.
 EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c
-$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT)
+$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT)
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
 
@@ -729,6 +732,7 @@ $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tas
 $(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.skel.h
 $(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h
 $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h
+$(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -748,6 +752,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_bpf_hashmap_lookup.o \
 		 $(OUTPUT)/bench_local_storage_create.o \
 		 $(OUTPUT)/bench_htab_mem.o \
+		 $(OUTPUT)/bench_bpf_crypto.o \
 		 #
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
@@ -759,7 +764,7 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o
 
 $(OUTPUT)/uprobe_multi: uprobe_multi.c
 	$(call msg,BINARY,,$@)
-	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@
+	$(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@
 
 EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)			\
 	prog_tests/tests.h map_tests/tests.h verifier/tests.h		\
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index b2b4c391eb0a..627b74ae041b 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -280,6 +280,8 @@ extern struct argp bench_strncmp_argp;
 extern struct argp bench_hashmap_lookup_argp;
 extern struct argp bench_local_storage_create_argp;
 extern struct argp bench_htab_mem_argp;
+extern struct argp bench_trigger_batch_argp;
+extern struct argp bench_crypto_argp;
 
 static const struct argp_child bench_parsers[] = {
 	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
@@ -292,6 +294,8 @@ static const struct argp_child bench_parsers[] = {
 	{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
 	{ &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 },
 	{ &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 },
+	{ &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 },
+	{ &bench_crypto_argp, 0, "bpf crypto benchmark", 0 },
 	{},
 };
 
@@ -491,24 +495,31 @@ extern const struct bench bench_rename_kretprobe;
 extern const struct bench bench_rename_rawtp;
 extern const struct bench bench_rename_fentry;
 extern const struct bench bench_rename_fexit;
-extern const struct bench bench_trig_base;
-extern const struct bench bench_trig_tp;
-extern const struct bench bench_trig_rawtp;
+
+/* pure counting benchmarks to establish theoretical lmits */
+extern const struct bench bench_trig_usermode_count;
+extern const struct bench bench_trig_syscall_count;
+extern const struct bench bench_trig_kernel_count;
+
+/* batched, staying mostly in-kernel benchmarks */
 extern const struct bench bench_trig_kprobe;
 extern const struct bench bench_trig_kretprobe;
 extern const struct bench bench_trig_kprobe_multi;
 extern const struct bench bench_trig_kretprobe_multi;
 extern const struct bench bench_trig_fentry;
 extern const struct bench bench_trig_fexit;
-extern const struct bench bench_trig_fentry_sleep;
 extern const struct bench bench_trig_fmodret;
-extern const struct bench bench_trig_uprobe_base;
+extern const struct bench bench_trig_tp;
+extern const struct bench bench_trig_rawtp;
+
+/* uprobe/uretprobe benchmarks */
 extern const struct bench bench_trig_uprobe_nop;
 extern const struct bench bench_trig_uretprobe_nop;
 extern const struct bench bench_trig_uprobe_push;
 extern const struct bench bench_trig_uretprobe_push;
 extern const struct bench bench_trig_uprobe_ret;
 extern const struct bench bench_trig_uretprobe_ret;
+
 extern const struct bench bench_rb_libbpf;
 extern const struct bench bench_rb_custom;
 extern const struct bench bench_pb_libbpf;
@@ -529,6 +540,8 @@ extern const struct bench bench_local_storage_tasks_trace;
 extern const struct bench bench_bpf_hashmap_lookup;
 extern const struct bench bench_local_storage_create;
 extern const struct bench bench_htab_mem;
+extern const struct bench bench_crypto_encrypt;
+extern const struct bench bench_crypto_decrypt;
 
 static const struct bench *benchs[] = {
 	&bench_count_global,
@@ -539,24 +552,28 @@ static const struct bench *benchs[] = {
 	&bench_rename_rawtp,
 	&bench_rename_fentry,
 	&bench_rename_fexit,
-	&bench_trig_base,
-	&bench_trig_tp,
-	&bench_trig_rawtp,
+	/* pure counting benchmarks for establishing theoretical limits */
+	&bench_trig_usermode_count,
+	&bench_trig_kernel_count,
+	&bench_trig_syscall_count,
+	/* batched, staying mostly in-kernel triggers */
 	&bench_trig_kprobe,
 	&bench_trig_kretprobe,
 	&bench_trig_kprobe_multi,
 	&bench_trig_kretprobe_multi,
 	&bench_trig_fentry,
 	&bench_trig_fexit,
-	&bench_trig_fentry_sleep,
 	&bench_trig_fmodret,
-	&bench_trig_uprobe_base,
+	&bench_trig_tp,
+	&bench_trig_rawtp,
+	/* uprobes */
 	&bench_trig_uprobe_nop,
 	&bench_trig_uretprobe_nop,
 	&bench_trig_uprobe_push,
 	&bench_trig_uretprobe_push,
 	&bench_trig_uprobe_ret,
 	&bench_trig_uretprobe_ret,
+	/* ringbuf/perfbuf benchmarks */
 	&bench_rb_libbpf,
 	&bench_rb_custom,
 	&bench_pb_libbpf,
@@ -577,6 +594,8 @@ static const struct bench *benchs[] = {
 	&bench_bpf_hashmap_lookup,
 	&bench_local_storage_create,
 	&bench_htab_mem,
+	&bench_crypto_encrypt,
+	&bench_crypto_decrypt,
 };
 
 static void find_benchmark(void)
diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c
new file mode 100644
index 000000000000..2845edaba8db
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_crypto.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <argp.h>
+#include "bench.h"
+#include "crypto_bench.skel.h"
+
+#define MAX_CIPHER_LEN 32
+static char *input;
+static struct crypto_ctx {
+	struct crypto_bench *skel;
+	int pfd;
+} ctx;
+
+static struct crypto_args {
+	u32 crypto_len;
+	char *crypto_cipher;
+} args = {
+	.crypto_len = 16,
+	.crypto_cipher = "ecb(aes)",
+};
+
+enum {
+	ARG_CRYPTO_LEN = 5000,
+	ARG_CRYPTO_CIPHER = 5001,
+};
+
+static const struct argp_option opts[] = {
+	{ "crypto-len", ARG_CRYPTO_LEN, "CRYPTO_LEN", 0,
+	  "Set the length of crypto buffer" },
+	{ "crypto-cipher", ARG_CRYPTO_CIPHER, "CRYPTO_CIPHER", 0,
+	  "Set the cipher to use (default:ecb(aes))" },
+	{},
+};
+
+static error_t crypto_parse_arg(int key, char *arg, struct argp_state *state)
+{
+	switch (key) {
+	case ARG_CRYPTO_LEN:
+		args.crypto_len = strtoul(arg, NULL, 10);
+		if (!args.crypto_len ||
+		    args.crypto_len > sizeof(ctx.skel->bss->dst)) {
+			fprintf(stderr, "Invalid crypto buffer len (limit %zu)\n",
+				sizeof(ctx.skel->bss->dst));
+			argp_usage(state);
+		}
+		break;
+	case ARG_CRYPTO_CIPHER:
+		args.crypto_cipher = strdup(arg);
+		if (!strlen(args.crypto_cipher) ||
+		    strlen(args.crypto_cipher) > MAX_CIPHER_LEN) {
+			fprintf(stderr, "Invalid crypto cipher len (limit %d)\n",
+				MAX_CIPHER_LEN);
+			argp_usage(state);
+		}
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+const struct argp bench_crypto_argp = {
+	.options = opts,
+	.parser = crypto_parse_arg,
+};
+
+static void crypto_validate(void)
+{
+	if (env.consumer_cnt != 0) {
+		fprintf(stderr, "bpf crypto benchmark doesn't support consumer!\n");
+		exit(1);
+	}
+}
+
+static void crypto_setup(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+
+	int err, pfd;
+	size_t i, sz;
+
+	sz = args.crypto_len;
+	if (!sz || sz > sizeof(ctx.skel->bss->dst)) {
+		fprintf(stderr, "invalid encrypt buffer size (source %zu, target %zu)\n",
+			sz, sizeof(ctx.skel->bss->dst));
+		exit(1);
+	}
+
+	setup_libbpf();
+
+	ctx.skel = crypto_bench__open();
+	if (!ctx.skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	snprintf(ctx.skel->bss->cipher, 128, "%s", args.crypto_cipher);
+	memcpy(ctx.skel->bss->key, "12345678testtest", 16);
+	ctx.skel->bss->key_len = 16;
+	ctx.skel->bss->authsize = 0;
+
+	srandom(time(NULL));
+	input = malloc(sz);
+	for (i = 0; i < sz - 1; i++)
+		input[i] = '1' + random() % 9;
+	input[sz - 1] = '\0';
+
+	ctx.skel->rodata->len = args.crypto_len;
+
+	err = crypto_bench__load(ctx.skel);
+	if (err) {
+		fprintf(stderr, "failed to load skeleton\n");
+		crypto_bench__destroy(ctx.skel);
+		exit(1);
+	}
+
+	pfd = bpf_program__fd(ctx.skel->progs.crypto_setup);
+	if (pfd < 0) {
+		fprintf(stderr, "failed to get fd for setup prog\n");
+		crypto_bench__destroy(ctx.skel);
+		exit(1);
+	}
+
+	err = bpf_prog_test_run_opts(pfd, &opts);
+	if (err || ctx.skel->bss->status) {
+		fprintf(stderr, "failed to run setup prog: err %d, status %d\n",
+			err, ctx.skel->bss->status);
+		crypto_bench__destroy(ctx.skel);
+		exit(1);
+	}
+}
+
+static void crypto_encrypt_setup(void)
+{
+	crypto_setup();
+	ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_encrypt);
+}
+
+static void crypto_decrypt_setup(void)
+{
+	crypto_setup();
+	ctx.pfd = bpf_program__fd(ctx.skel->progs.crypto_decrypt);
+}
+
+static void crypto_measure(struct bench_res *res)
+{
+	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
+}
+
+static void *crypto_producer(void *unused)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts,
+		.repeat = 64,
+		.data_in = input,
+		.data_size_in = args.crypto_len,
+	);
+
+	while (true)
+		(void)bpf_prog_test_run_opts(ctx.pfd, &opts);
+	return NULL;
+}
+
+const struct bench bench_crypto_encrypt = {
+	.name = "crypto-encrypt",
+	.argp = &bench_crypto_argp,
+	.validate = crypto_validate,
+	.setup = crypto_encrypt_setup,
+	.producer_thread = crypto_producer,
+	.measure = crypto_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
+
+const struct bench bench_crypto_decrypt = {
+	.name = "crypto-decrypt",
+	.argp = &bench_crypto_argp,
+	.validate = crypto_validate,
+	.setup = crypto_decrypt_setup,
+	.producer_thread = crypto_producer,
+	.measure = crypto_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c
index b36de42ee4d9..e2ff8ea1cb79 100644
--- a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c
+++ b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c
@@ -186,7 +186,7 @@ static void *task_producer(void *input)
 
 		for (i = 0; i < batch_sz; i++) {
 			if (!pthd_results[i])
-				pthread_join(pthds[i], NULL);;
+				pthread_join(pthds[i], NULL);
 		}
 	}
 
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index ace0d1011a8e..4b05539f167d 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -1,15 +1,95 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
+#define _GNU_SOURCE
+#include <argp.h>
+#include <unistd.h>
+#include <stdint.h>
 #include "bench.h"
 #include "trigger_bench.skel.h"
 #include "trace_helpers.h"
 
+#define MAX_TRIG_BATCH_ITERS 1000
+
+static struct {
+	__u32 batch_iters;
+} args = {
+	.batch_iters = 100,
+};
+
+enum {
+	ARG_TRIG_BATCH_ITERS = 7000,
+};
+
+static const struct argp_option opts[] = {
+	{ "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0,
+		"Number of in-kernel iterations per one driver test run"},
+	{},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	long ret;
+
+	switch (key) {
+	case ARG_TRIG_BATCH_ITERS:
+		ret = strtol(arg, NULL, 10);
+		if (ret < 1 || ret > MAX_TRIG_BATCH_ITERS) {
+			fprintf(stderr, "invalid --trig-batch-iters value (should be between %d and %d)\n",
+				1, MAX_TRIG_BATCH_ITERS);
+			argp_usage(state);
+		}
+		args.batch_iters = ret;
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+const struct argp bench_trigger_batch_argp = {
+	.options = opts,
+	.parser = parse_arg,
+};
+
+/* adjust slot shift in inc_hits() if changing */
+#define MAX_BUCKETS 256
+
+#pragma GCC diagnostic ignored "-Wattributes"
+
 /* BPF triggering benchmarks */
 static struct trigger_ctx {
 	struct trigger_bench *skel;
+	bool usermode_counters;
+	int driver_prog_fd;
 } ctx;
 
-static struct counter base_hits;
+static struct counter base_hits[MAX_BUCKETS];
+
+static __always_inline void inc_counter(struct counter *counters)
+{
+	static __thread int tid = 0;
+	unsigned slot;
+
+	if (unlikely(tid == 0))
+		tid = syscall(SYS_gettid);
+
+	/* multiplicative hashing, it's fast */
+	slot = 2654435769U * tid;
+	slot >>= 24;
+
+	atomic_inc(&base_hits[slot].value); /* use highest byte as an index */
+}
+
+static long sum_and_reset_counters(struct counter *counters)
+{
+	int i;
+	long sum = 0;
+
+	for (i = 0; i < MAX_BUCKETS; i++)
+		sum += atomic_swap(&counters[i].value, 0);
+	return sum;
+}
 
 static void trigger_validate(void)
 {
@@ -19,41 +99,63 @@ static void trigger_validate(void)
 	}
 }
 
-static void *trigger_base_producer(void *input)
+static void *trigger_producer(void *input)
 {
-	while (true) {
-		(void)syscall(__NR_getpgid);
-		atomic_inc(&base_hits.value);
+	if (ctx.usermode_counters) {
+		while (true) {
+			(void)syscall(__NR_getpgid);
+			inc_counter(base_hits);
+		}
+	} else {
+		while (true)
+			(void)syscall(__NR_getpgid);
 	}
 	return NULL;
 }
 
-static void trigger_base_measure(struct bench_res *res)
+static void *trigger_producer_batch(void *input)
 {
-	res->hits = atomic_swap(&base_hits.value, 0);
-}
+	int fd = ctx.driver_prog_fd ?: bpf_program__fd(ctx.skel->progs.trigger_driver);
 
-static void *trigger_producer(void *input)
-{
 	while (true)
-		(void)syscall(__NR_getpgid);
+		bpf_prog_test_run_opts(fd, NULL);
+
 	return NULL;
 }
 
 static void trigger_measure(struct bench_res *res)
 {
-	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
+	if (ctx.usermode_counters)
+		res->hits = sum_and_reset_counters(base_hits);
+	else
+		res->hits = sum_and_reset_counters(ctx.skel->bss->hits);
 }
 
 static void setup_ctx(void)
 {
 	setup_libbpf();
 
-	ctx.skel = trigger_bench__open_and_load();
+	ctx.skel = trigger_bench__open();
 	if (!ctx.skel) {
 		fprintf(stderr, "failed to open skeleton\n");
 		exit(1);
 	}
+
+	/* default "driver" BPF program */
+	bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true);
+
+	ctx.skel->rodata->batch_iters = args.batch_iters;
+}
+
+static void load_ctx(void)
+{
+	int err;
+
+	err = trigger_bench__load(ctx.skel);
+	if (err) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
 }
 
 static void attach_bpf(struct bpf_program *prog)
@@ -67,64 +169,104 @@ static void attach_bpf(struct bpf_program *prog)
 	}
 }
 
-static void trigger_tp_setup(void)
+static void trigger_syscall_count_setup(void)
 {
-	setup_ctx();
-	attach_bpf(ctx.skel->progs.bench_trigger_tp);
+	ctx.usermode_counters = true;
 }
 
-static void trigger_rawtp_setup(void)
+/* Batched, staying mostly in-kernel triggering setups */
+static void trigger_kernel_count_setup(void)
 {
 	setup_ctx();
-	attach_bpf(ctx.skel->progs.bench_trigger_raw_tp);
+	bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
+	bpf_program__set_autoload(ctx.skel->progs.trigger_count, true);
+	load_ctx();
+	/* override driver program */
+	ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count);
 }
 
 static void trigger_kprobe_setup(void)
 {
 	setup_ctx();
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe, true);
+	load_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_kprobe);
 }
 
 static void trigger_kretprobe_setup(void)
 {
 	setup_ctx();
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe, true);
+	load_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_kretprobe);
 }
 
 static void trigger_kprobe_multi_setup(void)
 {
 	setup_ctx();
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe_multi, true);
+	load_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi);
 }
 
 static void trigger_kretprobe_multi_setup(void)
 {
 	setup_ctx();
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe_multi, true);
+	load_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi);
 }
 
 static void trigger_fentry_setup(void)
 {
 	setup_ctx();
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fentry, true);
+	load_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_fentry);
 }
 
 static void trigger_fexit_setup(void)
 {
 	setup_ctx();
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fexit, true);
+	load_ctx();
 	attach_bpf(ctx.skel->progs.bench_trigger_fexit);
 }
 
-static void trigger_fentry_sleep_setup(void)
+static void trigger_fmodret_setup(void)
 {
 	setup_ctx();
-	attach_bpf(ctx.skel->progs.bench_trigger_fentry_sleep);
+	bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
+	bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true);
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fmodret, true);
+	load_ctx();
+	/* override driver program */
+	ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc);
+	attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
 }
 
-static void trigger_fmodret_setup(void)
+static void trigger_tp_setup(void)
 {
 	setup_ctx();
-	attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
+	bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
+	bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true);
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_tp, true);
+	load_ctx();
+	/* override driver program */
+	ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc);
+	attach_bpf(ctx.skel->progs.bench_trigger_tp);
+}
+
+static void trigger_rawtp_setup(void)
+{
+	setup_ctx();
+	bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
+	bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true);
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_rawtp, true);
+	load_ctx();
+	/* override driver program */
+	ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc);
+	attach_bpf(ctx.skel->progs.bench_trigger_rawtp);
 }
 
 /* make sure call is not inlined and not avoided by compiler, so __weak and
@@ -137,7 +279,7 @@ static void trigger_fmodret_setup(void)
  * GCC doesn't generate stack setup preample for these functions due to them
  * having no input arguments and doing nothing in the body.
  */
-__weak void uprobe_target_nop(void)
+__nocf_check __weak void uprobe_target_nop(void)
 {
 	asm volatile ("nop");
 }
@@ -146,7 +288,7 @@ __weak void opaque_noop_func(void)
 {
 }
 
-__weak int uprobe_target_push(void)
+__nocf_check __weak int uprobe_target_push(void)
 {
 	/* overhead of function call is negligible compared to uprobe
 	 * triggering, so this shouldn't affect benchmark results much
@@ -155,16 +297,16 @@ __weak int uprobe_target_push(void)
 	return 1;
 }
 
-__weak void uprobe_target_ret(void)
+__nocf_check __weak void uprobe_target_ret(void)
 {
 	asm volatile ("");
 }
 
-static void *uprobe_base_producer(void *input)
+static void *uprobe_producer_count(void *input)
 {
 	while (true) {
 		uprobe_target_nop();
-		atomic_inc(&base_hits.value);
+		inc_counter(base_hits);
 	}
 	return NULL;
 }
@@ -194,15 +336,24 @@ static void usetup(bool use_retprobe, void *target_addr)
 {
 	size_t uprobe_offset;
 	struct bpf_link *link;
+	int err;
 
 	setup_libbpf();
 
-	ctx.skel = trigger_bench__open_and_load();
+	ctx.skel = trigger_bench__open();
 	if (!ctx.skel) {
 		fprintf(stderr, "failed to open skeleton\n");
 		exit(1);
 	}
 
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true);
+
+	err = trigger_bench__load(ctx.skel);
+	if (err) {
+		fprintf(stderr, "failed to load skeleton\n");
+		exit(1);
+	}
+
 	uprobe_offset = get_uprobe_offset(target_addr);
 	link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
 					  use_retprobe,
@@ -216,204 +367,90 @@ static void usetup(bool use_retprobe, void *target_addr)
 	ctx.skel->links.bench_trigger_uprobe = link;
 }
 
-static void uprobe_setup_nop(void)
+static void usermode_count_setup(void)
+{
+	ctx.usermode_counters = true;
+}
+
+static void uprobe_nop_setup(void)
 {
 	usetup(false, &uprobe_target_nop);
 }
 
-static void uretprobe_setup_nop(void)
+static void uretprobe_nop_setup(void)
 {
 	usetup(true, &uprobe_target_nop);
 }
 
-static void uprobe_setup_push(void)
+static void uprobe_push_setup(void)
 {
 	usetup(false, &uprobe_target_push);
 }
 
-static void uretprobe_setup_push(void)
+static void uretprobe_push_setup(void)
 {
 	usetup(true, &uprobe_target_push);
 }
 
-static void uprobe_setup_ret(void)
+static void uprobe_ret_setup(void)
 {
 	usetup(false, &uprobe_target_ret);
 }
 
-static void uretprobe_setup_ret(void)
+static void uretprobe_ret_setup(void)
 {
 	usetup(true, &uprobe_target_ret);
 }
 
-const struct bench bench_trig_base = {
-	.name = "trig-base",
+const struct bench bench_trig_syscall_count = {
+	.name = "trig-syscall-count",
 	.validate = trigger_validate,
-	.producer_thread = trigger_base_producer,
-	.measure = trigger_base_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_tp = {
-	.name = "trig-tp",
-	.validate = trigger_validate,
-	.setup = trigger_tp_setup,
-	.producer_thread = trigger_producer,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_rawtp = {
-	.name = "trig-rawtp",
-	.validate = trigger_validate,
-	.setup = trigger_rawtp_setup,
-	.producer_thread = trigger_producer,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_kprobe = {
-	.name = "trig-kprobe",
-	.validate = trigger_validate,
-	.setup = trigger_kprobe_setup,
+	.setup = trigger_syscall_count_setup,
 	.producer_thread = trigger_producer,
 	.measure = trigger_measure,
 	.report_progress = hits_drops_report_progress,
 	.report_final = hits_drops_report_final,
 };
 
-const struct bench bench_trig_kretprobe = {
-	.name = "trig-kretprobe",
-	.validate = trigger_validate,
-	.setup = trigger_kretprobe_setup,
-	.producer_thread = trigger_producer,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_kprobe_multi = {
-	.name = "trig-kprobe-multi",
-	.validate = trigger_validate,
-	.setup = trigger_kprobe_multi_setup,
-	.producer_thread = trigger_producer,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_kretprobe_multi = {
-	.name = "trig-kretprobe-multi",
-	.validate = trigger_validate,
-	.setup = trigger_kretprobe_multi_setup,
-	.producer_thread = trigger_producer,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_fentry = {
-	.name = "trig-fentry",
-	.validate = trigger_validate,
-	.setup = trigger_fentry_setup,
-	.producer_thread = trigger_producer,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_fexit = {
-	.name = "trig-fexit",
-	.validate = trigger_validate,
-	.setup = trigger_fexit_setup,
-	.producer_thread = trigger_producer,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_fentry_sleep = {
-	.name = "trig-fentry-sleep",
-	.validate = trigger_validate,
-	.setup = trigger_fentry_sleep_setup,
-	.producer_thread = trigger_producer,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_fmodret = {
-	.name = "trig-fmodret",
-	.validate = trigger_validate,
-	.setup = trigger_fmodret_setup,
-	.producer_thread = trigger_producer,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_uprobe_base = {
-	.name = "trig-uprobe-base",
-	.setup = NULL, /* no uprobe/uretprobe is attached */
-	.producer_thread = uprobe_base_producer,
-	.measure = trigger_base_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_uprobe_nop = {
-	.name = "trig-uprobe-nop",
-	.setup = uprobe_setup_nop,
-	.producer_thread = uprobe_producer_nop,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_uretprobe_nop = {
-	.name = "trig-uretprobe-nop",
-	.setup = uretprobe_setup_nop,
-	.producer_thread = uprobe_producer_nop,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_uprobe_push = {
-	.name = "trig-uprobe-push",
-	.setup = uprobe_setup_push,
-	.producer_thread = uprobe_producer_push,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_uretprobe_push = {
-	.name = "trig-uretprobe-push",
-	.setup = uretprobe_setup_push,
-	.producer_thread = uprobe_producer_push,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_uprobe_ret = {
-	.name = "trig-uprobe-ret",
-	.setup = uprobe_setup_ret,
-	.producer_thread = uprobe_producer_ret,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
-
-const struct bench bench_trig_uretprobe_ret = {
-	.name = "trig-uretprobe-ret",
-	.setup = uretprobe_setup_ret,
-	.producer_thread = uprobe_producer_ret,
-	.measure = trigger_measure,
-	.report_progress = hits_drops_report_progress,
-	.report_final = hits_drops_report_final,
-};
+/* batched (staying mostly in kernel) kprobe/fentry benchmarks */
+#define BENCH_TRIG_KERNEL(KIND, NAME)					\
+const struct bench bench_trig_##KIND = {				\
+	.name = "trig-" NAME,						\
+	.setup = trigger_##KIND##_setup,				\
+	.producer_thread = trigger_producer_batch,			\
+	.measure = trigger_measure,					\
+	.report_progress = hits_drops_report_progress,			\
+	.report_final = hits_drops_report_final,			\
+	.argp = &bench_trigger_batch_argp,				\
+}
+
+BENCH_TRIG_KERNEL(kernel_count, "kernel-count");
+BENCH_TRIG_KERNEL(kprobe, "kprobe");
+BENCH_TRIG_KERNEL(kretprobe, "kretprobe");
+BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi");
+BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi");
+BENCH_TRIG_KERNEL(fentry, "fentry");
+BENCH_TRIG_KERNEL(fexit, "fexit");
+BENCH_TRIG_KERNEL(fmodret, "fmodret");
+BENCH_TRIG_KERNEL(tp, "tp");
+BENCH_TRIG_KERNEL(rawtp, "rawtp");
+
+/* uprobe benchmarks */
+#define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME)			\
+const struct bench bench_trig_##KIND = {				\
+	.name = "trig-" NAME,						\
+	.validate = trigger_validate,					\
+	.setup = KIND##_setup,						\
+	.producer_thread = uprobe_producer_##PRODUCER,			\
+	.measure = trigger_measure,					\
+	.report_progress = hits_drops_report_progress,			\
+	.report_final = hits_drops_report_final,			\
+}
+
+BENCH_TRIG_USERMODE(usermode_count, count, "usermode-count");
+BENCH_TRIG_USERMODE(uprobe_nop, nop, "uprobe-nop");
+BENCH_TRIG_USERMODE(uprobe_push, push, "uprobe-push");
+BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret");
+BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop");
+BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push");
+BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret");
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
index 78e83f243294..a690f5a68b6b 100755
--- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
@@ -2,8 +2,22 @@
 
 set -eufo pipefail
 
-for i in base tp rawtp kprobe fentry fmodret
-do
-	summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
-	printf "%-10s: %s\n" $i "$summary"
+def_tests=( \
+	usermode-count kernel-count syscall-count \
+	fentry fexit fmodret \
+	rawtp tp \
+	kprobe kprobe-multi \
+	kretprobe kretprobe-multi \
+)
+
+tests=("$@")
+if [ ${#tests[@]} -eq 0 ]; then
+	tests=("${def_tests[@]}")
+fi
+
+p=${PROD_CNT:-1}
+
+for t in "${tests[@]}"; do
+	summary=$(sudo ./bench -w2 -d5 -a -p$p trig-$t | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
+	printf "%-15s: %s\n" $t "$summary"
 done
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh
index 9bdcc74e03a4..af169f831f2f 100755
--- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh
@@ -2,7 +2,7 @@
 
 set -eufo pipefail
 
-for i in base {uprobe,uretprobe}-{nop,push,ret}
+for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret}
 do
 	summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
 	printf "%-15s: %s\n" $i "$summary"
diff --git a/tools/testing/selftests/bpf/bpf_arena_common.h b/tools/testing/selftests/bpf/bpf_arena_common.h
index bcf195c64a45..567491f3e1b5 100644
--- a/tools/testing/selftests/bpf/bpf_arena_common.h
+++ b/tools/testing/selftests/bpf/bpf_arena_common.h
@@ -32,7 +32,7 @@
  */
 #endif
 
-#if defined(__BPF_FEATURE_ARENA_CAST) && !defined(BPF_ARENA_FORCE_ASM)
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM)
 #define __arena __attribute__((address_space(1)))
 #define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */
 #define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */
diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h
index b99b9f408eff..85dbc3ea4da5 100644
--- a/tools/testing/selftests/bpf/bpf_arena_list.h
+++ b/tools/testing/selftests/bpf/bpf_arena_list.h
@@ -29,6 +29,7 @@ static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { re
 static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {}
 static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; }
 #define cond_break ({})
+#define can_loop true
 #endif
 
 /* Safely walk link list elements. Deletion of elements is allowed. */
@@ -36,8 +37,7 @@ static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; }
 	for (void * ___tmp = (pos = list_entry_safe((head)->first,		\
 						    typeof(*(pos)), member),	\
 			      (void *)0);					\
-	     pos && ({ ___tmp = (void *)pos->member.next; 1; });		\
-	     cond_break,							\
+	     pos && ({ ___tmp = (void *)pos->member.next; 1; }) && can_loop;    \
 	     pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member))
 
 static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h)
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index a5b9df38c162..3d9e4b8c6b81 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -326,9 +326,48 @@ l_true:												\
        })
 #endif
 
+/*
+ * Note that cond_break can only be portably used in the body of a breakable
+ * construct, whereas can_loop can be used anywhere.
+ */
+#ifdef __BPF_FEATURE_MAY_GOTO
+#define can_loop					\
+	({ __label__ l_break, l_continue;		\
+	bool ret = true;				\
+	asm volatile goto("may_goto %l[l_break]"	\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: ret = false;				\
+	l_continue:;					\
+	ret;						\
+	})
+
+#define cond_break					\
+	({ __label__ l_break, l_continue;		\
+	asm volatile goto("may_goto %l[l_break]"	\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: break;					\
+	l_continue:;					\
+	})
+#else
+#define can_loop					\
+	({ __label__ l_break, l_continue;		\
+	bool ret = true;				\
+	asm volatile goto("1:.byte 0xe5;		\
+		      .byte 0;				\
+		      .long ((%l[l_break] - 1b - 8) / 8) & 0xffff;	\
+		      .short 0"				\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: ret = false;				\
+	l_continue:;					\
+	ret;						\
+	})
+
 #define cond_break					\
 	({ __label__ l_break, l_continue;		\
-	 asm volatile goto("1:.byte 0xe5;			\
+	asm volatile goto("1:.byte 0xe5;		\
 		      .byte 0;				\
 		      .long ((%l[l_break] - 1b - 8) / 8) & 0xffff;	\
 		      .short 0"				\
@@ -337,6 +376,7 @@ l_true:												\
 	l_break: break;					\
 	l_continue:;					\
 	})
+#endif
 
 #ifndef bpf_nop_mov
 #define bpf_nop_mov(var) \
@@ -386,6 +426,28 @@ l_true:												\
 		     , [as]"i"((dst_as << 16) | src_as));
 #endif
 
+void bpf_preempt_disable(void) __weak __ksym;
+void bpf_preempt_enable(void) __weak __ksym;
+
+typedef struct {
+} __bpf_preempt_t;
+
+static inline __bpf_preempt_t __bpf_preempt_constructor(void)
+{
+	__bpf_preempt_t ret = {};
+
+	bpf_preempt_disable();
+	return ret;
+}
+static inline void __bpf_preempt_destructor(__bpf_preempt_t *t)
+{
+	bpf_preempt_enable();
+}
+#define bpf_guard_preempt() \
+	__bpf_preempt_t ___bpf_apply(preempt, __COUNTER__)			\
+	__attribute__((__unused__, __cleanup__(__bpf_preempt_destructor))) =	\
+	__bpf_preempt_constructor()
+
 /* Description
  *	Assert that a conditional expression is true.
  * Returns
@@ -459,4 +521,11 @@ extern int bpf_iter_css_new(struct bpf_iter_css *it,
 extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym;
 extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
 
+extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym;
+extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym;
+extern int bpf_wq_set_callback_impl(struct bpf_wq *wq,
+		int (callback_fn)(void *map, int *key, struct bpf_wq *wq),
+		unsigned int flags__k, void *aux__ign) __ksym;
+#define bpf_wq_set_callback(timer, cb, flags) \
+	bpf_wq_set_callback_impl(timer, cb, flags, NULL)
 #endif
diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
index 14ebe7d9e1a3..be91a6919315 100644
--- a/tools/testing/selftests/bpf/bpf_kfuncs.h
+++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
@@ -75,4 +75,7 @@ extern void bpf_key_put(struct bpf_key *key) __ksym;
 extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr,
 				      struct bpf_dynptr *sig_ptr,
 				      struct bpf_key *trusted_keyring) __ksym;
+
+extern bool bpf_session_is_return(void) __ksym __weak;
+extern long *bpf_session_cookie(void) __ksym __weak;
 #endif
diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
deleted file mode 100644
index 82a7c9de95f9..000000000000
--- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __BPF_TCP_HELPERS_H
-#define __BPF_TCP_HELPERS_H
-
-#include <stdbool.h>
-#include <linux/types.h>
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_core_read.h>
-#include <bpf/bpf_tracing.h>
-
-#define BPF_STRUCT_OPS(name, args...) \
-SEC("struct_ops/"#name) \
-BPF_PROG(name, args)
-
-#ifndef SOL_TCP
-#define SOL_TCP 6
-#endif
-
-#ifndef TCP_CA_NAME_MAX
-#define TCP_CA_NAME_MAX	16
-#endif
-
-#define tcp_jiffies32 ((__u32)bpf_jiffies64())
-
-struct sock_common {
-	unsigned char	skc_state;
-	__u16		skc_num;
-} __attribute__((preserve_access_index));
-
-enum sk_pacing {
-	SK_PACING_NONE		= 0,
-	SK_PACING_NEEDED	= 1,
-	SK_PACING_FQ		= 2,
-};
-
-struct sock {
-	struct sock_common	__sk_common;
-#define sk_state		__sk_common.skc_state
-	unsigned long		sk_pacing_rate;
-	__u32			sk_pacing_status; /* see enum sk_pacing */
-} __attribute__((preserve_access_index));
-
-struct inet_sock {
-	struct sock		sk;
-} __attribute__((preserve_access_index));
-
-struct inet_connection_sock {
-	struct inet_sock	  icsk_inet;
-	__u8			  icsk_ca_state:6,
-				  icsk_ca_setsockopt:1,
-				  icsk_ca_dst_locked:1;
-	struct {
-		__u8		  pending;
-	} icsk_ack;
-	__u64			  icsk_ca_priv[104 / sizeof(__u64)];
-} __attribute__((preserve_access_index));
-
-struct request_sock {
-	struct sock_common		__req_common;
-} __attribute__((preserve_access_index));
-
-struct tcp_sock {
-	struct inet_connection_sock	inet_conn;
-
-	__u32	rcv_nxt;
-	__u32	snd_nxt;
-	__u32	snd_una;
-	__u32	window_clamp;
-	__u8	ecn_flags;
-	__u32	delivered;
-	__u32	delivered_ce;
-	__u32	snd_cwnd;
-	__u32	snd_cwnd_cnt;
-	__u32	snd_cwnd_clamp;
-	__u32	snd_ssthresh;
-	__u8	syn_data:1,	/* SYN includes data */
-		syn_fastopen:1,	/* SYN includes Fast Open option */
-		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
-		syn_fastopen_ch:1, /* Active TFO re-enabling probe */
-		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
-		save_syn:1,	/* Save headers of SYN packet */
-		is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
-		syn_smc:1;	/* SYN includes SMC */
-	__u32	max_packets_out;
-	__u32	lsndtime;
-	__u32	prior_cwnd;
-	__u64	tcp_mstamp;	/* most recent packet received/sent */
-	bool	is_mptcp;
-} __attribute__((preserve_access_index));
-
-static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
-{
-	return (struct inet_connection_sock *)sk;
-}
-
-static __always_inline void *inet_csk_ca(const struct sock *sk)
-{
-	return (void *)inet_csk(sk)->icsk_ca_priv;
-}
-
-static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
-{
-	return (struct tcp_sock *)sk;
-}
-
-static __always_inline bool before(__u32 seq1, __u32 seq2)
-{
-	return (__s32)(seq1-seq2) < 0;
-}
-#define after(seq2, seq1) 	before(seq1, seq2)
-
-#define	TCP_ECN_OK		1
-#define	TCP_ECN_QUEUE_CWR	2
-#define	TCP_ECN_DEMAND_CWR	4
-#define	TCP_ECN_SEEN		8
-
-enum inet_csk_ack_state_t {
-	ICSK_ACK_SCHED	= 1,
-	ICSK_ACK_TIMER  = 2,
-	ICSK_ACK_PUSHED = 4,
-	ICSK_ACK_PUSHED2 = 8,
-	ICSK_ACK_NOW = 16	/* Send the next ACK immediately (once) */
-};
-
-enum tcp_ca_event {
-	CA_EVENT_TX_START = 0,
-	CA_EVENT_CWND_RESTART = 1,
-	CA_EVENT_COMPLETE_CWR = 2,
-	CA_EVENT_LOSS = 3,
-	CA_EVENT_ECN_NO_CE = 4,
-	CA_EVENT_ECN_IS_CE = 5,
-};
-
-struct ack_sample {
-	__u32 pkts_acked;
-	__s32 rtt_us;
-	__u32 in_flight;
-} __attribute__((preserve_access_index));
-
-struct rate_sample {
-	__u64  prior_mstamp; /* starting timestamp for interval */
-	__u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
-	__s32  delivered;		/* number of packets delivered over interval */
-	long interval_us;	/* time for tp->delivered to incr "delivered" */
-	__u32 snd_interval_us;	/* snd interval for delivered packets */
-	__u32 rcv_interval_us;	/* rcv interval for delivered packets */
-	long rtt_us;		/* RTT of last (S)ACKed packet (or -1) */
-	int  losses;		/* number of packets marked lost upon ACK */
-	__u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
-	__u32  prior_in_flight;	/* in flight before this ACK */
-	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
-	bool is_retrans;	/* is sample from retransmission? */
-	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
-} __attribute__((preserve_access_index));
-
-#define TCP_CA_NAME_MAX		16
-#define TCP_CONG_NEEDS_ECN	0x2
-
-struct tcp_congestion_ops {
-	char name[TCP_CA_NAME_MAX];
-	__u32 flags;
-
-	/* initialize private data (optional) */
-	void (*init)(struct sock *sk);
-	/* cleanup private data  (optional) */
-	void (*release)(struct sock *sk);
-
-	/* return slow start threshold (required) */
-	__u32 (*ssthresh)(struct sock *sk);
-	/* do new cwnd calculation (required) */
-	void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
-	/* call before changing ca_state (optional) */
-	void (*set_state)(struct sock *sk, __u8 new_state);
-	/* call when cwnd event occurs (optional) */
-	void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
-	/* call when ack arrives (optional) */
-	void (*in_ack_event)(struct sock *sk, __u32 flags);
-	/* new value of cwnd after loss (required) */
-	__u32  (*undo_cwnd)(struct sock *sk);
-	/* hook for packet ack accounting (optional) */
-	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
-	/* override sysctl_tcp_min_tso_segs */
-	__u32 (*min_tso_segs)(struct sock *sk);
-	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
-	__u32 (*sndbuf_expand)(struct sock *sk);
-	/* call when packets are delivered to update cwnd and pacing rate,
-	 * after all the ca_state processing. (optional)
-	 */
-	void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
-	void *owner;
-};
-
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#define max(a, b) ((a) > (b) ? (a) : (b))
-#define min_not_zero(x, y) ({			\
-	typeof(x) __x = (x);			\
-	typeof(y) __y = (y);			\
-	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
-
-static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp)
-{
-	return tp->snd_cwnd < tp->snd_ssthresh;
-}
-
-static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-
-	/* If in slow start, ensure cwnd grows to twice what was ACKed. */
-	if (tcp_in_slow_start(tp))
-		return tp->snd_cwnd < 2 * tp->max_packets_out;
-
-	return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited);
-}
-
-static __always_inline bool tcp_cc_eq(const char *a, const char *b)
-{
-	int i;
-
-	for (i = 0; i < TCP_CA_NAME_MAX; i++) {
-		if (a[i] != b[i])
-			return false;
-		if (!a[i])
-			break;
-	}
-
-	return true;
-}
-
-extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym;
-extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
-
-struct mptcp_sock {
-	struct inet_connection_sock	sk;
-
-	__u32		token;
-	struct sock	*first;
-	char		ca_name[TCP_CA_NAME_MAX];
-} __attribute__((preserve_access_index));
-
-#endif
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 39ad96a18123..2a18bd320e92 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -10,18 +10,30 @@
 #include <linux/percpu-defs.h>
 #include <linux/sysfs.h>
 #include <linux/tracepoint.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/nsproxy.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/un.h>
+#include <net/sock.h>
 #include "bpf_testmod.h"
 #include "bpf_testmod_kfunc.h"
 
 #define CREATE_TRACE_POINTS
 #include "bpf_testmod-events.h"
 
+#define CONNECT_TIMEOUT_SEC 1
+
 typedef int (*func_proto_typedef)(long);
 typedef int (*func_proto_typedef_nested1)(func_proto_typedef);
 typedef int (*func_proto_typedef_nested2)(func_proto_typedef_nested1);
 
 DEFINE_PER_CPU(int, bpf_testmod_ksym_percpu) = 123;
 long bpf_testmod_test_struct_arg_result;
+static DEFINE_MUTEX(sock_lock);
+static struct socket *sock;
 
 struct bpf_testmod_struct_arg_1 {
 	int a;
@@ -205,6 +217,9 @@ __weak noinline struct file *bpf_testmod_return_ptr(int arg)
 	case 5: return (void *)~(1ull << 30);	/* trigger extable */
 	case 6: return &f;			/* valid addr */
 	case 7: return (void *)((long)&f | 1);	/* kernel tricks */
+#ifdef CONFIG_X86_64
+	case 8: return (void *)VSYSCALL_ADDR;   /* vsyscall page address */
+#endif
 	default: return NULL;
 	}
 }
@@ -494,6 +509,241 @@ __bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused
 	return arg;
 }
 
+__bpf_kfunc void bpf_kfunc_call_test_sleepable(void)
+{
+}
+
+__bpf_kfunc int bpf_kfunc_init_sock(struct init_sock_args *args)
+{
+	int proto;
+	int err;
+
+	mutex_lock(&sock_lock);
+
+	if (sock) {
+		pr_err("%s called without releasing old sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	switch (args->af) {
+	case AF_INET:
+	case AF_INET6:
+		proto = args->type == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP;
+		break;
+	case AF_UNIX:
+		proto = PF_UNIX;
+		break;
+	default:
+		pr_err("invalid address family %d\n", args->af);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = sock_create_kern(current->nsproxy->net_ns, args->af, args->type,
+			       proto, &sock);
+
+	if (!err)
+		/* Set timeout for call to kernel_connect() to prevent it from hanging,
+		 * and consider the connection attempt failed if it returns
+		 * -EINPROGRESS.
+		 */
+		sock->sk->sk_sndtimeo = CONNECT_TIMEOUT_SEC * HZ;
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc void bpf_kfunc_close_sock(void)
+{
+	mutex_lock(&sock_lock);
+
+	if (sock) {
+		sock_release(sock);
+		sock = NULL;
+	}
+
+	mutex_unlock(&sock_lock);
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_connect(struct addr_args *args)
+{
+	int err;
+
+	if (args->addrlen > sizeof(args->addr))
+		return -EINVAL;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_connect(sock, (struct sockaddr *)&args->addr,
+			     args->addrlen, 0);
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_bind(struct addr_args *args)
+{
+	int err;
+
+	if (args->addrlen > sizeof(args->addr))
+		return -EINVAL;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_bind(sock, (struct sockaddr *)&args->addr, args->addrlen);
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_listen(void)
+{
+	int err;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_listen(sock, 128);
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args)
+{
+	struct msghdr msg = {
+		.msg_name	= &args->addr.addr,
+		.msg_namelen	= args->addr.addrlen,
+	};
+	struct kvec iov;
+	int err;
+
+	if (args->addr.addrlen > sizeof(args->addr.addr) ||
+	    args->msglen > sizeof(args->msg))
+		return -EINVAL;
+
+	iov.iov_base = args->msg;
+	iov.iov_len  = args->msglen;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_sendmsg(sock, &msg, &iov, 1, args->msglen);
+	args->addr.addrlen = msg.msg_namelen;
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args)
+{
+	struct msghdr msg = {
+		.msg_name	= &args->addr.addr,
+		.msg_namelen	= args->addr.addrlen,
+	};
+	struct kvec iov;
+	int err;
+
+	if (args->addr.addrlen > sizeof(args->addr.addr) ||
+	    args->msglen > sizeof(args->msg))
+		return -EINVAL;
+
+	iov.iov_base = args->msg;
+	iov.iov_len  = args->msglen;
+
+	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, args->msglen);
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = sock_sendmsg(sock, &msg);
+	args->addr.addrlen = msg.msg_namelen;
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_getsockname(struct addr_args *args)
+{
+	int err;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_getsockname(sock, (struct sockaddr *)&args->addr);
+	if (err < 0)
+		goto out;
+
+	args->addrlen = err;
+	err = 0;
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
+__bpf_kfunc int bpf_kfunc_call_kernel_getpeername(struct addr_args *args)
+{
+	int err;
+
+	mutex_lock(&sock_lock);
+
+	if (!sock) {
+		pr_err("%s called without initializing sock", __func__);
+		err = -EPERM;
+		goto out;
+	}
+
+	err = kernel_getpeername(sock, (struct sockaddr *)&args->addr);
+	if (err < 0)
+		goto out;
+
+	args->addrlen = err;
+	err = 0;
+out:
+	mutex_unlock(&sock_lock);
+
+	return err;
+}
+
 BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
@@ -520,6 +770,16 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_init_sock, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_close_sock, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_connect, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_bind, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_listen, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE)
 BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids)
 
 static int bpf_testmod_ops_init(struct btf *btf)
@@ -650,6 +910,8 @@ static int bpf_testmod_init(void)
 		return ret;
 	if (bpf_fentry_test1(0) < 0)
 		return -EINVAL;
+	sock = NULL;
+	mutex_init(&sock_lock);
 	return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
 }
 
@@ -663,6 +925,7 @@ static void bpf_testmod_exit(void)
 	while (refcount_read(&prog_test_struct.cnt) > 1)
 		msleep(20);
 
+	bpf_kfunc_close_sock();
 	sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
 }
 
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
index 7c664dd61059..b0d586a6751f 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h
@@ -64,6 +64,22 @@ struct prog_test_fail3 {
 	char arr2[];
 };
 
+struct init_sock_args {
+	int af;
+	int type;
+};
+
+struct addr_args {
+	char addr[sizeof(struct __kernel_sockaddr_storage)];
+	int addrlen;
+};
+
+struct sendmsg_args {
+	struct addr_args addr;
+	char msg[10];
+	int msglen;
+};
+
 struct prog_test_ref_kfunc *
 bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr) __ksym;
 void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym;
@@ -96,6 +112,7 @@ void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym;
 void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym;
 
 void bpf_kfunc_call_test_destructive(void) __ksym;
+void bpf_kfunc_call_test_sleepable(void) __ksym;
 
 void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p);
 struct prog_test_member *bpf_kfunc_call_memb_acquire(void);
@@ -106,4 +123,15 @@ void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p);
 void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len);
 
 void bpf_kfunc_common_test(void) __ksym;
+
+int bpf_kfunc_init_sock(struct init_sock_args *args) __ksym;
+void bpf_kfunc_close_sock(void) __ksym;
+int bpf_kfunc_call_kernel_connect(struct addr_args *args) __ksym;
+int bpf_kfunc_call_kernel_bind(struct addr_args *args) __ksym;
+int bpf_kfunc_call_kernel_listen(void) __ksym;
+int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args) __ksym;
+int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args) __ksym;
+int bpf_kfunc_call_kernel_getsockname(struct addr_args *args) __ksym;
+int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) __ksym;
+
 #endif /* _BPF_TESTMOD_KFUNC_H */
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c
index 19be9c63d5e8..23bb9a9e6a7d 100644
--- a/tools/testing/selftests/bpf/cgroup_helpers.c
+++ b/tools/testing/selftests/bpf/cgroup_helpers.c
@@ -429,7 +429,7 @@ int create_and_get_cgroup(const char *relative_path)
  * which is an invalid cgroup id.
  * If there is a failure, it prints the error to stderr.
  */
-unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir)
+static unsigned long long get_cgroup_id_from_path(const char *cgroup_workdir)
 {
 	int dirfd, err, flags, mount_id, fhsize;
 	union {
@@ -508,6 +508,9 @@ int cgroup_setup_and_join(const char *path) {
 /**
  * setup_classid_environment() - Setup the cgroupv1 net_cls environment
  *
+ * This function should only be called in a custom mount namespace, e.g.
+ * created by running setup_cgroup_environment.
+ *
  * After calling this function, cleanup_classid_environment should be called
  * once testing is complete.
  *
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 01f241ea2c67..eeabd798bc3a 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -13,7 +13,12 @@ CONFIG_BPF_SYSCALL=y
 CONFIG_CGROUP_BPF=y
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_SHA256=y
+CONFIG_CRYPTO_USER_API=y
 CONFIG_CRYPTO_USER_API_HASH=y
+CONFIG_CRYPTO_USER_API_SKCIPHER=y
+CONFIG_CRYPTO_SKCIPHER=y
+CONFIG_CRYPTO_ECB=y
+CONFIG_CRYPTO_AES=y
 CONFIG_DEBUG_INFO=y
 CONFIG_DEBUG_INFO_BTF=y
 CONFIG_DEBUG_INFO_DWARF4=y
@@ -88,3 +93,5 @@ CONFIG_VSOCKETS=y
 CONFIG_VXLAN=y
 CONFIG_XDP_SOCKETS=y
 CONFIG_XFRM_INTERFACE=y
+CONFIG_TCP_CONG_DCTCP=y
+CONFIG_TCP_CONG_BBR=y
diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 6db27a9088e9..35250e6cde7f 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -52,6 +52,8 @@ struct ipv6_packet pkt_v6 = {
 	.tcp.doff = 5,
 };
 
+static const struct network_helper_opts default_opts;
+
 int settimeo(int fd, int timeout_ms)
 {
 	struct timeval timeout = { .tv_sec = 3 };
@@ -78,24 +80,22 @@ int settimeo(int fd, int timeout_ms)
 
 #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; })
 
-static int __start_server(int type, int protocol, const struct sockaddr *addr,
-			  socklen_t addrlen, int timeout_ms, bool reuseport)
+static int __start_server(int type, const struct sockaddr *addr, socklen_t addrlen,
+			  const struct network_helper_opts *opts)
 {
-	int on = 1;
 	int fd;
 
-	fd = socket(addr->sa_family, type, protocol);
+	fd = socket(addr->sa_family, type, opts->proto);
 	if (fd < 0) {
 		log_err("Failed to create server socket");
 		return -1;
 	}
 
-	if (settimeo(fd, timeout_ms))
+	if (settimeo(fd, opts->timeout_ms))
 		goto error_close;
 
-	if (reuseport &&
-	    setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on))) {
-		log_err("Failed to set SO_REUSEPORT");
+	if (opts->post_socket_cb && opts->post_socket_cb(fd, NULL)) {
+		log_err("Failed to call post_socket_cb");
 		goto error_close;
 	}
 
@@ -118,35 +118,35 @@ error_close:
 	return -1;
 }
 
-static int start_server_proto(int family, int type, int protocol,
-			      const char *addr_str, __u16 port, int timeout_ms)
+int start_server(int family, int type, const char *addr_str, __u16 port,
+		 int timeout_ms)
 {
+	struct network_helper_opts opts = {
+		.timeout_ms	= timeout_ms,
+	};
 	struct sockaddr_storage addr;
 	socklen_t addrlen;
 
 	if (make_sockaddr(family, addr_str, port, &addr, &addrlen))
 		return -1;
 
-	return __start_server(type, protocol, (struct sockaddr *)&addr,
-			      addrlen, timeout_ms, false);
+	return __start_server(type, (struct sockaddr *)&addr, addrlen, &opts);
 }
 
-int start_server(int family, int type, const char *addr_str, __u16 port,
-		 int timeout_ms)
+static int reuseport_cb(int fd, const struct post_socket_opts *opts)
 {
-	return start_server_proto(family, type, 0, addr_str, port, timeout_ms);
-}
+	int on = 1;
 
-int start_mptcp_server(int family, const char *addr_str, __u16 port,
-		       int timeout_ms)
-{
-	return start_server_proto(family, SOCK_STREAM, IPPROTO_MPTCP, addr_str,
-				  port, timeout_ms);
+	return setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on));
 }
 
 int *start_reuseport_server(int family, int type, const char *addr_str,
 			    __u16 port, int timeout_ms, unsigned int nr_listens)
 {
+	struct network_helper_opts opts = {
+		.timeout_ms = timeout_ms,
+		.post_socket_cb = reuseport_cb,
+	};
 	struct sockaddr_storage addr;
 	unsigned int nr_fds = 0;
 	socklen_t addrlen;
@@ -162,8 +162,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str,
 	if (!fds)
 		return NULL;
 
-	fds[0] = __start_server(type, 0, (struct sockaddr *)&addr, addrlen,
-				timeout_ms, true);
+	fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts);
 	if (fds[0] == -1)
 		goto close_fds;
 	nr_fds = 1;
@@ -172,8 +171,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str,
 		goto close_fds;
 
 	for (; nr_fds < nr_listens; nr_fds++) {
-		fds[nr_fds] = __start_server(type, 0, (struct sockaddr *)&addr,
-					     addrlen, timeout_ms, true);
+		fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts);
 		if (fds[nr_fds] == -1)
 			goto close_fds;
 	}
@@ -185,6 +183,15 @@ close_fds:
 	return NULL;
 }
 
+int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len,
+		      const struct network_helper_opts *opts)
+{
+	if (!opts)
+		opts = &default_opts;
+
+	return __start_server(type, (struct sockaddr *)addr, len, opts);
+}
+
 void free_fds(int *fds, unsigned int nr_close_fds)
 {
 	if (fds) {
@@ -258,17 +265,24 @@ static int connect_fd_to_addr(int fd,
 	return 0;
 }
 
-int connect_to_addr(const struct sockaddr_storage *addr, socklen_t addrlen, int type)
+int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen,
+		    const struct network_helper_opts *opts)
 {
 	int fd;
 
-	fd = socket(addr->ss_family, type, 0);
+	if (!opts)
+		opts = &default_opts;
+
+	fd = socket(addr->ss_family, type, opts->proto);
 	if (fd < 0) {
 		log_err("Failed to create client socket");
 		return -1;
 	}
 
-	if (connect_fd_to_addr(fd, addr, addrlen, false))
+	if (settimeo(fd, opts->timeout_ms))
+		goto error_close;
+
+	if (connect_fd_to_addr(fd, addr, addrlen, opts->must_fail))
 		goto error_close;
 
 	return fd;
@@ -278,8 +292,6 @@ error_close:
 	return -1;
 }
 
-static const struct network_helper_opts default_opts;
-
 int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts)
 {
 	struct sockaddr_storage addr;
@@ -442,25 +454,35 @@ struct nstoken *open_netns(const char *name)
 	struct nstoken *token;
 
 	token = calloc(1, sizeof(struct nstoken));
-	if (!ASSERT_OK_PTR(token, "malloc token"))
+	if (!token) {
+		log_err("Failed to malloc token");
 		return NULL;
+	}
 
 	token->orig_netns_fd = open("/proc/self/ns/net", O_RDONLY);
-	if (!ASSERT_GE(token->orig_netns_fd, 0, "open /proc/self/ns/net"))
+	if (token->orig_netns_fd == -1) {
+		log_err("Failed to open(/proc/self/ns/net)");
 		goto fail;
+	}
 
 	snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name);
 	nsfd = open(nspath, O_RDONLY | O_CLOEXEC);
-	if (!ASSERT_GE(nsfd, 0, "open netns fd"))
+	if (nsfd == -1) {
+		log_err("Failed to open(%s)", nspath);
 		goto fail;
+	}
 
 	err = setns(nsfd, CLONE_NEWNET);
 	close(nsfd);
-	if (!ASSERT_OK(err, "setns"))
+	if (err) {
+		log_err("Failed to setns(nsfd)");
 		goto fail;
+	}
 
 	return token;
 fail:
+	if (token->orig_netns_fd != -1)
+		close(token->orig_netns_fd);
 	free(token);
 	return NULL;
 }
@@ -470,7 +492,8 @@ void close_netns(struct nstoken *token)
 	if (!token)
 		return;
 
-	ASSERT_OK(setns(token->orig_netns_fd, CLONE_NEWNET), "setns");
+	if (setns(token->orig_netns_fd, CLONE_NEWNET))
+		log_err("Failed to setns(orig_netns_fd)");
 	close(token->orig_netns_fd);
 	free(token);
 }
@@ -497,3 +520,153 @@ int get_socket_local_port(int sock_fd)
 
 	return -1;
 }
+
+int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param)
+{
+	struct ifreq ifr = {0};
+	int sockfd, err;
+
+	sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+	if (sockfd < 0)
+		return -errno;
+
+	memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
+
+	ring_param->cmd = ETHTOOL_GRINGPARAM;
+	ifr.ifr_data = (char *)ring_param;
+
+	if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) {
+		err = errno;
+		close(sockfd);
+		return -err;
+	}
+
+	close(sockfd);
+	return 0;
+}
+
+int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param)
+{
+	struct ifreq ifr = {0};
+	int sockfd, err;
+
+	sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+	if (sockfd < 0)
+		return -errno;
+
+	memcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
+
+	ring_param->cmd = ETHTOOL_SRINGPARAM;
+	ifr.ifr_data = (char *)ring_param;
+
+	if (ioctl(sockfd, SIOCETHTOOL, &ifr) < 0) {
+		err = errno;
+		close(sockfd);
+		return -err;
+	}
+
+	close(sockfd);
+	return 0;
+}
+
+struct send_recv_arg {
+	int		fd;
+	uint32_t	bytes;
+	int		stop;
+};
+
+static void *send_recv_server(void *arg)
+{
+	struct send_recv_arg *a = (struct send_recv_arg *)arg;
+	ssize_t nr_sent = 0, bytes = 0;
+	char batch[1500];
+	int err = 0, fd;
+
+	fd = accept(a->fd, NULL, NULL);
+	while (fd == -1) {
+		if (errno == EINTR)
+			continue;
+		err = -errno;
+		goto done;
+	}
+
+	if (settimeo(fd, 0)) {
+		err = -errno;
+		goto done;
+	}
+
+	while (bytes < a->bytes && !READ_ONCE(a->stop)) {
+		nr_sent = send(fd, &batch,
+			       MIN(a->bytes - bytes, sizeof(batch)), 0);
+		if (nr_sent == -1 && errno == EINTR)
+			continue;
+		if (nr_sent == -1) {
+			err = -errno;
+			break;
+		}
+		bytes += nr_sent;
+	}
+
+	if (bytes != a->bytes) {
+		log_err("send %zd expected %u", bytes, a->bytes);
+		if (!err)
+			err = bytes > a->bytes ? -E2BIG : -EINTR;
+	}
+
+done:
+	if (fd >= 0)
+		close(fd);
+	if (err) {
+		WRITE_ONCE(a->stop, 1);
+		return ERR_PTR(err);
+	}
+	return NULL;
+}
+
+int send_recv_data(int lfd, int fd, uint32_t total_bytes)
+{
+	ssize_t nr_recv = 0, bytes = 0;
+	struct send_recv_arg arg = {
+		.fd	= lfd,
+		.bytes	= total_bytes,
+		.stop	= 0,
+	};
+	pthread_t srv_thread;
+	void *thread_ret;
+	char batch[1500];
+	int err = 0;
+
+	err = pthread_create(&srv_thread, NULL, send_recv_server, (void *)&arg);
+	if (err) {
+		log_err("Failed to pthread_create");
+		return err;
+	}
+
+	/* recv total_bytes */
+	while (bytes < total_bytes && !READ_ONCE(arg.stop)) {
+		nr_recv = recv(fd, &batch,
+			       MIN(total_bytes - bytes, sizeof(batch)), 0);
+		if (nr_recv == -1 && errno == EINTR)
+			continue;
+		if (nr_recv == -1) {
+			err = -errno;
+			break;
+		}
+		bytes += nr_recv;
+	}
+
+	if (bytes != total_bytes) {
+		log_err("recv %zd expected %u", bytes, total_bytes);
+		if (!err)
+			err = bytes > total_bytes ? -E2BIG : -EINTR;
+	}
+
+	WRITE_ONCE(arg.stop, 1);
+	pthread_join(srv_thread, &thread_ret);
+	if (IS_ERR(thread_ret)) {
+		log_err("Failed in thread_ret %ld", PTR_ERR(thread_ret));
+		err = err ? : PTR_ERR(thread_ret);
+	}
+
+	return err;
+}
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 94b9be24e39b..883c7ea9d8d5 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -9,14 +9,20 @@ typedef __u16 __sum16;
 #include <linux/if_packet.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#include <linux/err.h>
 #include <netinet/tcp.h>
 #include <bpf/bpf_endian.h>
+#include <net/if.h>
 
 #define MAGIC_VAL 0x1234
 #define NUM_ITER 100000
 #define VIP_NUM 5
 #define MAGIC_BYTES 123
 
+struct post_socket_opts {};
+
 struct network_helper_opts {
 	const char *cc;
 	int timeout_ms;
@@ -24,6 +30,7 @@ struct network_helper_opts {
 	bool noconnect;
 	int type;
 	int proto;
+	int (*post_socket_cb)(int fd, const struct post_socket_opts *opts);
 };
 
 /* ipv4 test vector */
@@ -45,13 +52,14 @@ extern struct ipv6_packet pkt_v6;
 int settimeo(int fd, int timeout_ms);
 int start_server(int family, int type, const char *addr, __u16 port,
 		 int timeout_ms);
-int start_mptcp_server(int family, const char *addr, __u16 port,
-		       int timeout_ms);
 int *start_reuseport_server(int family, int type, const char *addr_str,
 			    __u16 port, int timeout_ms,
 			    unsigned int nr_listens);
+int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t len,
+		      const struct network_helper_opts *opts);
 void free_fds(int *fds, unsigned int nr_close_fds);
-int connect_to_addr(const struct sockaddr_storage *addr, socklen_t len, int type);
+int connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t len,
+		    const struct network_helper_opts *opts);
 int connect_to_fd(int server_fd, int timeout_ms);
 int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts);
 int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms);
@@ -61,6 +69,8 @@ int make_sockaddr(int family, const char *addr_str, __u16 port,
 		  struct sockaddr_storage *addr, socklen_t *len);
 char *ping_command(int family);
 int get_socket_local_port(int sock_fd);
+int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param);
+int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param);
 
 struct nstoken;
 /**
@@ -71,6 +81,7 @@ struct nstoken;
  */
 struct nstoken *open_netns(const char *name);
 void close_netns(struct nstoken *token);
+int send_recv_data(int lfd, int fd, uint32_t total_bytes);
 
 static __u16 csum_fold(__u32 csum)
 {
diff --git a/tools/testing/selftests/bpf/prog_tests/arena_atomics.c b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c
new file mode 100644
index 000000000000..0807a48a58ee
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include "arena_atomics.skel.h"
+
+static void test_add(struct arena_atomics *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	/* No need to attach it, just run it directly */
+	prog_fd = bpf_program__fd(skel->progs.add);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	if (!ASSERT_OK(err, "test_run_opts err"))
+		return;
+	if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
+		return;
+
+	ASSERT_EQ(skel->arena->add64_value, 3, "add64_value");
+	ASSERT_EQ(skel->arena->add64_result, 1, "add64_result");
+
+	ASSERT_EQ(skel->arena->add32_value, 3, "add32_value");
+	ASSERT_EQ(skel->arena->add32_result, 1, "add32_result");
+
+	ASSERT_EQ(skel->arena->add_stack_value_copy, 3, "add_stack_value");
+	ASSERT_EQ(skel->arena->add_stack_result, 1, "add_stack_result");
+
+	ASSERT_EQ(skel->arena->add_noreturn_value, 3, "add_noreturn_value");
+}
+
+static void test_sub(struct arena_atomics *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	/* No need to attach it, just run it directly */
+	prog_fd = bpf_program__fd(skel->progs.sub);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	if (!ASSERT_OK(err, "test_run_opts err"))
+		return;
+	if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
+		return;
+
+	ASSERT_EQ(skel->arena->sub64_value, -1, "sub64_value");
+	ASSERT_EQ(skel->arena->sub64_result, 1, "sub64_result");
+
+	ASSERT_EQ(skel->arena->sub32_value, -1, "sub32_value");
+	ASSERT_EQ(skel->arena->sub32_result, 1, "sub32_result");
+
+	ASSERT_EQ(skel->arena->sub_stack_value_copy, -1, "sub_stack_value");
+	ASSERT_EQ(skel->arena->sub_stack_result, 1, "sub_stack_result");
+
+	ASSERT_EQ(skel->arena->sub_noreturn_value, -1, "sub_noreturn_value");
+}
+
+static void test_and(struct arena_atomics *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	/* No need to attach it, just run it directly */
+	prog_fd = bpf_program__fd(skel->progs.and);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	if (!ASSERT_OK(err, "test_run_opts err"))
+		return;
+	if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
+		return;
+
+	ASSERT_EQ(skel->arena->and64_value, 0x010ull << 32, "and64_value");
+	ASSERT_EQ(skel->arena->and32_value, 0x010, "and32_value");
+}
+
+static void test_or(struct arena_atomics *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	/* No need to attach it, just run it directly */
+	prog_fd = bpf_program__fd(skel->progs.or);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	if (!ASSERT_OK(err, "test_run_opts err"))
+		return;
+	if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
+		return;
+
+	ASSERT_EQ(skel->arena->or64_value, 0x111ull << 32, "or64_value");
+	ASSERT_EQ(skel->arena->or32_value, 0x111, "or32_value");
+}
+
+static void test_xor(struct arena_atomics *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	/* No need to attach it, just run it directly */
+	prog_fd = bpf_program__fd(skel->progs.xor);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	if (!ASSERT_OK(err, "test_run_opts err"))
+		return;
+	if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
+		return;
+
+	ASSERT_EQ(skel->arena->xor64_value, 0x101ull << 32, "xor64_value");
+	ASSERT_EQ(skel->arena->xor32_value, 0x101, "xor32_value");
+}
+
+static void test_cmpxchg(struct arena_atomics *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	/* No need to attach it, just run it directly */
+	prog_fd = bpf_program__fd(skel->progs.cmpxchg);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	if (!ASSERT_OK(err, "test_run_opts err"))
+		return;
+	if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
+		return;
+
+	ASSERT_EQ(skel->arena->cmpxchg64_value, 2, "cmpxchg64_value");
+	ASSERT_EQ(skel->arena->cmpxchg64_result_fail, 1, "cmpxchg_result_fail");
+	ASSERT_EQ(skel->arena->cmpxchg64_result_succeed, 1, "cmpxchg_result_succeed");
+
+	ASSERT_EQ(skel->arena->cmpxchg32_value, 2, "lcmpxchg32_value");
+	ASSERT_EQ(skel->arena->cmpxchg32_result_fail, 1, "cmpxchg_result_fail");
+	ASSERT_EQ(skel->arena->cmpxchg32_result_succeed, 1, "cmpxchg_result_succeed");
+}
+
+static void test_xchg(struct arena_atomics *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	/* No need to attach it, just run it directly */
+	prog_fd = bpf_program__fd(skel->progs.xchg);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	if (!ASSERT_OK(err, "test_run_opts err"))
+		return;
+	if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
+		return;
+
+	ASSERT_EQ(skel->arena->xchg64_value, 2, "xchg64_value");
+	ASSERT_EQ(skel->arena->xchg64_result, 1, "xchg64_result");
+
+	ASSERT_EQ(skel->arena->xchg32_value, 2, "xchg32_value");
+	ASSERT_EQ(skel->arena->xchg32_result, 1, "xchg32_result");
+}
+
+void test_arena_atomics(void)
+{
+	struct arena_atomics *skel;
+	int err;
+
+	skel = arena_atomics__open();
+	if (!ASSERT_OK_PTR(skel, "arena atomics skeleton open"))
+		return;
+
+	if (skel->data->skip_tests) {
+		printf("%s:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang",
+		       __func__);
+		test__skip();
+		goto cleanup;
+	}
+	err = arena_atomics__load(skel);
+	if (!ASSERT_OK(err, "arena atomics skeleton load"))
+		return;
+	skel->bss->pid = getpid();
+
+	if (test__start_subtest("add"))
+		test_add(skel);
+	if (test__start_subtest("sub"))
+		test_sub(skel);
+	if (test__start_subtest("and"))
+		test_and(skel);
+	if (test__start_subtest("or"))
+		test_or(skel);
+	if (test__start_subtest("xor"))
+		test_xor(skel);
+	if (test__start_subtest("cmpxchg"))
+		test_cmpxchg(skel);
+	if (test__start_subtest("xchg"))
+		test_xchg(skel);
+
+cleanup:
+	arena_atomics__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/arena_htab.c b/tools/testing/selftests/bpf/prog_tests/arena_htab.c
index 0766702de846..d69fd2465f53 100644
--- a/tools/testing/selftests/bpf/prog_tests/arena_htab.c
+++ b/tools/testing/selftests/bpf/prog_tests/arena_htab.c
@@ -3,12 +3,14 @@
 #include <test_progs.h>
 #include <sys/mman.h>
 #include <network_helpers.h>
-
+#include <sys/user.h>
+#ifndef PAGE_SIZE /* on some archs it comes in sys/user.h */
+#include <unistd.h>
+#define PAGE_SIZE getpagesize()
+#endif
 #include "arena_htab_asm.skel.h"
 #include "arena_htab.skel.h"
 
-#define PAGE_SIZE 4096
-
 #include "bpf_arena_htab.h"
 
 static void test_arena_htab_common(struct htab *htab)
diff --git a/tools/testing/selftests/bpf/prog_tests/arena_list.c b/tools/testing/selftests/bpf/prog_tests/arena_list.c
index e61886debab1..d15867cddde0 100644
--- a/tools/testing/selftests/bpf/prog_tests/arena_list.c
+++ b/tools/testing/selftests/bpf/prog_tests/arena_list.c
@@ -3,8 +3,11 @@
 #include <test_progs.h>
 #include <sys/mman.h>
 #include <network_helpers.h>
-
-#define PAGE_SIZE 4096
+#include <sys/user.h>
+#ifndef PAGE_SIZE /* on some archs it comes in sys/user.h */
+#include <unistd.h>
+#define PAGE_SIZE getpagesize()
+#endif
 
 #include "bpf_arena_list.h"
 #include "arena_list.skel.h"
diff --git a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c
index 053f4d6da77a..cc184e4420f6 100644
--- a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2021 Facebook */
 
 #include <sys/syscall.h>
+#include <limits.h>
 #include <test_progs.h>
 #include "bloom_filter_map.skel.h"
 
@@ -21,6 +22,11 @@ static void test_fail_cases(void)
 	if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid value size 0"))
 		close(fd);
 
+	/* Invalid value size: too big */
+	fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, INT32_MAX, 100, NULL);
+	if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid value too large"))
+		close(fd);
+
 	/* Invalid max entries size */
 	fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(value), 0, NULL);
 	if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid max entries size"))
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
index 1454cebc262b..4407ea428e77 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
@@ -573,6 +573,115 @@ cleanup:
 		close(lsm_fd);
 }
 
+static void tp_btf_subtest(struct test_bpf_cookie *skel)
+{
+	__u64 cookie;
+	int prog_fd, link_fd = -1;
+	struct bpf_link *link = NULL;
+	LIBBPF_OPTS(bpf_link_create_opts, link_opts);
+	LIBBPF_OPTS(bpf_raw_tp_opts, raw_tp_opts);
+	LIBBPF_OPTS(bpf_trace_opts, trace_opts);
+
+	/* There are three different ways to attach tp_btf (BTF-aware raw
+	 * tracepoint) programs. Let's test all of them.
+	 */
+	prog_fd = bpf_program__fd(skel->progs.handle_tp_btf);
+
+	/* low-level BPF_RAW_TRACEPOINT_OPEN command wrapper */
+	skel->bss->tp_btf_res = 0;
+
+	raw_tp_opts.cookie = cookie = 0x11000000000000L;
+	link_fd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_tp_opts);
+	if (!ASSERT_GE(link_fd, 0, "bpf_raw_tracepoint_open_opts"))
+		goto cleanup;
+
+	usleep(1); /* trigger */
+	close(link_fd); /* detach */
+	link_fd = -1;
+
+	ASSERT_EQ(skel->bss->tp_btf_res, cookie, "raw_tp_open_res");
+
+	/* low-level generic bpf_link_create() API */
+	skel->bss->tp_btf_res = 0;
+
+	link_opts.tracing.cookie = cookie = 0x22000000000000L;
+	link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_RAW_TP, &link_opts);
+	if (!ASSERT_GE(link_fd, 0, "bpf_link_create"))
+		goto cleanup;
+
+	usleep(1); /* trigger */
+	close(link_fd); /* detach */
+	link_fd = -1;
+
+	ASSERT_EQ(skel->bss->tp_btf_res, cookie, "link_create_res");
+
+	/* high-level bpf_link-based bpf_program__attach_trace_opts() API */
+	skel->bss->tp_btf_res = 0;
+
+	trace_opts.cookie = cookie = 0x33000000000000L;
+	link = bpf_program__attach_trace_opts(skel->progs.handle_tp_btf, &trace_opts);
+	if (!ASSERT_OK_PTR(link, "attach_trace_opts"))
+		goto cleanup;
+
+	usleep(1); /* trigger */
+	bpf_link__destroy(link); /* detach */
+	link = NULL;
+
+	ASSERT_EQ(skel->bss->tp_btf_res, cookie, "attach_trace_opts_res");
+
+cleanup:
+	if (link_fd >= 0)
+		close(link_fd);
+	bpf_link__destroy(link);
+}
+
+static void raw_tp_subtest(struct test_bpf_cookie *skel)
+{
+	__u64 cookie;
+	int prog_fd, link_fd = -1;
+	struct bpf_link *link = NULL;
+	LIBBPF_OPTS(bpf_raw_tp_opts, raw_tp_opts);
+	LIBBPF_OPTS(bpf_raw_tracepoint_opts, opts);
+
+	/* There are two different ways to attach raw_tp programs */
+	prog_fd = bpf_program__fd(skel->progs.handle_raw_tp);
+
+	/* low-level BPF_RAW_TRACEPOINT_OPEN command wrapper */
+	skel->bss->raw_tp_res = 0;
+
+	raw_tp_opts.tp_name = "sys_enter";
+	raw_tp_opts.cookie = cookie = 0x55000000000000L;
+	link_fd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_tp_opts);
+	if (!ASSERT_GE(link_fd, 0, "bpf_raw_tracepoint_open_opts"))
+		goto cleanup;
+
+	usleep(1); /* trigger */
+	close(link_fd); /* detach */
+	link_fd = -1;
+
+	ASSERT_EQ(skel->bss->raw_tp_res, cookie, "raw_tp_open_res");
+
+	/* high-level bpf_link-based bpf_program__attach_raw_tracepoint_opts() API */
+	skel->bss->raw_tp_res = 0;
+
+	opts.cookie = cookie = 0x66000000000000L;
+	link = bpf_program__attach_raw_tracepoint_opts(skel->progs.handle_raw_tp,
+						       "sys_enter", &opts);
+	if (!ASSERT_OK_PTR(link, "attach_raw_tp_opts"))
+		goto cleanup;
+
+	usleep(1); /* trigger */
+	bpf_link__destroy(link); /* detach */
+	link = NULL;
+
+	ASSERT_EQ(skel->bss->raw_tp_res, cookie, "attach_raw_tp_opts_res");
+
+cleanup:
+	if (link_fd >= 0)
+		close(link_fd);
+	bpf_link__destroy(link);
+}
+
 void test_bpf_cookie(void)
 {
 	struct test_bpf_cookie *skel;
@@ -601,6 +710,9 @@ void test_bpf_cookie(void)
 		tracing_subtest(skel);
 	if (test__start_subtest("lsm"))
 		lsm_subtest(skel);
-
+	if (test__start_subtest("tp_btf"))
+		tp_btf_subtest(skel);
+	if (test__start_subtest("raw_tp"))
+		raw_tp_subtest(skel);
 	test_bpf_cookie__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index a88e6e07e4f5..0aca02532794 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -13,6 +13,8 @@
 #include "tcp_ca_write_sk_pacing.skel.h"
 #include "tcp_ca_incompl_cong_ops.skel.h"
 #include "tcp_ca_unsupp_cong_op.skel.h"
+#include "tcp_ca_kfunc.skel.h"
+#include "bpf_cc_cubic.skel.h"
 
 #ifndef ENOTSUPP
 #define ENOTSUPP 524
@@ -20,7 +22,6 @@
 
 static const unsigned int total_bytes = 10 * 1024 * 1024;
 static int expected_stg = 0xeB9F;
-static int stop;
 
 static int settcpca(int fd, const char *tcp_ca)
 {
@@ -33,63 +34,12 @@ static int settcpca(int fd, const char *tcp_ca)
 	return 0;
 }
 
-static void *server(void *arg)
-{
-	int lfd = (int)(long)arg, err = 0, fd;
-	ssize_t nr_sent = 0, bytes = 0;
-	char batch[1500];
-
-	fd = accept(lfd, NULL, NULL);
-	while (fd == -1) {
-		if (errno == EINTR)
-			continue;
-		err = -errno;
-		goto done;
-	}
-
-	if (settimeo(fd, 0)) {
-		err = -errno;
-		goto done;
-	}
-
-	while (bytes < total_bytes && !READ_ONCE(stop)) {
-		nr_sent = send(fd, &batch,
-			       MIN(total_bytes - bytes, sizeof(batch)), 0);
-		if (nr_sent == -1 && errno == EINTR)
-			continue;
-		if (nr_sent == -1) {
-			err = -errno;
-			break;
-		}
-		bytes += nr_sent;
-	}
-
-	ASSERT_EQ(bytes, total_bytes, "send");
-
-done:
-	if (fd >= 0)
-		close(fd);
-	if (err) {
-		WRITE_ONCE(stop, 1);
-		return ERR_PTR(err);
-	}
-	return NULL;
-}
-
 static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map)
 {
-	struct sockaddr_in6 sa6 = {};
-	ssize_t nr_recv = 0, bytes = 0;
 	int lfd = -1, fd = -1;
-	pthread_t srv_thread;
-	socklen_t addrlen = sizeof(sa6);
-	void *thread_ret;
-	char batch[1500];
 	int err;
 
-	WRITE_ONCE(stop, 0);
-
-	lfd = socket(AF_INET6, SOCK_STREAM, 0);
+	lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0);
 	if (!ASSERT_NEQ(lfd, -1, "socket"))
 		return;
 
@@ -99,23 +49,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map)
 		return;
 	}
 
-	if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) ||
-	    settimeo(lfd, 0) || settimeo(fd, 0))
-		goto done;
-
-	/* bind, listen and start server thread to accept */
-	sa6.sin6_family = AF_INET6;
-	sa6.sin6_addr = in6addr_loopback;
-	err = bind(lfd, (struct sockaddr *)&sa6, addrlen);
-	if (!ASSERT_NEQ(err, -1, "bind"))
-		goto done;
-
-	err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen);
-	if (!ASSERT_NEQ(err, -1, "getsockname"))
-		goto done;
-
-	err = listen(lfd, 1);
-	if (!ASSERT_NEQ(err, -1, "listen"))
+	if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca))
 		goto done;
 
 	if (sk_stg_map) {
@@ -126,7 +60,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map)
 	}
 
 	/* connect to server */
-	err = connect(fd, (struct sockaddr *)&sa6, addrlen);
+	err = connect_fd_to_fd(fd, lfd, 0);
 	if (!ASSERT_NEQ(err, -1, "connect"))
 		goto done;
 
@@ -140,26 +74,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map)
 			goto done;
 	}
 
-	err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd);
-	if (!ASSERT_OK(err, "pthread_create"))
-		goto done;
-
-	/* recv total_bytes */
-	while (bytes < total_bytes && !READ_ONCE(stop)) {
-		nr_recv = recv(fd, &batch,
-			       MIN(total_bytes - bytes, sizeof(batch)), 0);
-		if (nr_recv == -1 && errno == EINTR)
-			continue;
-		if (nr_recv == -1)
-			break;
-		bytes += nr_recv;
-	}
-
-	ASSERT_EQ(bytes, total_bytes, "recv");
-
-	WRITE_ONCE(stop, 1);
-	pthread_join(srv_thread, &thread_ret);
-	ASSERT_OK(IS_ERR(thread_ret), "thread_ret");
+	ASSERT_OK(send_recv_data(lfd, fd, total_bytes), "send_recv_data");
 
 done:
 	close(lfd);
@@ -315,7 +230,7 @@ static void test_rel_setsockopt(void)
 	struct bpf_dctcp_release *rel_skel;
 	libbpf_print_fn_t old_print_fn;
 
-	err_str = "unknown func bpf_setsockopt";
+	err_str = "program of this type cannot use helper bpf_setsockopt";
 	found = false;
 
 	old_print_fn = libbpf_set_print(libbpf_debug_print);
@@ -529,6 +444,36 @@ static void test_link_replace(void)
 	tcp_ca_update__destroy(skel);
 }
 
+static void test_tcp_ca_kfunc(void)
+{
+	struct tcp_ca_kfunc *skel;
+
+	skel = tcp_ca_kfunc__open_and_load();
+	ASSERT_OK_PTR(skel, "tcp_ca_kfunc__open_and_load");
+	tcp_ca_kfunc__destroy(skel);
+}
+
+static void test_cc_cubic(void)
+{
+	struct bpf_cc_cubic *cc_cubic_skel;
+	struct bpf_link *link;
+
+	cc_cubic_skel = bpf_cc_cubic__open_and_load();
+	if (!ASSERT_OK_PTR(cc_cubic_skel, "bpf_cc_cubic__open_and_load"))
+		return;
+
+	link = bpf_map__attach_struct_ops(cc_cubic_skel->maps.cc_cubic);
+	if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) {
+		bpf_cc_cubic__destroy(cc_cubic_skel);
+		return;
+	}
+
+	do_test("bpf_cc_cubic", NULL);
+
+	bpf_link__destroy(link);
+	bpf_cc_cubic__destroy(cc_cubic_skel);
+}
+
 void test_bpf_tcp_ca(void)
 {
 	if (test__start_subtest("dctcp"))
@@ -557,4 +502,8 @@ void test_bpf_tcp_ca(void)
 		test_multi_links();
 	if (test__start_subtest("link_replace"))
 		test_link_replace();
+	if (test__start_subtest("tcp_ca_kfunc"))
+		test_tcp_ca_kfunc();
+	if (test__start_subtest("cc_cubic"))
+		test_cc_cubic();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
index a8b53b8736f0..f66ceccd7029 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
@@ -25,7 +25,7 @@ static void test_lookup_update(void)
 	int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id;
 	int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd;
 	struct test_btf_map_in_map *skel;
-	int err, key = 0, val, i, fd;
+	int err, key = 0, val, i;
 
 	skel = test_btf_map_in_map__open_and_load();
 	if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n"))
@@ -102,30 +102,6 @@ static void test_lookup_update(void)
 	CHECK(map1_id == 0, "map1_id", "failed to get ID 1\n");
 	CHECK(map2_id == 0, "map2_id", "failed to get ID 2\n");
 
-	test_btf_map_in_map__destroy(skel);
-	skel = NULL;
-
-	/* we need to either wait for or force synchronize_rcu(), before
-	 * checking for "still exists" condition, otherwise map could still be
-	 * resolvable by ID, causing false positives.
-	 *
-	 * Older kernels (5.8 and earlier) freed map only after two
-	 * synchronize_rcu()s, so trigger two, to be entirely sure.
-	 */
-	CHECK(kern_sync_rcu(), "sync_rcu", "failed\n");
-	CHECK(kern_sync_rcu(), "sync_rcu", "failed\n");
-
-	fd = bpf_map_get_fd_by_id(map1_id);
-	if (CHECK(fd >= 0, "map1_leak", "inner_map1 leaked!\n")) {
-		close(fd);
-		goto cleanup;
-	}
-	fd = bpf_map_get_fd_by_id(map2_id);
-	if (CHECK(fd >= 0, "map2_leak", "inner_map2 leaked!\n")) {
-		close(fd);
-		goto cleanup;
-	}
-
 cleanup:
 	test_btf_map_in_map__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c b/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c
index 74d6d7546f40..25332e596750 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c
@@ -87,9 +87,12 @@ void test_cgroup1_hierarchy(void)
 		goto destroy;
 
 	/* Setup cgroup1 hierarchy */
+	err = setup_cgroup_environment();
+	if (!ASSERT_OK(err, "setup_cgroup_environment"))
+		goto destroy;
 	err = setup_classid_environment();
 	if (!ASSERT_OK(err, "setup_classid_environment"))
-		goto destroy;
+		goto cleanup_cgroup;
 
 	err = join_classid();
 	if (!ASSERT_OK(err, "join_cgroup1"))
@@ -153,6 +156,8 @@ void test_cgroup1_hierarchy(void)
 
 cleanup:
 	cleanup_classid_environment();
+cleanup_cgroup:
+	cleanup_cgroup_environment();
 destroy:
 	test_cgroup1_hierarchy__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
index 2a55f717fc07..34b59f6baca1 100644
--- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
@@ -10,6 +10,7 @@
 #include <netinet/tcp.h>
 
 #include <test_progs.h>
+#include "network_helpers.h"
 
 #include "progs/test_cls_redirect.h"
 #include "test_cls_redirect.skel.h"
@@ -35,39 +36,6 @@ struct tuple {
 	struct addr_port dst;
 };
 
-static int start_server(const struct sockaddr *addr, socklen_t len, int type)
-{
-	int fd = socket(addr->sa_family, type, 0);
-	if (CHECK_FAIL(fd == -1))
-		return -1;
-	if (CHECK_FAIL(bind(fd, addr, len) == -1))
-		goto err;
-	if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1))
-		goto err;
-
-	return fd;
-
-err:
-	close(fd);
-	return -1;
-}
-
-static int connect_to_server(const struct sockaddr *addr, socklen_t len,
-			     int type)
-{
-	int fd = socket(addr->sa_family, type, 0);
-	if (CHECK_FAIL(fd == -1))
-		return -1;
-	if (CHECK_FAIL(connect(fd, addr, len)))
-		goto err;
-
-	return fd;
-
-err:
-	close(fd);
-	return -1;
-}
-
 static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap)
 {
 	const struct sockaddr_in6 *in6;
@@ -98,14 +66,14 @@ static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type,
 	socklen_t slen = sizeof(ss);
 	struct sockaddr *sa = (struct sockaddr *)&ss;
 
-	*server = start_server(addr, len, type);
+	*server = start_server_addr(type, (struct sockaddr_storage *)addr, len, NULL);
 	if (*server < 0)
 		return false;
 
 	if (CHECK_FAIL(getsockname(*server, sa, &slen)))
 		goto close_server;
 
-	*conn = connect_to_server(sa, slen, type);
+	*conn = connect_to_addr(type, (struct sockaddr_storage *)sa, slen, NULL);
 	if (*conn < 0)
 		goto close_server;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c
new file mode 100644
index 000000000000..b1a3a49a822a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/crypto_sanity.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <net/if.h>
+#include <linux/in6.h>
+#include <linux/if_alg.h>
+
+#include "test_progs.h"
+#include "network_helpers.h"
+#include "crypto_sanity.skel.h"
+#include "crypto_basic.skel.h"
+
+#define NS_TEST "crypto_sanity_ns"
+#define IPV6_IFACE_ADDR "face::1"
+static const unsigned char crypto_key[] = "testtest12345678";
+static const char plain_text[] = "stringtoencrypt0";
+static int opfd = -1, tfmfd = -1;
+static const char algo[] = "ecb(aes)";
+static int init_afalg(void)
+{
+	struct sockaddr_alg sa = {
+		.salg_family = AF_ALG,
+		.salg_type = "skcipher",
+		.salg_name = "ecb(aes)"
+	};
+
+	tfmfd = socket(AF_ALG, SOCK_SEQPACKET, 0);
+	if (tfmfd == -1)
+		return errno;
+	if (bind(tfmfd, (struct sockaddr *)&sa, sizeof(sa)) == -1)
+		return errno;
+	if (setsockopt(tfmfd, SOL_ALG, ALG_SET_KEY, crypto_key, 16) == -1)
+		return errno;
+	opfd = accept(tfmfd, NULL, 0);
+	if (opfd == -1)
+		return errno;
+	return 0;
+}
+
+static void deinit_afalg(void)
+{
+	if (tfmfd != -1)
+		close(tfmfd);
+	if (opfd != -1)
+		close(opfd);
+}
+
+static void do_crypt_afalg(const void *src, void *dst, int size, bool encrypt)
+{
+	struct msghdr msg = {};
+	struct cmsghdr *cmsg;
+	char cbuf[CMSG_SPACE(4)] = {0};
+	struct iovec iov;
+
+	msg.msg_control = cbuf;
+	msg.msg_controllen = sizeof(cbuf);
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_level = SOL_ALG;
+	cmsg->cmsg_type = ALG_SET_OP;
+	cmsg->cmsg_len = CMSG_LEN(4);
+	*(__u32 *)CMSG_DATA(cmsg) = encrypt ? ALG_OP_ENCRYPT : ALG_OP_DECRYPT;
+
+	iov.iov_base = (char *)src;
+	iov.iov_len = size;
+
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	sendmsg(opfd, &msg, 0);
+	read(opfd, dst, size);
+}
+
+void test_crypto_basic(void)
+{
+	RUN_TESTS(crypto_basic);
+}
+
+void test_crypto_sanity(void)
+{
+	LIBBPF_OPTS(bpf_tc_hook, qdisc_hook, .attach_point = BPF_TC_EGRESS);
+	LIBBPF_OPTS(bpf_tc_opts, tc_attach_enc);
+	LIBBPF_OPTS(bpf_tc_opts, tc_attach_dec);
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	struct nstoken *nstoken = NULL;
+	struct crypto_sanity *skel;
+	char afalg_plain[16] = {0};
+	char afalg_dst[16] = {0};
+	struct sockaddr_in6 addr;
+	int sockfd, err, pfd;
+	socklen_t addrlen;
+	u16 udp_test_port;
+
+	skel = crypto_sanity__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel open"))
+		return;
+
+	SYS(fail, "ip netns add %s", NS_TEST);
+	SYS(fail, "ip -net %s -6 addr add %s/128 dev lo nodad", NS_TEST, IPV6_IFACE_ADDR);
+	SYS(fail, "ip -net %s link set dev lo up", NS_TEST);
+
+	nstoken = open_netns(NS_TEST);
+	if (!ASSERT_OK_PTR(nstoken, "open_netns"))
+		goto fail;
+
+	err = init_afalg();
+	if (!ASSERT_OK(err, "AF_ALG init fail"))
+		goto fail;
+
+	qdisc_hook.ifindex = if_nametoindex("lo");
+	if (!ASSERT_GT(qdisc_hook.ifindex, 0, "if_nametoindex lo"))
+		goto fail;
+
+	skel->bss->key_len = 16;
+	skel->bss->authsize = 0;
+	udp_test_port = skel->data->udp_test_port;
+	memcpy(skel->bss->key, crypto_key, sizeof(crypto_key));
+	snprintf(skel->bss->algo, 128, "%s", algo);
+	pfd = bpf_program__fd(skel->progs.skb_crypto_setup);
+	if (!ASSERT_GT(pfd, 0, "skb_crypto_setup fd"))
+		goto fail;
+
+	err = bpf_prog_test_run_opts(pfd, &opts);
+	if (!ASSERT_OK(err, "skb_crypto_setup") ||
+	    !ASSERT_OK(opts.retval, "skb_crypto_setup retval"))
+		goto fail;
+
+	if (!ASSERT_OK(skel->bss->status, "skb_crypto_setup status"))
+		goto fail;
+
+	err = bpf_tc_hook_create(&qdisc_hook);
+	if (!ASSERT_OK(err, "create qdisc hook"))
+		goto fail;
+
+	addrlen = sizeof(addr);
+	err = make_sockaddr(AF_INET6, IPV6_IFACE_ADDR, udp_test_port,
+			    (void *)&addr, &addrlen);
+	if (!ASSERT_OK(err, "make_sockaddr"))
+		goto fail;
+
+	tc_attach_enc.prog_fd = bpf_program__fd(skel->progs.encrypt_sanity);
+	err = bpf_tc_attach(&qdisc_hook, &tc_attach_enc);
+	if (!ASSERT_OK(err, "attach encrypt filter"))
+		goto fail;
+
+	sockfd = socket(AF_INET6, SOCK_DGRAM, 0);
+	if (!ASSERT_NEQ(sockfd, -1, "encrypt socket"))
+		goto fail;
+	err = sendto(sockfd, plain_text, sizeof(plain_text), 0, (void *)&addr, addrlen);
+	close(sockfd);
+	if (!ASSERT_EQ(err, sizeof(plain_text), "encrypt send"))
+		goto fail;
+
+	do_crypt_afalg(plain_text, afalg_dst, sizeof(afalg_dst), true);
+
+	if (!ASSERT_OK(skel->bss->status, "encrypt status"))
+		goto fail;
+	if (!ASSERT_STRNEQ(skel->bss->dst, afalg_dst, sizeof(afalg_dst), "encrypt AF_ALG"))
+		goto fail;
+
+	tc_attach_enc.flags = tc_attach_enc.prog_fd = tc_attach_enc.prog_id = 0;
+	err = bpf_tc_detach(&qdisc_hook, &tc_attach_enc);
+	if (!ASSERT_OK(err, "bpf_tc_detach encrypt"))
+		goto fail;
+
+	tc_attach_dec.prog_fd = bpf_program__fd(skel->progs.decrypt_sanity);
+	err = bpf_tc_attach(&qdisc_hook, &tc_attach_dec);
+	if (!ASSERT_OK(err, "attach decrypt filter"))
+		goto fail;
+
+	sockfd = socket(AF_INET6, SOCK_DGRAM, 0);
+	if (!ASSERT_NEQ(sockfd, -1, "decrypt socket"))
+		goto fail;
+	err = sendto(sockfd, afalg_dst, sizeof(afalg_dst), 0, (void *)&addr, addrlen);
+	close(sockfd);
+	if (!ASSERT_EQ(err, sizeof(afalg_dst), "decrypt send"))
+		goto fail;
+
+	do_crypt_afalg(afalg_dst, afalg_plain, sizeof(afalg_plain), false);
+
+	if (!ASSERT_OK(skel->bss->status, "decrypt status"))
+		goto fail;
+	if (!ASSERT_STRNEQ(skel->bss->dst, afalg_plain, sizeof(afalg_plain), "decrypt AF_ALG"))
+		goto fail;
+
+	tc_attach_dec.flags = tc_attach_dec.prog_fd = tc_attach_dec.prog_id = 0;
+	err = bpf_tc_detach(&qdisc_hook, &tc_attach_dec);
+	ASSERT_OK(err, "bpf_tc_detach decrypt");
+
+fail:
+	close_netns(nstoken);
+	deinit_afalg();
+	SYS_NOFAIL("ip netns del " NS_TEST " &> /dev/null");
+	crypto_sanity__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c
index f43fcb13d2c4..d3d94596ab79 100644
--- a/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c
+++ b/tools/testing/selftests/bpf/prog_tests/dummy_st_ops.c
@@ -98,7 +98,8 @@ done:
 
 static void test_dummy_multiple_args(void)
 {
-	__u64 args[5] = {0, -100, 0x8a5f, 'c', 0x1234567887654321ULL};
+	struct bpf_dummy_ops_state st = { 7 };
+	__u64 args[5] = {(__u64)&st, -100, 0x8a5f, 'c', 0x1234567887654321ULL};
 	LIBBPF_OPTS(bpf_test_run_opts, attr,
 		.ctx_in = args,
 		.ctx_size_in = sizeof(args),
@@ -115,6 +116,7 @@ static void test_dummy_multiple_args(void)
 	fd = bpf_program__fd(skel->progs.test_2);
 	err = bpf_prog_test_run_opts(fd, &attr);
 	ASSERT_OK(err, "test_run");
+	args[0] = 7;
 	for (i = 0; i < ARRAY_SIZE(args); i++) {
 		snprintf(name, sizeof(name), "arg %zu", i);
 		ASSERT_EQ(skel->bss->test_2_args[i], args[i], name);
@@ -125,7 +127,8 @@ static void test_dummy_multiple_args(void)
 
 static void test_dummy_sleepable(void)
 {
-	__u64 args[1] = {0};
+	struct bpf_dummy_ops_state st;
+	__u64 args[1] = {(__u64)&st};
 	LIBBPF_OPTS(bpf_test_run_opts, attr,
 		.ctx_in = args,
 		.ctx_size_in = sizeof(args),
@@ -144,6 +147,31 @@ static void test_dummy_sleepable(void)
 	dummy_st_ops_success__destroy(skel);
 }
 
+/* dummy_st_ops.test_sleepable() parameter is not marked as nullable,
+ * thus bpf_prog_test_run_opts() below should be rejected as it tries
+ * to pass NULL for this parameter.
+ */
+static void test_dummy_sleepable_reject_null(void)
+{
+	__u64 args[1] = {0};
+	LIBBPF_OPTS(bpf_test_run_opts, attr,
+		.ctx_in = args,
+		.ctx_size_in = sizeof(args),
+	);
+	struct dummy_st_ops_success *skel;
+	int fd, err;
+
+	skel = dummy_st_ops_success__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "dummy_st_ops_load"))
+		return;
+
+	fd = bpf_program__fd(skel->progs.test_sleepable);
+	err = bpf_prog_test_run_opts(fd, &attr);
+	ASSERT_EQ(err, -EINVAL, "test_run");
+
+	dummy_st_ops_success__destroy(skel);
+}
+
 void test_dummy_st_ops(void)
 {
 	if (test__start_subtest("dummy_st_ops_attach"))
@@ -156,6 +184,8 @@ void test_dummy_st_ops(void)
 		test_dummy_multiple_args();
 	if (test__start_subtest("dummy_sleepable"))
 		test_dummy_sleepable();
+	if (test__start_subtest("dummy_sleepable_reject_null"))
+		test_dummy_sleepable_reject_null();
 
 	RUN_TESTS(dummy_st_ops_fail);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/empty_skb.c b/tools/testing/selftests/bpf/prog_tests/empty_skb.c
index 261228eb68e8..438583e1f2d1 100644
--- a/tools/testing/selftests/bpf/prog_tests/empty_skb.c
+++ b/tools/testing/selftests/bpf/prog_tests/empty_skb.c
@@ -94,6 +94,8 @@ void test_empty_skb(void)
 
 	SYS(out, "ip netns add empty_skb");
 	tok = open_netns("empty_skb");
+	if (!ASSERT_OK_PTR(tok, "setns"))
+		goto out;
 	SYS(out, "ip link add veth0 type veth peer veth1");
 	SYS(out, "ip link set dev veth0 up");
 	SYS(out, "ip link set dev veth1 up");
diff --git a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c
index 3379df2d4cf2..bd7658958004 100644
--- a/tools/testing/selftests/bpf/prog_tests/fib_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/fib_lookup.c
@@ -26,6 +26,17 @@
 #define IPV6_TBID_ADDR		"fd00::FFFF"
 #define IPV6_TBID_NET		"fd00::"
 #define IPV6_TBID_DST		"fd00::2"
+#define MARK_NO_POLICY		33
+#define MARK			42
+#define MARK_TABLE		"200"
+#define IPV4_REMOTE_DST		"1.2.3.4"
+#define IPV4_LOCAL		"10.4.0.3"
+#define IPV4_GW1		"10.4.0.1"
+#define IPV4_GW2		"10.4.0.2"
+#define IPV6_REMOTE_DST		"be:ef::b0:10"
+#define IPV6_LOCAL		"fd01::3"
+#define IPV6_GW1		"fd01::1"
+#define IPV6_GW2		"fd01::2"
 #define DMAC			"11:11:11:11:11:11"
 #define DMAC_INIT { 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, }
 #define DMAC2			"01:01:01:01:01:01"
@@ -36,9 +47,11 @@ struct fib_lookup_test {
 	const char *daddr;
 	int expected_ret;
 	const char *expected_src;
+	const char *expected_dst;
 	int lookup_flags;
 	__u32 tbid;
 	__u8 dmac[6];
+	__u32 mark;
 };
 
 static const struct fib_lookup_test tests[] = {
@@ -90,10 +103,47 @@ static const struct fib_lookup_test tests[] = {
 	  .daddr = IPV6_ADDR_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
 	  .expected_src = IPV6_IFACE_ADDR_SEC,
 	  .lookup_flags = BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_SKIP_NEIGH, },
+	/* policy routing */
+	{ .desc = "IPv4 policy routing, default",
+	  .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .expected_dst = IPV4_GW1,
+	  .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, },
+	{ .desc = "IPv4 policy routing, mark doesn't point to a policy",
+	  .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .expected_dst = IPV4_GW1,
+	  .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH,
+	  .mark = MARK_NO_POLICY, },
+	{ .desc = "IPv4 policy routing, mark points to a policy",
+	  .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .expected_dst = IPV4_GW2,
+	  .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH,
+	  .mark = MARK, },
+	{ .desc = "IPv4 policy routing, mark points to a policy, but no flag",
+	  .daddr = IPV4_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .expected_dst = IPV4_GW1,
+	  .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH,
+	  .mark = MARK, },
+	{ .desc = "IPv6 policy routing, default",
+	  .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .expected_dst = IPV6_GW1,
+	  .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH, },
+	{ .desc = "IPv6 policy routing, mark doesn't point to a policy",
+	  .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .expected_dst = IPV6_GW1,
+	  .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH,
+	  .mark = MARK_NO_POLICY, },
+	{ .desc = "IPv6 policy routing, mark points to a policy",
+	  .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .expected_dst = IPV6_GW2,
+	  .lookup_flags = BPF_FIB_LOOKUP_MARK | BPF_FIB_LOOKUP_SKIP_NEIGH,
+	  .mark = MARK, },
+	{ .desc = "IPv6 policy routing, mark points to a policy, but no flag",
+	  .daddr = IPV6_REMOTE_DST, .expected_ret = BPF_FIB_LKUP_RET_SUCCESS,
+	  .expected_dst = IPV6_GW1,
+	  .lookup_flags = BPF_FIB_LOOKUP_SKIP_NEIGH,
+	  .mark = MARK, },
 };
 
-static int ifindex;
-
 static int setup_netns(void)
 {
 	int err;
@@ -144,12 +194,24 @@ static int setup_netns(void)
 	if (!ASSERT_OK(err, "write_sysctl(net.ipv6.conf.veth1.forwarding)"))
 		goto fail;
 
+	/* Setup for policy routing tests */
+	SYS(fail, "ip addr add %s/24 dev veth1", IPV4_LOCAL);
+	SYS(fail, "ip addr add %s/64 dev veth1 nodad", IPV6_LOCAL);
+	SYS(fail, "ip route add %s/32 via %s", IPV4_REMOTE_DST, IPV4_GW1);
+	SYS(fail, "ip route add %s/32 via %s table %s", IPV4_REMOTE_DST, IPV4_GW2, MARK_TABLE);
+	SYS(fail, "ip -6 route add %s/128 via %s", IPV6_REMOTE_DST, IPV6_GW1);
+	SYS(fail, "ip -6 route add %s/128 via %s table %s", IPV6_REMOTE_DST, IPV6_GW2, MARK_TABLE);
+	SYS(fail, "ip rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE);
+	SYS(fail, "ip -6 rule add prio 2 fwmark %d lookup %s", MARK, MARK_TABLE);
+
 	return 0;
 fail:
 	return -1;
 }
 
-static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_lookup_test *test)
+static int set_lookup_params(struct bpf_fib_lookup *params,
+			     const struct fib_lookup_test *test,
+			     int ifindex)
 {
 	int ret;
 
@@ -158,6 +220,7 @@ static int set_lookup_params(struct bpf_fib_lookup *params, const struct fib_loo
 	params->l4_protocol = IPPROTO_TCP;
 	params->ifindex = ifindex;
 	params->tbid = test->tbid;
+	params->mark = test->mark;
 
 	if (inet_pton(AF_INET6, test->daddr, params->ipv6_dst) == 1) {
 		params->family = AF_INET6;
@@ -190,40 +253,45 @@ static void mac_str(char *b, const __u8 *mac)
 		mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
 }
 
-static void assert_src_ip(struct bpf_fib_lookup *fib_params, const char *expected_src)
+static void assert_ip_address(int family, void *addr, const char *expected_str)
 {
+	char str[INET6_ADDRSTRLEN];
+	u8 expected_addr[16];
+	int addr_len = 0;
 	int ret;
-	__u32 src6[4];
-	__be32 src4;
 
-	switch (fib_params->family) {
+	switch (family) {
 	case AF_INET6:
-		ret = inet_pton(AF_INET6, expected_src, src6);
-		ASSERT_EQ(ret, 1, "inet_pton(expected_src)");
-
-		ret = memcmp(src6, fib_params->ipv6_src, sizeof(fib_params->ipv6_src));
-		if (!ASSERT_EQ(ret, 0, "fib_lookup ipv6 src")) {
-			char str_src6[64];
-
-			inet_ntop(AF_INET6, fib_params->ipv6_src, str_src6,
-				  sizeof(str_src6));
-			printf("ipv6 expected %s actual %s ", expected_src,
-			       str_src6);
-		}
-
+		ret = inet_pton(AF_INET6, expected_str, expected_addr);
+		ASSERT_EQ(ret, 1, "inet_pton(AF_INET6, expected_str)");
+		addr_len = 16;
 		break;
 	case AF_INET:
-		ret = inet_pton(AF_INET, expected_src, &src4);
-		ASSERT_EQ(ret, 1, "inet_pton(expected_src)");
-
-		ASSERT_EQ(fib_params->ipv4_src, src4, "fib_lookup ipv4 src");
-
+		ret = inet_pton(AF_INET, expected_str, expected_addr);
+		ASSERT_EQ(ret, 1, "inet_pton(AF_INET, expected_str)");
+		addr_len = 4;
 		break;
 	default:
-		PRINT_FAIL("invalid addr family: %d", fib_params->family);
+		PRINT_FAIL("invalid address family: %d", family);
+		break;
+	}
+
+	if (memcmp(addr, expected_addr, addr_len)) {
+		inet_ntop(family, addr, str, sizeof(str));
+		PRINT_FAIL("expected %s actual %s ", expected_str, str);
 	}
 }
 
+static void assert_src_ip(struct bpf_fib_lookup *params, const char *expected)
+{
+	assert_ip_address(params->family, params->ipv6_src, expected);
+}
+
+static void assert_dst_ip(struct bpf_fib_lookup *params, const char *expected)
+{
+	assert_ip_address(params->family, params->ipv6_dst, expected);
+}
+
 void test_fib_lookup(void)
 {
 	struct bpf_fib_lookup *fib_params;
@@ -256,15 +324,18 @@ void test_fib_lookup(void)
 	if (setup_netns())
 		goto fail;
 
-	ifindex = if_nametoindex("veth1");
-	skb.ifindex = ifindex;
+	skb.ifindex = if_nametoindex("veth1");
+	if (!ASSERT_NEQ(skb.ifindex, 0, "if_nametoindex(veth1)"))
+		goto fail;
+
 	fib_params = &skel->bss->fib_params;
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
 		printf("Testing %s ", tests[i].desc);
 
-		if (set_lookup_params(fib_params, &tests[i]))
+		if (set_lookup_params(fib_params, &tests[i], skb.ifindex))
 			continue;
+
 		skel->bss->fib_lookup_ret = -1;
 		skel->bss->lookup_flags = tests[i].lookup_flags;
 
@@ -278,6 +349,9 @@ void test_fib_lookup(void)
 		if (tests[i].expected_src)
 			assert_src_ip(fib_params, tests[i].expected_src);
 
+		if (tests[i].expected_dst)
+			assert_dst_ip(fib_params, tests[i].expected_dst);
+
 		ret = memcmp(tests[i].dmac, fib_params->dmac, sizeof(tests[i].dmac));
 		if (!ASSERT_EQ(ret, 0, "dmac not match")) {
 			char expected[18], actual[18];
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index c4773173a4e4..9e5f38739104 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -2,7 +2,6 @@
 #include <test_progs.h>
 #include <network_helpers.h>
 #include <error.h>
-#include <linux/if.h>
 #include <linux/if_tun.h>
 #include <sys/uio.h>
 
diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c
index 8963f8a549f2..09f6487f58b9 100644
--- a/tools/testing/selftests/bpf/prog_tests/for_each.c
+++ b/tools/testing/selftests/bpf/prog_tests/for_each.c
@@ -5,6 +5,7 @@
 #include "for_each_hash_map_elem.skel.h"
 #include "for_each_array_map_elem.skel.h"
 #include "for_each_map_elem_write_key.skel.h"
+#include "for_each_multi_maps.skel.h"
 
 static unsigned int duration;
 
@@ -143,6 +144,65 @@ static void test_write_map_key(void)
 		for_each_map_elem_write_key__destroy(skel);
 }
 
+static void test_multi_maps(void)
+{
+	struct for_each_multi_maps *skel;
+	__u64 val, array_total, hash_total;
+	__u32 key, max_entries;
+	int i, err;
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.data_in = &pkt_v4,
+		.data_size_in = sizeof(pkt_v4),
+		.repeat = 1,
+	);
+
+	skel = for_each_multi_maps__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "for_each_multi_maps__open_and_load"))
+		return;
+
+	array_total = 0;
+	max_entries = bpf_map__max_entries(skel->maps.arraymap);
+	for (i = 0; i < max_entries; i++) {
+		key = i;
+		val = i + 1;
+		array_total += val;
+		err = bpf_map__update_elem(skel->maps.arraymap, &key, sizeof(key),
+					   &val, sizeof(val), BPF_ANY);
+		if (!ASSERT_OK(err, "array_map_update"))
+			goto out;
+	}
+
+	hash_total = 0;
+	max_entries = bpf_map__max_entries(skel->maps.hashmap);
+	for (i = 0; i < max_entries; i++) {
+		key = i + 100;
+		val = i + 1;
+		hash_total += val;
+		err = bpf_map__update_elem(skel->maps.hashmap, &key, sizeof(key),
+					   &val, sizeof(val), BPF_ANY);
+		if (!ASSERT_OK(err, "hash_map_update"))
+			goto out;
+	}
+
+	skel->bss->data_output = 0;
+	skel->bss->use_array = 1;
+	err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts);
+	ASSERT_OK(err, "bpf_prog_test_run_opts");
+	ASSERT_OK(topts.retval, "retval");
+	ASSERT_EQ(skel->bss->data_output, array_total, "array output");
+
+	skel->bss->data_output = 0;
+	skel->bss->use_array = 0;
+	err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts);
+	ASSERT_OK(err, "bpf_prog_test_run_opts");
+	ASSERT_OK(topts.retval, "retval");
+	ASSERT_EQ(skel->bss->data_output, hash_total, "hash output");
+
+out:
+	for_each_multi_maps__destroy(skel);
+}
+
 void test_for_each(void)
 {
 	if (test__start_subtest("hash_map"))
@@ -151,4 +211,6 @@ void test_for_each(void)
 		test_array_map();
 	if (test__start_subtest("write_map_key"))
 		test_write_map_key();
+	if (test__start_subtest("multi_maps"))
+		test_multi_maps();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c
index 8dd2af9081f4..284764e7179f 100644
--- a/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c
+++ b/tools/testing/selftests/bpf/prog_tests/ip_check_defrag.c
@@ -88,6 +88,8 @@ static int attach(struct ip_check_defrag *skel, bool ipv6)
 	int err = -1;
 
 	nstoken = open_netns(NS1);
+	if (!ASSERT_OK_PTR(nstoken, "setns"))
+		goto out;
 
 	skel->links.defrag = bpf_program__attach_netfilter(skel->progs.defrag, &opts);
 	if (!ASSERT_OK_PTR(skel->links.defrag, "program attach"))
diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
index 05000810e28e..960c9323d1e0 100644
--- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
@@ -4,6 +4,8 @@
 #include "trace_helpers.h"
 #include "kprobe_multi_empty.skel.h"
 #include "kprobe_multi_override.skel.h"
+#include "kprobe_multi_session.skel.h"
+#include "kprobe_multi_session_cookie.skel.h"
 #include "bpf/libbpf_internal.h"
 #include "bpf/hashmap.h"
 
@@ -326,6 +328,74 @@ cleanup:
 	kprobe_multi__destroy(skel);
 }
 
+static void test_session_skel_api(void)
+{
+	struct kprobe_multi_session *skel = NULL;
+	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	struct bpf_link *link = NULL;
+	int i, err, prog_fd;
+
+	skel = kprobe_multi_session__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "kprobe_multi_session__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	err = kprobe_multi_session__attach(skel);
+	if (!ASSERT_OK(err, " kprobe_multi_session__attach"))
+		goto cleanup;
+
+	prog_fd = bpf_program__fd(skel->progs.trigger);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(topts.retval, 0, "test_run");
+
+	/* bpf_fentry_test1-4 trigger return probe, result is 2 */
+	for (i = 0; i < 4; i++)
+		ASSERT_EQ(skel->bss->kprobe_session_result[i], 2, "kprobe_session_result");
+
+	/* bpf_fentry_test5-8 trigger only entry probe, result is 1 */
+	for (i = 4; i < 8; i++)
+		ASSERT_EQ(skel->bss->kprobe_session_result[i], 1, "kprobe_session_result");
+
+cleanup:
+	bpf_link__destroy(link);
+	kprobe_multi_session__destroy(skel);
+}
+
+static void test_session_cookie_skel_api(void)
+{
+	struct kprobe_multi_session_cookie *skel = NULL;
+	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	struct bpf_link *link = NULL;
+	int err, prog_fd;
+
+	skel = kprobe_multi_session_cookie__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "fentry_raw_skel_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	err = kprobe_multi_session_cookie__attach(skel);
+	if (!ASSERT_OK(err, " kprobe_multi_wrapper__attach"))
+		goto cleanup;
+
+	prog_fd = bpf_program__fd(skel->progs.trigger);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(topts.retval, 0, "test_run");
+
+	ASSERT_EQ(skel->bss->test_kprobe_1_result, 1, "test_kprobe_1_result");
+	ASSERT_EQ(skel->bss->test_kprobe_2_result, 2, "test_kprobe_2_result");
+	ASSERT_EQ(skel->bss->test_kprobe_3_result, 3, "test_kprobe_3_result");
+
+cleanup:
+	bpf_link__destroy(link);
+	kprobe_multi_session_cookie__destroy(skel);
+}
+
 static size_t symbol_hash(long key, void *ctx __maybe_unused)
 {
 	return str_hash((const char *) key);
@@ -336,15 +406,80 @@ static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused)
 	return strcmp((const char *) key1, (const char *) key2) == 0;
 }
 
+static bool is_invalid_entry(char *buf, bool kernel)
+{
+	if (kernel && strchr(buf, '['))
+		return true;
+	if (!kernel && !strchr(buf, '['))
+		return true;
+	return false;
+}
+
+static bool skip_entry(char *name)
+{
+	/*
+	 * We attach to almost all kernel functions and some of them
+	 * will cause 'suspicious RCU usage' when fprobe is attached
+	 * to them. Filter out the current culprits - arch_cpu_idle
+	 * default_idle and rcu_* functions.
+	 */
+	if (!strcmp(name, "arch_cpu_idle"))
+		return true;
+	if (!strcmp(name, "default_idle"))
+		return true;
+	if (!strncmp(name, "rcu_", 4))
+		return true;
+	if (!strcmp(name, "bpf_dispatcher_xdp_func"))
+		return true;
+	if (!strncmp(name, "__ftrace_invalid_address__",
+		     sizeof("__ftrace_invalid_address__") - 1))
+		return true;
+	return false;
+}
+
+/* Do comparision by ignoring '.llvm.<hash>' suffixes. */
+static int compare_name(const char *name1, const char *name2)
+{
+	const char *res1, *res2;
+	int len1, len2;
+
+	res1 = strstr(name1, ".llvm.");
+	res2 = strstr(name2, ".llvm.");
+	len1 = res1 ? res1 - name1 : strlen(name1);
+	len2 = res2 ? res2 - name2 : strlen(name2);
+
+	if (len1 == len2)
+		return strncmp(name1, name2, len1);
+	if (len1 < len2)
+		return strncmp(name1, name2, len1) <= 0 ? -1 : 1;
+	return strncmp(name1, name2, len2) >= 0 ? 1 : -1;
+}
+
+static int load_kallsyms_compare(const void *p1, const void *p2)
+{
+	return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name);
+}
+
+static int search_kallsyms_compare(const void *p1, const struct ksym *p2)
+{
+	return compare_name(p1, p2->name);
+}
+
 static int get_syms(char ***symsp, size_t *cntp, bool kernel)
 {
-	size_t cap = 0, cnt = 0, i;
-	char *name = NULL, **syms = NULL;
+	size_t cap = 0, cnt = 0;
+	char *name = NULL, *ksym_name, **syms = NULL;
 	struct hashmap *map;
+	struct ksyms *ksyms;
+	struct ksym *ks;
 	char buf[256];
 	FILE *f;
 	int err = 0;
 
+	ksyms = load_kallsyms_custom_local(load_kallsyms_compare);
+	if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local"))
+		return -EINVAL;
+
 	/*
 	 * The available_filter_functions contains many duplicates,
 	 * but other than that all symbols are usable in kprobe multi
@@ -368,33 +503,23 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel)
 	}
 
 	while (fgets(buf, sizeof(buf), f)) {
-		if (kernel && strchr(buf, '['))
-			continue;
-		if (!kernel && !strchr(buf, '['))
+		if (is_invalid_entry(buf, kernel))
 			continue;
 
 		free(name);
 		if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1)
 			continue;
-		/*
-		 * We attach to almost all kernel functions and some of them
-		 * will cause 'suspicious RCU usage' when fprobe is attached
-		 * to them. Filter out the current culprits - arch_cpu_idle
-		 * default_idle and rcu_* functions.
-		 */
-		if (!strcmp(name, "arch_cpu_idle"))
-			continue;
-		if (!strcmp(name, "default_idle"))
-			continue;
-		if (!strncmp(name, "rcu_", 4))
-			continue;
-		if (!strcmp(name, "bpf_dispatcher_xdp_func"))
-			continue;
-		if (!strncmp(name, "__ftrace_invalid_address__",
-			     sizeof("__ftrace_invalid_address__") - 1))
+		if (skip_entry(name))
 			continue;
 
-		err = hashmap__add(map, name, 0);
+		ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare);
+		if (!ks) {
+			err = -EINVAL;
+			goto error;
+		}
+
+		ksym_name = ks->name;
+		err = hashmap__add(map, ksym_name, 0);
 		if (err == -EEXIST) {
 			err = 0;
 			continue;
@@ -407,8 +532,7 @@ static int get_syms(char ***symsp, size_t *cntp, bool kernel)
 		if (err)
 			goto error;
 
-		syms[cnt++] = name;
-		name = NULL;
+		syms[cnt++] = ksym_name;
 	}
 
 	*symsp = syms;
@@ -418,42 +542,88 @@ error:
 	free(name);
 	fclose(f);
 	hashmap__free(map);
-	if (err) {
-		for (i = 0; i < cnt; i++)
-			free(syms[i]);
+	if (err)
 		free(syms);
+	return err;
+}
+
+static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel)
+{
+	unsigned long *addr, *addrs, *tmp_addrs;
+	int err = 0, max_cnt, inc_cnt;
+	char *name = NULL;
+	size_t cnt = 0;
+	char buf[256];
+	FILE *f;
+
+	if (access("/sys/kernel/tracing/trace", F_OK) == 0)
+		f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r");
+	else
+		f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r");
+
+	if (!f)
+		return -ENOENT;
+
+	/* In my local setup, the number of entries is 50k+ so Let us initially
+	 * allocate space to hold 64k entries. If 64k is not enough, incrementally
+	 * increase 1k each time.
+	 */
+	max_cnt = 65536;
+	inc_cnt = 1024;
+	addrs = malloc(max_cnt * sizeof(long));
+	if (addrs == NULL) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	while (fgets(buf, sizeof(buf), f)) {
+		if (is_invalid_entry(buf, kernel))
+			continue;
+
+		free(name);
+		if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2)
+			continue;
+		if (skip_entry(name))
+			continue;
+
+		if (cnt == max_cnt) {
+			max_cnt += inc_cnt;
+			tmp_addrs = realloc(addrs, max_cnt);
+			if (!tmp_addrs) {
+				err = -ENOMEM;
+				goto error;
+			}
+			addrs = tmp_addrs;
+		}
+
+		addrs[cnt++] = (unsigned long)addr;
 	}
+
+	*addrsp = addrs;
+	*cntp = cnt;
+
+error:
+	free(name);
+	fclose(f);
+	if (err)
+		free(addrs);
 	return err;
 }
 
-static void test_kprobe_multi_bench_attach(bool kernel)
+static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts)
 {
-	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
-	struct kprobe_multi_empty *skel = NULL;
 	long attach_start_ns, attach_end_ns;
 	long detach_start_ns, detach_end_ns;
 	double attach_delta, detach_delta;
 	struct bpf_link *link = NULL;
-	char **syms = NULL;
-	size_t cnt = 0, i;
-
-	if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms"))
-		return;
-
-	skel = kprobe_multi_empty__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load"))
-		goto cleanup;
-
-	opts.syms = (const char **) syms;
-	opts.cnt = cnt;
 
 	attach_start_ns = get_time_ns();
 	link = bpf_program__attach_kprobe_multi_opts(skel->progs.test_kprobe_empty,
-						     NULL, &opts);
+						     NULL, opts);
 	attach_end_ns = get_time_ns();
 
 	if (!ASSERT_OK_PTR(link, "bpf_program__attach_kprobe_multi_opts"))
-		goto cleanup;
+		return;
 
 	detach_start_ns = get_time_ns();
 	bpf_link__destroy(link);
@@ -462,17 +632,65 @@ static void test_kprobe_multi_bench_attach(bool kernel)
 	attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0;
 	detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0;
 
-	printf("%s: found %lu functions\n", __func__, cnt);
+	printf("%s: found %lu functions\n", __func__, opts->cnt);
 	printf("%s: attached in %7.3lfs\n", __func__, attach_delta);
 	printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
+}
+
+static void test_kprobe_multi_bench_attach(bool kernel)
+{
+	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
+	struct kprobe_multi_empty *skel = NULL;
+	char **syms = NULL;
+	size_t cnt = 0;
+
+	if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms"))
+		return;
+
+	skel = kprobe_multi_empty__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load"))
+		goto cleanup;
+
+	opts.syms = (const char **) syms;
+	opts.cnt = cnt;
+
+	do_bench_test(skel, &opts);
 
 cleanup:
 	kprobe_multi_empty__destroy(skel);
-	if (syms) {
-		for (i = 0; i < cnt; i++)
-			free(syms[i]);
+	if (syms)
 		free(syms);
+}
+
+static void test_kprobe_multi_bench_attach_addr(bool kernel)
+{
+	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
+	struct kprobe_multi_empty *skel = NULL;
+	unsigned long *addrs = NULL;
+	size_t cnt = 0;
+	int err;
+
+	err = get_addrs(&addrs, &cnt, kernel);
+	if (err == -ENOENT) {
+		test__skip();
+		return;
 	}
+
+	if (!ASSERT_OK(err, "get_addrs"))
+		return;
+
+	skel = kprobe_multi_empty__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load"))
+		goto cleanup;
+
+	opts.addrs = addrs;
+	opts.cnt = cnt;
+
+	do_bench_test(skel, &opts);
+
+cleanup:
+	kprobe_multi_empty__destroy(skel);
+	free(addrs);
 }
 
 static void test_attach_override(void)
@@ -515,6 +733,10 @@ void serial_test_kprobe_multi_bench_attach(void)
 		test_kprobe_multi_bench_attach(true);
 	if (test__start_subtest("modules"))
 		test_kprobe_multi_bench_attach(false);
+	if (test__start_subtest("kernel"))
+		test_kprobe_multi_bench_attach_addr(true);
+	if (test__start_subtest("modules"))
+		test_kprobe_multi_bench_attach_addr(false);
 }
 
 void test_kprobe_multi_test(void)
@@ -538,4 +760,8 @@ void test_kprobe_multi_test(void)
 		test_attach_api_fails();
 	if (test__start_subtest("attach_override"))
 		test_attach_override();
+	if (test__start_subtest("session"))
+		test_session_skel_api();
+	if (test__start_subtest("session_cookie"))
+		test_session_cookie_skel_api();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c
index b295969b263b..dc7aab532fb1 100644
--- a/tools/testing/selftests/bpf/prog_tests/ksyms.c
+++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c
@@ -5,8 +5,6 @@
 #include "test_ksyms.skel.h"
 #include <sys/stat.h>
 
-static int duration;
-
 void test_ksyms(void)
 {
 	const char *btf_path = "/sys/kernel/btf/vmlinux";
@@ -18,43 +16,37 @@ void test_ksyms(void)
 	int err;
 
 	err = kallsyms_find("bpf_link_fops", &link_fops_addr);
-	if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+	if (!ASSERT_NEQ(err, -EINVAL, "bpf_link_fops: kallsyms_fopen"))
 		return;
-	if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n"))
+	if (!ASSERT_NEQ(err, -ENOENT, "bpf_link_fops: ksym_find"))
 		return;
 
 	err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr);
-	if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+	if (!ASSERT_NEQ(err, -EINVAL, "__per_cpu_start: kallsyms_fopen"))
 		return;
-	if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n"))
+	if (!ASSERT_NEQ(err, -ENOENT, "__per_cpu_start: ksym_find"))
 		return;
 
-	if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno))
+	if (!ASSERT_OK(stat(btf_path, &st), "stat_btf"))
 		return;
 	btf_size = st.st_size;
 
 	skel = test_ksyms__open_and_load();
-	if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n"))
+	if (!ASSERT_OK_PTR(skel, "test_ksyms__open_and_load"))
 		return;
 
 	err = test_ksyms__attach(skel);
-	if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+	if (!ASSERT_OK(err, "test_ksyms__attach"))
 		goto cleanup;
 
 	/* trigger tracepoint */
 	usleep(1);
 
 	data = skel->data;
-	CHECK(data->out__bpf_link_fops != link_fops_addr, "bpf_link_fops",
-	      "got 0x%llx, exp 0x%llx\n",
-	      data->out__bpf_link_fops, link_fops_addr);
-	CHECK(data->out__bpf_link_fops1 != 0, "bpf_link_fops1",
-	      "got %llu, exp %llu\n", data->out__bpf_link_fops1, (__u64)0);
-	CHECK(data->out__btf_size != btf_size, "btf_size",
-	      "got %llu, exp %llu\n", data->out__btf_size, btf_size);
-	CHECK(data->out__per_cpu_start != per_cpu_start_addr, "__per_cpu_start",
-	      "got %llu, exp %llu\n", data->out__per_cpu_start,
-	      per_cpu_start_addr);
+	ASSERT_EQ(data->out__bpf_link_fops, link_fops_addr, "bpf_link_fops");
+	ASSERT_EQ(data->out__bpf_link_fops1, 0, "bpf_link_fops1");
+	ASSERT_EQ(data->out__btf_size, btf_size, "btf_size");
+	ASSERT_EQ(data->out__per_cpu_start, per_cpu_start_addr, "__per_cpu_start");
 
 cleanup:
 	test_ksyms__destroy(skel);
diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c
index f53d658ed080..6d391d95f96e 100644
--- a/tools/testing/selftests/bpf/prog_tests/module_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c
@@ -51,6 +51,10 @@ void test_module_attach(void)
 					     0, "bpf_testmod_test_read");
 	ASSERT_OK(err, "set_attach_target");
 
+	err = bpf_program__set_attach_target(skel->progs.handle_fentry_explicit_manual,
+					     0, "bpf_testmod:bpf_testmod_test_read");
+	ASSERT_OK(err, "set_attach_target_explicit");
+
 	err = test_module_attach__load(skel);
 	if (CHECK(err, "skel_load", "failed to load skeleton\n"))
 		return;
@@ -70,6 +74,8 @@ void test_module_attach(void)
 	ASSERT_EQ(bss->tp_btf_read_sz, READ_SZ, "tp_btf");
 	ASSERT_EQ(bss->fentry_read_sz, READ_SZ, "fentry");
 	ASSERT_EQ(bss->fentry_manual_read_sz, READ_SZ, "fentry_manual");
+	ASSERT_EQ(bss->fentry_explicit_read_sz, READ_SZ, "fentry_explicit");
+	ASSERT_EQ(bss->fentry_explicit_manual_read_sz, READ_SZ, "fentry_explicit_manual");
 	ASSERT_EQ(bss->fexit_read_sz, READ_SZ, "fexit");
 	ASSERT_EQ(bss->fexit_ret, -EIO, "fexit_tet");
 	ASSERT_EQ(bss->fmod_ret_read_sz, READ_SZ, "fmod_ret");
diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
index 8f8d792307c1..274d2e033e39 100644
--- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
+++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
@@ -82,6 +82,22 @@ static void cleanup_netns(struct nstoken *nstoken)
 	SYS_NOFAIL("ip netns del %s", NS_TEST);
 }
 
+static int start_mptcp_server(int family, const char *addr_str, __u16 port,
+			      int timeout_ms)
+{
+	struct network_helper_opts opts = {
+		.timeout_ms	= timeout_ms,
+		.proto		= IPPROTO_MPTCP,
+	};
+	struct sockaddr_storage addr;
+	socklen_t addrlen;
+
+	if (make_sockaddr(family, addr_str, port, &addr, &addrlen))
+		return -1;
+
+	return start_server_addr(SOCK_STREAM, &addr, addrlen, &opts);
+}
+
 static int verify_tsk(int map_fd, int client_fd)
 {
 	int err, cfd = client_fd;
@@ -273,6 +289,8 @@ static int run_mptcpify(int cgroup_fd)
 	if (!ASSERT_OK_PTR(mptcpify_skel, "skel_open_load"))
 		return libbpf_get_error(mptcpify_skel);
 
+	mptcpify_skel->bss->pid = getpid();
+
 	err = mptcpify__attach(mptcpify_skel);
 	if (!ASSERT_OK(err, "skel_attach"))
 		goto out;
diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
index 24d493482ffc..e72d75d6baa7 100644
--- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
+++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
@@ -12,77 +12,229 @@
 #include <sys/wait.h>
 #include <sys/mount.h>
 #include <sys/fcntl.h>
+#include "network_helpers.h"
 
 #define STACK_SIZE (1024 * 1024)
 static char child_stack[STACK_SIZE];
 
-static int test_current_pid_tgid(void *args)
+static int get_pid_tgid(pid_t *pid, pid_t *tgid,
+			struct test_ns_current_pid_tgid__bss *bss)
 {
-	struct test_ns_current_pid_tgid__bss  *bss;
-	struct test_ns_current_pid_tgid *skel;
-	int err = -1, duration = 0;
-	pid_t tgid, pid;
 	struct stat st;
+	int err;
 
-	skel = test_ns_current_pid_tgid__open_and_load();
-	if (CHECK(!skel, "skel_open_load", "failed to load skeleton\n"))
-		goto cleanup;
-
-	pid = syscall(SYS_gettid);
-	tgid = getpid();
+	*pid = syscall(SYS_gettid);
+	*tgid = getpid();
 
 	err = stat("/proc/self/ns/pid", &st);
-	if (CHECK(err, "stat", "failed /proc/self/ns/pid: %d\n", err))
-		goto cleanup;
+	if (!ASSERT_OK(err, "stat /proc/self/ns/pid"))
+		return err;
 
-	bss = skel->bss;
 	bss->dev = st.st_dev;
 	bss->ino = st.st_ino;
 	bss->user_pid = 0;
 	bss->user_tgid = 0;
+	return 0;
+}
+
+static int test_current_pid_tgid_tp(void *args)
+{
+	struct test_ns_current_pid_tgid__bss  *bss;
+	struct test_ns_current_pid_tgid *skel;
+	int ret = -1, err;
+	pid_t tgid, pid;
+
+	skel = test_ns_current_pid_tgid__open();
+	if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open"))
+		return ret;
+
+	bpf_program__set_autoload(skel->progs.tp_handler, true);
+
+	err = test_ns_current_pid_tgid__load(skel);
+	if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load"))
+		goto cleanup;
+
+	bss = skel->bss;
+	if (get_pid_tgid(&pid, &tgid, bss))
+		goto cleanup;
 
 	err = test_ns_current_pid_tgid__attach(skel);
-	if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+	if (!ASSERT_OK(err, "test_ns_current_pid_tgid__attach"))
 		goto cleanup;
 
 	/* trigger tracepoint */
 	usleep(1);
-	ASSERT_EQ(bss->user_pid, pid, "pid");
-	ASSERT_EQ(bss->user_tgid, tgid, "tgid");
-	err = 0;
+	if (!ASSERT_EQ(bss->user_pid, pid, "pid"))
+		goto cleanup;
+	if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid"))
+		goto cleanup;
+	ret = 0;
+
+cleanup:
+	test_ns_current_pid_tgid__destroy(skel);
+	return ret;
+}
+
+static int test_current_pid_tgid_cgrp(void *args)
+{
+	struct test_ns_current_pid_tgid__bss *bss;
+	struct test_ns_current_pid_tgid *skel;
+	int server_fd = -1, ret = -1, err;
+	int cgroup_fd = *(int *)args;
+	pid_t tgid, pid;
+
+	skel = test_ns_current_pid_tgid__open();
+	if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open"))
+		return ret;
+
+	bpf_program__set_autoload(skel->progs.cgroup_bind4, true);
+
+	err = test_ns_current_pid_tgid__load(skel);
+	if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load"))
+		goto cleanup;
+
+	bss = skel->bss;
+	if (get_pid_tgid(&pid, &tgid, bss))
+		goto cleanup;
+
+	skel->links.cgroup_bind4 = bpf_program__attach_cgroup(
+		skel->progs.cgroup_bind4, cgroup_fd);
+	if (!ASSERT_OK_PTR(skel->links.cgroup_bind4, "bpf_program__attach_cgroup"))
+		goto cleanup;
+
+	server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
+	if (!ASSERT_GE(server_fd, 0, "start_server"))
+		goto cleanup;
+
+	if (!ASSERT_EQ(bss->user_pid, pid, "pid"))
+		goto cleanup;
+	if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid"))
+		goto cleanup;
+	ret = 0;
 
 cleanup:
-	 test_ns_current_pid_tgid__destroy(skel);
+	if (server_fd >= 0)
+		close(server_fd);
+	test_ns_current_pid_tgid__destroy(skel);
+	return ret;
+}
+
+static int test_current_pid_tgid_sk_msg(void *args)
+{
+	int verdict, map, server_fd = -1, client_fd = -1;
+	struct test_ns_current_pid_tgid__bss *bss;
+	static const char send_msg[] = "message";
+	struct test_ns_current_pid_tgid *skel;
+	int ret = -1, err, key = 0;
+	pid_t tgid, pid;
+
+	skel = test_ns_current_pid_tgid__open();
+	if (!ASSERT_OK_PTR(skel, "test_ns_current_pid_tgid__open"))
+		return ret;
+
+	bpf_program__set_autoload(skel->progs.sk_msg, true);
+
+	err = test_ns_current_pid_tgid__load(skel);
+	if (!ASSERT_OK(err, "test_ns_current_pid_tgid__load"))
+		goto cleanup;
+
+	bss = skel->bss;
+	if (get_pid_tgid(&pid, &tgid, skel->bss))
+		goto cleanup;
+
+	verdict = bpf_program__fd(skel->progs.sk_msg);
+	map = bpf_map__fd(skel->maps.sock_map);
+	err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0);
+	if (!ASSERT_OK(err, "prog_attach"))
+		goto cleanup;
+
+	server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+	if (!ASSERT_GE(server_fd, 0, "start_server"))
+		goto cleanup;
 
-	return err;
+	client_fd = connect_to_fd(server_fd, 0);
+	if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
+		goto cleanup;
+
+	err = bpf_map_update_elem(map, &key, &client_fd, BPF_ANY);
+	if (!ASSERT_OK(err, "bpf_map_update_elem"))
+		goto cleanup;
+
+	err = send(client_fd, send_msg, sizeof(send_msg), 0);
+	if (!ASSERT_EQ(err, sizeof(send_msg), "send(msg)"))
+		goto cleanup;
+
+	if (!ASSERT_EQ(bss->user_pid, pid, "pid"))
+		goto cleanup;
+	if (!ASSERT_EQ(bss->user_tgid, tgid, "tgid"))
+		goto cleanup;
+	ret = 0;
+
+cleanup:
+	if (server_fd >= 0)
+		close(server_fd);
+	if (client_fd >= 0)
+		close(client_fd);
+	test_ns_current_pid_tgid__destroy(skel);
+	return ret;
 }
 
-static void test_ns_current_pid_tgid_new_ns(void)
+static void test_ns_current_pid_tgid_new_ns(int (*fn)(void *), void *arg)
 {
-	int wstatus, duration = 0;
+	int wstatus;
 	pid_t cpid;
 
 	/* Create a process in a new namespace, this process
 	 * will be the init process of this new namespace hence will be pid 1.
 	 */
-	cpid = clone(test_current_pid_tgid, child_stack + STACK_SIZE,
-		     CLONE_NEWPID | SIGCHLD, NULL);
+	cpid = clone(fn, child_stack + STACK_SIZE,
+		     CLONE_NEWPID | SIGCHLD, arg);
 
-	if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno)))
+	if (!ASSERT_NEQ(cpid, -1, "clone"))
 		return;
 
-	if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno)))
+	if (!ASSERT_NEQ(waitpid(cpid, &wstatus, 0), -1, "waitpid"))
 		return;
 
-	if (CHECK(WEXITSTATUS(wstatus) != 0, "newns_pidtgid", "failed"))
+	if (!ASSERT_OK(WEXITSTATUS(wstatus), "newns_pidtgid"))
 		return;
 }
 
+static void test_in_netns(int (*fn)(void *), void *arg)
+{
+	struct nstoken *nstoken = NULL;
+
+	SYS(cleanup, "ip netns add ns_current_pid_tgid");
+	SYS(cleanup, "ip -net ns_current_pid_tgid link set dev lo up");
+
+	nstoken = open_netns("ns_current_pid_tgid");
+	if (!ASSERT_OK_PTR(nstoken, "open_netns"))
+		goto cleanup;
+
+	test_ns_current_pid_tgid_new_ns(fn, arg);
+
+cleanup:
+	if (nstoken)
+		close_netns(nstoken);
+	SYS_NOFAIL("ip netns del ns_current_pid_tgid");
+}
+
 /* TODO: use a different tracepoint */
 void serial_test_ns_current_pid_tgid(void)
 {
-	if (test__start_subtest("ns_current_pid_tgid_root_ns"))
-		test_current_pid_tgid(NULL);
-	if (test__start_subtest("ns_current_pid_tgid_new_ns"))
-		test_ns_current_pid_tgid_new_ns();
+	if (test__start_subtest("root_ns_tp"))
+		test_current_pid_tgid_tp(NULL);
+	if (test__start_subtest("new_ns_tp"))
+		test_ns_current_pid_tgid_new_ns(test_current_pid_tgid_tp, NULL);
+	if (test__start_subtest("new_ns_cgrp")) {
+		int cgroup_fd = -1;
+
+		cgroup_fd = test__join_cgroup("/sock_addr");
+		if (ASSERT_GE(cgroup_fd, 0, "join_cgroup")) {
+			test_in_netns(test_current_pid_tgid_cgrp, &cgroup_fd);
+			close(cgroup_fd);
+		}
+	}
+	if (test__start_subtest("new_ns_sk_msg"))
+		test_in_netns(test_current_pid_tgid_sk_msg, NULL);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_skip.c b/tools/testing/selftests/bpf/prog_tests/perf_skip.c
new file mode 100644
index 000000000000..37d8618800e4
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/perf_skip.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <test_progs.h>
+#include "test_perf_skip.skel.h"
+#include <linux/compiler.h>
+#include <linux/hw_breakpoint.h>
+#include <sys/mman.h>
+
+#ifndef TRAP_PERF
+#define TRAP_PERF 6
+#endif
+
+int sigio_count, sigtrap_count;
+
+static void handle_sigio(int sig __always_unused)
+{
+	++sigio_count;
+}
+
+static void handle_sigtrap(int signum __always_unused,
+			   siginfo_t *info,
+			   void *ucontext __always_unused)
+{
+	ASSERT_EQ(info->si_code, TRAP_PERF, "si_code");
+	++sigtrap_count;
+}
+
+static noinline int test_function(void)
+{
+	asm volatile ("");
+	return 0;
+}
+
+void serial_test_perf_skip(void)
+{
+	struct sigaction action = {};
+	struct sigaction previous_sigtrap;
+	sighandler_t previous_sigio = SIG_ERR;
+	struct test_perf_skip *skel = NULL;
+	struct perf_event_attr attr = {};
+	int perf_fd = -1;
+	int err;
+	struct f_owner_ex owner;
+	struct bpf_link *prog_link = NULL;
+
+	action.sa_flags = SA_SIGINFO | SA_NODEFER;
+	action.sa_sigaction = handle_sigtrap;
+	sigemptyset(&action.sa_mask);
+	if (!ASSERT_OK(sigaction(SIGTRAP, &action, &previous_sigtrap), "sigaction"))
+		return;
+
+	previous_sigio = signal(SIGIO, handle_sigio);
+	if (!ASSERT_NEQ(previous_sigio, SIG_ERR, "signal"))
+		goto cleanup;
+
+	skel = test_perf_skip__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_load"))
+		goto cleanup;
+
+	attr.type = PERF_TYPE_BREAKPOINT;
+	attr.size = sizeof(attr);
+	attr.bp_type = HW_BREAKPOINT_X;
+	attr.bp_addr = (uintptr_t)test_function;
+	attr.bp_len = sizeof(long);
+	attr.sample_period = 1;
+	attr.sample_type = PERF_SAMPLE_IP;
+	attr.pinned = 1;
+	attr.exclude_kernel = 1;
+	attr.exclude_hv = 1;
+	attr.precise_ip = 3;
+	attr.sigtrap = 1;
+	attr.remove_on_exec = 1;
+
+	perf_fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
+	if (perf_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) {
+		printf("SKIP:no PERF_TYPE_BREAKPOINT/HW_BREAKPOINT_X\n");
+		test__skip();
+		goto cleanup;
+	}
+	if (!ASSERT_OK(perf_fd < 0, "perf_event_open"))
+		goto cleanup;
+
+	/* Configure the perf event to signal on sample. */
+	err = fcntl(perf_fd, F_SETFL, O_ASYNC);
+	if (!ASSERT_OK(err, "fcntl(F_SETFL, O_ASYNC)"))
+		goto cleanup;
+
+	owner.type = F_OWNER_TID;
+	owner.pid = syscall(__NR_gettid);
+	err = fcntl(perf_fd, F_SETOWN_EX, &owner);
+	if (!ASSERT_OK(err, "fcntl(F_SETOWN_EX)"))
+		goto cleanup;
+
+	/* Allow at most one sample. A sample rejected by bpf should
+	 * not count against this.
+	 */
+	err = ioctl(perf_fd, PERF_EVENT_IOC_REFRESH, 1);
+	if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_REFRESH)"))
+		goto cleanup;
+
+	prog_link = bpf_program__attach_perf_event(skel->progs.handler, perf_fd);
+	if (!ASSERT_OK_PTR(prog_link, "bpf_program__attach_perf_event"))
+		goto cleanup;
+
+	/* Configure the bpf program to suppress the sample. */
+	skel->bss->ip = (uintptr_t)test_function;
+	test_function();
+
+	ASSERT_EQ(sigio_count, 0, "sigio_count");
+	ASSERT_EQ(sigtrap_count, 0, "sigtrap_count");
+
+	/* Configure the bpf program to allow the sample. */
+	skel->bss->ip = 0;
+	test_function();
+
+	ASSERT_EQ(sigio_count, 1, "sigio_count");
+	ASSERT_EQ(sigtrap_count, 1, "sigtrap_count");
+
+	/* Test that the sample above is the only one allowed (by perf, not
+	 * by bpf)
+	 */
+	test_function();
+
+	ASSERT_EQ(sigio_count, 1, "sigio_count");
+	ASSERT_EQ(sigtrap_count, 1, "sigtrap_count");
+
+cleanup:
+	bpf_link__destroy(prog_link);
+	if (perf_fd >= 0)
+		close(perf_fd);
+	test_perf_skip__destroy(skel);
+
+	if (previous_sigio != SIG_ERR)
+		signal(SIGIO, previous_sigio);
+	sigaction(SIGTRAP, &previous_sigtrap, NULL);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/preempt_lock.c b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c
new file mode 100644
index 000000000000..02917c672441
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/preempt_lock.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+#include <preempt_lock.skel.h>
+
+void test_preempt_lock(void)
+{
+	RUN_TESTS(preempt_lock);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
index 48c5695b7abf..4c6f42dae409 100644
--- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c
+++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
@@ -13,6 +13,7 @@
 #include <linux/perf_event.h>
 #include <linux/ring_buffer.h>
 #include "test_ringbuf.lskel.h"
+#include "test_ringbuf_n.lskel.h"
 #include "test_ringbuf_map_key.lskel.h"
 
 #define EDONE 7777
@@ -326,6 +327,68 @@ cleanup:
 	test_ringbuf_lskel__destroy(skel);
 }
 
+/*
+ * Test ring_buffer__consume_n() by producing N_TOT_SAMPLES samples in the ring
+ * buffer, via getpid(), and consuming them in chunks of N_SAMPLES.
+ */
+#define N_TOT_SAMPLES	32
+#define N_SAMPLES	4
+
+/* Sample value to verify the callback validity */
+#define SAMPLE_VALUE	42L
+
+static int process_n_sample(void *ctx, void *data, size_t len)
+{
+	struct sample *s = data;
+
+	ASSERT_EQ(s->value, SAMPLE_VALUE, "sample_value");
+
+	return 0;
+}
+
+static void ringbuf_n_subtest(void)
+{
+	struct test_ringbuf_n_lskel *skel_n;
+	int err, i;
+
+	skel_n = test_ringbuf_n_lskel__open();
+	if (!ASSERT_OK_PTR(skel_n, "test_ringbuf_n_lskel__open"))
+		return;
+
+	skel_n->maps.ringbuf.max_entries = getpagesize();
+	skel_n->bss->pid = getpid();
+
+	err = test_ringbuf_n_lskel__load(skel_n);
+	if (!ASSERT_OK(err, "test_ringbuf_n_lskel__load"))
+		goto cleanup;
+
+	ringbuf = ring_buffer__new(skel_n->maps.ringbuf.map_fd,
+				   process_n_sample, NULL, NULL);
+	if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new"))
+		goto cleanup;
+
+	err = test_ringbuf_n_lskel__attach(skel_n);
+	if (!ASSERT_OK(err, "test_ringbuf_n_lskel__attach"))
+		goto cleanup_ringbuf;
+
+	/* Produce N_TOT_SAMPLES samples in the ring buffer by calling getpid() */
+	skel_n->bss->value = SAMPLE_VALUE;
+	for (i = 0; i < N_TOT_SAMPLES; i++)
+		syscall(__NR_getpgid);
+
+	/* Consume all samples from the ring buffer in batches of N_SAMPLES */
+	for (i = 0; i < N_TOT_SAMPLES; i += err) {
+		err = ring_buffer__consume_n(ringbuf, N_SAMPLES);
+		if (!ASSERT_EQ(err, N_SAMPLES, "rb_consume"))
+			goto cleanup_ringbuf;
+	}
+
+cleanup_ringbuf:
+	ring_buffer__free(ringbuf);
+cleanup:
+	test_ringbuf_n_lskel__destroy(skel_n);
+}
+
 static int process_map_key_sample(void *ctx, void *data, size_t len)
 {
 	struct sample *s;
@@ -384,6 +447,8 @@ void test_ringbuf(void)
 {
 	if (test__start_subtest("ringbuf"))
 		ringbuf_subtest();
+	if (test__start_subtest("ringbuf_n"))
+		ringbuf_n_subtest();
 	if (test__start_subtest("ringbuf_map_key"))
 		ringbuf_map_key_subtest();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index b15b343ebb6b..920aee41bd58 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -179,7 +179,7 @@ static void test_send_signal_nmi(bool signal_thread)
 	pmu_fd = syscall(__NR_perf_event_open, &attr, 0 /* pid */,
 			 -1 /* cpu */, -1 /* group_fd */, 0 /* flags */);
 	if (pmu_fd == -1) {
-		if (errno == ENOENT) {
+		if (errno == ENOENT || errno == EOPNOTSUPP) {
 			printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n",
 			       __func__);
 			test__skip();
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
index 1374b626a985..0b9bd1d6f7cc 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
@@ -15,6 +15,7 @@
 #include <unistd.h>
 
 #include "test_progs.h"
+#include "network_helpers.h"
 
 #define BIND_PORT 1234
 #define CONNECT_PORT 4321
@@ -22,8 +23,6 @@
 #define NS_SELF "/proc/self/ns/net"
 #define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map"
 
-static const struct timeval timeo_sec = { .tv_sec = 3 };
-static const size_t timeo_optlen = sizeof(timeo_sec);
 static int stop, duration;
 
 static bool
@@ -73,52 +72,6 @@ configure_stack(void)
 	return true;
 }
 
-static int
-start_server(const struct sockaddr *addr, socklen_t len, int type)
-{
-	int fd;
-
-	fd = socket(addr->sa_family, type, 0);
-	if (CHECK_FAIL(fd == -1))
-		goto out;
-	if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
-				  timeo_optlen)))
-		goto close_out;
-	if (CHECK_FAIL(bind(fd, addr, len) == -1))
-		goto close_out;
-	if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1))
-		goto close_out;
-
-	goto out;
-close_out:
-	close(fd);
-	fd = -1;
-out:
-	return fd;
-}
-
-static int
-connect_to_server(const struct sockaddr *addr, socklen_t len, int type)
-{
-	int fd = -1;
-
-	fd = socket(addr->sa_family, type, 0);
-	if (CHECK_FAIL(fd == -1))
-		goto out;
-	if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec,
-				  timeo_optlen)))
-		goto close_out;
-	if (CHECK_FAIL(connect(fd, addr, len)))
-		goto close_out;
-
-	goto out;
-close_out:
-	close(fd);
-	fd = -1;
-out:
-	return fd;
-}
-
 static in_port_t
 get_port(int fd)
 {
@@ -161,7 +114,7 @@ run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type)
 	in_port_t port;
 	int ret = 1;
 
-	client = connect_to_server(addr, len, type);
+	client = connect_to_addr(type, (struct sockaddr_storage *)addr, len, NULL);
 	if (client == -1) {
 		perror("Cannot connect to server");
 		goto out;
@@ -310,7 +263,9 @@ void test_sk_assign(void)
 			continue;
 		prepare_addr(test->addr, test->family, BIND_PORT, false);
 		addr = (const struct sockaddr *)test->addr;
-		server = start_server(addr, test->len, test->type);
+		server = start_server_addr(test->type,
+					   (const struct sockaddr_storage *)addr,
+					   test->len, NULL);
 		if (server == -1)
 			goto close;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c
index 5fd617718991..b880c564a204 100644
--- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c
+++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c
@@ -3,16 +3,56 @@
 
 #include "test_progs.h"
 
+#include "sock_addr_kern.skel.h"
+#include "bind4_prog.skel.h"
+#include "bind6_prog.skel.h"
 #include "connect_unix_prog.skel.h"
+#include "connect4_prog.skel.h"
+#include "connect6_prog.skel.h"
+#include "sendmsg4_prog.skel.h"
+#include "sendmsg6_prog.skel.h"
+#include "recvmsg4_prog.skel.h"
+#include "recvmsg6_prog.skel.h"
 #include "sendmsg_unix_prog.skel.h"
 #include "recvmsg_unix_prog.skel.h"
+#include "getsockname4_prog.skel.h"
+#include "getsockname6_prog.skel.h"
 #include "getsockname_unix_prog.skel.h"
+#include "getpeername4_prog.skel.h"
+#include "getpeername6_prog.skel.h"
 #include "getpeername_unix_prog.skel.h"
 #include "network_helpers.h"
 
+#ifndef ENOTSUPP
+# define ENOTSUPP 524
+#endif
+
+#define TEST_NS                 "sock_addr"
+#define TEST_IF_PREFIX          "test_sock_addr"
+#define TEST_IPV4               "127.0.0.4"
+#define TEST_IPV6               "::6"
+
+#define SERV4_IP                "192.168.1.254"
+#define SERV4_REWRITE_IP        "127.0.0.1"
+#define SRC4_IP                 "172.16.0.1"
+#define SRC4_REWRITE_IP         TEST_IPV4
+#define SERV4_PORT              4040
+#define SERV4_REWRITE_PORT      4444
+
+#define SERV6_IP                "face:b00c:1234:5678::abcd"
+#define SERV6_REWRITE_IP        "::1"
+#define SERV6_V4MAPPED_IP       "::ffff:192.168.0.4"
+#define SRC6_IP                 "::1"
+#define SRC6_REWRITE_IP         TEST_IPV6
+#define WILDCARD6_IP            "::"
+#define SERV6_PORT              6060
+#define SERV6_REWRITE_PORT      6666
+
 #define SERVUN_ADDRESS         "bpf_cgroup_unix_test"
 #define SERVUN_REWRITE_ADDRESS "bpf_cgroup_unix_test_rewrite"
-#define SRCUN_ADDRESS	       "bpf_cgroup_unix_test_src"
+#define SRCUN_ADDRESS          "bpf_cgroup_unix_test_src"
+
+#define save_errno_do(op) ({ int __save = errno; op; errno = __save; })
 
 enum sock_addr_test_type {
 	SOCK_ADDR_TEST_BIND,
@@ -23,152 +63,955 @@ enum sock_addr_test_type {
 	SOCK_ADDR_TEST_GETPEERNAME,
 };
 
-typedef void *(*load_fn)(int cgroup_fd);
+typedef void *(*load_fn)(int cgroup_fd,
+			 enum bpf_attach_type attach_type,
+			 bool expect_reject);
 typedef void (*destroy_fn)(void *skel);
 
-struct sock_addr_test {
-	enum sock_addr_test_type type;
-	const char *name;
-	/* BPF prog properties */
-	load_fn loadfn;
-	destroy_fn destroyfn;
-	/* Socket properties */
-	int socket_family;
-	int socket_type;
-	/* IP:port pairs for BPF prog to override */
-	const char *requested_addr;
-	unsigned short requested_port;
-	const char *expected_addr;
-	unsigned short expected_port;
-	const char *expected_src_addr;
+static int cmp_addr(const struct sockaddr_storage *addr1, socklen_t addr1_len,
+		    const struct sockaddr_storage *addr2, socklen_t addr2_len,
+		    bool cmp_port);
+
+struct init_sock_args {
+	int af;
+	int type;
 };
 
-static void *connect_unix_prog_load(int cgroup_fd)
-{
-	struct connect_unix_prog *skel;
+struct addr_args {
+	char addr[sizeof(struct sockaddr_storage)];
+	int addrlen;
+};
 
-	skel = connect_unix_prog__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "skel_open"))
-		goto cleanup;
+struct sendmsg_args {
+	struct addr_args addr;
+	char msg[10];
+	int msglen;
+};
 
-	skel->links.connect_unix_prog = bpf_program__attach_cgroup(
-		skel->progs.connect_unix_prog, cgroup_fd);
-	if (!ASSERT_OK_PTR(skel->links.connect_unix_prog, "prog_attach"))
-		goto cleanup;
+static struct sock_addr_kern *skel;
 
-	return skel;
-cleanup:
-	connect_unix_prog__destroy(skel);
-	return NULL;
+static int run_bpf_prog(const char *prog_name, void *ctx, int ctx_size)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	struct bpf_program *prog;
+	int prog_fd, err;
+
+	topts.ctx_in = ctx;
+	topts.ctx_size_in = ctx_size;
+
+	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+		goto err;
+
+	prog_fd = bpf_program__fd(prog);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	if (!ASSERT_OK(err, prog_name))
+		goto err;
+
+	err = topts.retval;
+	errno = -topts.retval;
+	goto out;
+err:
+	err = -1;
+out:
+	return err;
 }
 
-static void connect_unix_prog_destroy(void *skel)
+static int kernel_init_sock(int af, int type, int protocol)
 {
-	connect_unix_prog__destroy(skel);
+	struct init_sock_args args = {
+		.af = af,
+		.type = type,
+	};
+
+	return run_bpf_prog("init_sock", &args, sizeof(args));
 }
 
-static void *sendmsg_unix_prog_load(int cgroup_fd)
+static int kernel_close_sock(int fd)
 {
-	struct sendmsg_unix_prog *skel;
+	return run_bpf_prog("close_sock", NULL, 0);
+}
 
-	skel = sendmsg_unix_prog__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "skel_open"))
-		goto cleanup;
+static int sock_addr_op(const char *name, struct sockaddr *addr,
+			socklen_t *addrlen, bool expect_change)
+{
+	struct addr_args args;
+	int err;
 
-	skel->links.sendmsg_unix_prog = bpf_program__attach_cgroup(
-		skel->progs.sendmsg_unix_prog, cgroup_fd);
-	if (!ASSERT_OK_PTR(skel->links.sendmsg_unix_prog, "prog_attach"))
-		goto cleanup;
+	if (addrlen)
+		args.addrlen = *addrlen;
 
-	return skel;
-cleanup:
-	sendmsg_unix_prog__destroy(skel);
-	return NULL;
+	if (addr)
+		memcpy(&args.addr, addr, *addrlen);
+
+	err = run_bpf_prog(name, &args, sizeof(args));
+
+	if (!expect_change && addr)
+		if (!ASSERT_EQ(cmp_addr((struct sockaddr_storage *)addr,
+					*addrlen,
+					(struct sockaddr_storage *)&args.addr,
+					args.addrlen, 1),
+			       0, "address_param_modified"))
+			return -1;
+
+	if (addrlen)
+		*addrlen = args.addrlen;
+
+	if (addr)
+		memcpy(addr, &args.addr, *addrlen);
+
+	return err;
 }
 
-static void sendmsg_unix_prog_destroy(void *skel)
+static int send_msg_op(const char *name, struct sockaddr *addr,
+		       socklen_t addrlen, const char *msg, int msglen)
 {
-	sendmsg_unix_prog__destroy(skel);
+	struct sendmsg_args args;
+	int err;
+
+	memset(&args, 0, sizeof(args));
+	memcpy(&args.addr.addr, addr, addrlen);
+	args.addr.addrlen = addrlen;
+	memcpy(args.msg, msg, msglen);
+	args.msglen = msglen;
+
+	err = run_bpf_prog(name, &args, sizeof(args));
+
+	if (!ASSERT_EQ(cmp_addr((struct sockaddr_storage *)addr,
+				addrlen,
+				(struct sockaddr_storage *)&args.addr.addr,
+				args.addr.addrlen, 1),
+		       0, "address_param_modified"))
+		return -1;
+
+	return err;
 }
 
-static void *recvmsg_unix_prog_load(int cgroup_fd)
+static int kernel_connect(struct sockaddr *addr, socklen_t addrlen)
 {
-	struct recvmsg_unix_prog *skel;
-
-	skel = recvmsg_unix_prog__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "skel_open"))
-		goto cleanup;
+	return sock_addr_op("kernel_connect", addr, &addrlen, false);
+}
 
-	skel->links.recvmsg_unix_prog = bpf_program__attach_cgroup(
-		skel->progs.recvmsg_unix_prog, cgroup_fd);
-	if (!ASSERT_OK_PTR(skel->links.recvmsg_unix_prog, "prog_attach"))
-		goto cleanup;
+static int kernel_bind(int fd, struct sockaddr *addr, socklen_t addrlen)
+{
+	return sock_addr_op("kernel_bind", addr, &addrlen, false);
+}
 
-	return skel;
-cleanup:
-	recvmsg_unix_prog__destroy(skel);
-	return NULL;
+static int kernel_listen(void)
+{
+	return sock_addr_op("kernel_listen", NULL, NULL, false);
 }
 
-static void recvmsg_unix_prog_destroy(void *skel)
+static int kernel_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen,
+			  char *msg, int msglen)
 {
-	recvmsg_unix_prog__destroy(skel);
+	return send_msg_op("kernel_sendmsg", addr, addrlen, msg, msglen);
 }
 
-static void *getsockname_unix_prog_load(int cgroup_fd)
+static int sock_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen,
+			char *msg, int msglen)
 {
-	struct getsockname_unix_prog *skel;
+	return send_msg_op("sock_sendmsg", addr, addrlen, msg, msglen);
+}
 
-	skel = getsockname_unix_prog__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "skel_open"))
-		goto cleanup;
+static int kernel_getsockname(int fd, struct sockaddr *addr, socklen_t *addrlen)
+{
+	return sock_addr_op("kernel_getsockname", addr, addrlen, true);
+}
 
-	skel->links.getsockname_unix_prog = bpf_program__attach_cgroup(
-		skel->progs.getsockname_unix_prog, cgroup_fd);
-	if (!ASSERT_OK_PTR(skel->links.getsockname_unix_prog, "prog_attach"))
-		goto cleanup;
+static int kernel_getpeername(int fd, struct sockaddr *addr, socklen_t *addrlen)
+{
+	return sock_addr_op("kernel_getpeername", addr, addrlen, true);
+}
 
-	return skel;
-cleanup:
-	getsockname_unix_prog__destroy(skel);
-	return NULL;
+int kernel_connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen,
+			   const struct network_helper_opts *opts)
+{
+	int err;
+
+	if (!ASSERT_OK(kernel_init_sock(addr->ss_family, type, 0),
+		       "kernel_init_sock"))
+		goto err;
+
+	if (kernel_connect((struct sockaddr *)addr, addrlen) < 0)
+		goto err;
+
+	/* Test code expects a "file descriptor" on success. */
+	err = 1;
+	goto out;
+err:
+	err = -1;
+	save_errno_do(ASSERT_OK(kernel_close_sock(0), "kernel_close_sock"));
+out:
+	return err;
 }
 
-static void getsockname_unix_prog_destroy(void *skel)
+int kernel_start_server(int family, int type, const char *addr_str, __u16 port,
+			int timeout_ms)
 {
-	getsockname_unix_prog__destroy(skel);
+	struct sockaddr_storage addr;
+	socklen_t addrlen;
+	int err;
+
+	if (!ASSERT_OK(kernel_init_sock(family, type, 0), "kernel_init_sock"))
+		goto err;
+
+	if (make_sockaddr(family, addr_str, port, &addr, &addrlen))
+		goto err;
+
+	if (kernel_bind(0, (struct sockaddr *)&addr, addrlen) < 0)
+		goto err;
+
+	if (type == SOCK_STREAM) {
+		if (!ASSERT_OK(kernel_listen(), "kernel_listen"))
+			goto err;
+	}
+
+	/* Test code expects a "file descriptor" on success. */
+	err = 1;
+	goto out;
+err:
+	err = -1;
+	save_errno_do(ASSERT_OK(kernel_close_sock(0), "kernel_close_sock"));
+out:
+	return err;
 }
 
-static void *getpeername_unix_prog_load(int cgroup_fd)
+struct sock_ops {
+	int (*connect_to_addr)(int type, const struct sockaddr_storage *addr,
+			       socklen_t addrlen,
+			       const struct network_helper_opts *opts);
+	int (*start_server)(int family, int type, const char *addr_str,
+			    __u16 port, int timeout_ms);
+	int (*socket)(int famil, int type, int protocol);
+	int (*bind)(int fd, struct sockaddr *addr, socklen_t addrlen);
+	int (*getsockname)(int fd, struct sockaddr *addr, socklen_t *addrlen);
+	int (*getpeername)(int fd, struct sockaddr *addr, socklen_t *addrlen);
+	int (*sendmsg)(int fd, struct sockaddr *addr, socklen_t addrlen,
+		       char *msg, int msglen);
+	int (*close)(int fd);
+};
+
+static int user_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen,
+			char *msg, int msglen)
 {
-	struct getpeername_unix_prog *skel;
+	struct msghdr hdr;
+	struct iovec iov;
 
-	skel = getpeername_unix_prog__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "skel_open"))
-		goto cleanup;
+	memset(&iov, 0, sizeof(iov));
+	iov.iov_base = msg;
+	iov.iov_len = msglen;
 
-	skel->links.getpeername_unix_prog = bpf_program__attach_cgroup(
-		skel->progs.getpeername_unix_prog, cgroup_fd);
-	if (!ASSERT_OK_PTR(skel->links.getpeername_unix_prog, "prog_attach"))
-		goto cleanup;
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.msg_name = (void *)addr;
+	hdr.msg_namelen = addrlen;
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
 
-	return skel;
-cleanup:
-	getpeername_unix_prog__destroy(skel);
-	return NULL;
+	return sendmsg(fd, &hdr, 0);
 }
 
-static void getpeername_unix_prog_destroy(void *skel)
+static int user_bind(int fd, struct sockaddr *addr, socklen_t addrlen)
 {
-	getpeername_unix_prog__destroy(skel);
+	return bind(fd, (const struct sockaddr *)addr, addrlen);
+}
+
+struct sock_ops user_ops = {
+	.connect_to_addr = connect_to_addr,
+	.start_server = start_server,
+	.socket = socket,
+	.bind = user_bind,
+	.getsockname = getsockname,
+	.getpeername = getpeername,
+	.sendmsg = user_sendmsg,
+	.close = close,
+};
+
+struct sock_ops kern_ops_sock_sendmsg = {
+	.connect_to_addr = kernel_connect_to_addr,
+	.start_server = kernel_start_server,
+	.socket = kernel_init_sock,
+	.bind = kernel_bind,
+	.getsockname = kernel_getsockname,
+	.getpeername = kernel_getpeername,
+	.sendmsg = sock_sendmsg,
+	.close = kernel_close_sock,
+};
+
+struct sock_ops kern_ops_kernel_sendmsg = {
+	.connect_to_addr = kernel_connect_to_addr,
+	.start_server = kernel_start_server,
+	.socket = kernel_init_sock,
+	.bind = kernel_bind,
+	.getsockname = kernel_getsockname,
+	.getpeername = kernel_getpeername,
+	.sendmsg = kernel_sendmsg,
+	.close = kernel_close_sock,
+};
+
+struct sock_addr_test {
+	enum sock_addr_test_type type;
+	const char *name;
+	/* BPF prog properties */
+	load_fn loadfn;
+	destroy_fn destroyfn;
+	enum bpf_attach_type attach_type;
+	/* Socket operations */
+	struct sock_ops *ops;
+	/* Socket properties */
+	int socket_family;
+	int socket_type;
+	/* IP:port pairs for BPF prog to override */
+	const char *requested_addr;
+	unsigned short requested_port;
+	const char *expected_addr;
+	unsigned short expected_port;
+	const char *expected_src_addr;
+	/* Expected test result */
+	enum {
+		LOAD_REJECT,
+		ATTACH_REJECT,
+		SYSCALL_EPERM,
+		SYSCALL_ENOTSUPP,
+		SUCCESS,
+	} expected_result;
+};
+
+#define BPF_SKEL_FUNCS_RAW(skel_name, prog_name) \
+static void *prog_name##_load_raw(int cgroup_fd, \
+				  enum bpf_attach_type attach_type, \
+				  bool expect_reject) \
+{ \
+	struct skel_name *skel = skel_name##__open(); \
+	int prog_fd = -1; \
+	if (!ASSERT_OK_PTR(skel, "skel_open")) \
+		goto cleanup; \
+	if (!ASSERT_OK(skel_name##__load(skel), "load")) \
+		goto cleanup; \
+	prog_fd = bpf_program__fd(skel->progs.prog_name); \
+	if (!ASSERT_GT(prog_fd, 0, "prog_fd")) \
+		goto cleanup; \
+	if (bpf_prog_attach(prog_fd, cgroup_fd, attach_type, \
+			      BPF_F_ALLOW_OVERRIDE), "bpf_prog_attach") { \
+		ASSERT_TRUE(expect_reject, "unexpected rejection"); \
+		goto cleanup; \
+	} \
+	if (!ASSERT_FALSE(expect_reject, "expected rejection")) \
+		goto cleanup; \
+cleanup: \
+	if (prog_fd > 0) \
+		bpf_prog_detach(cgroup_fd, attach_type); \
+	skel_name##__destroy(skel); \
+	return NULL; \
+} \
+static void prog_name##_destroy_raw(void *progfd) \
+{ \
+	/* No-op. *_load_raw does all cleanup. */ \
+} \
+
+#define BPF_SKEL_FUNCS(skel_name, prog_name) \
+static void *prog_name##_load(int cgroup_fd, \
+			      enum bpf_attach_type attach_type, \
+			      bool expect_reject) \
+{ \
+	struct skel_name *skel = skel_name##__open(); \
+	if (!ASSERT_OK_PTR(skel, "skel_open")) \
+		goto cleanup; \
+	if (!ASSERT_OK(bpf_program__set_expected_attach_type(skel->progs.prog_name, \
+							     attach_type), \
+		       "set_expected_attach_type")) \
+		goto cleanup; \
+	if (skel_name##__load(skel)) { \
+		ASSERT_TRUE(expect_reject, "unexpected rejection"); \
+		goto cleanup; \
+	} \
+	if (!ASSERT_FALSE(expect_reject, "expected rejection")) \
+		goto cleanup; \
+	skel->links.prog_name = bpf_program__attach_cgroup( \
+		skel->progs.prog_name, cgroup_fd); \
+	if (!ASSERT_OK_PTR(skel->links.prog_name, "prog_attach")) \
+		goto cleanup; \
+	return skel; \
+cleanup: \
+	skel_name##__destroy(skel); \
+	return NULL; \
+} \
+static void prog_name##_destroy(void *skel) \
+{ \
+	skel_name##__destroy(skel); \
 }
 
+BPF_SKEL_FUNCS(bind4_prog, bind_v4_prog);
+BPF_SKEL_FUNCS_RAW(bind4_prog, bind_v4_prog);
+BPF_SKEL_FUNCS(bind4_prog, bind_v4_deny_prog);
+BPF_SKEL_FUNCS(bind6_prog, bind_v6_prog);
+BPF_SKEL_FUNCS_RAW(bind6_prog, bind_v6_prog);
+BPF_SKEL_FUNCS(bind6_prog, bind_v6_deny_prog);
+BPF_SKEL_FUNCS(connect4_prog, connect_v4_prog);
+BPF_SKEL_FUNCS_RAW(connect4_prog, connect_v4_prog);
+BPF_SKEL_FUNCS(connect4_prog, connect_v4_deny_prog);
+BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog);
+BPF_SKEL_FUNCS_RAW(connect6_prog, connect_v6_prog);
+BPF_SKEL_FUNCS(connect6_prog, connect_v6_deny_prog);
+BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog);
+BPF_SKEL_FUNCS_RAW(connect_unix_prog, connect_unix_prog);
+BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_deny_prog);
+BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog);
+BPF_SKEL_FUNCS_RAW(sendmsg4_prog, sendmsg_v4_prog);
+BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_deny_prog);
+BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog);
+BPF_SKEL_FUNCS_RAW(sendmsg6_prog, sendmsg_v6_prog);
+BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_deny_prog);
+BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog);
+BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog);
+BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_wildcard_prog);
+BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog);
+BPF_SKEL_FUNCS_RAW(sendmsg_unix_prog, sendmsg_unix_prog);
+BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_deny_prog);
+BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog);
+BPF_SKEL_FUNCS_RAW(recvmsg4_prog, recvmsg4_prog);
+BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog);
+BPF_SKEL_FUNCS_RAW(recvmsg6_prog, recvmsg6_prog);
+BPF_SKEL_FUNCS(recvmsg_unix_prog, recvmsg_unix_prog);
+BPF_SKEL_FUNCS_RAW(recvmsg_unix_prog, recvmsg_unix_prog);
+BPF_SKEL_FUNCS(getsockname_unix_prog, getsockname_unix_prog);
+BPF_SKEL_FUNCS_RAW(getsockname_unix_prog, getsockname_unix_prog);
+BPF_SKEL_FUNCS(getsockname4_prog, getsockname_v4_prog);
+BPF_SKEL_FUNCS_RAW(getsockname4_prog, getsockname_v4_prog);
+BPF_SKEL_FUNCS(getsockname6_prog, getsockname_v6_prog);
+BPF_SKEL_FUNCS_RAW(getsockname6_prog, getsockname_v6_prog);
+BPF_SKEL_FUNCS(getpeername_unix_prog, getpeername_unix_prog);
+BPF_SKEL_FUNCS_RAW(getpeername_unix_prog, getpeername_unix_prog);
+BPF_SKEL_FUNCS(getpeername4_prog, getpeername_v4_prog);
+BPF_SKEL_FUNCS_RAW(getpeername4_prog, getpeername_v4_prog);
+BPF_SKEL_FUNCS(getpeername6_prog, getpeername_v6_prog);
+BPF_SKEL_FUNCS_RAW(getpeername6_prog, getpeername_v6_prog);
+
 static struct sock_addr_test tests[] = {
+	/* bind - system calls */
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind4: bind (stream)",
+		bind_v4_prog_load,
+		bind_v4_prog_destroy,
+		BPF_CGROUP_INET4_BIND,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind4: bind deny (stream)",
+		bind_v4_deny_prog_load,
+		bind_v4_deny_prog_destroy,
+		BPF_CGROUP_INET4_BIND,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind4: bind (dgram)",
+		bind_v4_prog_load,
+		bind_v4_prog_destroy,
+		BPF_CGROUP_INET4_BIND,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind4: bind deny (dgram)",
+		bind_v4_deny_prog_load,
+		bind_v4_deny_prog_destroy,
+		BPF_CGROUP_INET4_BIND,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind4: load prog with wrong expected attach type",
+		bind_v4_prog_load,
+		bind_v4_prog_destroy,
+		BPF_CGROUP_INET6_BIND,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind4: attach prog with wrong attach type",
+		bind_v4_prog_load_raw,
+		bind_v4_prog_destroy_raw,
+		BPF_CGROUP_INET6_BIND,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind6: bind (stream)",
+		bind_v6_prog_load,
+		bind_v6_prog_destroy,
+		BPF_CGROUP_INET6_BIND,
+		&user_ops,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind6: bind deny (stream)",
+		bind_v6_deny_prog_load,
+		bind_v6_deny_prog_destroy,
+		BPF_CGROUP_INET6_BIND,
+		&user_ops,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind6: bind (dgram)",
+		bind_v6_prog_load,
+		bind_v6_prog_destroy,
+		BPF_CGROUP_INET6_BIND,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind6: bind deny (dgram)",
+		bind_v6_deny_prog_load,
+		bind_v6_deny_prog_destroy,
+		BPF_CGROUP_INET6_BIND,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind6: load prog with wrong expected attach type",
+		bind_v6_prog_load,
+		bind_v6_prog_destroy,
+		BPF_CGROUP_INET4_BIND,
+		&user_ops,
+		AF_INET6,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind6: attach prog with wrong attach type",
+		bind_v6_prog_load_raw,
+		bind_v6_prog_destroy_raw,
+		BPF_CGROUP_INET4_BIND,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+
+	/* bind - kernel calls */
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind4: kernel_bind (stream)",
+		bind_v4_prog_load,
+		bind_v4_prog_destroy,
+		BPF_CGROUP_INET4_BIND,
+		&kern_ops_sock_sendmsg,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind4: kernel_bind deny (stream)",
+		bind_v4_deny_prog_load,
+		bind_v4_deny_prog_destroy,
+		BPF_CGROUP_INET4_BIND,
+		&kern_ops_sock_sendmsg,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind4: kernel_bind (dgram)",
+		bind_v4_prog_load,
+		bind_v4_prog_destroy,
+		BPF_CGROUP_INET4_BIND,
+		&kern_ops_sock_sendmsg,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind4: kernel_bind deny (dgram)",
+		bind_v4_deny_prog_load,
+		bind_v4_deny_prog_destroy,
+		BPF_CGROUP_INET4_BIND,
+		&kern_ops_sock_sendmsg,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind6: kernel_bind (stream)",
+		bind_v6_prog_load,
+		bind_v6_prog_destroy,
+		BPF_CGROUP_INET6_BIND,
+		&kern_ops_sock_sendmsg,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind6: kernel_bind deny (stream)",
+		bind_v6_deny_prog_load,
+		bind_v6_deny_prog_destroy,
+		BPF_CGROUP_INET6_BIND,
+		&kern_ops_sock_sendmsg,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind6: kernel_bind (dgram)",
+		bind_v6_prog_load,
+		bind_v6_prog_destroy,
+		BPF_CGROUP_INET6_BIND,
+		&kern_ops_sock_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_BIND,
+		"bind6: kernel_bind deny (dgram)",
+		bind_v6_deny_prog_load,
+		bind_v6_deny_prog_destroy,
+		BPF_CGROUP_INET6_BIND,
+		&kern_ops_sock_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SYSCALL_EPERM,
+	},
+
+	/* connect - system calls */
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect4: connect (stream)",
+		connect_v4_prog_load,
+		connect_v4_prog_destroy,
+		BPF_CGROUP_INET4_CONNECT,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect4: connect deny (stream)",
+		connect_v4_deny_prog_load,
+		connect_v4_deny_prog_destroy,
+		BPF_CGROUP_INET4_CONNECT,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect4: connect (dgram)",
+		connect_v4_prog_load,
+		connect_v4_prog_destroy,
+		BPF_CGROUP_INET4_CONNECT,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect4: connect deny (dgram)",
+		connect_v4_deny_prog_load,
+		connect_v4_deny_prog_destroy,
+		BPF_CGROUP_INET4_CONNECT,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect4: load prog with wrong expected attach type",
+		connect_v4_prog_load,
+		connect_v4_prog_destroy,
+		BPF_CGROUP_INET6_CONNECT,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect4: attach prog with wrong attach type",
+		connect_v4_prog_load_raw,
+		connect_v4_prog_destroy_raw,
+		BPF_CGROUP_INET6_CONNECT,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect6: connect (stream)",
+		connect_v6_prog_load,
+		connect_v6_prog_destroy,
+		BPF_CGROUP_INET6_CONNECT,
+		&user_ops,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect6: connect deny (stream)",
+		connect_v6_deny_prog_load,
+		connect_v6_deny_prog_destroy,
+		BPF_CGROUP_INET6_CONNECT,
+		&user_ops,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
 	{
 		SOCK_ADDR_TEST_CONNECT,
-		"connect_unix",
+		"connect6: connect (dgram)",
+		connect_v6_prog_load,
+		connect_v6_prog_destroy,
+		BPF_CGROUP_INET6_CONNECT,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect6: connect deny (dgram)",
+		connect_v6_deny_prog_load,
+		connect_v6_deny_prog_destroy,
+		BPF_CGROUP_INET6_CONNECT,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect6: load prog with wrong expected attach type",
+		connect_v6_prog_load,
+		connect_v6_prog_destroy,
+		BPF_CGROUP_INET4_CONNECT,
+		&user_ops,
+		AF_INET6,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect6: attach prog with wrong attach type",
+		connect_v6_prog_load_raw,
+		connect_v6_prog_destroy_raw,
+		BPF_CGROUP_INET4_CONNECT,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect_unix: connect (stream)",
 		connect_unix_prog_load,
 		connect_unix_prog_destroy,
+		BPF_CGROUP_UNIX_CONNECT,
+		&user_ops,
 		AF_UNIX,
 		SOCK_STREAM,
 		SERVUN_ADDRESS,
@@ -176,12 +1019,631 @@ static struct sock_addr_test tests[] = {
 		SERVUN_REWRITE_ADDRESS,
 		0,
 		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect_unix: connect deny (stream)",
+		connect_unix_deny_prog_load,
+		connect_unix_deny_prog_destroy,
+		BPF_CGROUP_UNIX_CONNECT,
+		&user_ops,
+		AF_UNIX,
+		SOCK_STREAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect_unix: attach prog with wrong attach type",
+		connect_unix_prog_load_raw,
+		connect_unix_prog_destroy_raw,
+		BPF_CGROUP_INET4_CONNECT,
+		&user_ops,
+		AF_UNIX,
+		SOCK_STREAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+
+	/* connect - kernel calls */
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect4: kernel_connect (stream)",
+		connect_v4_prog_load,
+		connect_v4_prog_destroy,
+		BPF_CGROUP_INET4_CONNECT,
+		&kern_ops_sock_sendmsg,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect4: kernel_connect deny (stream)",
+		connect_v4_deny_prog_load,
+		connect_v4_deny_prog_destroy,
+		BPF_CGROUP_INET4_CONNECT,
+		&kern_ops_sock_sendmsg,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect4: kernel_connect (dgram)",
+		connect_v4_prog_load,
+		connect_v4_prog_destroy,
+		BPF_CGROUP_INET4_CONNECT,
+		&kern_ops_sock_sendmsg,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect4: kernel_connect deny (dgram)",
+		connect_v4_deny_prog_load,
+		connect_v4_deny_prog_destroy,
+		BPF_CGROUP_INET4_CONNECT,
+		&kern_ops_sock_sendmsg,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect6: kernel_connect (stream)",
+		connect_v6_prog_load,
+		connect_v6_prog_destroy,
+		BPF_CGROUP_INET6_CONNECT,
+		&kern_ops_sock_sendmsg,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect6: kernel_connect deny (stream)",
+		connect_v6_deny_prog_load,
+		connect_v6_deny_prog_destroy,
+		BPF_CGROUP_INET6_CONNECT,
+		&kern_ops_sock_sendmsg,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect6: kernel_connect (dgram)",
+		connect_v6_prog_load,
+		connect_v6_prog_destroy,
+		BPF_CGROUP_INET6_CONNECT,
+		&kern_ops_sock_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect6: kernel_connect deny (dgram)",
+		connect_v6_deny_prog_load,
+		connect_v6_deny_prog_destroy,
+		BPF_CGROUP_INET6_CONNECT,
+		&kern_ops_sock_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect_unix: kernel_connect (dgram)",
+		connect_unix_prog_load,
+		connect_unix_prog_destroy,
+		BPF_CGROUP_UNIX_CONNECT,
+		&kern_ops_sock_sendmsg,
+		AF_UNIX,
+		SOCK_STREAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_CONNECT,
+		"connect_unix: kernel_connect deny (dgram)",
+		connect_unix_deny_prog_load,
+		connect_unix_deny_prog_destroy,
+		BPF_CGROUP_UNIX_CONNECT,
+		&kern_ops_sock_sendmsg,
+		AF_UNIX,
+		SOCK_STREAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		SYSCALL_EPERM,
+	},
+
+	/* sendmsg - system calls */
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg4: sendmsg (dgram)",
+		sendmsg_v4_prog_load,
+		sendmsg_v4_prog_destroy,
+		BPF_CGROUP_UDP4_SENDMSG,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg4: sendmsg deny (dgram)",
+		sendmsg_v4_deny_prog_load,
+		sendmsg_v4_deny_prog_destroy,
+		BPF_CGROUP_UDP4_SENDMSG,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg4: load prog with wrong expected attach type",
+		sendmsg_v4_prog_load,
+		sendmsg_v4_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg4: attach prog with wrong attach type",
+		sendmsg_v4_prog_load_raw,
+		sendmsg_v4_prog_destroy_raw,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: sendmsg (dgram)",
+		sendmsg_v6_prog_load,
+		sendmsg_v6_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: sendmsg [::] (BSD'ism) (dgram)",
+		sendmsg_v6_preserve_dst_prog_load,
+		sendmsg_v6_preserve_dst_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		WILDCARD6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_PORT,
+		SRC6_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: sendmsg deny (dgram)",
+		sendmsg_v6_deny_prog_load,
+		sendmsg_v6_deny_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: sendmsg IPv4-mapped IPv6 (dgram)",
+		sendmsg_v6_v4mapped_prog_load,
+		sendmsg_v6_v4mapped_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_ENOTSUPP,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: sendmsg dst IP = [::] (BSD'ism) (dgram)",
+		sendmsg_v6_wildcard_prog_load,
+		sendmsg_v6_wildcard_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: load prog with wrong expected attach type",
+		sendmsg_v6_prog_load,
+		sendmsg_v6_prog_destroy,
+		BPF_CGROUP_UDP4_SENDMSG,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: attach prog with wrong attach type",
+		sendmsg_v6_prog_load_raw,
+		sendmsg_v6_prog_destroy_raw,
+		BPF_CGROUP_UDP4_SENDMSG,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg_unix: sendmsg (dgram)",
+		sendmsg_unix_prog_load,
+		sendmsg_unix_prog_destroy,
+		BPF_CGROUP_UNIX_SENDMSG,
+		&user_ops,
+		AF_UNIX,
+		SOCK_DGRAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg_unix: sendmsg deny (dgram)",
+		sendmsg_unix_deny_prog_load,
+		sendmsg_unix_deny_prog_destroy,
+		BPF_CGROUP_UNIX_SENDMSG,
+		&user_ops,
+		AF_UNIX,
+		SOCK_DGRAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg_unix: attach prog with wrong attach type",
+		sendmsg_unix_prog_load_raw,
+		sendmsg_unix_prog_destroy_raw,
+		BPF_CGROUP_UDP4_SENDMSG,
+		&user_ops,
+		AF_UNIX,
+		SOCK_DGRAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+
+	/* sendmsg - kernel calls (sock_sendmsg) */
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg4: sock_sendmsg (dgram)",
+		sendmsg_v4_prog_load,
+		sendmsg_v4_prog_destroy,
+		BPF_CGROUP_UDP4_SENDMSG,
+		&kern_ops_sock_sendmsg,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg4: sock_sendmsg deny (dgram)",
+		sendmsg_v4_deny_prog_load,
+		sendmsg_v4_deny_prog_destroy,
+		BPF_CGROUP_UDP4_SENDMSG,
+		&kern_ops_sock_sendmsg,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: sock_sendmsg (dgram)",
+		sendmsg_v6_prog_load,
+		sendmsg_v6_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&kern_ops_sock_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: sock_sendmsg [::] (BSD'ism) (dgram)",
+		sendmsg_v6_preserve_dst_prog_load,
+		sendmsg_v6_preserve_dst_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&kern_ops_sock_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		WILDCARD6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_PORT,
+		SRC6_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: sock_sendmsg deny (dgram)",
+		sendmsg_v6_deny_prog_load,
+		sendmsg_v6_deny_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&kern_ops_sock_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg_unix: sock_sendmsg (dgram)",
+		sendmsg_unix_prog_load,
+		sendmsg_unix_prog_destroy,
+		BPF_CGROUP_UNIX_SENDMSG,
+		&kern_ops_sock_sendmsg,
+		AF_UNIX,
+		SOCK_DGRAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg_unix: sock_sendmsg deny (dgram)",
+		sendmsg_unix_deny_prog_load,
+		sendmsg_unix_deny_prog_destroy,
+		BPF_CGROUP_UNIX_SENDMSG,
+		&kern_ops_sock_sendmsg,
+		AF_UNIX,
+		SOCK_DGRAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		SYSCALL_EPERM,
+	},
+
+	/* sendmsg - kernel calls (kernel_sendmsg) */
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg4: kernel_sendmsg (dgram)",
+		sendmsg_v4_prog_load,
+		sendmsg_v4_prog_destroy,
+		BPF_CGROUP_UDP4_SENDMSG,
+		&kern_ops_kernel_sendmsg,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg4: kernel_sendmsg deny (dgram)",
+		sendmsg_v4_deny_prog_load,
+		sendmsg_v4_deny_prog_destroy,
+		BPF_CGROUP_UDP4_SENDMSG,
+		&kern_ops_kernel_sendmsg,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: kernel_sendmsg (dgram)",
+		sendmsg_v6_prog_load,
+		sendmsg_v6_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&kern_ops_kernel_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg6: kernel_sendmsg [::] (BSD'ism) (dgram)",
+		sendmsg_v6_preserve_dst_prog_load,
+		sendmsg_v6_preserve_dst_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&kern_ops_kernel_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		WILDCARD6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_PORT,
+		SRC6_IP,
+		SUCCESS,
 	},
 	{
 		SOCK_ADDR_TEST_SENDMSG,
-		"sendmsg_unix",
+		"sendmsg6: kernel_sendmsg deny (dgram)",
+		sendmsg_v6_deny_prog_load,
+		sendmsg_v6_deny_prog_destroy,
+		BPF_CGROUP_UDP6_SENDMSG,
+		&kern_ops_kernel_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg_unix: sock_sendmsg (dgram)",
 		sendmsg_unix_prog_load,
 		sendmsg_unix_prog_destroy,
+		BPF_CGROUP_UNIX_SENDMSG,
+		&kern_ops_kernel_sendmsg,
 		AF_UNIX,
 		SOCK_DGRAM,
 		SERVUN_ADDRESS,
@@ -189,12 +1651,97 @@ static struct sock_addr_test tests[] = {
 		SERVUN_REWRITE_ADDRESS,
 		0,
 		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_SENDMSG,
+		"sendmsg_unix: kernel_sendmsg deny (dgram)",
+		sendmsg_unix_deny_prog_load,
+		sendmsg_unix_deny_prog_destroy,
+		BPF_CGROUP_UNIX_SENDMSG,
+		&kern_ops_kernel_sendmsg,
+		AF_UNIX,
+		SOCK_DGRAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		SYSCALL_EPERM,
+	},
+
+	/* recvmsg - system calls */
+	{
+		SOCK_ADDR_TEST_RECVMSG,
+		"recvmsg4: recvfrom (dgram)",
+		recvmsg4_prog_load,
+		recvmsg4_prog_destroy,
+		BPF_CGROUP_UDP4_RECVMSG,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_RECVMSG,
+		"recvmsg4: attach prog with wrong attach type",
+		recvmsg4_prog_load_raw,
+		recvmsg4_prog_destroy_raw,
+		BPF_CGROUP_UDP6_RECVMSG,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		ATTACH_REJECT,
 	},
 	{
 		SOCK_ADDR_TEST_RECVMSG,
-		"recvmsg_unix-dgram",
+		"recvmsg6: recvfrom (dgram)",
+		recvmsg6_prog_load,
+		recvmsg6_prog_destroy,
+		BPF_CGROUP_UDP6_RECVMSG,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_RECVMSG,
+		"recvmsg6: attach prog with wrong attach type",
+		recvmsg6_prog_load_raw,
+		recvmsg6_prog_destroy_raw,
+		BPF_CGROUP_UDP4_RECVMSG,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		ATTACH_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_RECVMSG,
+		"recvmsg_unix: recvfrom (dgram)",
 		recvmsg_unix_prog_load,
 		recvmsg_unix_prog_destroy,
+		BPF_CGROUP_UNIX_RECVMSG,
+		&user_ops,
 		AF_UNIX,
 		SOCK_DGRAM,
 		SERVUN_REWRITE_ADDRESS,
@@ -202,12 +1749,15 @@ static struct sock_addr_test tests[] = {
 		SERVUN_REWRITE_ADDRESS,
 		0,
 		SERVUN_ADDRESS,
+		SUCCESS,
 	},
 	{
 		SOCK_ADDR_TEST_RECVMSG,
-		"recvmsg_unix-stream",
+		"recvmsg_unix: recvfrom (stream)",
 		recvmsg_unix_prog_load,
 		recvmsg_unix_prog_destroy,
+		BPF_CGROUP_UNIX_RECVMSG,
+		&user_ops,
 		AF_UNIX,
 		SOCK_STREAM,
 		SERVUN_REWRITE_ADDRESS,
@@ -215,12 +1765,227 @@ static struct sock_addr_test tests[] = {
 		SERVUN_REWRITE_ADDRESS,
 		0,
 		SERVUN_ADDRESS,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_RECVMSG,
+		"recvmsg_unix: attach prog with wrong attach type",
+		recvmsg_unix_prog_load_raw,
+		recvmsg_unix_prog_destroy_raw,
+		BPF_CGROUP_UDP4_RECVMSG,
+		&user_ops,
+		AF_INET6,
+		SOCK_STREAM,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		SERVUN_ADDRESS,
+		ATTACH_REJECT,
 	},
+
+	/* getsockname - system calls */
 	{
 		SOCK_ADDR_TEST_GETSOCKNAME,
-		"getsockname_unix",
+		"getsockname4: getsockname (stream)",
+		getsockname_v4_prog_load,
+		getsockname_v4_prog_destroy,
+		BPF_CGROUP_INET4_GETSOCKNAME,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		SERV4_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname4: getsockname (dgram)",
+		getsockname_v4_prog_load,
+		getsockname_v4_prog_destroy,
+		BPF_CGROUP_INET4_GETSOCKNAME,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		SERV4_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname4: attach prog with wrong attach type",
+		getsockname_v4_prog_load_raw,
+		getsockname_v4_prog_destroy_raw,
+		BPF_CGROUP_INET6_GETSOCKNAME,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		SERV4_PORT,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname6: getsockname (stream)",
+		getsockname_v6_prog_load,
+		getsockname_v6_prog_destroy,
+		BPF_CGROUP_INET6_GETSOCKNAME,
+		&user_ops,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		SERV6_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname6: getsockname (dgram)",
+		getsockname_v6_prog_load,
+		getsockname_v6_prog_destroy,
+		BPF_CGROUP_INET6_GETSOCKNAME,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		SERV6_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname6: attach prog with wrong attach type",
+		getsockname_v6_prog_load_raw,
+		getsockname_v6_prog_destroy_raw,
+		BPF_CGROUP_INET4_GETSOCKNAME,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		SERV6_PORT,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname_unix: getsockname",
+		getsockname_unix_prog_load,
+		getsockname_unix_prog_destroy,
+		BPF_CGROUP_UNIX_GETSOCKNAME,
+		&user_ops,
+		AF_UNIX,
+		SOCK_STREAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname_unix: attach prog with wrong attach type",
+		getsockname_unix_prog_load_raw,
+		getsockname_unix_prog_destroy_raw,
+		BPF_CGROUP_INET4_GETSOCKNAME,
+		&user_ops,
+		AF_UNIX,
+		SOCK_STREAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+
+	/* getsockname - kernel calls */
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname4: kernel_getsockname (stream)",
+		getsockname_v4_prog_load,
+		getsockname_v4_prog_destroy,
+		BPF_CGROUP_INET4_GETSOCKNAME,
+		&kern_ops_kernel_sendmsg,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		SERV4_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname4: kernel_getsockname (dgram)",
+		getsockname_v4_prog_load,
+		getsockname_v4_prog_destroy,
+		BPF_CGROUP_INET4_GETSOCKNAME,
+		&kern_ops_kernel_sendmsg,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		SERV4_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname6: kernel_getsockname (stream)",
+		getsockname_v6_prog_load,
+		getsockname_v6_prog_destroy,
+		BPF_CGROUP_INET6_GETSOCKNAME,
+		&kern_ops_kernel_sendmsg,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		SERV6_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname6: kernel_getsockname (dgram)",
+		getsockname_v6_prog_load,
+		getsockname_v6_prog_destroy,
+		BPF_CGROUP_INET6_GETSOCKNAME,
+		&kern_ops_kernel_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		SERV6_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETSOCKNAME,
+		"getsockname_unix: kernel_getsockname",
 		getsockname_unix_prog_load,
 		getsockname_unix_prog_destroy,
+		BPF_CGROUP_UNIX_GETSOCKNAME,
+		&kern_ops_kernel_sendmsg,
 		AF_UNIX,
 		SOCK_STREAM,
 		SERVUN_ADDRESS,
@@ -228,12 +1993,113 @@ static struct sock_addr_test tests[] = {
 		SERVUN_REWRITE_ADDRESS,
 		0,
 		NULL,
+		SUCCESS,
+	},
+
+	/* getpeername - system calls */
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername4: getpeername (stream)",
+		getpeername_v4_prog_load,
+		getpeername_v4_prog_destroy,
+		BPF_CGROUP_INET4_GETPEERNAME,
+		&user_ops,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		SERV4_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername4: getpeername (dgram)",
+		getpeername_v4_prog_load,
+		getpeername_v4_prog_destroy,
+		BPF_CGROUP_INET4_GETPEERNAME,
+		&user_ops,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		SERV4_PORT,
+		NULL,
+		SUCCESS,
 	},
 	{
 		SOCK_ADDR_TEST_GETPEERNAME,
-		"getpeername_unix",
+		"getpeername4: attach prog with wrong attach type",
+		getpeername_v4_prog_load_raw,
+		getpeername_v4_prog_destroy_raw,
+		BPF_CGROUP_INET6_GETSOCKNAME,
+		&user_ops,
+		AF_UNIX,
+		SOCK_DGRAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		SERV4_PORT,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername6: getpeername (stream)",
+		getpeername_v6_prog_load,
+		getpeername_v6_prog_destroy,
+		BPF_CGROUP_INET6_GETPEERNAME,
+		&user_ops,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		SERV6_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername6: getpeername (dgram)",
+		getpeername_v6_prog_load,
+		getpeername_v6_prog_destroy,
+		BPF_CGROUP_INET6_GETPEERNAME,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		SERV6_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername6: attach prog with wrong attach type",
+		getpeername_v6_prog_load_raw,
+		getpeername_v6_prog_destroy_raw,
+		BPF_CGROUP_INET4_GETSOCKNAME,
+		&user_ops,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		SERV6_PORT,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername_unix: getpeername",
 		getpeername_unix_prog_load,
 		getpeername_unix_prog_destroy,
+		BPF_CGROUP_UNIX_GETPEERNAME,
+		&user_ops,
 		AF_UNIX,
 		SOCK_STREAM,
 		SERVUN_ADDRESS,
@@ -241,6 +2107,105 @@ static struct sock_addr_test tests[] = {
 		SERVUN_REWRITE_ADDRESS,
 		0,
 		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername_unix: attach prog with wrong attach type",
+		getpeername_unix_prog_load_raw,
+		getpeername_unix_prog_destroy_raw,
+		BPF_CGROUP_INET4_GETSOCKNAME,
+		&user_ops,
+		AF_UNIX,
+		SOCK_STREAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+
+	/* getpeername - kernel calls */
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername4: kernel_getpeername (stream)",
+		getpeername_v4_prog_load,
+		getpeername_v4_prog_destroy,
+		BPF_CGROUP_INET4_GETPEERNAME,
+		&kern_ops_kernel_sendmsg,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		SERV4_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername4: kernel_getpeername (dgram)",
+		getpeername_v4_prog_load,
+		getpeername_v4_prog_destroy,
+		BPF_CGROUP_INET4_GETPEERNAME,
+		&kern_ops_kernel_sendmsg,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SERV4_IP,
+		SERV4_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername6: kernel_getpeername (stream)",
+		getpeername_v6_prog_load,
+		getpeername_v6_prog_destroy,
+		BPF_CGROUP_INET6_GETPEERNAME,
+		&kern_ops_kernel_sendmsg,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		SERV6_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername6: kernel_getpeername (dgram)",
+		getpeername_v6_prog_load,
+		getpeername_v6_prog_destroy,
+		BPF_CGROUP_INET6_GETPEERNAME,
+		&kern_ops_kernel_sendmsg,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SERV6_IP,
+		SERV6_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		SOCK_ADDR_TEST_GETPEERNAME,
+		"getpeername_unix: kernel_getpeername",
+		getpeername_unix_prog_load,
+		getpeername_unix_prog_destroy,
+		BPF_CGROUP_UNIX_GETPEERNAME,
+		&kern_ops_kernel_sendmsg,
+		AF_UNIX,
+		SOCK_STREAM,
+		SERVUN_ADDRESS,
+		0,
+		SERVUN_REWRITE_ADDRESS,
+		0,
+		NULL,
+		SUCCESS,
 	},
 };
 
@@ -294,28 +2259,40 @@ static int cmp_sock_addr(info_fn fn, int sock1,
 	return cmp_addr(&addr1, len1, addr2, addr2_len, cmp_port);
 }
 
-static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2,
-			  socklen_t addr2_len, bool cmp_port)
+static int load_sock_addr_kern(void)
 {
-	return cmp_sock_addr(getsockname, sock1, addr2, addr2_len, cmp_port);
+	int err;
+
+	skel = sock_addr_kern__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel"))
+		goto err;
+
+	err = 0;
+	goto out;
+err:
+	err = -1;
+out:
+	return err;
 }
 
-static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2,
-			 socklen_t addr2_len, bool cmp_port)
+static void unload_sock_addr_kern(void)
 {
-	return cmp_sock_addr(getpeername, sock1, addr2, addr2_len, cmp_port);
+	sock_addr_kern__destroy(skel);
 }
 
-static void test_bind(struct sock_addr_test *test)
+static int test_bind(struct sock_addr_test *test)
 {
 	struct sockaddr_storage expected_addr;
 	socklen_t expected_addr_len = sizeof(struct sockaddr_storage);
 	int serv = -1, client = -1, err;
 
-	serv = start_server(test->socket_family, test->socket_type,
-			    test->requested_addr, test->requested_port, 0);
-	if (!ASSERT_GE(serv, 0, "start_server"))
-		goto cleanup;
+	serv = test->ops->start_server(test->socket_family, test->socket_type,
+				       test->requested_addr,
+				       test->requested_port, 0);
+	if (serv < 0) {
+		err = errno;
+		goto err;
+	}
 
 	err = make_sockaddr(test->socket_family,
 			    test->expected_addr, test->expected_port,
@@ -323,23 +2300,28 @@ static void test_bind(struct sock_addr_test *test)
 	if (!ASSERT_EQ(err, 0, "make_sockaddr"))
 		goto cleanup;
 
-	err = cmp_local_addr(serv, &expected_addr, expected_addr_len, true);
+	err = cmp_sock_addr(test->ops->getsockname, serv, &expected_addr,
+			    expected_addr_len, true);
 	if (!ASSERT_EQ(err, 0, "cmp_local_addr"))
 		goto cleanup;
 
 	/* Try to connect to server just in case */
-	client = connect_to_addr(&expected_addr, expected_addr_len, test->socket_type);
+	client = connect_to_addr(test->socket_type, &expected_addr, expected_addr_len, NULL);
 	if (!ASSERT_GE(client, 0, "connect_to_addr"))
 		goto cleanup;
 
 cleanup:
+	err = 0;
+err:
 	if (client != -1)
 		close(client);
 	if (serv != -1)
-		close(serv);
+		test->ops->close(serv);
+
+	return err;
 }
 
-static void test_connect(struct sock_addr_test *test)
+static int test_connect(struct sock_addr_test *test)
 {
 	struct sockaddr_storage addr, expected_addr, expected_src_addr;
 	socklen_t addr_len = sizeof(struct sockaddr_storage),
@@ -357,9 +2339,12 @@ static void test_connect(struct sock_addr_test *test)
 	if (!ASSERT_EQ(err, 0, "make_sockaddr"))
 		goto cleanup;
 
-	client = connect_to_addr(&addr, addr_len, test->socket_type);
-	if (!ASSERT_GE(client, 0, "connect_to_addr"))
-		goto cleanup;
+	client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len,
+					    NULL);
+	if (client < 0) {
+		err = errno;
+		goto err;
+	}
 
 	err = make_sockaddr(test->socket_family, test->expected_addr, test->expected_port,
 			    &expected_addr, &expected_addr_len);
@@ -373,29 +2358,34 @@ static void test_connect(struct sock_addr_test *test)
 			goto cleanup;
 	}
 
-	err = cmp_peer_addr(client, &expected_addr, expected_addr_len, true);
+	err = cmp_sock_addr(test->ops->getpeername, client, &expected_addr,
+			    expected_addr_len, true);
 	if (!ASSERT_EQ(err, 0, "cmp_peer_addr"))
 		goto cleanup;
 
 	if (test->expected_src_addr) {
-		err = cmp_local_addr(client, &expected_src_addr, expected_src_addr_len, false);
+		err = cmp_sock_addr(test->ops->getsockname, client,
+				    &expected_src_addr, expected_src_addr_len,
+				    false);
 		if (!ASSERT_EQ(err, 0, "cmp_local_addr"))
 			goto cleanup;
 	}
 cleanup:
+	err = 0;
+err:
 	if (client != -1)
-		close(client);
+		test->ops->close(client);
 	if (serv != -1)
 		close(serv);
+
+	return err;
 }
 
-static void test_xmsg(struct sock_addr_test *test)
+static int test_xmsg(struct sock_addr_test *test)
 {
 	struct sockaddr_storage addr, src_addr;
 	socklen_t addr_len = sizeof(struct sockaddr_storage),
 		  src_addr_len = sizeof(struct sockaddr_storage);
-	struct msghdr hdr;
-	struct iovec iov;
 	char data = 'a';
 	int serv = -1, client = -1, err;
 
@@ -408,7 +2398,7 @@ static void test_xmsg(struct sock_addr_test *test)
 	if (!ASSERT_GE(serv, 0, "start_server"))
 		goto cleanup;
 
-	client = socket(test->socket_family, test->socket_type, 0);
+	client = test->ops->socket(test->socket_family, test->socket_type, 0);
 	if (!ASSERT_GE(client, 0, "socket"))
 		goto cleanup;
 
@@ -418,7 +2408,8 @@ static void test_xmsg(struct sock_addr_test *test)
 		if (!ASSERT_EQ(err, 0, "make_sockaddr"))
 			goto cleanup;
 
-		err = bind(client, (const struct sockaddr *) &src_addr, src_addr_len);
+		err = test->ops->bind(client, (struct sockaddr *)&src_addr,
+				      src_addr_len);
 		if (!ASSERT_OK(err, "bind"))
 			goto cleanup;
 	}
@@ -429,17 +2420,13 @@ static void test_xmsg(struct sock_addr_test *test)
 		goto cleanup;
 
 	if (test->socket_type == SOCK_DGRAM) {
-		memset(&iov, 0, sizeof(iov));
-		iov.iov_base = &data;
-		iov.iov_len = sizeof(data);
-
-		memset(&hdr, 0, sizeof(hdr));
-		hdr.msg_name = (void *)&addr;
-		hdr.msg_namelen = addr_len;
-		hdr.msg_iov = &iov;
-		hdr.msg_iovlen = 1;
+		err = test->ops->sendmsg(client, (struct sockaddr *)&addr,
+					 addr_len, &data, sizeof(data));
+		if (err < 0) {
+			err = errno;
+			goto err;
+		}
 
-		err = sendmsg(client, &hdr, 0);
 		if (!ASSERT_EQ(err, sizeof(data), "sendmsg"))
 			goto cleanup;
 	} else {
@@ -489,19 +2476,23 @@ static void test_xmsg(struct sock_addr_test *test)
 	}
 
 cleanup:
+	err = 0;
+err:
 	if (client != -1)
-		close(client);
+		test->ops->close(client);
 	if (serv != -1)
 		close(serv);
+
+	return err;
 }
 
-static void test_getsockname(struct sock_addr_test *test)
+static int test_getsockname(struct sock_addr_test *test)
 {
 	struct sockaddr_storage expected_addr;
 	socklen_t expected_addr_len = sizeof(struct sockaddr_storage);
 	int serv = -1, err;
 
-	serv = start_server(test->socket_family, test->socket_type,
+	serv = test->ops->start_server(test->socket_family, test->socket_type,
 			    test->requested_addr, test->requested_port, 0);
 	if (!ASSERT_GE(serv, 0, "start_server"))
 		goto cleanup;
@@ -512,16 +2503,18 @@ static void test_getsockname(struct sock_addr_test *test)
 	if (!ASSERT_EQ(err, 0, "make_sockaddr"))
 		goto cleanup;
 
-	err = cmp_local_addr(serv, &expected_addr, expected_addr_len, true);
+	err = cmp_sock_addr(test->ops->getsockname, serv, &expected_addr, expected_addr_len, true);
 	if (!ASSERT_EQ(err, 0, "cmp_local_addr"))
 		goto cleanup;
 
 cleanup:
 	if (serv != -1)
-		close(serv);
+		test->ops->close(serv);
+
+	return 0;
 }
 
-static void test_getpeername(struct sock_addr_test *test)
+static int test_getpeername(struct sock_addr_test *test)
 {
 	struct sockaddr_storage addr, expected_addr;
 	socklen_t addr_len = sizeof(struct sockaddr_storage),
@@ -538,7 +2531,8 @@ static void test_getpeername(struct sock_addr_test *test)
 	if (!ASSERT_EQ(err, 0, "make_sockaddr"))
 		goto cleanup;
 
-	client = connect_to_addr(&addr, addr_len, test->socket_type);
+	client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len,
+					    NULL);
 	if (!ASSERT_GE(client, 0, "connect_to_addr"))
 		goto cleanup;
 
@@ -547,19 +2541,58 @@ static void test_getpeername(struct sock_addr_test *test)
 	if (!ASSERT_EQ(err, 0, "make_sockaddr"))
 		goto cleanup;
 
-	err = cmp_peer_addr(client, &expected_addr, expected_addr_len, true);
+	err = cmp_sock_addr(test->ops->getpeername, client, &expected_addr,
+			    expected_addr_len, true);
 	if (!ASSERT_EQ(err, 0, "cmp_peer_addr"))
 		goto cleanup;
 
 cleanup:
 	if (client != -1)
-		close(client);
+		test->ops->close(client);
 	if (serv != -1)
 		close(serv);
+
+	return 0;
+}
+
+static int setup_test_env(struct nstoken **tok)
+{
+	int err;
+
+	SYS_NOFAIL("ip netns delete %s", TEST_NS);
+	SYS(fail, "ip netns add %s", TEST_NS);
+	*tok = open_netns(TEST_NS);
+	if (!ASSERT_OK_PTR(*tok, "netns token"))
+		goto fail;
+
+	SYS(fail, "ip link add dev %s1 type veth peer name %s2", TEST_IF_PREFIX,
+	    TEST_IF_PREFIX);
+	SYS(fail, "ip link set lo up");
+	SYS(fail, "ip link set %s1 up", TEST_IF_PREFIX);
+	SYS(fail, "ip link set %s2 up", TEST_IF_PREFIX);
+	SYS(fail, "ip -4 addr add %s/8 dev %s1", TEST_IPV4, TEST_IF_PREFIX);
+	SYS(fail, "ip -6 addr add %s/128 nodad dev %s1", TEST_IPV6, TEST_IF_PREFIX);
+
+	err = 0;
+	goto out;
+fail:
+	err = -1;
+	close_netns(*tok);
+	*tok = NULL;
+	SYS_NOFAIL("ip netns delete %s", TEST_NS);
+out:
+	return err;
+}
+
+static void cleanup_test_env(struct nstoken *tok)
+{
+	close_netns(tok);
+	SYS_NOFAIL("ip netns delete %s", TEST_NS);
 }
 
 void test_sock_addr(void)
 {
+	struct nstoken *tok = NULL;
 	int cgroup_fd = -1;
 	void *skel;
 
@@ -567,13 +2600,22 @@ void test_sock_addr(void)
 	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
 		goto cleanup;
 
+	if (!ASSERT_OK(setup_test_env(&tok), "setup_test_env"))
+		goto cleanup;
+
+	if (!ASSERT_OK(load_sock_addr_kern(), "load_sock_addr_kern"))
+		goto cleanup;
+
 	for (size_t i = 0; i < ARRAY_SIZE(tests); ++i) {
 		struct sock_addr_test *test = &tests[i];
+		int err;
 
 		if (!test__start_subtest(test->name))
 			continue;
 
-		skel = test->loadfn(cgroup_fd);
+		skel = test->loadfn(cgroup_fd, test->attach_type,
+				    test->expected_result == LOAD_REJECT ||
+					test->expected_result == ATTACH_REJECT);
 		if (!skel)
 			continue;
 
@@ -583,30 +2625,39 @@ void test_sock_addr(void)
 		 * the future.
 		 */
 		case SOCK_ADDR_TEST_BIND:
-			test_bind(test);
+			err = test_bind(test);
 			break;
 		case SOCK_ADDR_TEST_CONNECT:
-			test_connect(test);
+			err = test_connect(test);
 			break;
 		case SOCK_ADDR_TEST_SENDMSG:
 		case SOCK_ADDR_TEST_RECVMSG:
-			test_xmsg(test);
+			err = test_xmsg(test);
 			break;
 		case SOCK_ADDR_TEST_GETSOCKNAME:
-			test_getsockname(test);
+			err = test_getsockname(test);
 			break;
 		case SOCK_ADDR_TEST_GETPEERNAME:
-			test_getpeername(test);
+			err = test_getpeername(test);
 			break;
 		default:
 			ASSERT_TRUE(false, "Unknown sock addr test type");
 			break;
 		}
 
+		if (test->expected_result == SYSCALL_EPERM)
+			ASSERT_EQ(err, EPERM, "socket operation returns EPERM");
+		else if (test->expected_result == SYSCALL_ENOTSUPP)
+			ASSERT_EQ(err, ENOTSUPP, "socket operation returns ENOTSUPP");
+		else if (test->expected_result == SUCCESS)
+			ASSERT_OK(err, "socket operation succeeds");
+
 		test->destroyfn(skel);
 	}
 
 cleanup:
+	unload_sock_addr_kern();
+	cleanup_test_env(tok);
 	if (cgroup_fd >= 0)
 		close(cgroup_fd);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 77e26ecffa9d..1337153eb0ad 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -131,6 +131,65 @@ out:
 	test_skmsg_load_helpers__destroy(skel);
 }
 
+static void test_skmsg_helpers_with_link(enum bpf_map_type map_type)
+{
+	struct bpf_program *prog, *prog_clone, *prog_clone2;
+	DECLARE_LIBBPF_OPTS(bpf_link_update_opts, opts);
+	struct test_skmsg_load_helpers *skel;
+	struct bpf_link *link, *link2;
+	int err, map;
+
+	skel = test_skmsg_load_helpers__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "test_skmsg_load_helpers__open_and_load"))
+		return;
+
+	prog = skel->progs.prog_msg_verdict;
+	prog_clone = skel->progs.prog_msg_verdict_clone;
+	prog_clone2 = skel->progs.prog_msg_verdict_clone2;
+	map = bpf_map__fd(skel->maps.sock_map);
+
+	link = bpf_program__attach_sockmap(prog, map);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap"))
+		goto out;
+
+	/* Fail since bpf_link for the same prog has been created. */
+	err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_MSG_VERDICT, 0);
+	if (!ASSERT_ERR(err, "bpf_prog_attach"))
+		goto out;
+
+	/* Fail since bpf_link for the same prog type has been created. */
+	link2 = bpf_program__attach_sockmap(prog_clone, map);
+	if (!ASSERT_ERR_PTR(link2, "bpf_program__attach_sockmap")) {
+		bpf_link__detach(link2);
+		goto out;
+	}
+
+	err = bpf_link__update_program(link, prog_clone);
+	if (!ASSERT_OK(err, "bpf_link__update_program"))
+		goto out;
+
+	/* Fail since a prog with different type attempts to do update. */
+	err = bpf_link__update_program(link, skel->progs.prog_skb_verdict);
+	if (!ASSERT_ERR(err, "bpf_link__update_program"))
+		goto out;
+
+	/* Fail since the old prog does not match the one in the kernel. */
+	opts.old_prog_fd = bpf_program__fd(prog_clone2);
+	opts.flags = BPF_F_REPLACE;
+	err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts);
+	if (!ASSERT_ERR(err, "bpf_link_update"))
+		goto out;
+
+	opts.old_prog_fd = bpf_program__fd(prog_clone);
+	opts.flags = BPF_F_REPLACE;
+	err = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), &opts);
+	if (!ASSERT_OK(err, "bpf_link_update"))
+		goto out;
+out:
+	bpf_link__detach(link);
+	test_skmsg_load_helpers__destroy(skel);
+}
+
 static void test_sockmap_update(enum bpf_map_type map_type)
 {
 	int err, prog, src;
@@ -298,6 +357,40 @@ out:
 	test_sockmap_skb_verdict_attach__destroy(skel);
 }
 
+static void test_sockmap_skb_verdict_attach_with_link(void)
+{
+	struct test_sockmap_skb_verdict_attach *skel;
+	struct bpf_program *prog;
+	struct bpf_link *link;
+	int err, map;
+
+	skel = test_sockmap_skb_verdict_attach__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+	prog = skel->progs.prog_skb_verdict;
+	map = bpf_map__fd(skel->maps.sock_map);
+	link = bpf_program__attach_sockmap(prog, map);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap"))
+		goto out;
+
+	bpf_link__detach(link);
+
+	err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0);
+	if (!ASSERT_OK(err, "bpf_prog_attach"))
+		goto out;
+
+	/* Fail since attaching with the same prog/map has been done. */
+	link = bpf_program__attach_sockmap(prog, map);
+	if (!ASSERT_ERR_PTR(link, "bpf_program__attach_sockmap"))
+		bpf_link__detach(link);
+
+	err = bpf_prog_detach2(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT);
+	if (!ASSERT_OK(err, "bpf_prog_detach2"))
+		goto out;
+out:
+	test_sockmap_skb_verdict_attach__destroy(skel);
+}
+
 static __u32 query_prog_id(int prog_fd)
 {
 	struct bpf_prog_info info = {};
@@ -475,30 +568,19 @@ out:
 		test_sockmap_drop_prog__destroy(drop);
 }
 
-static void test_sockmap_skb_verdict_peek(void)
+static void test_sockmap_skb_verdict_peek_helper(int map)
 {
-	int err, map, verdict, s, c1, p1, zero = 0, sent, recvd, avail;
-	struct test_sockmap_pass_prog *pass;
+	int err, s, c1, p1, zero = 0, sent, recvd, avail;
 	char snd[256] = "0123456789";
 	char rcv[256] = "0";
 
-	pass = test_sockmap_pass_prog__open_and_load();
-	if (!ASSERT_OK_PTR(pass, "open_and_load"))
-		return;
-	verdict = bpf_program__fd(pass->progs.prog_skb_verdict);
-	map = bpf_map__fd(pass->maps.sock_map_rx);
-
-	err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0);
-	if (!ASSERT_OK(err, "bpf_prog_attach"))
-		goto out;
-
 	s = socket_loopback(AF_INET, SOCK_STREAM);
 	if (!ASSERT_GT(s, -1, "socket_loopback(s)"))
-		goto out;
+		return;
 
 	err = create_pair(s, AF_INET, SOCK_STREAM, &c1, &p1);
 	if (!ASSERT_OK(err, "create_pairs(s)"))
-		goto out;
+		return;
 
 	err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST);
 	if (!ASSERT_OK(err, "bpf_map_update_elem(c1)"))
@@ -520,7 +602,58 @@ static void test_sockmap_skb_verdict_peek(void)
 out_close:
 	close(c1);
 	close(p1);
+}
+
+static void test_sockmap_skb_verdict_peek(void)
+{
+	struct test_sockmap_pass_prog *pass;
+	int err, map, verdict;
+
+	pass = test_sockmap_pass_prog__open_and_load();
+	if (!ASSERT_OK_PTR(pass, "open_and_load"))
+		return;
+	verdict = bpf_program__fd(pass->progs.prog_skb_verdict);
+	map = bpf_map__fd(pass->maps.sock_map_rx);
+
+	err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0);
+	if (!ASSERT_OK(err, "bpf_prog_attach"))
+		goto out;
+
+	test_sockmap_skb_verdict_peek_helper(map);
+
+out:
+	test_sockmap_pass_prog__destroy(pass);
+}
+
+static void test_sockmap_skb_verdict_peek_with_link(void)
+{
+	struct test_sockmap_pass_prog *pass;
+	struct bpf_program *prog;
+	struct bpf_link *link;
+	int err, map;
+
+	pass = test_sockmap_pass_prog__open_and_load();
+	if (!ASSERT_OK_PTR(pass, "open_and_load"))
+		return;
+	prog = pass->progs.prog_skb_verdict;
+	map = bpf_map__fd(pass->maps.sock_map_rx);
+	link = bpf_program__attach_sockmap(prog, map);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap"))
+		goto out;
+
+	err = bpf_link__update_program(link, pass->progs.prog_skb_verdict_clone);
+	if (!ASSERT_OK(err, "bpf_link__update_program"))
+		goto out;
+
+	/* Fail since a prog with different attach type attempts to do update. */
+	err = bpf_link__update_program(link, pass->progs.prog_skb_parser);
+	if (!ASSERT_ERR(err, "bpf_link__update_program"))
+		goto out;
+
+	test_sockmap_skb_verdict_peek_helper(map);
+	ASSERT_EQ(pass->bss->clone_called, 1, "clone_called");
 out:
+	bpf_link__detach(link);
 	test_sockmap_pass_prog__destroy(pass);
 }
 
@@ -788,6 +921,8 @@ void test_sockmap_basic(void)
 		test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT,
 						BPF_SK_SKB_VERDICT);
 	}
+	if (test__start_subtest("sockmap skb_verdict attach_with_link"))
+		test_sockmap_skb_verdict_attach_with_link();
 	if (test__start_subtest("sockmap msg_verdict progs query"))
 		test_sockmap_progs_query(BPF_SK_MSG_VERDICT);
 	if (test__start_subtest("sockmap stream_parser progs query"))
@@ -804,6 +939,8 @@ void test_sockmap_basic(void)
 		test_sockmap_skb_verdict_fionread(false);
 	if (test__start_subtest("sockmap skb_verdict msg_f_peek"))
 		test_sockmap_skb_verdict_peek();
+	if (test__start_subtest("sockmap skb_verdict msg_f_peek with link"))
+		test_sockmap_skb_verdict_peek_with_link();
 	if (test__start_subtest("sockmap unconnected af_unix"))
 		test_sockmap_unconnected_unix();
 	if (test__start_subtest("sockmap one socket to many map entries"))
@@ -812,4 +949,8 @@ void test_sockmap_basic(void)
 		test_sockmap_many_maps();
 	if (test__start_subtest("sockmap same socket replace"))
 		test_sockmap_same_sock();
+	if (test__start_subtest("sockmap sk_msg attach sockmap helpers with link"))
+		test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKMAP);
+	if (test__start_subtest("sockhash sk_msg attach sockhash helpers with link"))
+		test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKHASH);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index a92807bfcd13..e91b59366030 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -767,6 +767,24 @@ static void test_msg_redir_to_connected(struct test_sockmap_listen *skel,
 	xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT);
 }
 
+static void test_msg_redir_to_connected_with_link(struct test_sockmap_listen *skel,
+						  struct bpf_map *inner_map, int family,
+						  int sotype)
+{
+	int prog_msg_verdict = bpf_program__fd(skel->progs.prog_msg_verdict);
+	int verdict_map = bpf_map__fd(skel->maps.verdict_map);
+	int sock_map = bpf_map__fd(inner_map);
+	int link_fd;
+
+	link_fd = bpf_link_create(prog_msg_verdict, sock_map, BPF_SK_MSG_VERDICT, NULL);
+	if (!ASSERT_GE(link_fd, 0, "bpf_link_create"))
+		return;
+
+	redir_to_connected(family, sotype, sock_map, verdict_map, REDIR_EGRESS);
+
+	close(link_fd);
+}
+
 static void redir_to_listening(int family, int sotype, int sock_mapfd,
 			       int verd_mapfd, enum redir_mode mode)
 {
@@ -869,6 +887,24 @@ static void test_msg_redir_to_listening(struct test_sockmap_listen *skel,
 	xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT);
 }
 
+static void test_msg_redir_to_listening_with_link(struct test_sockmap_listen *skel,
+						  struct bpf_map *inner_map, int family,
+						  int sotype)
+{
+	struct bpf_program *verdict = skel->progs.prog_msg_verdict;
+	int verdict_map = bpf_map__fd(skel->maps.verdict_map);
+	int sock_map = bpf_map__fd(inner_map);
+	struct bpf_link *link;
+
+	link = bpf_program__attach_sockmap(verdict, sock_map);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap"))
+		return;
+
+	redir_to_listening(family, sotype, sock_map, verdict_map, REDIR_EGRESS);
+
+	bpf_link__detach(link);
+}
+
 static void redir_partial(int family, int sotype, int sock_map, int parser_map)
 {
 	int s, c0 = -1, c1 = -1, p0 = -1, p1 = -1;
@@ -1316,7 +1352,9 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map,
 		TEST(test_skb_redir_to_listening),
 		TEST(test_skb_redir_partial),
 		TEST(test_msg_redir_to_connected),
+		TEST(test_msg_redir_to_connected_with_link),
 		TEST(test_msg_redir_to_listening),
+		TEST(test_msg_redir_to_listening_with_link),
 	};
 	const char *family_name, *map_name;
 	const struct redir_test *t;
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c
index 5a4491d4edfe..eaac83a7f388 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c
@@ -24,6 +24,7 @@ enum sockopt_test_error {
 static struct sockopt_test {
 	const char			*descr;
 	const struct bpf_insn		insns[64];
+	enum bpf_prog_type		prog_type;
 	enum bpf_attach_type		attach_type;
 	enum bpf_attach_type		expected_attach_type;
 
@@ -928,9 +929,40 @@ static struct sockopt_test {
 
 		.error = EPERM_SETSOCKOPT,
 	},
+
+	/* ==================== prog_type ====================  */
+
+	{
+		.descr = "can attach only BPF_CGROUP_SETSOCKOP",
+		.insns = {
+			/* return 1 */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+
+		},
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+		.attach_type = BPF_CGROUP_SETSOCKOPT,
+		.expected_attach_type = 0,
+		.error = DENY_ATTACH,
+	},
+
+	{
+		.descr = "can attach only BPF_CGROUP_GETSOCKOP",
+		.insns = {
+			/* return 1 */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+
+		},
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+		.attach_type = BPF_CGROUP_GETSOCKOPT,
+		.expected_attach_type = 0,
+		.error = DENY_ATTACH,
+	},
 };
 
 static int load_prog(const struct bpf_insn *insns,
+		     enum bpf_prog_type prog_type,
 		     enum bpf_attach_type expected_attach_type)
 {
 	LIBBPF_OPTS(bpf_prog_load_opts, opts,
@@ -947,7 +979,7 @@ static int load_prog(const struct bpf_insn *insns,
 	}
 	insns_cnt++;
 
-	fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCKOPT, NULL, "GPL", insns, insns_cnt, &opts);
+	fd = bpf_prog_load(prog_type, NULL, "GPL", insns, insns_cnt, &opts);
 	if (verbose && fd < 0)
 		fprintf(stderr, "%s\n", bpf_log_buf);
 
@@ -1036,13 +1068,18 @@ static int call_getsockopt(bool use_io_uring, int fd, int level, int optname,
 	return getsockopt(fd, level, optname, optval, optlen);
 }
 
-static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring)
+static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring,
+		    bool use_link)
 {
-	int sock_fd, err, prog_fd;
+	int prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+	int sock_fd, err, prog_fd, link_fd = -1;
 	void *optval = NULL;
 	int ret = 0;
 
-	prog_fd = load_prog(test->insns, test->expected_attach_type);
+	if (test->prog_type)
+		prog_type = test->prog_type;
+
+	prog_fd = load_prog(test->insns, prog_type, test->expected_attach_type);
 	if (prog_fd < 0) {
 		if (test->error == DENY_LOAD)
 			return 0;
@@ -1051,7 +1088,12 @@ static int run_test(int cgroup_fd, struct sockopt_test *test, bool use_io_uring)
 		return -1;
 	}
 
-	err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0);
+	if (use_link) {
+		err = bpf_link_create(prog_fd, cgroup_fd, test->attach_type, NULL);
+		link_fd = err;
+	} else {
+		err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0);
+	}
 	if (err < 0) {
 		if (test->error == DENY_ATTACH)
 			goto close_prog_fd;
@@ -1142,7 +1184,12 @@ free_optval:
 close_sock_fd:
 	close(sock_fd);
 detach_prog:
-	bpf_prog_detach2(prog_fd, cgroup_fd, test->attach_type);
+	if (use_link) {
+		if (link_fd >= 0)
+			close(link_fd);
+	} else {
+		bpf_prog_detach2(prog_fd, cgroup_fd, test->attach_type);
+	}
 close_prog_fd:
 	close(prog_fd);
 	return ret;
@@ -1160,10 +1207,12 @@ void test_sockopt(void)
 		if (!test__start_subtest(tests[i].descr))
 			continue;
 
-		ASSERT_OK(run_test(cgroup_fd, &tests[i], false),
+		ASSERT_OK(run_test(cgroup_fd, &tests[i], false, false),
+			  tests[i].descr);
+		ASSERT_OK(run_test(cgroup_fd, &tests[i], false, true),
 			  tests[i].descr);
 		if (tests[i].io_uring_support)
-			ASSERT_OK(run_test(cgroup_fd, &tests[i], true),
+			ASSERT_OK(run_test(cgroup_fd, &tests[i], true, false),
 				  tests[i].descr);
 	}
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
index 917f486db826..1d3a20f01b60 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
 #include "cgroup_helpers.h"
+#include "network_helpers.h"
 
 #include "sockopt_inherit.skel.h"
 
@@ -9,35 +10,6 @@
 #define CUSTOM_INHERIT2			1
 #define CUSTOM_LISTENER			2
 
-static int connect_to_server(int server_fd)
-{
-	struct sockaddr_storage addr;
-	socklen_t len = sizeof(addr);
-	int fd;
-
-	fd = socket(AF_INET, SOCK_STREAM, 0);
-	if (fd < 0) {
-		log_err("Failed to create client socket");
-		return -1;
-	}
-
-	if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
-		log_err("Failed to get server addr");
-		goto out;
-	}
-
-	if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
-		log_err("Fail to connect to server");
-		goto out;
-	}
-
-	return fd;
-
-out:
-	close(fd);
-	return -1;
-}
-
 static int verify_sockopt(int fd, int optname, const char *msg, char expected)
 {
 	socklen_t optlen = 1;
@@ -98,47 +70,36 @@ static void *server_thread(void *arg)
 	return (void *)(long)err;
 }
 
-static int start_server(void)
+static int custom_cb(int fd, const struct post_socket_opts *opts)
 {
-	struct sockaddr_in addr = {
-		.sin_family = AF_INET,
-		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
-	};
 	char buf;
 	int err;
-	int fd;
 	int i;
 
-	fd = socket(AF_INET, SOCK_STREAM, 0);
-	if (fd < 0) {
-		log_err("Failed to create server socket");
-		return -1;
-	}
-
 	for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) {
 		buf = 0x01;
 		err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1);
 		if (err) {
 			log_err("Failed to call setsockopt(%d)", i);
-			close(fd);
 			return -1;
 		}
 	}
 
-	if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) {
-		log_err("Failed to bind socket");
-		close(fd);
-		return -1;
-	}
-
-	return fd;
+	return 0;
 }
 
 static void run_test(int cgroup_fd)
 {
 	struct bpf_link *link_getsockopt = NULL;
 	struct bpf_link *link_setsockopt = NULL;
+	struct network_helper_opts opts = {
+		.post_socket_cb = custom_cb,
+	};
 	int server_fd = -1, client_fd;
+	struct sockaddr_in addr = {
+		.sin_family = AF_INET,
+		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+	};
 	struct sockopt_inherit *obj;
 	void *server_err;
 	pthread_t tid;
@@ -160,7 +121,8 @@ static void run_test(int cgroup_fd)
 	if (!ASSERT_OK_PTR(link_setsockopt, "cg-attach-setsockopt"))
 		goto close_bpf_object;
 
-	server_fd = start_server();
+	server_fd = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr,
+				      sizeof(addr), &opts);
 	if (!ASSERT_GE(server_fd, 0, "start_server"))
 		goto close_bpf_object;
 
@@ -173,7 +135,7 @@ static void run_test(int cgroup_fd)
 	pthread_cond_wait(&server_started, &server_started_mtx);
 	pthread_mutex_unlock(&server_started_mtx);
 
-	client_fd = connect_to_server(server_fd);
+	client_fd = connect_to_fd(server_fd, 0);
 	if (!ASSERT_GE(client_fd, 0, "connect_to_server"))
 		goto close_server_fd;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
index 5db9eec24b5b..0832fd787457 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
@@ -35,7 +35,7 @@ retry:
 	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
 			 0 /* cpu 0 */, -1 /* group id */,
 			 0 /* flags */);
-	if (pmu_fd < 0 && errno == ENOENT) {
+	if (pmu_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) {
 		printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__);
 		test__skip();
 		goto cleanup;
diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index dbe06aeaa2b2..b1073d36d77a 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -530,7 +530,7 @@ static int wait_netstamp_needed_key(void)
 	__u64 tstamp = 0;
 
 	nstoken = open_netns(NS_DST);
-	if (!nstoken)
+	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
 		return -1;
 
 	srv_fd = start_server(AF_INET6, SOCK_DGRAM, "::1", 0, 0);
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
index 8fe84da1b9b4..f2b99d95d916 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
@@ -10,6 +10,9 @@ struct tcp_rtt_storage {
 	__u32 delivered;
 	__u32 delivered_ce;
 	__u32 icsk_retransmits;
+
+	__u32 mrtt_us;	/* args[0] */
+	__u32 srtt;	/* args[1] */
 };
 
 static void send_byte(int fd)
@@ -83,6 +86,17 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked,
 		err++;
 	}
 
+	/* Precise values of mrtt and srtt are unavailable, just make sure they are nonzero */
+	if (val.mrtt_us == 0) {
+		log_err("%s: unexpected bpf_tcp_sock.args[0] (mrtt_us) %u == 0", msg, val.mrtt_us);
+		err++;
+	}
+
+	if (val.srtt == 0) {
+		log_err("%s: unexpected bpf_tcp_sock.args[1] (srtt) %u == 0", msg, val.srtt);
+		err++;
+	}
+
 	return err;
 }
 
diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
index ee5372c7f2c7..29e183a80f49 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
@@ -4,6 +4,8 @@
 #include <time.h>
 
 #include "struct_ops_module.skel.h"
+#include "struct_ops_nulled_out_cb.skel.h"
+#include "struct_ops_forgotten_cb.skel.h"
 
 static void check_map_info(struct bpf_map_info *info)
 {
@@ -66,6 +68,7 @@ static void test_struct_ops_load(void)
 	 * auto-loading, or it will fail to load.
 	 */
 	bpf_program__set_autoload(skel->progs.test_2, false);
+	bpf_map__set_autocreate(skel->maps.testmod_zeroed, false);
 
 	err = struct_ops_module__load(skel);
 	if (!ASSERT_OK(err, "struct_ops_module_load"))
@@ -93,9 +96,163 @@ cleanup:
 	struct_ops_module__destroy(skel);
 }
 
+static void test_struct_ops_not_zeroed(void)
+{
+	struct struct_ops_module *skel;
+	int err;
+
+	/* zeroed is 0, and zeroed_op is null */
+	skel = struct_ops_module__open();
+	if (!ASSERT_OK_PTR(skel, "struct_ops_module_open"))
+		return;
+
+	skel->struct_ops.testmod_zeroed->zeroed = 0;
+	/* zeroed_op prog should be not loaded automatically now */
+	skel->struct_ops.testmod_zeroed->zeroed_op = NULL;
+
+	err = struct_ops_module__load(skel);
+	ASSERT_OK(err, "struct_ops_module_load");
+
+	struct_ops_module__destroy(skel);
+
+	/* zeroed is not 0 */
+	skel = struct_ops_module__open();
+	if (!ASSERT_OK_PTR(skel, "struct_ops_module_open_not_zeroed"))
+		return;
+
+	/* libbpf should reject the testmod_zeroed since struct
+	 * bpf_testmod_ops in the kernel has no "zeroed" field and the
+	 * value of "zeroed" is non-zero.
+	 */
+	skel->struct_ops.testmod_zeroed->zeroed = 0xdeadbeef;
+	skel->struct_ops.testmod_zeroed->zeroed_op = NULL;
+	err = struct_ops_module__load(skel);
+	ASSERT_ERR(err, "struct_ops_module_load_not_zeroed");
+
+	struct_ops_module__destroy(skel);
+
+	/* zeroed_op is not null */
+	skel = struct_ops_module__open();
+	if (!ASSERT_OK_PTR(skel, "struct_ops_module_open_not_zeroed_op"))
+		return;
+
+	/* libbpf should reject the testmod_zeroed since the value of its
+	 * "zeroed_op" is not null.
+	 */
+	skel->struct_ops.testmod_zeroed->zeroed_op = skel->progs.test_3;
+	err = struct_ops_module__load(skel);
+	ASSERT_ERR(err, "struct_ops_module_load_not_zeroed_op");
+
+	struct_ops_module__destroy(skel);
+}
+
+/* The signature of an implementation might not match the signature of the
+ * function pointer prototype defined in the BPF program. This mismatch
+ * should be allowed as long as the behavior of the operator program
+ * adheres to the signature in the kernel. Libbpf should not enforce the
+ * signature; rather, let the kernel verifier handle the enforcement.
+ */
+static void test_struct_ops_incompatible(void)
+{
+	struct struct_ops_module *skel;
+	struct bpf_link *link;
+	int err;
+
+	skel = struct_ops_module__open();
+	if (!ASSERT_OK_PTR(skel, "struct_ops_module_open"))
+		return;
+
+	bpf_map__set_autocreate(skel->maps.testmod_zeroed, false);
+
+	err = struct_ops_module__load(skel);
+	if (!ASSERT_OK(err, "skel_load"))
+		goto cleanup;
+
+	link = bpf_map__attach_struct_ops(skel->maps.testmod_incompatible);
+	if (ASSERT_OK_PTR(link, "attach_struct_ops"))
+		bpf_link__destroy(link);
+
+cleanup:
+	struct_ops_module__destroy(skel);
+}
+
+/* validate that it's ok to "turn off" callback that kernel supports */
+static void test_struct_ops_nulled_out_cb(void)
+{
+	struct struct_ops_nulled_out_cb *skel;
+	int err;
+
+	skel = struct_ops_nulled_out_cb__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	/* kernel knows about test_1, but we still null it out */
+	skel->struct_ops.ops->test_1 = NULL;
+
+	err = struct_ops_nulled_out_cb__load(skel);
+	if (!ASSERT_OK(err, "skel_load"))
+		goto cleanup;
+
+	ASSERT_FALSE(bpf_program__autoload(skel->progs.test_1_turn_off), "prog_autoload");
+	ASSERT_LT(bpf_program__fd(skel->progs.test_1_turn_off), 0, "prog_fd");
+
+cleanup:
+	struct_ops_nulled_out_cb__destroy(skel);
+}
+
+/* validate that libbpf generates reasonable error message if struct_ops is
+ * not referenced in any struct_ops map
+ */
+static void test_struct_ops_forgotten_cb(void)
+{
+	struct struct_ops_forgotten_cb *skel;
+	char *log;
+	int err;
+
+	skel = struct_ops_forgotten_cb__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	start_libbpf_log_capture();
+
+	err = struct_ops_forgotten_cb__load(skel);
+	if (!ASSERT_ERR(err, "skel_load"))
+		goto cleanup;
+
+	log = stop_libbpf_log_capture();
+	ASSERT_HAS_SUBSTR(log,
+			  "prog 'test_1_forgotten': SEC(\"struct_ops\") program isn't referenced anywhere, did you forget to use it?",
+			  "libbpf_log");
+	free(log);
+
+	struct_ops_forgotten_cb__destroy(skel);
+
+	/* now let's programmatically use it, we should be fine now */
+	skel = struct_ops_forgotten_cb__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	skel->struct_ops.ops->test_1 = skel->progs.test_1_forgotten; /* not anymore */
+
+	err = struct_ops_forgotten_cb__load(skel);
+	if (!ASSERT_OK(err, "skel_load"))
+		goto cleanup;
+
+cleanup:
+	struct_ops_forgotten_cb__destroy(skel);
+}
+
 void serial_test_struct_ops_module(void)
 {
-	if (test__start_subtest("test_struct_ops_load"))
+	if (test__start_subtest("struct_ops_load"))
 		test_struct_ops_load();
+	if (test__start_subtest("struct_ops_not_zeroed"))
+		test_struct_ops_not_zeroed();
+	if (test__start_subtest("struct_ops_incompatible"))
+		test_struct_ops_incompatible();
+	if (test__start_subtest("struct_ops_null_out_cb"))
+		test_struct_ops_nulled_out_cb();
+	if (test__start_subtest("struct_ops_forgotten_cb"))
+		test_struct_ops_forgotten_cb();
 }
 
diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
index 5f1fb0a2ea56..cec746e77cd3 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
@@ -612,6 +612,8 @@ static void test_ipip_tunnel(enum ipip_encap encap)
 
 	/* ping from at_ns0 namespace test */
 	nstoken = open_netns("at_ns0");
+	if (!ASSERT_OK_PTR(nstoken, "setns"))
+		goto done;
 	err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1);
 	if (!ASSERT_OK(err, "test_ping"))
 		goto done;
@@ -666,6 +668,8 @@ static void test_xfrm_tunnel(void)
 
 	/* ping from at_ns0 namespace test */
 	nstoken = open_netns("at_ns0");
+	if (!ASSERT_OK_PTR(nstoken, "setns"))
+		goto done;
 	err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1);
 	close_netns(nstoken);
 	if (!ASSERT_OK(err, "test_ping"))
diff --git a/tools/testing/selftests/bpf/prog_tests/trace_printk.c b/tools/testing/selftests/bpf/prog_tests/trace_printk.c
index 7b9124d506a5..e56e88596d64 100644
--- a/tools/testing/selftests/bpf/prog_tests/trace_printk.c
+++ b/tools/testing/selftests/bpf/prog_tests/trace_printk.c
@@ -5,18 +5,19 @@
 
 #include "trace_printk.lskel.h"
 
-#define TRACEFS_PIPE	"/sys/kernel/tracing/trace_pipe"
-#define DEBUGFS_PIPE	"/sys/kernel/debug/tracing/trace_pipe"
 #define SEARCHMSG	"testing,testing"
 
+static void trace_pipe_cb(const char *str, void *data)
+{
+	if (strstr(str, SEARCHMSG) != NULL)
+		(*(int *)data)++;
+}
+
 void serial_test_trace_printk(void)
 {
 	struct trace_printk_lskel__bss *bss;
-	int err = 0, iter = 0, found = 0;
 	struct trace_printk_lskel *skel;
-	char *buf = NULL;
-	FILE *fp = NULL;
-	size_t buflen;
+	int err = 0, found = 0;
 
 	skel = trace_printk_lskel__open();
 	if (!ASSERT_OK_PTR(skel, "trace_printk__open"))
@@ -35,16 +36,6 @@ void serial_test_trace_printk(void)
 	if (!ASSERT_OK(err, "trace_printk__attach"))
 		goto cleanup;
 
-	if (access(TRACEFS_PIPE, F_OK) == 0)
-		fp = fopen(TRACEFS_PIPE, "r");
-	else
-		fp = fopen(DEBUGFS_PIPE, "r");
-	if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)"))
-		goto cleanup;
-
-	/* We do not want to wait forever if this test fails... */
-	fcntl(fileno(fp), F_SETFL, O_NONBLOCK);
-
 	/* wait for tracepoint to trigger */
 	usleep(1);
 	trace_printk_lskel__detach(skel);
@@ -56,21 +47,12 @@ void serial_test_trace_printk(void)
 		goto cleanup;
 
 	/* verify our search string is in the trace buffer */
-	while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) {
-		if (strstr(buf, SEARCHMSG) != NULL)
-			found++;
-		if (found == bss->trace_printk_ran)
-			break;
-		if (++iter > 1000)
-			break;
-	}
+	ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000),
+		 "read_trace_pipe_iter");
 
 	if (!ASSERT_EQ(found, bss->trace_printk_ran, "found"))
 		goto cleanup;
 
 cleanup:
 	trace_printk_lskel__destroy(skel);
-	free(buf);
-	if (fp)
-		fclose(fp);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c
index 44ea2fd88f4c..2af6a6f2096a 100644
--- a/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c
+++ b/tools/testing/selftests/bpf/prog_tests/trace_vprintk.c
@@ -5,18 +5,19 @@
 
 #include "trace_vprintk.lskel.h"
 
-#define TRACEFS_PIPE	"/sys/kernel/tracing/trace_pipe"
-#define DEBUGFS_PIPE	"/sys/kernel/debug/tracing/trace_pipe"
 #define SEARCHMSG	"1,2,3,4,5,6,7,8,9,10"
 
+static void trace_pipe_cb(const char *str, void *data)
+{
+	if (strstr(str, SEARCHMSG) != NULL)
+		(*(int *)data)++;
+}
+
 void serial_test_trace_vprintk(void)
 {
 	struct trace_vprintk_lskel__bss *bss;
-	int err = 0, iter = 0, found = 0;
 	struct trace_vprintk_lskel *skel;
-	char *buf = NULL;
-	FILE *fp = NULL;
-	size_t buflen;
+	int err = 0, found = 0;
 
 	skel = trace_vprintk_lskel__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "trace_vprintk__open_and_load"))
@@ -28,16 +29,6 @@ void serial_test_trace_vprintk(void)
 	if (!ASSERT_OK(err, "trace_vprintk__attach"))
 		goto cleanup;
 
-	if (access(TRACEFS_PIPE, F_OK) == 0)
-		fp = fopen(TRACEFS_PIPE, "r");
-	else
-		fp = fopen(DEBUGFS_PIPE, "r");
-	if (!ASSERT_OK_PTR(fp, "fopen(TRACE_PIPE)"))
-		goto cleanup;
-
-	/* We do not want to wait forever if this test fails... */
-	fcntl(fileno(fp), F_SETFL, O_NONBLOCK);
-
 	/* wait for tracepoint to trigger */
 	usleep(1);
 	trace_vprintk_lskel__detach(skel);
@@ -49,14 +40,8 @@ void serial_test_trace_vprintk(void)
 		goto cleanup;
 
 	/* verify our search string is in the trace buffer */
-	while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) {
-		if (strstr(buf, SEARCHMSG) != NULL)
-			found++;
-		if (found == bss->trace_vprintk_ran)
-			break;
-		if (++iter > 1000)
-			break;
-	}
+	ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000),
+		 "read_trace_pipe_iter");
 
 	if (!ASSERT_EQ(found, bss->trace_vprintk_ran, "found"))
 		goto cleanup;
@@ -66,7 +51,4 @@ void serial_test_trace_vprintk(void)
 
 cleanup:
 	trace_vprintk_lskel__destroy(skel);
-	free(buf);
-	if (fp)
-		fclose(fp);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 985273832f89..c60db8beeb73 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -5,6 +5,7 @@
 #include "cap_helpers.h"
 #include "verifier_and.skel.h"
 #include "verifier_arena.skel.h"
+#include "verifier_arena_large.skel.h"
 #include "verifier_array_access.skel.h"
 #include "verifier_basic_stack.skel.h"
 #include "verifier_bitfield_write.skel.h"
@@ -65,6 +66,7 @@
 #include "verifier_sdiv.skel.h"
 #include "verifier_search_pruning.skel.h"
 #include "verifier_sock.skel.h"
+#include "verifier_sock_addr.skel.h"
 #include "verifier_spill_fill.skel.h"
 #include "verifier_spin_lock.skel.h"
 #include "verifier_stack_ptr.skel.h"
@@ -120,6 +122,7 @@ static void run_tests_aux(const char *skel_name,
 
 void test_verifier_and(void)                  { RUN(verifier_and); }
 void test_verifier_arena(void)                { RUN(verifier_arena); }
+void test_verifier_arena_large(void)          { RUN(verifier_arena_large); }
 void test_verifier_basic_stack(void)          { RUN(verifier_basic_stack); }
 void test_verifier_bitfield_write(void)       { RUN(verifier_bitfield_write); }
 void test_verifier_bounds(void)               { RUN(verifier_bounds); }
@@ -179,6 +182,7 @@ void test_verifier_scalar_ids(void)           { RUN(verifier_scalar_ids); }
 void test_verifier_sdiv(void)                 { RUN(verifier_sdiv); }
 void test_verifier_search_pruning(void)       { RUN(verifier_search_pruning); }
 void test_verifier_sock(void)                 { RUN(verifier_sock); }
+void test_verifier_sock_addr(void)            { RUN(verifier_sock_addr); }
 void test_verifier_spill_fill(void)           { RUN(verifier_spill_fill); }
 void test_verifier_spin_lock(void)            { RUN(verifier_spin_lock); }
 void test_verifier_stack_ptr(void)            { RUN(verifier_stack_ptr); }
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c
new file mode 100644
index 000000000000..3918ecc2ee91
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/verifier_kfunc_prog_types.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+
+#include "verifier_kfunc_prog_types.skel.h"
+
+void test_verifier_kfunc_prog_types(void)
+{
+	RUN_TESTS(verifier_kfunc_prog_types);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c
new file mode 100644
index 000000000000..99e438fe12ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/wq.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Benjamin Tissoires */
+#include <test_progs.h>
+#include "wq.skel.h"
+#include "wq_failures.skel.h"
+
+void serial_test_wq(void)
+{
+	struct wq *wq_skel = NULL;
+	int err, prog_fd;
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+
+	RUN_TESTS(wq);
+
+	/* re-run the success test to check if the timer was actually executed */
+
+	wq_skel = wq__open_and_load();
+	if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load"))
+		return;
+
+	err = wq__attach(wq_skel);
+	if (!ASSERT_OK(err, "wq_attach"))
+		return;
+
+	prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(topts.retval, 0, "test_run");
+
+	usleep(50); /* 10 usecs should be enough, but give it extra */
+
+	ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable");
+	wq__destroy(wq_skel);
+}
+
+void serial_test_failures_wq(void)
+{
+	RUN_TESTS(wq_failures);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
index 498d3bdaa4b0..bad0ea167be7 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c
@@ -107,8 +107,8 @@ void test_xdp_do_redirect(void)
 			    .attach_point = BPF_TC_INGRESS);
 
 	memcpy(&data[sizeof(__u64)], &pkt_udp, sizeof(pkt_udp));
-	*((__u32 *)data) = 0x42; /* metadata test value */
-	*((__u32 *)data + 4) = 0;
+	((__u32 *)data)[0] = 0x42; /* metadata test value */
+	((__u32 *)data)[1] = 0;
 
 	skel = test_xdp_do_redirect__open();
 	if (!ASSERT_OK_PTR(skel, "skel"))
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
index 05edcf32f528..f76b5d67a3ee 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -384,6 +384,8 @@ void test_xdp_metadata(void)
 	SYS(out, "ip netns add " RX_NETNS_NAME);
 
 	tok = open_netns(TX_NETNS_NAME);
+	if (!ASSERT_OK_PTR(tok, "setns"))
+		goto out;
 	SYS(out, "ip link add numtxqueues 1 numrxqueues 1 " TX_NAME
 	    " type veth peer " RX_NAME " numtxqueues 1 numrxqueues 1");
 	SYS(out, "ip link set " RX_NAME " netns " RX_NETNS_NAME);
@@ -400,6 +402,8 @@ void test_xdp_metadata(void)
 	SYS(out, "ip -4 neigh add " RX_ADDR " lladdr " RX_MAC " dev " TX_NAME_VLAN);
 
 	switch_ns_to_rx(&tok);
+	if (!ASSERT_OK_PTR(tok, "setns rx"))
+		goto out;
 
 	SYS(out, "ip link set dev " RX_NAME " address " RX_MAC);
 	SYS(out, "ip link set dev " RX_NAME " up");
@@ -449,6 +453,8 @@ void test_xdp_metadata(void)
 		goto out;
 
 	switch_ns_to_tx(&tok);
+	if (!ASSERT_OK_PTR(tok, "setns tx"))
+		goto out;
 
 	/* Setup separate AF_XDP for TX interface nad send packet to the RX socket. */
 	tx_ifindex = if_nametoindex(TX_NAME);
@@ -461,6 +467,8 @@ void test_xdp_metadata(void)
 		goto out;
 
 	switch_ns_to_rx(&tok);
+	if (!ASSERT_OK_PTR(tok, "setns rx"))
+		goto out;
 
 	/* Verify packet sent from AF_XDP has proper metadata. */
 	if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, true), 0,
@@ -468,6 +476,8 @@ void test_xdp_metadata(void)
 		goto out;
 
 	switch_ns_to_tx(&tok);
+	if (!ASSERT_OK_PTR(tok, "setns tx"))
+		goto out;
 	complete_tx(&tx_xsk);
 
 	/* Now check metadata of packet, generated with network stack */
@@ -475,6 +485,8 @@ void test_xdp_metadata(void)
 		goto out;
 
 	switch_ns_to_rx(&tok);
+	if (!ASSERT_OK_PTR(tok, "setns rx"))
+		goto out;
 
 	if (!ASSERT_GE(verify_xsk_metadata(&rx_xsk, false), 0,
 		       "verify_xsk_metadata"))
@@ -498,6 +510,8 @@ void test_xdp_metadata(void)
 		goto out;
 
 	switch_ns_to_tx(&tok);
+	if (!ASSERT_OK_PTR(tok, "setns tx"))
+		goto out;
 
 	/* Send packet to trigger . */
 	if (!ASSERT_GE(generate_packet(&tx_xsk, AF_XDP_CONSUMER_PORT), 0,
@@ -505,6 +519,8 @@ void test_xdp_metadata(void)
 		goto out;
 
 	switch_ns_to_rx(&tok);
+	if (!ASSERT_OK_PTR(tok, "setns rx"))
+		goto out;
 
 	while (!retries--) {
 		if (bpf_obj2->bss->called)
diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c
new file mode 100644
index 000000000000..55f10563208d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/arena_atomics.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <stdbool.h>
+#include "bpf_arena_common.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+	__uint(max_entries, 10); /* number of pages */
+#ifdef __TARGET_ARCH_arm64
+	__ulong(map_extra, 0x1ull << 32); /* start of mmap() region */
+#else
+	__ulong(map_extra, 0x1ull << 44); /* start of mmap() region */
+#endif
+} arena SEC(".maps");
+
+#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+bool skip_tests __attribute((__section__(".data"))) = false;
+#else
+bool skip_tests = true;
+#endif
+
+__u32 pid = 0;
+
+#undef __arena
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+#define __arena __attribute__((address_space(1)))
+#else
+#define __arena SEC(".addr_space.1")
+#endif
+
+__u64 __arena add64_value = 1;
+__u64 __arena add64_result = 0;
+__u32 __arena add32_value = 1;
+__u32 __arena add32_result = 0;
+__u64 __arena add_stack_value_copy = 0;
+__u64 __arena add_stack_result = 0;
+__u64 __arena add_noreturn_value = 1;
+
+SEC("raw_tp/sys_enter")
+int add(const void *ctx)
+{
+	if (pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+#ifdef ENABLE_ATOMICS_TESTS
+	__u64 add_stack_value = 1;
+
+	add64_result = __sync_fetch_and_add(&add64_value, 2);
+	add32_result = __sync_fetch_and_add(&add32_value, 2);
+	add_stack_result = __sync_fetch_and_add(&add_stack_value, 2);
+	add_stack_value_copy = add_stack_value;
+	__sync_fetch_and_add(&add_noreturn_value, 2);
+#endif
+
+	return 0;
+}
+
+__s64 __arena sub64_value = 1;
+__s64 __arena sub64_result = 0;
+__s32 __arena sub32_value = 1;
+__s32 __arena sub32_result = 0;
+__s64 __arena sub_stack_value_copy = 0;
+__s64 __arena sub_stack_result = 0;
+__s64 __arena sub_noreturn_value = 1;
+
+SEC("raw_tp/sys_enter")
+int sub(const void *ctx)
+{
+	if (pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+#ifdef ENABLE_ATOMICS_TESTS
+	__u64 sub_stack_value = 1;
+
+	sub64_result = __sync_fetch_and_sub(&sub64_value, 2);
+	sub32_result = __sync_fetch_and_sub(&sub32_value, 2);
+	sub_stack_result = __sync_fetch_and_sub(&sub_stack_value, 2);
+	sub_stack_value_copy = sub_stack_value;
+	__sync_fetch_and_sub(&sub_noreturn_value, 2);
+#endif
+
+	return 0;
+}
+
+__u64 __arena and64_value = (0x110ull << 32);
+__u32 __arena and32_value = 0x110;
+
+SEC("raw_tp/sys_enter")
+int and(const void *ctx)
+{
+	if (pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+#ifdef ENABLE_ATOMICS_TESTS
+
+	__sync_fetch_and_and(&and64_value, 0x011ull << 32);
+	__sync_fetch_and_and(&and32_value, 0x011);
+#endif
+
+	return 0;
+}
+
+__u32 __arena or32_value = 0x110;
+__u64 __arena or64_value = (0x110ull << 32);
+
+SEC("raw_tp/sys_enter")
+int or(const void *ctx)
+{
+	if (pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+#ifdef ENABLE_ATOMICS_TESTS
+	__sync_fetch_and_or(&or64_value, 0x011ull << 32);
+	__sync_fetch_and_or(&or32_value, 0x011);
+#endif
+
+	return 0;
+}
+
+__u64 __arena xor64_value = (0x110ull << 32);
+__u32 __arena xor32_value = 0x110;
+
+SEC("raw_tp/sys_enter")
+int xor(const void *ctx)
+{
+	if (pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+#ifdef ENABLE_ATOMICS_TESTS
+	__sync_fetch_and_xor(&xor64_value, 0x011ull << 32);
+	__sync_fetch_and_xor(&xor32_value, 0x011);
+#endif
+
+	return 0;
+}
+
+__u32 __arena cmpxchg32_value = 1;
+__u32 __arena cmpxchg32_result_fail = 0;
+__u32 __arena cmpxchg32_result_succeed = 0;
+__u64 __arena cmpxchg64_value = 1;
+__u64 __arena cmpxchg64_result_fail = 0;
+__u64 __arena cmpxchg64_result_succeed = 0;
+
+SEC("raw_tp/sys_enter")
+int cmpxchg(const void *ctx)
+{
+	if (pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+#ifdef ENABLE_ATOMICS_TESTS
+	cmpxchg64_result_fail = __sync_val_compare_and_swap(&cmpxchg64_value, 0, 3);
+	cmpxchg64_result_succeed = __sync_val_compare_and_swap(&cmpxchg64_value, 1, 2);
+
+	cmpxchg32_result_fail = __sync_val_compare_and_swap(&cmpxchg32_value, 0, 3);
+	cmpxchg32_result_succeed = __sync_val_compare_and_swap(&cmpxchg32_value, 1, 2);
+#endif
+
+	return 0;
+}
+
+__u64 __arena xchg64_value = 1;
+__u64 __arena xchg64_result = 0;
+__u32 __arena xchg32_value = 1;
+__u32 __arena xchg32_result = 0;
+
+SEC("raw_tp/sys_enter")
+int xchg(const void *ctx)
+{
+	if (pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+#ifdef ENABLE_ATOMICS_TESTS
+	__u64 val64 = 2;
+	__u32 val32 = 2;
+
+	xchg64_result = __sync_lock_test_and_set(&xchg64_value, val64);
+	xchg32_result = __sync_lock_test_and_set(&xchg32_value, val32);
+#endif
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/arena_htab.c b/tools/testing/selftests/bpf/progs/arena_htab.c
index b7bb712cacfd..1e6ac187a6a0 100644
--- a/tools/testing/selftests/bpf/progs/arena_htab.c
+++ b/tools/testing/selftests/bpf/progs/arena_htab.c
@@ -22,7 +22,7 @@ int zero = 0;
 SEC("syscall")
 int arena_htab_llvm(void *ctx)
 {
-#if defined(__BPF_FEATURE_ARENA_CAST) || defined(BPF_ARENA_FORCE_ASM)
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) || defined(BPF_ARENA_FORCE_ASM)
 	struct htab __arena *htab;
 	__u64 i;
 
diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c
index cd35b8448435..93bd0600eba0 100644
--- a/tools/testing/selftests/bpf/progs/arena_list.c
+++ b/tools/testing/selftests/bpf/progs/arena_list.c
@@ -30,13 +30,13 @@ int list_sum;
 int cnt;
 bool skip = false;
 
-#ifdef __BPF_FEATURE_ARENA_CAST
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
 long __arena arena_sum;
 int __arena test_val = 1;
 struct arena_list_head __arena global_head;
 #else
-long arena_sum SEC(".arena.1");
-int test_val SEC(".arena.1");
+long arena_sum SEC(".addr_space.1");
+int test_val SEC(".addr_space.1");
 #endif
 
 int zero;
@@ -44,12 +44,12 @@ int zero;
 SEC("syscall")
 int arena_list_add(void *ctx)
 {
-#ifdef __BPF_FEATURE_ARENA_CAST
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
 	__u64 i;
 
 	list_head = &global_head;
 
-	for (i = zero; i < cnt; cond_break, i++) {
+	for (i = zero; i < cnt && can_loop; i++) {
 		struct elem __arena *n = bpf_alloc(sizeof(*n));
 
 		test_val++;
@@ -66,7 +66,7 @@ int arena_list_add(void *ctx)
 SEC("syscall")
 int arena_list_del(void *ctx)
 {
-#ifdef __BPF_FEATURE_ARENA_CAST
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
 	struct elem __arena *n;
 	int sum = 0;
 
diff --git a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c
index e4bfbba6c193..c8ec0d0368e4 100644
--- a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c
+++ b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c
@@ -61,14 +61,15 @@ SEC("lsm.s/socket_post_create")
 int BPF_PROG(socket_post_create, struct socket *sock, int family, int type,
 	     int protocol, int kern)
 {
+	struct sock *sk = sock->sk;
 	struct storage *stg;
 	__u32 pid;
 
 	pid = bpf_get_current_pid_tgid() >> 32;
-	if (pid != bench_pid)
+	if (pid != bench_pid || !sk)
 		return 0;
 
-	stg = bpf_sk_storage_get(&sk_storage_map, sock->sk, NULL,
+	stg = bpf_sk_storage_get(&sk_storage_map, sk, NULL,
 				 BPF_LOCAL_STORAGE_GET_F_CREATE);
 
 	if (stg)
diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c
index a487f60b73ac..b7ddf8ec4ee8 100644
--- a/tools/testing/selftests/bpf/progs/bind4_prog.c
+++ b/tools/testing/selftests/bpf/progs/bind4_prog.c
@@ -12,6 +12,8 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 
+#include "bind_prog.h"
+
 #define SERV4_IP		0xc0a801feU /* 192.168.1.254 */
 #define SERV4_PORT		4040
 #define SERV4_REWRITE_IP	0x7f000001U /* 127.0.0.1 */
@@ -118,23 +120,23 @@ int bind_v4_prog(struct bpf_sock_addr *ctx)
 
 	// u8 narrow loads:
 	user_ip4 = 0;
-	user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[0] << 0;
-	user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[1] << 8;
-	user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[2] << 16;
-	user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[3] << 24;
+	user_ip4 |= load_byte(ctx->user_ip4, 0, sizeof(user_ip4));
+	user_ip4 |= load_byte(ctx->user_ip4, 1, sizeof(user_ip4));
+	user_ip4 |= load_byte(ctx->user_ip4, 2, sizeof(user_ip4));
+	user_ip4 |= load_byte(ctx->user_ip4, 3, sizeof(user_ip4));
 	if (ctx->user_ip4 != user_ip4)
 		return 0;
 
 	user_port = 0;
-	user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0;
-	user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8;
+	user_port |= load_byte(ctx->user_port, 0, sizeof(user_port));
+	user_port |= load_byte(ctx->user_port, 1, sizeof(user_port));
 	if (ctx->user_port != user_port)
 		return 0;
 
 	// u16 narrow loads:
 	user_ip4 = 0;
-	user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[0] << 0;
-	user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[1] << 16;
+	user_ip4 |= load_word(ctx->user_ip4, 0, sizeof(user_ip4));
+	user_ip4 |= load_word(ctx->user_ip4, 1, sizeof(user_ip4));
 	if (ctx->user_ip4 != user_ip4)
 		return 0;
 
@@ -156,4 +158,10 @@ int bind_v4_prog(struct bpf_sock_addr *ctx)
 	return 1;
 }
 
+SEC("cgroup/bind4")
+int bind_v4_deny_prog(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c
index d62cd9e9cf0e..501c3fc11d35 100644
--- a/tools/testing/selftests/bpf/progs/bind6_prog.c
+++ b/tools/testing/selftests/bpf/progs/bind6_prog.c
@@ -12,6 +12,8 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 
+#include "bind_prog.h"
+
 #define SERV6_IP_0		0xfaceb00c /* face:b00c:1234:5678::abcd */
 #define SERV6_IP_1		0x12345678
 #define SERV6_IP_2		0x00000000
@@ -129,25 +131,25 @@ int bind_v6_prog(struct bpf_sock_addr *ctx)
 	// u8 narrow loads:
 	for (i = 0; i < 4; i++) {
 		user_ip6 = 0;
-		user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[0] << 0;
-		user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[1] << 8;
-		user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[2] << 16;
-		user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[3] << 24;
+		user_ip6 |= load_byte(ctx->user_ip6[i], 0, sizeof(user_ip6));
+		user_ip6 |= load_byte(ctx->user_ip6[i], 1, sizeof(user_ip6));
+		user_ip6 |= load_byte(ctx->user_ip6[i], 2, sizeof(user_ip6));
+		user_ip6 |= load_byte(ctx->user_ip6[i], 3, sizeof(user_ip6));
 		if (ctx->user_ip6[i] != user_ip6)
 			return 0;
 	}
 
 	user_port = 0;
-	user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0;
-	user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8;
+	user_port |= load_byte(ctx->user_port, 0, sizeof(user_port));
+	user_port |= load_byte(ctx->user_port, 1, sizeof(user_port));
 	if (ctx->user_port != user_port)
 		return 0;
 
 	// u16 narrow loads:
 	for (i = 0; i < 4; i++) {
 		user_ip6 = 0;
-		user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[0] << 0;
-		user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[1] << 16;
+		user_ip6 |= load_word(ctx->user_ip6[i], 0, sizeof(user_ip6));
+		user_ip6 |= load_word(ctx->user_ip6[i], 1, sizeof(user_ip6));
 		if (ctx->user_ip6[i] != user_ip6)
 			return 0;
 	}
@@ -173,4 +175,10 @@ int bind_v6_prog(struct bpf_sock_addr *ctx)
 	return 1;
 }
 
+SEC("cgroup/bind6")
+int bind_v6_deny_prog(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bind_prog.h b/tools/testing/selftests/bpf/progs/bind_prog.h
new file mode 100644
index 000000000000..e830caa940c3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bind_prog.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BIND_PROG_H__
+#define __BIND_PROG_H__
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define load_byte(src, b, s) \
+	(((volatile __u8 *)&(src))[b] << 8 * b)
+#define load_word(src, w, s) \
+	(((volatile __u16 *)&(src))[w] << 16 * w)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define load_byte(src, b, s) \
+	(((volatile __u8 *)&(src))[(b) + (sizeof(src) - (s))] << 8 * ((s) - (b) - 1))
+#define load_word(src, w, s) \
+	(((volatile __u16 *)&(src))[w] << 16 * (((s) / 2) - (w) - 1))
+#else
+# error "Fix your compiler's __BYTE_ORDER__?!"
+#endif
+
+#endif
diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c
new file mode 100644
index 000000000000..1654a530aa3d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* Highlights:
+ * 1. The major difference between this bpf program and tcp_cubic.c
+ *    is that this bpf program relies on `cong_control` rather than
+ *    `cong_avoid` in the struct tcp_congestion_ops.
+ * 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and
+ *    tcp_update_pacing_rate is bypassed when `cong_control` is
+ *    defined, so moving these logic to `cong_control`.
+ * 3. WARNING: This bpf program is NOT the same as tcp_cubic.c.
+ *    The main purpose is to show use cases of the arguments in
+ *    `cong_control`. For simplicity's sake, it reuses tcp cubic's
+ *    kernel functions.
+ */
+
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define USEC_PER_SEC 1000000UL
+#define TCP_PACING_SS_RATIO (200)
+#define TCP_PACING_CA_RATIO (120)
+#define TCP_REORDERING (12)
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#define after(seq2, seq1) before(seq1, seq2)
+
+extern void cubictcp_init(struct sock *sk) __ksym;
+extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym;
+extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym;
+extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym;
+extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
+extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym;
+extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;
+
+static bool before(__u32 seq1, __u32 seq2)
+{
+	return (__s32)(seq1-seq2) < 0;
+}
+
+static __u64 div64_u64(__u64 dividend, __u64 divisor)
+{
+	return dividend / divisor;
+}
+
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	__u64 rate;
+
+	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+	rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
+
+	/* current rate is (cwnd * mss) / srtt
+	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
+	 * In Congestion Avoidance phase, set it to 120 % the current rate.
+	 *
+	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
+	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
+	 *	 end of slow start and should slow down.
+	 */
+	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
+		rate *= TCP_PACING_SS_RATIO;
+	else
+		rate *= TCP_PACING_CA_RATIO;
+
+	rate *= max(tp->snd_cwnd, tp->packets_out);
+
+	if (tp->srtt_us)
+		rate = div64_u64(rate, (__u64)tp->srtt_us);
+
+	sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate);
+}
+
+static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
+			       int newly_lost, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int sndcnt = 0;
+	__u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out;
+	int delta = tp->snd_ssthresh - pkts_in_flight;
+
+	if (newly_acked_sacked <= 0 || !tp->prior_cwnd)
+		return;
+
+	__u32 prr_delivered = tp->prr_delivered + newly_acked_sacked;
+
+	if (delta < 0) {
+		__u64 dividend =
+			(__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1;
+		sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out;
+	} else {
+		sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked);
+		if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
+			sndcnt++;
+		sndcnt = min(delta, sndcnt);
+	}
+	/* Force a fast retransmit upon entering fast recovery */
+	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
+	tp->snd_cwnd = pkts_in_flight + sndcnt;
+}
+
+/* Decide wheather to run the increase function of congestion control. */
+static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+{
+	if (tcp_sk(sk)->reordering > TCP_REORDERING)
+		return flag & FLAG_FORWARD_PROGRESS;
+
+	return flag & FLAG_DATA_ACKED;
+}
+
+SEC("struct_ops")
+void BPF_PROG(bpf_cubic_init, struct sock *sk)
+{
+	cubictcp_init(sk);
+}
+
+SEC("struct_ops")
+void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
+{
+	cubictcp_cwnd_event(sk, event);
+}
+
+SEC("struct_ops")
+void BPF_PROG(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag,
+	      const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (((1<<TCP_CA_CWR) | (1<<TCP_CA_Recovery)) &
+			(1 << inet_csk(sk)->icsk_ca_state)) {
+		/* Reduce cwnd if state mandates */
+		tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag);
+
+		if (!before(tp->snd_una, tp->high_seq)) {
+			/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+			if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
+					inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
+				tp->snd_cwnd = tp->snd_ssthresh;
+				tp->snd_cwnd_stamp = tcp_jiffies32;
+			}
+		}
+	} else if (tcp_may_raise_cwnd(sk, flag)) {
+		/* Advance cwnd if state allows */
+		cubictcp_cong_avoid(sk, ack, rs->acked_sacked);
+		tp->snd_cwnd_stamp = tcp_jiffies32;
+	}
+
+	tcp_update_pacing_rate(sk);
+}
+
+SEC("struct_ops")
+__u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk)
+{
+	return cubictcp_recalc_ssthresh(sk);
+}
+
+SEC("struct_ops")
+void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state)
+{
+	cubictcp_state(sk, new_state);
+}
+
+SEC("struct_ops")
+void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample)
+{
+	cubictcp_acked(sk, sample);
+}
+
+SEC("struct_ops")
+__u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk)
+{
+	return tcp_reno_undo_cwnd(sk);
+}
+
+SEC(".struct_ops")
+struct tcp_congestion_ops cc_cubic = {
+	.init		= (void *)bpf_cubic_init,
+	.ssthresh	= (void *)bpf_cubic_recalc_ssthresh,
+	.cong_control	= (void *)bpf_cubic_cong_control,
+	.set_state	= (void *)bpf_cubic_state,
+	.undo_cwnd	= (void *)bpf_cubic_undo_cwnd,
+	.cwnd_event	= (void *)bpf_cubic_cwnd_event,
+	.pkts_acked     = (void *)bpf_cubic_acked,
+	.name		= "bpf_cc_cubic",
+};
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c
index c997e3e3d3fb..d665b8a15cc4 100644
--- a/tools/testing/selftests/bpf/progs/bpf_cubic.c
+++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c
@@ -14,14 +14,22 @@
  *    "ca->ack_cnt / delta" operation.
  */
 
-#include <linux/bpf.h>
-#include <linux/stddef.h>
-#include <linux/tcp.h>
-#include "bpf_tcp_helpers.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
 #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
+static bool before(__u32 seq1, __u32 seq2)
+{
+	return (__s32)(seq1-seq2) < 0;
+}
+#define after(seq2, seq1) 	before(seq1, seq2)
+
+extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym;
+extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
 
 #define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
 					 * max_cwnd = snd_cwnd * beta
@@ -70,7 +78,7 @@ static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ))
 				/ (bic_scale * 10);
 
 /* BIC TCP Parameters */
-struct bictcp {
+struct bpf_bictcp {
 	__u32	cnt;		/* increase cwnd by 1 after ACKs */
 	__u32	last_max_cwnd;	/* last maximum snd_cwnd */
 	__u32	last_cwnd;	/* the last snd_cwnd */
@@ -91,7 +99,7 @@ struct bictcp {
 	__u32	curr_rtt;	/* the minimum rtt of current round */
 };
 
-static inline void bictcp_reset(struct bictcp *ca)
+static void bictcp_reset(struct bpf_bictcp *ca)
 {
 	ca->cnt = 0;
 	ca->last_max_cwnd = 0;
@@ -112,7 +120,7 @@ extern unsigned long CONFIG_HZ __kconfig;
 #define USEC_PER_SEC	1000000UL
 #define USEC_PER_JIFFY	(USEC_PER_SEC / HZ)
 
-static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor)
+static __u64 div64_u64(__u64 dividend, __u64 divisor)
 {
 	return dividend / divisor;
 }
@@ -120,7 +128,7 @@ static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor)
 #define div64_ul div64_u64
 
 #define BITS_PER_U64 (sizeof(__u64) * 8)
-static __always_inline int fls64(__u64 x)
+static int fls64(__u64 x)
 {
 	int num = BITS_PER_U64 - 1;
 
@@ -153,15 +161,15 @@ static __always_inline int fls64(__u64 x)
 	return num + 1;
 }
 
-static __always_inline __u32 bictcp_clock_us(const struct sock *sk)
+static __u32 bictcp_clock_us(const struct sock *sk)
 {
 	return tcp_sk(sk)->tcp_mstamp;
 }
 
-static __always_inline void bictcp_hystart_reset(struct sock *sk)
+static void bictcp_hystart_reset(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct bictcp *ca = inet_csk_ca(sk);
+	struct bpf_bictcp *ca = inet_csk_ca(sk);
 
 	ca->round_start = ca->last_ack = bictcp_clock_us(sk);
 	ca->end_seq = tp->snd_nxt;
@@ -169,11 +177,10 @@ static __always_inline void bictcp_hystart_reset(struct sock *sk)
 	ca->sample_cnt = 0;
 }
 
-/* "struct_ops/" prefix is a requirement */
-SEC("struct_ops/bpf_cubic_init")
+SEC("struct_ops")
 void BPF_PROG(bpf_cubic_init, struct sock *sk)
 {
-	struct bictcp *ca = inet_csk_ca(sk);
+	struct bpf_bictcp *ca = inet_csk_ca(sk);
 
 	bictcp_reset(ca);
 
@@ -184,12 +191,11 @@ void BPF_PROG(bpf_cubic_init, struct sock *sk)
 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
-/* "struct_ops" prefix is a requirement */
-SEC("struct_ops/bpf_cubic_cwnd_event")
+SEC("struct_ops")
 void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
 {
 	if (event == CA_EVENT_TX_START) {
-		struct bictcp *ca = inet_csk_ca(sk);
+		struct bpf_bictcp *ca = inet_csk_ca(sk);
 		__u32 now = tcp_jiffies32;
 		__s32 delta;
 
@@ -230,7 +236,7 @@ static const __u8 v[] = {
  * Newton-Raphson iteration.
  * Avg err ~= 0.195%
  */
-static __always_inline __u32 cubic_root(__u64 a)
+static __u32 cubic_root(__u64 a)
 {
 	__u32 x, b, shift;
 
@@ -263,8 +269,7 @@ static __always_inline __u32 cubic_root(__u64 a)
 /*
  * Compute congestion window to use.
  */
-static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd,
-					  __u32 acked)
+static void bictcp_update(struct bpf_bictcp *ca, __u32 cwnd, __u32 acked)
 {
 	__u32 delta, bic_target, max_cnt;
 	__u64 offs, t;
@@ -377,11 +382,11 @@ tcp_friendliness:
 	ca->cnt = max(ca->cnt, 2U);
 }
 
-/* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */
-void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
+SEC("struct_ops")
+void BPF_PROG(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct bictcp *ca = inet_csk_ca(sk);
+	struct bpf_bictcp *ca = inet_csk_ca(sk);
 
 	if (!tcp_is_cwnd_limited(sk))
 		return;
@@ -397,10 +402,11 @@ void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acke
 	tcp_cong_avoid_ai(tp, ca->cnt, acked);
 }
 
-__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk)
+SEC("struct_ops")
+__u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
-	struct bictcp *ca = inet_csk_ca(sk);
+	struct bpf_bictcp *ca = inet_csk_ca(sk);
 
 	ca->epoch_start = 0;	/* end of epoch */
 
@@ -414,7 +420,8 @@ __u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk)
 	return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
 }
 
-void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state)
+SEC("struct_ops")
+void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state)
 {
 	if (new_state == TCP_CA_Loss) {
 		bictcp_reset(inet_csk_ca(sk));
@@ -433,7 +440,7 @@ void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state)
  * We apply another 100% factor because @rate is doubled at this point.
  * We cap the cushion to 1ms.
  */
-static __always_inline __u32 hystart_ack_delay(struct sock *sk)
+static __u32 hystart_ack_delay(struct sock *sk)
 {
 	unsigned long rate;
 
@@ -444,10 +451,10 @@ static __always_inline __u32 hystart_ack_delay(struct sock *sk)
 		   div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
 }
 
-static __always_inline void hystart_update(struct sock *sk, __u32 delay)
+static void hystart_update(struct sock *sk, __u32 delay)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct bictcp *ca = inet_csk_ca(sk);
+	struct bpf_bictcp *ca = inet_csk_ca(sk);
 	__u32 threshold;
 
 	if (hystart_detect & HYSTART_ACK_TRAIN) {
@@ -492,11 +499,11 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay)
 
 int bpf_cubic_acked_called = 0;
 
-void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
-		    const struct ack_sample *sample)
+SEC("struct_ops")
+void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
-	struct bictcp *ca = inet_csk_ca(sk);
+	struct bpf_bictcp *ca = inet_csk_ca(sk);
 	__u32 delay;
 
 	bpf_cubic_acked_called = 1;
@@ -524,7 +531,8 @@ void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
 
 extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
 
-__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk)
+SEC("struct_ops")
+__u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk)
 {
 	return tcp_reno_undo_cwnd(sk);
 }
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
index 460682759aed..3c9ffe340312 100644
--- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c
+++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
@@ -6,15 +6,23 @@
  * the kernel BPF logic.
  */
 
-#include <stddef.h>
-#include <linux/bpf.h>
-#include <linux/types.h>
-#include <linux/stddef.h>
-#include <linux/tcp.h>
-#include <errno.h>
+#include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "bpf_tcp_helpers.h"
+
+#ifndef EBUSY
+#define EBUSY 16
+#endif
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#define min_not_zero(x, y) ({			\
+	typeof(x) __x = (x);			\
+	typeof(y) __y = (y);			\
+	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+static bool before(__u32 seq1, __u32 seq2)
+{
+	return (__s32)(seq1-seq2) < 0;
+}
 
 char _license[] SEC("license") = "GPL";
 
@@ -35,7 +43,7 @@ struct {
 
 #define DCTCP_MAX_ALPHA	1024U
 
-struct dctcp {
+struct bpf_dctcp {
 	__u32 old_delivered;
 	__u32 old_delivered_ce;
 	__u32 prior_rcv_nxt;
@@ -48,8 +56,7 @@ struct dctcp {
 static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */
 static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA;
 
-static __always_inline void dctcp_reset(const struct tcp_sock *tp,
-					struct dctcp *ca)
+static void dctcp_reset(const struct tcp_sock *tp, struct bpf_dctcp *ca)
 {
 	ca->next_seq = tp->snd_nxt;
 
@@ -57,11 +64,11 @@ static __always_inline void dctcp_reset(const struct tcp_sock *tp,
 	ca->old_delivered_ce = tp->delivered_ce;
 }
 
-SEC("struct_ops/dctcp_init")
+SEC("struct_ops")
 void BPF_PROG(dctcp_init, struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
-	struct dctcp *ca = inet_csk_ca(sk);
+	struct bpf_dctcp *ca = inet_csk_ca(sk);
 	int *stg;
 
 	if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) {
@@ -104,21 +111,21 @@ void BPF_PROG(dctcp_init, struct sock *sk)
 	dctcp_reset(tp, ca);
 }
 
-SEC("struct_ops/dctcp_ssthresh")
+SEC("struct_ops")
 __u32 BPF_PROG(dctcp_ssthresh, struct sock *sk)
 {
-	struct dctcp *ca = inet_csk_ca(sk);
+	struct bpf_dctcp *ca = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	ca->loss_cwnd = tp->snd_cwnd;
 	return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
 }
 
-SEC("struct_ops/dctcp_update_alpha")
+SEC("struct_ops")
 void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
-	struct dctcp *ca = inet_csk_ca(sk);
+	struct bpf_dctcp *ca = inet_csk_ca(sk);
 
 	/* Expired RTT */
 	if (!before(tp->snd_una, ca->next_seq)) {
@@ -144,16 +151,16 @@ void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags)
 	}
 }
 
-static __always_inline void dctcp_react_to_loss(struct sock *sk)
+static void dctcp_react_to_loss(struct sock *sk)
 {
-	struct dctcp *ca = inet_csk_ca(sk);
+	struct bpf_dctcp *ca = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	ca->loss_cwnd = tp->snd_cwnd;
 	tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U);
 }
 
-SEC("struct_ops/dctcp_state")
+SEC("struct_ops")
 void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state)
 {
 	if (new_state == TCP_CA_Recovery &&
@@ -164,7 +171,7 @@ void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state)
 	 */
 }
 
-static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state)
+static void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -179,9 +186,8 @@ static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state)
  * S:	0 <- last pkt was non-CE
  *	1 <- last pkt was CE
  */
-static __always_inline
-void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
-			  __u32 *prior_rcv_nxt, __u32 *ce_state)
+static void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
+				 __u32 *prior_rcv_nxt, __u32 *ce_state)
 {
 	__u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;
 
@@ -201,10 +207,10 @@ void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
 	dctcp_ece_ack_cwr(sk, new_ce_state);
 }
 
-SEC("struct_ops/dctcp_cwnd_event")
+SEC("struct_ops")
 void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev)
 {
-	struct dctcp *ca = inet_csk_ca(sk);
+	struct bpf_dctcp *ca = inet_csk_ca(sk);
 
 	switch (ev) {
 	case CA_EVENT_ECN_IS_CE:
@@ -220,17 +226,17 @@ void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev)
 	}
 }
 
-SEC("struct_ops/dctcp_cwnd_undo")
+SEC("struct_ops")
 __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk)
 {
-	const struct dctcp *ca = inet_csk_ca(sk);
+	const struct bpf_dctcp *ca = inet_csk_ca(sk);
 
 	return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
 }
 
 extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;
 
-SEC("struct_ops/dctcp_reno_cong_avoid")
+SEC("struct_ops")
 void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
 {
 	tcp_reno_cong_avoid(sk, ack, acked);
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c
index d836f7c372f0..c91763f248b2 100644
--- a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c
+++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c
@@ -1,19 +1,15 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2021 Facebook */
 
-#include <stddef.h>
-#include <linux/bpf.h>
-#include <linux/types.h>
-#include <linux/stddef.h>
-#include <linux/tcp.h>
+#include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include "bpf_tcp_helpers.h"
 
 char _license[] SEC("license") = "GPL";
 const char cubic[] = "cubic";
 
-void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk)
+SEC("struct_ops")
+void BPF_PROG(dctcp_nouse_release, struct sock *sk)
 {
 	bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
 		       (void *)cubic, sizeof(cubic));
diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c
index 2ecd833dcd41..8a7a4c1b54e8 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c
+++ b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c
@@ -1,14 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include <linux/bpf.h>
-#include <linux/types.h>
-#include <bpf/bpf_helpers.h>
+#include "bpf_tracing_net.h"
 #include <bpf/bpf_tracing.h>
-#include "bpf_tcp_helpers.h"
 
 char _license[] SEC("license") = "X";
 
-void BPF_STRUCT_OPS(nogpltcp_init, struct sock *sk)
+SEC("struct_ops")
+void BPF_PROG(nogpltcp_init, struct sock *sk)
 {
 }
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
index 7001965d1cc3..59843b430f76 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -2,6 +2,9 @@
 #ifndef __BPF_TRACING_NET_H__
 #define __BPF_TRACING_NET_H__
 
+#include <vmlinux.h>
+#include <bpf/bpf_core_read.h>
+
 #define AF_INET			2
 #define AF_INET6		10
 
@@ -22,6 +25,7 @@
 
 #define IP_TOS			1
 
+#define SOL_IPV6		41
 #define IPV6_TCLASS		67
 #define IPV6_AUTOFLOWLABEL	70
 
@@ -46,6 +50,13 @@
 #define TCP_CA_NAME_MAX		16
 #define TCP_NAGLE_OFF		1
 
+#define TCP_ECN_OK              1
+#define TCP_ECN_QUEUE_CWR       2
+#define TCP_ECN_DEMAND_CWR      4
+#define TCP_ECN_SEEN            8
+
+#define TCP_CONG_NEEDS_ECN     0x2
+
 #define ICSK_TIME_RETRANS	1
 #define ICSK_TIME_PROBE0	3
 #define ICSK_TIME_LOSS_PROBE	5
@@ -80,6 +91,14 @@
 #define TCP_INFINITE_SSTHRESH	0x7fffffff
 #define TCP_PINGPONG_THRESH	3
 
+#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data.		*/
+#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN.		*/
+#define FLAG_DATA_SACKED 0x20 /* New SACK.				*/
+#define FLAG_SND_UNA_ADVANCED \
+	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
+#define FLAG_ACKED (FLAG_DATA_ACKED | FLAG_SYN_ACKED)
+#define FLAG_FORWARD_PROGRESS (FLAG_ACKED | FLAG_DATA_SACKED)
+
 #define fib_nh_dev		nh_common.nhc_dev
 #define fib_nh_gw_family	nh_common.nhc_gw_family
 #define fib_nh_gw6		nh_common.nhc_gw.ipv6
@@ -119,4 +138,37 @@
 #define tw_v6_daddr		__tw_common.skc_v6_daddr
 #define tw_v6_rcv_saddr		__tw_common.skc_v6_rcv_saddr
 
+#define tcp_jiffies32 ((__u32)bpf_jiffies64())
+
+static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
+{
+	return (struct inet_connection_sock *)sk;
+}
+
+static inline void *inet_csk_ca(const struct sock *sk)
+{
+	return (void *)inet_csk(sk)->icsk_ca_priv;
+}
+
+static inline struct tcp_sock *tcp_sk(const struct sock *sk)
+{
+	return (struct tcp_sock *)sk;
+}
+
+static inline bool tcp_in_slow_start(const struct tcp_sock *tp)
+{
+	return tp->snd_cwnd < tp->snd_ssthresh;
+}
+
+static inline bool tcp_is_cwnd_limited(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	/* If in slow start, ensure cwnd grows to twice what was ACKed. */
+	if (tcp_in_slow_start(tp))
+		return tp->snd_cwnd < 2 * tp->max_packets_out;
+
+	return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited);
+}
+
 #endif
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h
index 22914a70db54..73ba32e9a693 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_common.h
@@ -13,7 +13,7 @@ struct __cgrps_kfunc_map_value {
 	struct cgroup __kptr * cgrp;
 };
 
-struct hash_map {
+struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__type(key, int);
 	__type(value, struct __cgrps_kfunc_map_value);
diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c
index 7ef49ec04838..9e9ebf27b878 100644
--- a/tools/testing/selftests/bpf/progs/connect4_prog.c
+++ b/tools/testing/selftests/bpf/progs/connect4_prog.c
@@ -14,8 +14,6 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 
-#include "bpf_tcp_helpers.h"
-
 #define SRC_REWRITE_IP4		0x7f000004U
 #define DST_REWRITE_IP4		0x7f000001U
 #define DST_REWRITE_PORT4	4444
@@ -32,6 +30,10 @@
 #define IFNAMSIZ 16
 #endif
 
+#ifndef SOL_TCP
+#define SOL_TCP 6
+#endif
+
 __attribute__ ((noinline)) __weak
 int do_bind(struct bpf_sock_addr *ctx)
 {
@@ -197,4 +199,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx)
 	return do_bind(ctx) ? 1 : 0;
 }
 
+SEC("cgroup/connect4")
+int connect_v4_deny_prog(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/connect6_prog.c b/tools/testing/selftests/bpf/progs/connect6_prog.c
index 40266d2c737c..e98573b00ddb 100644
--- a/tools/testing/selftests/bpf/progs/connect6_prog.c
+++ b/tools/testing/selftests/bpf/progs/connect6_prog.c
@@ -90,4 +90,10 @@ int connect_v6_prog(struct bpf_sock_addr *ctx)
 	return 1;
 }
 
+SEC("cgroup/connect6")
+int connect_v6_deny_prog(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/connect_unix_prog.c b/tools/testing/selftests/bpf/progs/connect_unix_prog.c
index 2ef0e0c46d17..ba60adadb335 100644
--- a/tools/testing/selftests/bpf/progs/connect_unix_prog.c
+++ b/tools/testing/selftests/bpf/progs/connect_unix_prog.c
@@ -36,4 +36,10 @@ int connect_unix_prog(struct bpf_sock_addr *ctx)
 	return 1;
 }
 
+SEC("cgroup/connect_unix")
+int connect_unix_deny_prog(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h
index c705d8112a35..b979e91f55f0 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_common.h
+++ b/tools/testing/selftests/bpf/progs/cpumask_common.h
@@ -9,7 +9,7 @@
 
 int err;
 
-#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8)))
+#define private(name) SEC(".bss." #name) __attribute__((aligned(8)))
 private(MASK) static struct bpf_cpumask __kptr * global_mask;
 
 struct __cpumask_map_value {
diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c
index a9bf6ea336cf..a988d2823b52 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_failure.c
+++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c
@@ -61,11 +61,8 @@ SEC("tp_btf/task_newtask")
 __failure __msg("bpf_cpumask_set_cpu args#1 expected pointer to STRUCT bpf_cpumask")
 int BPF_PROG(test_mutate_cpumask, struct task_struct *task, u64 clone_flags)
 {
-	struct bpf_cpumask *cpumask;
-
 	/* Can't set the CPU of a non-struct bpf_cpumask. */
 	bpf_cpumask_set_cpu(0, (struct bpf_cpumask *)task->cpus_ptr);
-	__sink(cpumask);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/crypto_basic.c b/tools/testing/selftests/bpf/progs/crypto_basic.c
new file mode 100644
index 000000000000..8cf7168b42d5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/crypto_basic.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "bpf_kfuncs.h"
+#include "crypto_common.h"
+
+int status;
+SEC("syscall")
+int crypto_release(void *ctx)
+{
+	struct bpf_crypto_params params = {
+		.type = "skcipher",
+		.algo = "ecb(aes)",
+		.key_len = 16,
+	};
+
+	struct bpf_crypto_ctx *cctx;
+	int err = 0;
+
+	status = 0;
+
+	cctx = bpf_crypto_ctx_create(&params, sizeof(params), &err);
+
+	if (!cctx) {
+		status = err;
+		return 0;
+	}
+
+	bpf_crypto_ctx_release(cctx);
+
+	return 0;
+}
+
+SEC("syscall")
+__failure __msg("Unreleased reference")
+int crypto_acquire(void *ctx)
+{
+	struct bpf_crypto_params params = {
+		.type = "skcipher",
+		.algo = "ecb(aes)",
+		.key_len = 16,
+	};
+	struct bpf_crypto_ctx *cctx;
+	int err = 0;
+
+	status = 0;
+
+	cctx = bpf_crypto_ctx_create(&params, sizeof(params), &err);
+
+	if (!cctx) {
+		status = err;
+		return 0;
+	}
+
+	cctx = bpf_crypto_ctx_acquire(cctx);
+	if (!cctx)
+		return -EINVAL;
+
+	bpf_crypto_ctx_release(cctx);
+
+	return 0;
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/crypto_bench.c b/tools/testing/selftests/bpf/progs/crypto_bench.c
new file mode 100644
index 000000000000..e61fe0882293
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/crypto_bench.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "bpf_kfuncs.h"
+#include "crypto_common.h"
+
+const volatile unsigned int len = 16;
+char cipher[128] = {};
+u32 key_len, authsize;
+char dst[256] = {};
+u8 key[256] = {};
+long hits = 0;
+int status;
+
+SEC("syscall")
+int crypto_setup(void *args)
+{
+	struct bpf_crypto_ctx *cctx;
+	struct bpf_crypto_params params = {
+		.type = "skcipher",
+		.key_len = key_len,
+		.authsize = authsize,
+	};
+	int err = 0;
+
+	status = 0;
+
+	if (!cipher[0] || !key_len || key_len > 256) {
+		status = -EINVAL;
+		return 0;
+	}
+
+	__builtin_memcpy(&params.algo, cipher, sizeof(cipher));
+	__builtin_memcpy(&params.key, key, sizeof(key));
+	cctx = bpf_crypto_ctx_create(&params, sizeof(params), &err);
+
+	if (!cctx) {
+		status = err;
+		return 0;
+	}
+
+	err = crypto_ctx_insert(cctx);
+	if (err && err != -EEXIST)
+		status = err;
+
+	return 0;
+}
+
+SEC("tc")
+int crypto_encrypt(struct __sk_buff *skb)
+{
+	struct __crypto_ctx_value *v;
+	struct bpf_crypto_ctx *ctx;
+	struct bpf_dynptr psrc, pdst, iv;
+
+	v = crypto_ctx_value_lookup();
+	if (!v) {
+		status = -ENOENT;
+		return 0;
+	}
+
+	ctx = v->ctx;
+	if (!ctx) {
+		status = -ENOENT;
+		return 0;
+	}
+
+	bpf_dynptr_from_skb(skb, 0, &psrc);
+	bpf_dynptr_from_mem(dst, len, 0, &pdst);
+	bpf_dynptr_from_mem(dst, 0, 0, &iv);
+
+	status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv);
+	__sync_add_and_fetch(&hits, 1);
+
+	return 0;
+}
+
+SEC("tc")
+int crypto_decrypt(struct __sk_buff *skb)
+{
+	struct bpf_dynptr psrc, pdst, iv;
+	struct __crypto_ctx_value *v;
+	struct bpf_crypto_ctx *ctx;
+
+	v = crypto_ctx_value_lookup();
+	if (!v)
+		return -ENOENT;
+
+	ctx = v->ctx;
+	if (!ctx)
+		return -ENOENT;
+
+	bpf_dynptr_from_skb(skb, 0, &psrc);
+	bpf_dynptr_from_mem(dst, len, 0, &pdst);
+	bpf_dynptr_from_mem(dst, 0, 0, &iv);
+
+	status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv);
+	__sync_add_and_fetch(&hits, 1);
+
+	return 0;
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/crypto_common.h b/tools/testing/selftests/bpf/progs/crypto_common.h
new file mode 100644
index 000000000000..57dd7a68a8c3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/crypto_common.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#ifndef _CRYPTO_COMMON_H
+#define _CRYPTO_COMMON_H
+
+#include "errno.h"
+#include <stdbool.h>
+
+struct bpf_crypto_ctx *bpf_crypto_ctx_create(const struct bpf_crypto_params *params,
+					     u32 params__sz, int *err) __ksym;
+struct bpf_crypto_ctx *bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx) __ksym;
+void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) __ksym;
+int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src,
+		       const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym;
+int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx, const struct bpf_dynptr *src,
+		       const struct bpf_dynptr *dst, const struct bpf_dynptr *iv) __ksym;
+
+struct __crypto_ctx_value {
+	struct bpf_crypto_ctx __kptr * ctx;
+};
+
+struct array_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, struct __crypto_ctx_value);
+	__uint(max_entries, 1);
+} __crypto_ctx_map SEC(".maps");
+
+static inline struct __crypto_ctx_value *crypto_ctx_value_lookup(void)
+{
+	u32 key = 0;
+
+	return bpf_map_lookup_elem(&__crypto_ctx_map, &key);
+}
+
+static inline int crypto_ctx_insert(struct bpf_crypto_ctx *ctx)
+{
+	struct __crypto_ctx_value local, *v;
+	struct bpf_crypto_ctx *old;
+	u32 key = 0;
+	int err;
+
+	local.ctx = NULL;
+	err = bpf_map_update_elem(&__crypto_ctx_map, &key, &local, 0);
+	if (err) {
+		bpf_crypto_ctx_release(ctx);
+		return err;
+	}
+
+	v = bpf_map_lookup_elem(&__crypto_ctx_map, &key);
+	if (!v) {
+		bpf_crypto_ctx_release(ctx);
+		return -ENOENT;
+	}
+
+	old = bpf_kptr_xchg(&v->ctx, ctx);
+	if (old) {
+		bpf_crypto_ctx_release(old);
+		return -EEXIST;
+	}
+
+	return 0;
+}
+
+#endif /* _CRYPTO_COMMON_H */
diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c
new file mode 100644
index 000000000000..1be0a3fa5efd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "bpf_kfuncs.h"
+#include "crypto_common.h"
+
+unsigned char key[256] = {};
+u16 udp_test_port = 7777;
+u32 authsize, key_len;
+char algo[128] = {};
+char dst[16] = {};
+int status;
+
+static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc)
+{
+	struct ipv6hdr ip6h;
+	struct udphdr udph;
+	u32 offset;
+
+	if (skb->protocol != __bpf_constant_htons(ETH_P_IPV6))
+		return -1;
+
+	if (bpf_skb_load_bytes(skb, ETH_HLEN, &ip6h, sizeof(ip6h)))
+		return -1;
+
+	if (ip6h.nexthdr != IPPROTO_UDP)
+		return -1;
+
+	if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(ip6h), &udph, sizeof(udph)))
+		return -1;
+
+	if (udph.dest != __bpf_htons(udp_test_port))
+		return -1;
+
+	offset = ETH_HLEN + sizeof(ip6h) + sizeof(udph);
+	if (skb->len < offset + 16)
+		return -1;
+
+	/* let's make sure that 16 bytes of payload are in the linear part of skb */
+	bpf_skb_pull_data(skb, offset + 16);
+	bpf_dynptr_from_skb(skb, 0, psrc);
+	bpf_dynptr_adjust(psrc, offset, offset + 16);
+
+	return 0;
+}
+
+SEC("syscall")
+int skb_crypto_setup(void *ctx)
+{
+	struct bpf_crypto_params params = {
+		.type = "skcipher",
+		.key_len = key_len,
+		.authsize = authsize,
+	};
+	struct bpf_crypto_ctx *cctx;
+	int err = 0;
+
+	status = 0;
+
+	if (key_len > 256) {
+		status = -EINVAL;
+		return 0;
+	}
+
+	__builtin_memcpy(&params.algo, algo, sizeof(algo));
+	__builtin_memcpy(&params.key, key, sizeof(key));
+	cctx = bpf_crypto_ctx_create(&params, sizeof(params), &err);
+
+	if (!cctx) {
+		status = err;
+		return 0;
+	}
+
+	err = crypto_ctx_insert(cctx);
+	if (err && err != -EEXIST)
+		status = err;
+
+	return 0;
+}
+
+SEC("tc")
+int decrypt_sanity(struct __sk_buff *skb)
+{
+	struct __crypto_ctx_value *v;
+	struct bpf_crypto_ctx *ctx;
+	struct bpf_dynptr psrc, pdst, iv;
+	int err;
+
+	err = skb_dynptr_validate(skb, &psrc);
+	if (err < 0) {
+		status = err;
+		return TC_ACT_SHOT;
+	}
+
+	v = crypto_ctx_value_lookup();
+	if (!v) {
+		status = -ENOENT;
+		return TC_ACT_SHOT;
+	}
+
+	ctx = v->ctx;
+	if (!ctx) {
+		status = -ENOENT;
+		return TC_ACT_SHOT;
+	}
+
+	/* dst is a global variable to make testing part easier to check. In real
+	 * production code, a percpu map should be used to store the result.
+	 */
+	bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst);
+	/* iv dynptr has to be initialized with 0 size, but proper memory region
+	 * has to be provided anyway
+	 */
+	bpf_dynptr_from_mem(dst, 0, 0, &iv);
+
+	status = bpf_crypto_decrypt(ctx, &psrc, &pdst, &iv);
+
+	return TC_ACT_SHOT;
+}
+
+SEC("tc")
+int encrypt_sanity(struct __sk_buff *skb)
+{
+	struct __crypto_ctx_value *v;
+	struct bpf_crypto_ctx *ctx;
+	struct bpf_dynptr psrc, pdst, iv;
+	int err;
+
+	status = 0;
+
+	err = skb_dynptr_validate(skb, &psrc);
+	if (err < 0) {
+		status = err;
+		return TC_ACT_SHOT;
+	}
+
+	v = crypto_ctx_value_lookup();
+	if (!v) {
+		status = -ENOENT;
+		return TC_ACT_SHOT;
+	}
+
+	ctx = v->ctx;
+	if (!ctx) {
+		status = -ENOENT;
+		return TC_ACT_SHOT;
+	}
+
+	/* dst is a global variable to make testing part easier to check. In real
+	 * production code, a percpu map should be used to store the result.
+	 */
+	bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst);
+	/* iv dynptr has to be initialized with 0 size, but proper memory region
+	 * has to be provided anyway
+	 */
+	bpf_dynptr_from_mem(dst, 0, 0, &iv);
+
+	status = bpf_crypto_encrypt(ctx, &psrc, &pdst, &iv);
+
+	return TC_ACT_SHOT;
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c
index 1efa746c25dc..ec0c595d47af 100644
--- a/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c
+++ b/tools/testing/selftests/bpf/progs/dummy_st_ops_success.c
@@ -11,8 +11,17 @@ int BPF_PROG(test_1, struct bpf_dummy_ops_state *state)
 {
 	int ret;
 
-	if (!state)
-		return 0xf2f3f4f5;
+	/* Check that 'state' nullable status is detected correctly.
+	 * If 'state' argument would be assumed non-null by verifier
+	 * the code below would be deleted as dead (which it shouldn't).
+	 * Hide it from the compiler behind 'asm' block to avoid
+	 * unnecessary optimizations.
+	 */
+	asm volatile (
+		"if %[state] != 0 goto +2;"
+		"r0 = 0xf2f3f4f5;"
+		"exit;"
+	::[state]"p"(state));
 
 	ret = state->val;
 	state->val = 0x5a;
@@ -25,7 +34,7 @@ SEC("struct_ops/test_2")
 int BPF_PROG(test_2, struct bpf_dummy_ops_state *state, int a1, unsigned short a2,
 	     char a3, unsigned long a4)
 {
-	test_2_args[0] = (unsigned long)state;
+	test_2_args[0] = state->val;
 	test_2_args[1] = a1;
 	test_2_args[2] = a2;
 	test_2_args[3] = a3;
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 7ce7e827d5f0..66a60bfb5867 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -80,7 +80,7 @@ SEC("?raw_tp")
 __failure __msg("Unreleased reference id=2")
 int ringbuf_missing_release1(void *ctx)
 {
-	struct bpf_dynptr ptr;
+	struct bpf_dynptr ptr = {};
 
 	bpf_ringbuf_reserve_dynptr(&ringbuf, val, 0, &ptr);
 
@@ -1385,7 +1385,7 @@ SEC("?raw_tp")
 __failure __msg("Expected an initialized dynptr as arg #1")
 int dynptr_adjust_invalid(void *ctx)
 {
-	struct bpf_dynptr ptr;
+	struct bpf_dynptr ptr = {};
 
 	/* this should fail */
 	bpf_dynptr_adjust(&ptr, 1, 2);
@@ -1398,7 +1398,7 @@ SEC("?raw_tp")
 __failure __msg("Expected an initialized dynptr as arg #1")
 int dynptr_is_null_invalid(void *ctx)
 {
-	struct bpf_dynptr ptr;
+	struct bpf_dynptr ptr = {};
 
 	/* this should fail */
 	bpf_dynptr_is_null(&ptr);
@@ -1411,7 +1411,7 @@ SEC("?raw_tp")
 __failure __msg("Expected an initialized dynptr as arg #1")
 int dynptr_is_rdonly_invalid(void *ctx)
 {
-	struct bpf_dynptr ptr;
+	struct bpf_dynptr ptr = {};
 
 	/* this should fail */
 	bpf_dynptr_is_rdonly(&ptr);
@@ -1424,7 +1424,7 @@ SEC("?raw_tp")
 __failure __msg("Expected an initialized dynptr as arg #1")
 int dynptr_size_invalid(void *ctx)
 {
-	struct bpf_dynptr ptr;
+	struct bpf_dynptr ptr = {};
 
 	/* this should fail */
 	bpf_dynptr_size(&ptr);
@@ -1437,7 +1437,7 @@ SEC("?raw_tp")
 __failure __msg("Expected an initialized dynptr as arg #1")
 int clone_invalid1(void *ctx)
 {
-	struct bpf_dynptr ptr1;
+	struct bpf_dynptr ptr1 = {};
 	struct bpf_dynptr ptr2;
 
 	/* this should fail */
diff --git a/tools/testing/selftests/bpf/progs/fib_lookup.c b/tools/testing/selftests/bpf/progs/fib_lookup.c
index c4514dd58c62..7b5dd2214ff4 100644
--- a/tools/testing/selftests/bpf/progs/fib_lookup.c
+++ b/tools/testing/selftests/bpf/progs/fib_lookup.c
@@ -3,8 +3,8 @@
 
 #include <linux/types.h>
 #include <linux/bpf.h>
+#include <linux/pkt_cls.h>
 #include <bpf/bpf_helpers.h>
-#include "bpf_tracing_net.h"
 
 struct bpf_fib_lookup fib_params = {};
 int fib_lookup_ret = 0;
diff --git a/tools/testing/selftests/bpf/progs/for_each_multi_maps.c b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c
new file mode 100644
index 000000000000..ff0bed7d4459
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/for_each_multi_maps.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 3);
+	__type(key, __u32);
+	__type(value, __u64);
+} arraymap SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 5);
+	__type(key, __u32);
+	__type(value, __u64);
+} hashmap SEC(".maps");
+
+struct callback_ctx {
+	int output;
+};
+
+u32 data_output = 0;
+int use_array = 0;
+
+static __u64
+check_map_elem(struct bpf_map *map, __u32 *key, __u64 *val,
+	       struct callback_ctx *data)
+{
+	data->output += *val;
+	return 0;
+}
+
+SEC("tc")
+int test_pkt_access(struct __sk_buff *skb)
+{
+	struct callback_ctx data;
+
+	data.output = 0;
+	if (use_array)
+		bpf_for_each_map_elem(&arraymap, check_map_elem, &data, 0);
+	else
+		bpf_for_each_map_elem(&hashmap, check_map_elem, &data, 0);
+	data_output = data.output;
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/getpeername4_prog.c b/tools/testing/selftests/bpf/progs/getpeername4_prog.c
new file mode 100644
index 000000000000..4c97208cd25d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/getpeername4_prog.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Google LLC */
+
+#include "vmlinux.h"
+
+#include <string.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_kfuncs.h"
+
+#define REWRITE_ADDRESS_IP4   0xc0a801fe // 192.168.1.254
+#define REWRITE_ADDRESS_PORT4 4040
+
+SEC("cgroup/getpeername4")
+int getpeername_v4_prog(struct bpf_sock_addr *ctx)
+{
+	ctx->user_ip4 = bpf_htonl(REWRITE_ADDRESS_IP4);
+	ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT4);
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/getpeername6_prog.c b/tools/testing/selftests/bpf/progs/getpeername6_prog.c
new file mode 100644
index 000000000000..070e4d7f636c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/getpeername6_prog.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Google LLC */
+
+#include "vmlinux.h"
+
+#include <string.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_kfuncs.h"
+
+#define REWRITE_ADDRESS_IP6_0 0xfaceb00c
+#define REWRITE_ADDRESS_IP6_1 0x12345678
+#define REWRITE_ADDRESS_IP6_2 0x00000000
+#define REWRITE_ADDRESS_IP6_3 0x0000abcd
+
+#define REWRITE_ADDRESS_PORT6 6060
+
+SEC("cgroup/getpeername6")
+int getpeername_v6_prog(struct bpf_sock_addr *ctx)
+{
+	ctx->user_ip6[0] = bpf_htonl(REWRITE_ADDRESS_IP6_0);
+	ctx->user_ip6[1] = bpf_htonl(REWRITE_ADDRESS_IP6_1);
+	ctx->user_ip6[2] = bpf_htonl(REWRITE_ADDRESS_IP6_2);
+	ctx->user_ip6[3] = bpf_htonl(REWRITE_ADDRESS_IP6_3);
+	ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT6);
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/getsockname4_prog.c b/tools/testing/selftests/bpf/progs/getsockname4_prog.c
new file mode 100644
index 000000000000..e298487c6347
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/getsockname4_prog.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Google LLC */
+
+#include "vmlinux.h"
+
+#include <string.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_kfuncs.h"
+
+#define REWRITE_ADDRESS_IP4   0xc0a801fe // 192.168.1.254
+#define REWRITE_ADDRESS_PORT4 4040
+
+SEC("cgroup/getsockname4")
+int getsockname_v4_prog(struct bpf_sock_addr *ctx)
+{
+	ctx->user_ip4 = bpf_htonl(REWRITE_ADDRESS_IP4);
+	ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT4);
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/getsockname6_prog.c b/tools/testing/selftests/bpf/progs/getsockname6_prog.c
new file mode 100644
index 000000000000..811d10cd5525
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/getsockname6_prog.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Google LLC */
+
+#include "vmlinux.h"
+
+#include <string.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_kfuncs.h"
+
+#define REWRITE_ADDRESS_IP6_0 0xfaceb00c
+#define REWRITE_ADDRESS_IP6_1 0x12345678
+#define REWRITE_ADDRESS_IP6_2 0x00000000
+#define REWRITE_ADDRESS_IP6_3 0x0000abcd
+
+#define REWRITE_ADDRESS_PORT6 6060
+
+SEC("cgroup/getsockname6")
+int getsockname_v6_prog(struct bpf_sock_addr *ctx)
+{
+	ctx->user_ip6[0] = bpf_htonl(REWRITE_ADDRESS_IP6_0);
+	ctx->user_ip6[1] = bpf_htonl(REWRITE_ADDRESS_IP6_1);
+	ctx->user_ip6[2] = bpf_htonl(REWRITE_ADDRESS_IP6_2);
+	ctx->user_ip6[3] = bpf_htonl(REWRITE_ADDRESS_IP6_3);
+	ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT6);
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index 3db416606f2f..fe65e0952a1e 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -673,7 +673,7 @@ static __noinline void fill(struct bpf_iter_num *it, int *arr, __u32 n, int mul)
 
 static __noinline int sum(struct bpf_iter_num *it, int *arr, __u32 n)
 {
-	int *t, i, sum = 0;;
+	int *t, i, sum = 0;
 
 	while ((t = bpf_iter_num_next(it))) {
 		i = *t;
diff --git a/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c
index f46965053acb..4d619bea9c75 100644
--- a/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c
+++ b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c
@@ -4,6 +4,10 @@
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
 
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
 char _license[] SEC("license") = "GPL";
 
 struct {
diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session.c
new file mode 100644
index 000000000000..bbba9eb46551
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <stdbool.h>
+#include "bpf_kfuncs.h"
+
+#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0]))
+
+char _license[] SEC("license") = "GPL";
+
+extern const void bpf_fentry_test1 __ksym;
+extern const void bpf_fentry_test2 __ksym;
+extern const void bpf_fentry_test3 __ksym;
+extern const void bpf_fentry_test4 __ksym;
+extern const void bpf_fentry_test5 __ksym;
+extern const void bpf_fentry_test6 __ksym;
+extern const void bpf_fentry_test7 __ksym;
+extern const void bpf_fentry_test8 __ksym;
+
+int pid = 0;
+
+__u64 kprobe_session_result[8];
+
+static int session_check(void *ctx)
+{
+	unsigned int i;
+	__u64 addr;
+	const void *kfuncs[] = {
+		&bpf_fentry_test1,
+		&bpf_fentry_test2,
+		&bpf_fentry_test3,
+		&bpf_fentry_test4,
+		&bpf_fentry_test5,
+		&bpf_fentry_test6,
+		&bpf_fentry_test7,
+		&bpf_fentry_test8,
+	};
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 1;
+
+	addr = bpf_get_func_ip(ctx);
+
+	for (i = 0; i < ARRAY_SIZE(kfuncs); i++) {
+		if (kfuncs[i] == (void *) addr) {
+			kprobe_session_result[i]++;
+			break;
+		}
+	}
+
+	/*
+	 * Force probes for function bpf_fentry_test[5-8] not to
+	 * install and execute the return probe
+	 */
+	if (((const void *) addr == &bpf_fentry_test5) ||
+	    ((const void *) addr == &bpf_fentry_test6) ||
+	    ((const void *) addr == &bpf_fentry_test7) ||
+	    ((const void *) addr == &bpf_fentry_test8))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * No tests in here, just to trigger 'bpf_fentry_test*'
+ * through tracing test_run
+ */
+SEC("fentry/bpf_modify_return_test")
+int BPF_PROG(trigger)
+{
+	return 0;
+}
+
+SEC("kprobe.session/bpf_fentry_test*")
+int test_kprobe(struct pt_regs *ctx)
+{
+	return session_check(ctx);
+}
diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c
new file mode 100644
index 000000000000..d49070803e22
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <stdbool.h>
+#include "bpf_kfuncs.h"
+
+char _license[] SEC("license") = "GPL";
+
+int pid = 0;
+
+__u64 test_kprobe_1_result = 0;
+__u64 test_kprobe_2_result = 0;
+__u64 test_kprobe_3_result = 0;
+
+/*
+ * No tests in here, just to trigger 'bpf_fentry_test*'
+ * through tracing test_run
+ */
+SEC("fentry/bpf_modify_return_test")
+int BPF_PROG(trigger)
+{
+	return 0;
+}
+
+static int check_cookie(__u64 val, __u64 *result)
+{
+	long *cookie;
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 1;
+
+	cookie = bpf_session_cookie();
+
+	if (bpf_session_is_return())
+		*result = *cookie == val ? val : 0;
+	else
+		*cookie = val;
+	return 0;
+}
+
+SEC("kprobe.session/bpf_fentry_test1")
+int test_kprobe_1(struct pt_regs *ctx)
+{
+	return check_cookie(1, &test_kprobe_1_result);
+}
+
+SEC("kprobe.session/bpf_fentry_test1")
+int test_kprobe_2(struct pt_regs *ctx)
+{
+	return check_cookie(2, &test_kprobe_2_result);
+}
+
+SEC("kprobe.session/bpf_fentry_test1")
+int test_kprobe_3(struct pt_regs *ctx)
+{
+	return check_cookie(3, &test_kprobe_3_result);
+}
diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c
index e5e3a8b8dd07..637e75df2e14 100644
--- a/tools/testing/selftests/bpf/progs/local_storage.c
+++ b/tools/testing/selftests/bpf/progs/local_storage.c
@@ -140,11 +140,12 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address,
 {
 	__u32 pid = bpf_get_current_pid_tgid() >> 32;
 	struct local_storage *storage;
+	struct sock *sk = sock->sk;
 
-	if (pid != monitored_pid)
+	if (pid != monitored_pid || !sk)
 		return 0;
 
-	storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, 0);
+	storage = bpf_sk_storage_get(&sk_storage_map, sk, 0, 0);
 	if (!storage)
 		return 0;
 
@@ -155,24 +156,24 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address,
 	/* This tests that we can associate multiple elements
 	 * with the local storage.
 	 */
-	storage = bpf_sk_storage_get(&sk_storage_map2, sock->sk, 0,
+	storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0,
 				     BPF_LOCAL_STORAGE_GET_F_CREATE);
 	if (!storage)
 		return 0;
 
-	if (bpf_sk_storage_delete(&sk_storage_map2, sock->sk))
+	if (bpf_sk_storage_delete(&sk_storage_map2, sk))
 		return 0;
 
-	storage = bpf_sk_storage_get(&sk_storage_map2, sock->sk, 0,
+	storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0,
 				     BPF_LOCAL_STORAGE_GET_F_CREATE);
 	if (!storage)
 		return 0;
 
-	if (bpf_sk_storage_delete(&sk_storage_map, sock->sk))
+	if (bpf_sk_storage_delete(&sk_storage_map, sk))
 		return 0;
 
 	/* Ensure that the sk_storage_map is disconnected from the storage. */
-	if (!sock->sk->sk_bpf_storage || sock->sk->sk_bpf_storage->smap)
+	if (!sk->sk_bpf_storage || sk->sk_bpf_storage->smap)
 		return 0;
 
 	sk_storage_result = 0;
@@ -185,11 +186,12 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type,
 {
 	__u32 pid = bpf_get_current_pid_tgid() >> 32;
 	struct local_storage *storage;
+	struct sock *sk = sock->sk;
 
-	if (pid != monitored_pid)
+	if (pid != monitored_pid || !sk)
 		return 0;
 
-	storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0,
+	storage = bpf_sk_storage_get(&sk_storage_map, sk, 0,
 				     BPF_LOCAL_STORAGE_GET_F_CREATE);
 	if (!storage)
 		return 0;
diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c
index 02c11d16b692..d7598538aa2d 100644
--- a/tools/testing/selftests/bpf/progs/lsm_cgroup.c
+++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c
@@ -103,11 +103,15 @@ static __always_inline int real_bind(struct socket *sock,
 				     int addrlen)
 {
 	struct sockaddr_ll sa = {};
+	struct sock *sk = sock->sk;
 
-	if (sock->sk->__sk_common.skc_family != AF_PACKET)
+	if (!sk)
+		return 1;
+
+	if (sk->__sk_common.skc_family != AF_PACKET)
 		return 1;
 
-	if (sock->sk->sk_kern_sock)
+	if (sk->sk_kern_sock)
 		return 1;
 
 	bpf_probe_read_kernel(&sa, sizeof(sa), address);
diff --git a/tools/testing/selftests/bpf/progs/mptcp_sock.c b/tools/testing/selftests/bpf/progs/mptcp_sock.c
index 91a0d7eff2ac..f3acb90588c7 100644
--- a/tools/testing/selftests/bpf/progs/mptcp_sock.c
+++ b/tools/testing/selftests/bpf/progs/mptcp_sock.c
@@ -2,9 +2,9 @@
 /* Copyright (c) 2020, Tessares SA. */
 /* Copyright (c) 2022, SUSE. */
 
-#include <linux/bpf.h>
+#include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
-#include "bpf_tcp_helpers.h"
+#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 __u32 token = 0;
diff --git a/tools/testing/selftests/bpf/progs/mptcpify.c b/tools/testing/selftests/bpf/progs/mptcpify.c
index 53301ae8a8f7..cbdc730c3a47 100644
--- a/tools/testing/selftests/bpf/progs/mptcpify.c
+++ b/tools/testing/selftests/bpf/progs/mptcpify.c
@@ -6,10 +6,14 @@
 #include "bpf_tracing_net.h"
 
 char _license[] SEC("license") = "GPL";
+int pid;
 
 SEC("fmod_ret/update_socket_protocol")
 int BPF_PROG(mptcpify, int family, int type, int protocol)
 {
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return protocol;
+
 	if ((family == AF_INET || family == AF_INET6) &&
 	    type == SOCK_STREAM &&
 	    (!protocol || protocol == IPPROTO_TCP)) {
diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c
new file mode 100644
index 000000000000..672fc368d9c4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/preempt_lock.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+
+SEC("?tc")
+__failure __msg("1 bpf_preempt_enable is missing")
+int preempt_lock_missing_1(struct __sk_buff *ctx)
+{
+	bpf_preempt_disable();
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("2 bpf_preempt_enable(s) are missing")
+int preempt_lock_missing_2(struct __sk_buff *ctx)
+{
+	bpf_preempt_disable();
+	bpf_preempt_disable();
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("3 bpf_preempt_enable(s) are missing")
+int preempt_lock_missing_3(struct __sk_buff *ctx)
+{
+	bpf_preempt_disable();
+	bpf_preempt_disable();
+	bpf_preempt_disable();
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("1 bpf_preempt_enable is missing")
+int preempt_lock_missing_3_minus_2(struct __sk_buff *ctx)
+{
+	bpf_preempt_disable();
+	bpf_preempt_disable();
+	bpf_preempt_disable();
+	bpf_preempt_enable();
+	bpf_preempt_enable();
+	return 0;
+}
+
+static __noinline void preempt_disable(void)
+{
+	bpf_preempt_disable();
+}
+
+static __noinline void preempt_enable(void)
+{
+	bpf_preempt_enable();
+}
+
+SEC("?tc")
+__failure __msg("1 bpf_preempt_enable is missing")
+int preempt_lock_missing_1_subprog(struct __sk_buff *ctx)
+{
+	preempt_disable();
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("2 bpf_preempt_enable(s) are missing")
+int preempt_lock_missing_2_subprog(struct __sk_buff *ctx)
+{
+	preempt_disable();
+	preempt_disable();
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("1 bpf_preempt_enable is missing")
+int preempt_lock_missing_2_minus_1_subprog(struct __sk_buff *ctx)
+{
+	preempt_disable();
+	preempt_disable();
+	preempt_enable();
+	return 0;
+}
+
+static __noinline void preempt_balance_subprog(void)
+{
+	preempt_disable();
+	preempt_enable();
+}
+
+SEC("?tc")
+__success int preempt_balance(struct __sk_buff *ctx)
+{
+	bpf_guard_preempt();
+	return 0;
+}
+
+SEC("?tc")
+__success int preempt_balance_subprog_test(struct __sk_buff *ctx)
+{
+	preempt_balance_subprog();
+	return 0;
+}
+
+SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
+__failure __msg("sleepable helper bpf_copy_from_user#")
+int preempt_sleepable_helper(void *ctx)
+{
+	u32 data;
+
+	bpf_preempt_disable();
+	bpf_copy_from_user(&data, sizeof(data), NULL);
+	bpf_preempt_enable();
+	return 0;
+}
+
+int __noinline preempt_global_subprog(void)
+{
+	preempt_balance_subprog();
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("global function calls are not allowed with preemption disabled")
+int preempt_global_subprog_test(struct __sk_buff *ctx)
+{
+	preempt_disable();
+	preempt_global_subprog();
+	preempt_enable();
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c
index 351e79aef2fa..edc159598a0e 100644
--- a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c
+++ b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c
@@ -49,4 +49,10 @@ int sendmsg_v4_prog(struct bpf_sock_addr *ctx)
 	return 1;
 }
 
+SEC("cgroup/sendmsg4")
+int sendmsg_v4_deny_prog(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c
index bf9b46b806f6..36a7f960799f 100644
--- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c
+++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c
@@ -20,6 +20,11 @@
 #define DST_REWRITE_IP6_2	0
 #define DST_REWRITE_IP6_3	1
 
+#define DST_REWRITE_IP6_V4_MAPPED_0	0
+#define DST_REWRITE_IP6_V4_MAPPED_1	0
+#define DST_REWRITE_IP6_V4_MAPPED_2	0x0000FFFF
+#define DST_REWRITE_IP6_V4_MAPPED_3	0xc0a80004 // 192.168.0.4
+
 #define DST_REWRITE_PORT6	6666
 
 SEC("cgroup/sendmsg6")
@@ -59,4 +64,56 @@ int sendmsg_v6_prog(struct bpf_sock_addr *ctx)
 	return 1;
 }
 
+SEC("cgroup/sendmsg6")
+int sendmsg_v6_v4mapped_prog(struct bpf_sock_addr *ctx)
+{
+	/* Rewrite source. */
+	ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0);
+	ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1);
+	ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2);
+	ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3);
+
+	/* Rewrite destination. */
+	ctx->user_ip6[0] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_0);
+	ctx->user_ip6[1] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_1);
+	ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_2);
+	ctx->user_ip6[3] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_3);
+
+	ctx->user_port = bpf_htons(DST_REWRITE_PORT6);
+
+	return 1;
+}
+
+SEC("cgroup/sendmsg6")
+int sendmsg_v6_wildcard_prog(struct bpf_sock_addr *ctx)
+{
+	/* Rewrite source. */
+	ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0);
+	ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1);
+	ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2);
+	ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3);
+
+	/* Rewrite destination. */
+	ctx->user_ip6[0] = bpf_htonl(0);
+	ctx->user_ip6[1] = bpf_htonl(0);
+	ctx->user_ip6[2] = bpf_htonl(0);
+	ctx->user_ip6[3] = bpf_htonl(0);
+
+	ctx->user_port = bpf_htons(DST_REWRITE_PORT6);
+
+	return 1;
+}
+
+SEC("cgroup/sendmsg6")
+int sendmsg_v6_preserve_dst_prog(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/sendmsg6")
+int sendmsg_v6_deny_prog(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c
index d8869b03dda9..332d0eb1116f 100644
--- a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c
+++ b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c
@@ -36,4 +36,10 @@ int sendmsg_unix_prog(struct bpf_sock_addr *ctx)
 	return 1;
 }
 
+SEC("cgroup/sendmsg_unix")
+int sendmsg_unix_deny_prog(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/skb_pkt_end.c b/tools/testing/selftests/bpf/progs/skb_pkt_end.c
index 992b7861003a..db4abd2682fc 100644
--- a/tools/testing/selftests/bpf/progs/skb_pkt_end.c
+++ b/tools/testing/selftests/bpf/progs/skb_pkt_end.c
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
+#ifndef BPF_NO_PRESERVE_ACCESS_INDEX
 #define BPF_NO_PRESERVE_ACCESS_INDEX
+#endif
 #include <vmlinux.h>
 #include <bpf/bpf_core_read.h>
 #include <bpf/bpf_helpers.h>
diff --git a/tools/testing/selftests/bpf/progs/sock_addr_kern.c b/tools/testing/selftests/bpf/progs/sock_addr_kern.c
new file mode 100644
index 000000000000..8386bb15ccdc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sock_addr_kern.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Google LLC */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "../bpf_testmod/bpf_testmod_kfunc.h"
+
+SEC("syscall")
+int init_sock(struct init_sock_args *args)
+{
+	bpf_kfunc_init_sock(args);
+
+	return 0;
+}
+
+SEC("syscall")
+int close_sock(void *ctx)
+{
+	bpf_kfunc_close_sock();
+
+	return 0;
+}
+
+SEC("syscall")
+int kernel_connect(struct addr_args *args)
+{
+	return bpf_kfunc_call_kernel_connect(args);
+}
+
+SEC("syscall")
+int kernel_bind(struct addr_args *args)
+{
+	return bpf_kfunc_call_kernel_bind(args);
+}
+
+SEC("syscall")
+int kernel_listen(struct addr_args *args)
+{
+	return bpf_kfunc_call_kernel_listen();
+}
+
+SEC("syscall")
+int kernel_sendmsg(struct sendmsg_args *args)
+{
+	return bpf_kfunc_call_kernel_sendmsg(args);
+}
+
+SEC("syscall")
+int sock_sendmsg(struct sendmsg_args *args)
+{
+	return bpf_kfunc_call_sock_sendmsg(args);
+}
+
+SEC("syscall")
+int kernel_getsockname(struct addr_args *args)
+{
+	return bpf_kfunc_call_kernel_getsockname(args);
+}
+
+SEC("syscall")
+int kernel_getpeername(struct addr_args *args)
+{
+	return bpf_kfunc_call_kernel_getpeername(args);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c
index 83753b00a556..5c3614333b01 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c
@@ -1,24 +1,20 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2021 Facebook */
-#include <string.h>
-#include <linux/tcp.h>
-#include <netinet/in.h>
-#include <linux/bpf.h>
-#include <bpf/bpf_helpers.h>
-#include "bpf_tcp_helpers.h"
+#include "bpf_tracing_net.h"
 
 char _license[] SEC("license") = "GPL";
 
 __s32 page_size = 0;
 
+const char cc_reno[TCP_CA_NAME_MAX] = "reno";
+const char cc_cubic[TCP_CA_NAME_MAX] = "cubic";
+
 SEC("cgroup/setsockopt")
 int sockopt_qos_to_cc(struct bpf_sockopt *ctx)
 {
 	void *optval_end = ctx->optval_end;
 	int *optval = ctx->optval;
 	char buf[TCP_CA_NAME_MAX];
-	char cc_reno[TCP_CA_NAME_MAX] = "reno";
-	char cc_cubic[TCP_CA_NAME_MAX] = "cubic";
 
 	if (ctx->level != SOL_IPV6 || ctx->optname != IPV6_TCLASS)
 		goto out;
@@ -29,11 +25,11 @@ int sockopt_qos_to_cc(struct bpf_sockopt *ctx)
 	if (bpf_getsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf)))
 		return 0;
 
-	if (!tcp_cc_eq(buf, cc_cubic))
+	if (bpf_strncmp(buf, sizeof(buf), cc_cubic))
 		return 0;
 
 	if (*optval == 0x2d) {
-		if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &cc_reno,
+		if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, (void *)&cc_reno,
 				sizeof(cc_reno)))
 			return 0;
 	}
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c
new file mode 100644
index 000000000000..3c822103bd40
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("struct_ops/test_1")
+int BPF_PROG(test_1_forgotten)
+{
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops ops = {
+	/* we forgot to reference test_1_forgotten above, oops */
+};
+
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c
index 026cabfa7f1f..4c56d4a9d9f4 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_module.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c
@@ -23,7 +23,7 @@ void BPF_PROG(test_2, int a, int b)
 	test_2_result = a + b;
 }
 
-SEC("struct_ops/test_3")
+SEC("?struct_ops/test_3")
 int BPF_PROG(test_3, int a, int b)
 {
 	test_2_result = a + b + 3;
@@ -54,3 +54,37 @@ struct bpf_testmod_ops___v2 testmod_2 = {
 	.test_1 = (void *)test_1,
 	.test_2 = (void *)test_2_v2,
 };
+
+struct bpf_testmod_ops___zeroed {
+	int (*test_1)(void);
+	void (*test_2)(int a, int b);
+	int (*test_maybe_null)(int dummy, struct task_struct *task);
+	void (*zeroed_op)(int a, int b);
+	int zeroed;
+};
+
+SEC("struct_ops/test_3")
+int BPF_PROG(zeroed_op)
+{
+	return 1;
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops___zeroed testmod_zeroed = {
+	.test_1 = (void *)test_1,
+	.test_2 = (void *)test_2_v2,
+	.zeroed_op = (void *)zeroed_op,
+};
+
+struct bpf_testmod_ops___incompatible {
+	int (*test_1)(void);
+	void (*test_2)(int *a);
+	int data;
+};
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops___incompatible testmod_incompatible = {
+	.test_1 = (void *)test_1,
+	.test_2 = (void *)test_2,
+	.data = 3,
+};
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c
new file mode 100644
index 000000000000..fa2021388485
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+int rand;
+int arr[1];
+
+SEC("struct_ops/test_1")
+int BPF_PROG(test_1_turn_off)
+{
+	return arr[rand]; /* potentially way out of range access */
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops ops = {
+	.test_1 = (void *)test_1_turn_off,
+};
+
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_common.h b/tools/testing/selftests/bpf/progs/task_kfunc_common.h
index 41f2d44f49cb..6720c4b5be41 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_common.h
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_common.h
@@ -13,7 +13,7 @@ struct __tasks_kfunc_map_value {
 	struct task_struct __kptr * task;
 };
 
-struct hash_map {
+struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__type(key, int);
 	__type(value, struct __tasks_kfunc_map_value);
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c
index 7bb872fb22dd..0016c90e9c13 100644
--- a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c
@@ -1,24 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include "vmlinux.h"
-
+#include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
-static inline struct tcp_sock *tcp_sk(const struct sock *sk)
-{
-	return (struct tcp_sock *)sk;
-}
-
-SEC("struct_ops/incompl_cong_ops_ssthresh")
+SEC("struct_ops")
 __u32 BPF_PROG(incompl_cong_ops_ssthresh, struct sock *sk)
 {
 	return tcp_sk(sk)->snd_ssthresh;
 }
 
-SEC("struct_ops/incompl_cong_ops_undo_cwnd")
+SEC("struct_ops")
 __u32 BPF_PROG(incompl_cong_ops_undo_cwnd, struct sock *sk)
 {
 	return tcp_sk(sk)->snd_cwnd;
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c
new file mode 100644
index 000000000000..f95862f570b7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_tracing.h>
+
+extern void bbr_init(struct sock *sk) __ksym;
+extern void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) __ksym;
+extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym;
+extern u32 bbr_undo_cwnd(struct sock *sk) __ksym;
+extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym;
+extern u32 bbr_ssthresh(struct sock *sk) __ksym;
+extern u32 bbr_min_tso_segs(struct sock *sk) __ksym;
+extern void bbr_set_state(struct sock *sk, u8 new_state) __ksym;
+
+extern void dctcp_init(struct sock *sk) __ksym;
+extern void dctcp_update_alpha(struct sock *sk, u32 flags) __ksym;
+extern void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) __ksym;
+extern u32 dctcp_ssthresh(struct sock *sk) __ksym;
+extern u32 dctcp_cwnd_undo(struct sock *sk) __ksym;
+extern void dctcp_state(struct sock *sk, u8 new_state) __ksym;
+
+extern void cubictcp_init(struct sock *sk) __ksym;
+extern u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym;
+extern void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) __ksym;
+extern void cubictcp_state(struct sock *sk, u8 new_state) __ksym;
+extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym;
+extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym;
+
+SEC("struct_ops")
+void BPF_PROG(init, struct sock *sk)
+{
+	bbr_init(sk);
+	dctcp_init(sk);
+	cubictcp_init(sk);
+}
+
+SEC("struct_ops")
+void BPF_PROG(in_ack_event, struct sock *sk, u32 flags)
+{
+	dctcp_update_alpha(sk, flags);
+}
+
+SEC("struct_ops")
+void BPF_PROG(cong_control, struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
+{
+	bbr_main(sk, ack, flag, rs);
+}
+
+SEC("struct_ops")
+void BPF_PROG(cong_avoid, struct sock *sk, u32 ack, u32 acked)
+{
+	cubictcp_cong_avoid(sk, ack, acked);
+}
+
+SEC("struct_ops")
+u32 BPF_PROG(sndbuf_expand, struct sock *sk)
+{
+	return bbr_sndbuf_expand(sk);
+}
+
+SEC("struct_ops")
+u32 BPF_PROG(undo_cwnd, struct sock *sk)
+{
+	bbr_undo_cwnd(sk);
+	return dctcp_cwnd_undo(sk);
+}
+
+SEC("struct_ops")
+void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event)
+{
+	bbr_cwnd_event(sk, event);
+	dctcp_cwnd_event(sk, event);
+	cubictcp_cwnd_event(sk, event);
+}
+
+SEC("struct_ops")
+u32 BPF_PROG(ssthresh, struct sock *sk)
+{
+	bbr_ssthresh(sk);
+	dctcp_ssthresh(sk);
+	return cubictcp_recalc_ssthresh(sk);
+}
+
+SEC("struct_ops")
+u32 BPF_PROG(min_tso_segs, struct sock *sk)
+{
+	return bbr_min_tso_segs(sk);
+}
+
+SEC("struct_ops")
+void BPF_PROG(set_state, struct sock *sk, u8 new_state)
+{
+	bbr_set_state(sk, new_state);
+	dctcp_state(sk, new_state);
+	cubictcp_state(sk, new_state);
+}
+
+SEC("struct_ops")
+void BPF_PROG(pkts_acked, struct sock *sk, const struct ack_sample *sample)
+{
+	cubictcp_acked(sk, sample);
+}
+
+SEC(".struct_ops")
+struct tcp_congestion_ops tcp_ca_kfunc = {
+	.init		= (void *)init,
+	.in_ack_event	= (void *)in_ack_event,
+	.cong_control	= (void *)cong_control,
+	.cong_avoid	= (void *)cong_avoid,
+	.sndbuf_expand	= (void *)sndbuf_expand,
+	.undo_cwnd	= (void *)undo_cwnd,
+	.cwnd_event	= (void *)cwnd_event,
+	.ssthresh	= (void *)ssthresh,
+	.min_tso_segs	= (void *)min_tso_segs,
+	.set_state	= (void *)set_state,
+	.pkts_acked     = (void *)pkts_acked,
+	.name		= "tcp_ca_kfunc",
+};
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c
index c06f4a41c21a..54f916a931c6 100644
--- a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c
@@ -7,7 +7,7 @@
 
 char _license[] SEC("license") = "GPL";
 
-SEC("struct_ops/unsupp_cong_op_get_info")
+SEC("struct_ops")
 size_t BPF_PROG(unsupp_cong_op_get_info, struct sock *sk, u32 ext, int *attr,
 		union tcp_cc_info *info)
 {
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
index b93a0ed33057..e4bd82bc0d01 100644
--- a/tools/testing/selftests/bpf/progs/tcp_ca_update.c
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include "vmlinux.h"
-
+#include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
@@ -10,36 +9,31 @@ char _license[] SEC("license") = "GPL";
 int ca1_cnt = 0;
 int ca2_cnt = 0;
 
-static inline struct tcp_sock *tcp_sk(const struct sock *sk)
-{
-	return (struct tcp_sock *)sk;
-}
-
-SEC("struct_ops/ca_update_1_init")
+SEC("struct_ops")
 void BPF_PROG(ca_update_1_init, struct sock *sk)
 {
 	ca1_cnt++;
 }
 
-SEC("struct_ops/ca_update_2_init")
+SEC("struct_ops")
 void BPF_PROG(ca_update_2_init, struct sock *sk)
 {
 	ca2_cnt++;
 }
 
-SEC("struct_ops/ca_update_cong_control")
+SEC("struct_ops")
 void BPF_PROG(ca_update_cong_control, struct sock *sk,
 	      const struct rate_sample *rs)
 {
 }
 
-SEC("struct_ops/ca_update_ssthresh")
+SEC("struct_ops")
 __u32 BPF_PROG(ca_update_ssthresh, struct sock *sk)
 {
 	return tcp_sk(sk)->snd_ssthresh;
 }
 
-SEC("struct_ops/ca_update_undo_cwnd")
+SEC("struct_ops")
 __u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk)
 {
 	return tcp_sk(sk)->snd_cwnd;
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c
index 0724a79cec78..a58b5194fc89 100644
--- a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include "vmlinux.h"
-
+#include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
@@ -11,22 +10,17 @@ char _license[] SEC("license") = "GPL";
 
 #define min(a, b) ((a) < (b) ? (a) : (b))
 
-static inline struct tcp_sock *tcp_sk(const struct sock *sk)
-{
-	return (struct tcp_sock *)sk;
-}
-
-static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
+static unsigned int tcp_left_out(const struct tcp_sock *tp)
 {
 	return tp->sacked_out + tp->lost_out;
 }
 
-static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
+static unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
 {
 	return tp->packets_out - tcp_left_out(tp) + tp->retrans_out;
 }
 
-SEC("struct_ops/write_sk_pacing_init")
+SEC("struct_ops")
 void BPF_PROG(write_sk_pacing_init, struct sock *sk)
 {
 #ifdef ENABLE_ATOMICS_TESTS
@@ -37,7 +31,7 @@ void BPF_PROG(write_sk_pacing_init, struct sock *sk)
 #endif
 }
 
-SEC("struct_ops/write_sk_pacing_cong_control")
+SEC("struct_ops")
 void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk,
 	      const struct rate_sample *rs)
 {
@@ -49,13 +43,13 @@ void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk,
 	tp->app_limited = (tp->delivered + tcp_packets_in_flight(tp)) ?: 1;
 }
 
-SEC("struct_ops/write_sk_pacing_ssthresh")
+SEC("struct_ops")
 __u32 BPF_PROG(write_sk_pacing_ssthresh, struct sock *sk)
 {
 	return tcp_sk(sk)->snd_ssthresh;
 }
 
-SEC("struct_ops/write_sk_pacing_undo_cwnd")
+SEC("struct_ops")
 __u32 BPF_PROG(write_sk_pacing_undo_cwnd, struct sock *sk)
 {
 	return tcp_sk(sk)->snd_cwnd;
diff --git a/tools/testing/selftests/bpf/progs/tcp_rtt.c b/tools/testing/selftests/bpf/progs/tcp_rtt.c
index 0988d79f1587..42c729f85524 100644
--- a/tools/testing/selftests/bpf/progs/tcp_rtt.c
+++ b/tools/testing/selftests/bpf/progs/tcp_rtt.c
@@ -10,6 +10,9 @@ struct tcp_rtt_storage {
 	__u32 delivered;
 	__u32 delivered_ce;
 	__u32 icsk_retransmits;
+
+	__u32 mrtt_us;	/* args[0] */
+	__u32 srtt;	/* args[1] */
 };
 
 struct {
@@ -55,5 +58,8 @@ int _sockops(struct bpf_sock_ops *ctx)
 	storage->delivered_ce = tcp_sk->delivered_ce;
 	storage->icsk_retransmits = tcp_sk->icsk_retransmits;
 
+	storage->mrtt_us = ctx->args[0];
+	storage->srtt = ctx->args[1];
+
 	return 1;
 }
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c
index 5a3a80f751c4..c83142b55f47 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c
@@ -15,6 +15,8 @@ __u64 uprobe_res;
 __u64 uretprobe_res;
 __u64 tp_res;
 __u64 pe_res;
+__u64 raw_tp_res;
+__u64 tp_btf_res;
 __u64 fentry_res;
 __u64 fexit_res;
 __u64 fmod_ret_res;
@@ -87,6 +89,20 @@ int handle_pe(struct pt_regs *ctx)
 	return 0;
 }
 
+SEC("raw_tp/sys_enter")
+int handle_raw_tp(void *ctx)
+{
+	update(ctx, &raw_tp_res);
+	return 0;
+}
+
+SEC("tp_btf/sys_enter")
+int handle_tp_btf(void *ctx)
+{
+	update(ctx, &tp_btf_res);
+	return 0;
+}
+
 SEC("fentry/bpf_fentry_test1")
 int BPF_PROG(fentry_test1, int a)
 {
diff --git a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c
index e2bea4da194b..f0759efff6ef 100644
--- a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c
+++ b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c
@@ -1,19 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
 
-#include <string.h>
-#include <errno.h>
-#include <netinet/in.h>
-#include <linux/stddef.h>
-#include <linux/bpf.h>
-#include <linux/ipv6.h>
-#include <linux/tcp.h>
-#include <linux/if_ether.h>
-#include <linux/pkt_cls.h>
-
+#include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
-#include "bpf_tcp_helpers.h"
+
+#ifndef ENOENT
+#define ENOENT 2
+#endif
 
 struct sockaddr_in6 srv_sa6 = {};
 __u16 listen_tp_sport = 0;
diff --git a/tools/testing/selftests/bpf/progs/test_global_func10.c b/tools/testing/selftests/bpf/progs/test_global_func10.c
index 8fba3f3649e2..5da001ca57a5 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func10.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func10.c
@@ -4,6 +4,10 @@
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
 
+#if !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 struct Small {
 	long x;
 };
diff --git a/tools/testing/selftests/bpf/progs/test_lwt_redirect.c b/tools/testing/selftests/bpf/progs/test_lwt_redirect.c
index 8c895122f293..83439b87b766 100644
--- a/tools/testing/selftests/bpf/progs/test_lwt_redirect.c
+++ b/tools/testing/selftests/bpf/progs/test_lwt_redirect.c
@@ -3,7 +3,7 @@
 #include <bpf/bpf_endian.h>
 #include <bpf/bpf_helpers.h>
 #include <linux/ip.h>
-#include "bpf_tracing_net.h"
+#include <linux/if_ether.h>
 
 /* We don't care about whether the packet can be received by network stack.
  * Just care if the packet is sent to the correct device at correct direction
diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c
index 8a1b50f3a002..cc1a012d038f 100644
--- a/tools/testing/selftests/bpf/progs/test_module_attach.c
+++ b/tools/testing/selftests/bpf/progs/test_module_attach.c
@@ -73,6 +73,29 @@ int BPF_PROG(handle_fentry_manual,
 	return 0;
 }
 
+__u32 fentry_explicit_read_sz = 0;
+
+SEC("fentry/bpf_testmod:bpf_testmod_test_read")
+int BPF_PROG(handle_fentry_explicit,
+	     struct file *file, struct kobject *kobj,
+	     struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len)
+{
+	fentry_explicit_read_sz = len;
+	return 0;
+}
+
+
+__u32 fentry_explicit_manual_read_sz = 0;
+
+SEC("fentry")
+int BPF_PROG(handle_fentry_explicit_manual,
+	     struct file *file, struct kobject *kobj,
+	     struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len)
+{
+	fentry_explicit_manual_read_sz = len;
+	return 0;
+}
+
 __u32 fexit_read_sz = 0;
 int fexit_ret = 0;
 
diff --git a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c
index 0763d49f9c42..386315afad65 100644
--- a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c
+++ b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c
@@ -5,23 +5,48 @@
 #include <stdint.h>
 #include <bpf/bpf_helpers.h>
 
+struct {
+	__uint(type, BPF_MAP_TYPE_SOCKMAP);
+	__uint(max_entries, 2);
+	__type(key, __u32);
+	__type(value, __u32);
+} sock_map SEC(".maps");
+
 __u64 user_pid = 0;
 __u64 user_tgid = 0;
 __u64 dev = 0;
 __u64 ino = 0;
 
-SEC("tracepoint/syscalls/sys_enter_nanosleep")
-int handler(const void *ctx)
+static void get_pid_tgid(void)
 {
 	struct bpf_pidns_info nsdata;
 
 	if (bpf_get_ns_current_pid_tgid(dev, ino, &nsdata, sizeof(struct bpf_pidns_info)))
-		return 0;
+		return;
 
 	user_pid = nsdata.pid;
 	user_tgid = nsdata.tgid;
+}
 
+SEC("?tracepoint/syscalls/sys_enter_nanosleep")
+int tp_handler(const void *ctx)
+{
+	get_pid_tgid();
 	return 0;
 }
 
+SEC("?cgroup/bind4")
+int cgroup_bind4(struct bpf_sock_addr *ctx)
+{
+	get_pid_tgid();
+	return 1;
+}
+
+SEC("?sk_msg")
+int sk_msg(struct sk_msg_md *msg)
+{
+	get_pid_tgid();
+	return SK_PASS;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_perf_skip.c b/tools/testing/selftests/bpf/progs/test_perf_skip.c
new file mode 100644
index 000000000000..7eb8b6de7a57
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_perf_skip.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+uintptr_t ip;
+
+SEC("perf_event")
+int handler(struct bpf_perf_event_data *data)
+{
+	/* Skip events that have the correct ip. */
+	return ip != PT_REGS_IP(&data->regs);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_n.c b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c
new file mode 100644
index 000000000000..8669eb42dbe0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ringbuf_n.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2024 Andrea Righi <andrea.righi@canonical.com>
+
+#include <linux/bpf.h>
+#include <sched.h>
+#include <unistd.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define TASK_COMM_LEN 16
+
+struct sample {
+	int pid;
+	long value;
+	char comm[16];
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_RINGBUF);
+} ringbuf SEC(".maps");
+
+int pid = 0;
+long value = 0;
+
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
+int test_ringbuf_n(void *ctx)
+{
+	int cur_pid = bpf_get_current_pid_tgid() >> 32;
+	struct sample *sample;
+
+	if (cur_pid != pid)
+		return 0;
+
+	sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0);
+	if (!sample)
+		return 0;
+
+	sample->pid = pid;
+	sample->value = value;
+	bpf_get_current_comm(sample->comm, sizeof(sample->comm));
+
+	bpf_ringbuf_submit(sample, 0);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c
index 45e8fc75a739..996b177324ba 100644
--- a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c
+++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c
@@ -24,8 +24,7 @@ struct {
 	__type(value, __u64);
 } socket_storage SEC(".maps");
 
-SEC("sk_msg")
-int prog_msg_verdict(struct sk_msg_md *msg)
+static int prog_msg_verdict_common(struct sk_msg_md *msg)
 {
 	struct task_struct *task = (struct task_struct *)bpf_get_current_task();
 	int verdict = SK_PASS;
@@ -44,4 +43,28 @@ int prog_msg_verdict(struct sk_msg_md *msg)
 	return verdict;
 }
 
+SEC("sk_msg")
+int prog_msg_verdict(struct sk_msg_md *msg)
+{
+	return prog_msg_verdict_common(msg);
+}
+
+SEC("sk_msg")
+int prog_msg_verdict_clone(struct sk_msg_md *msg)
+{
+	return prog_msg_verdict_common(msg);
+}
+
+SEC("sk_msg")
+int prog_msg_verdict_clone2(struct sk_msg_md *msg)
+{
+	return prog_msg_verdict_common(msg);
+}
+
+SEC("sk_skb/stream_verdict")
+int prog_skb_verdict(struct __sk_buff *skb)
+{
+	return SK_PASS;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sock_fields.c b/tools/testing/selftests/bpf/progs/test_sock_fields.c
index f75e531bf36f..196844be349c 100644
--- a/tools/testing/selftests/bpf/progs/test_sock_fields.c
+++ b/tools/testing/selftests/bpf/progs/test_sock_fields.c
@@ -7,7 +7,6 @@
 
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
-#include "bpf_tcp_helpers.h"
 
 enum bpf_linum_array_idx {
 	EGRESS_LINUM_IDX,
@@ -42,6 +41,10 @@ struct {
 	__type(value, struct bpf_spinlock_cnt);
 } sk_pkt_out_cnt10 SEC(".maps");
 
+struct tcp_sock {
+	__u32	lsndtime;
+} __attribute__((preserve_access_index));
+
 struct bpf_tcp_sock listen_tp = {};
 struct sockaddr_in6 srv_sa6 = {};
 struct bpf_tcp_sock cli_tp = {};
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c
index 1d86a717a290..69aacc96db36 100644
--- a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c
@@ -23,10 +23,25 @@ struct {
 	__type(value, int);
 } sock_map_msg SEC(".maps");
 
-SEC("sk_skb")
+SEC("sk_skb/stream_verdict")
 int prog_skb_verdict(struct __sk_buff *skb)
 {
 	return SK_PASS;
 }
 
+int clone_called;
+
+SEC("sk_skb/stream_verdict")
+int prog_skb_verdict_clone(struct __sk_buff *skb)
+{
+	clone_called = 1;
+	return SK_PASS;
+}
+
+SEC("sk_skb/stream_parser")
+int prog_skb_parser(struct __sk_buff *skb)
+{
+	return SK_PASS;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c
index 3c69aa971738..d25b0bb30fc0 100644
--- a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c
@@ -9,7 +9,7 @@ struct {
 	__type(value, __u64);
 } sock_map SEC(".maps");
 
-SEC("sk_skb")
+SEC("sk_skb/verdict")
 int prog_skb_verdict(struct __sk_buff *skb)
 {
 	return SK_DROP;
diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c
index a3f3f43fc195..6935f32eeb8f 100644
--- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c
@@ -1,18 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <stddef.h>
-#include <string.h>
-#include <netinet/in.h>
-#include <linux/bpf.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/tcp.h>
+#include "bpf_tracing_net.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
-#include "bpf_tcp_helpers.h"
 #include "test_tcpbpf.h"
 
 struct tcpbpf_globals global = {};
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index 3e436e6f7312..3f5abcf3ff13 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -567,12 +567,18 @@ int ip6vxlan_get_tunnel_src(struct __sk_buff *skb)
 	return TC_ACT_OK;
 }
 
+struct local_geneve_opt {
+	struct geneve_opt gopt;
+	int data;
+};
+
 SEC("tc")
 int geneve_set_tunnel(struct __sk_buff *skb)
 {
 	int ret;
 	struct bpf_tunnel_key key;
-	struct geneve_opt gopt;
+	struct local_geneve_opt local_gopt;
+	struct geneve_opt *gopt = (struct geneve_opt *) &local_gopt;
 
 	__builtin_memset(&key, 0x0, sizeof(key));
 	key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
@@ -580,14 +586,14 @@ int geneve_set_tunnel(struct __sk_buff *skb)
 	key.tunnel_tos = 0;
 	key.tunnel_ttl = 64;
 
-	__builtin_memset(&gopt, 0x0, sizeof(gopt));
-	gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */
-	gopt.type = 0x08;
-	gopt.r1 = 0;
-	gopt.r2 = 0;
-	gopt.r3 = 0;
-	gopt.length = 2; /* 4-byte multiple */
-	*(int *) &gopt.opt_data = bpf_htonl(0xdeadbeef);
+	__builtin_memset(gopt, 0x0, sizeof(local_gopt));
+	gopt->opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */
+	gopt->type = 0x08;
+	gopt->r1 = 0;
+	gopt->r2 = 0;
+	gopt->r3 = 0;
+	gopt->length = 2; /* 4-byte multiple */
+	*(int *) &gopt->opt_data = bpf_htonl(0xdeadbeef);
 
 	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
 				     BPF_F_ZERO_CSUM_TX);
@@ -596,7 +602,7 @@ int geneve_set_tunnel(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt));
+	ret = bpf_skb_set_tunnel_opt(skb, gopt, sizeof(local_gopt));
 	if (ret < 0) {
 		log_err(ret);
 		return TC_ACT_SHOT;
@@ -631,7 +637,8 @@ SEC("tc")
 int ip6geneve_set_tunnel(struct __sk_buff *skb)
 {
 	struct bpf_tunnel_key key;
-	struct geneve_opt gopt;
+	struct local_geneve_opt local_gopt;
+	struct geneve_opt *gopt = (struct geneve_opt *) &local_gopt;
 	int ret;
 
 	__builtin_memset(&key, 0x0, sizeof(key));
@@ -647,16 +654,16 @@ int ip6geneve_set_tunnel(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	__builtin_memset(&gopt, 0x0, sizeof(gopt));
-	gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */
-	gopt.type = 0x08;
-	gopt.r1 = 0;
-	gopt.r2 = 0;
-	gopt.r3 = 0;
-	gopt.length = 2; /* 4-byte multiple */
-	*(int *) &gopt.opt_data = bpf_htonl(0xfeedbeef);
+	__builtin_memset(gopt, 0x0, sizeof(local_gopt));
+	gopt->opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */
+	gopt->type = 0x08;
+	gopt->r1 = 0;
+	gopt->r2 = 0;
+	gopt->r3 = 0;
+	gopt->length = 2; /* 4-byte multiple */
+	*(int *) &gopt->opt_data = bpf_htonl(0xfeedbeef);
 
-	ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt));
+	ret = bpf_skb_set_tunnel_opt(skb, gopt, sizeof(gopt));
 	if (ret < 0) {
 		log_err(ret);
 		return TC_ACT_SHOT;
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c
index 5c7e4758a0ca..fad94e41cef9 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c
@@ -318,6 +318,14 @@ bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval,
 	return true;
 }
 
+#ifndef __clang__
+#pragma GCC push_options
+/* GCC optimization collapses functions and increases the number of arguments
+ * beyond the compatible amount supported by BPF.
+ */
+#pragma GCC optimize("-fno-ipa-sra")
+#endif
+
 static __attribute__ ((noinline))
 bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval,
 	      struct packet_description *pckt,
@@ -372,6 +380,10 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval,
 	return true;
 }
 
+#ifndef __clang__
+#pragma GCC pop_options
+#endif
+
 static __attribute__ ((noinline))
 int swap_mac_and_send(void *data, void *data_end)
 {
@@ -588,12 +600,13 @@ static void connection_table_lookup(struct real_definition **real,
 __attribute__ ((noinline))
 static int process_l3_headers_v6(struct packet_description *pckt,
 				 __u8 *protocol, __u64 off,
-				 __u16 *pkt_bytes, void *data,
-				 void *data_end)
+				 __u16 *pkt_bytes, void *extra_args[2])
 {
 	struct ipv6hdr *ip6h;
 	__u64 iph_len;
 	int action;
+	void *data = extra_args[0];
+	void *data_end = extra_args[1];
 
 	ip6h = data + off;
 	if (ip6h + 1 > data_end)
@@ -619,11 +632,12 @@ static int process_l3_headers_v6(struct packet_description *pckt,
 __attribute__ ((noinline))
 static int process_l3_headers_v4(struct packet_description *pckt,
 				 __u8 *protocol, __u64 off,
-				 __u16 *pkt_bytes, void *data,
-				 void *data_end)
+				 __u16 *pkt_bytes, void *extra_args[2])
 {
 	struct iphdr *iph;
 	int action;
+	void *data = extra_args[0];
+	void *data_end = extra_args[1];
 
 	iph = data + off;
 	if (iph + 1 > data_end)
@@ -666,13 +680,14 @@ static int process_packet(void *data, __u64 off, void *data_end,
 	__u8 protocol;
 	__u32 vip_num;
 	int action;
+	void *extra_args[2] = { data, data_end };
 
 	if (is_ipv6)
 		action = process_l3_headers_v6(&pckt, &protocol, off,
-					       &pkt_bytes, data, data_end);
+					       &pkt_bytes, extra_args);
 	else
 		action = process_l3_headers_v4(&pckt, &protocol, off,
-					       &pkt_bytes, data, data_end);
+					       &pkt_bytes, extra_args);
 	if (action >= 0)
 		return action;
 	protocol = pckt.flow.proto;
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c
index f3ec8086482d..a7588302268d 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c
@@ -160,7 +160,7 @@ int  xdp_prognum1(struct xdp_md *ctx)
 
 		/* Modifying VLAN, preserve top 4 bits */
 		vlan_hdr->h_vlan_TCI =
-			bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000)
+			bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000U)
 				  | TO_VLAN);
 	}
 
diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c
index f615da97df26..4c677c001258 100644
--- a/tools/testing/selftests/bpf/progs/timer.c
+++ b/tools/testing/selftests/bpf/progs/timer.c
@@ -2,9 +2,10 @@
 /* Copyright (c) 2021 Facebook */
 #include <linux/bpf.h>
 #include <time.h>
+#include <stdbool.h>
 #include <errno.h>
 #include <bpf/bpf_helpers.h>
-#include "bpf_tcp_helpers.h"
+#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 struct hmap_elem {
diff --git a/tools/testing/selftests/bpf/progs/timer_failure.c b/tools/testing/selftests/bpf/progs/timer_failure.c
index 0996c2486f05..5a2e9dabf1c6 100644
--- a/tools/testing/selftests/bpf/progs/timer_failure.c
+++ b/tools/testing/selftests/bpf/progs/timer_failure.c
@@ -5,8 +5,8 @@
 #include <time.h>
 #include <errno.h>
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
-#include "bpf_tcp_helpers.h"
 
 char _license[] SEC("license") = "GPL";
 
diff --git a/tools/testing/selftests/bpf/progs/timer_mim.c b/tools/testing/selftests/bpf/progs/timer_mim.c
index 2fee7ab105ef..50ebc3f68522 100644
--- a/tools/testing/selftests/bpf/progs/timer_mim.c
+++ b/tools/testing/selftests/bpf/progs/timer_mim.c
@@ -4,7 +4,7 @@
 #include <time.h>
 #include <errno.h>
 #include <bpf/bpf_helpers.h>
-#include "bpf_tcp_helpers.h"
+#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 struct hmap_elem {
diff --git a/tools/testing/selftests/bpf/progs/timer_mim_reject.c b/tools/testing/selftests/bpf/progs/timer_mim_reject.c
index 5d648e3d8a41..dd3f1ed6d6e6 100644
--- a/tools/testing/selftests/bpf/progs/timer_mim_reject.c
+++ b/tools/testing/selftests/bpf/progs/timer_mim_reject.c
@@ -4,7 +4,7 @@
 #include <time.h>
 #include <errno.h>
 #include <bpf/bpf_helpers.h>
-#include "bpf_tcp_helpers.h"
+#include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 struct hmap_elem {
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 5fda43901033..2619ed193c65 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2020 Facebook
-
 #include <linux/bpf.h>
 #include <asm/unistd.h>
 #include <bpf/bpf_helpers.h>
@@ -9,82 +8,126 @@
 
 char _license[] SEC("license") = "GPL";
 
-long hits = 0;
+#define CPU_MASK 255
+#define MAX_CPUS (CPU_MASK + 1) /* should match MAX_BUCKETS in benchs/bench_trigger.c */
 
-SEC("tp/syscalls/sys_enter_getpgid")
-int bench_trigger_tp(void *ctx)
+/* matches struct counter in bench.h */
+struct counter {
+	long value;
+} __attribute__((aligned(128)));
+
+struct counter hits[MAX_CPUS];
+
+static __always_inline void inc_counter(void)
+{
+	int cpu = bpf_get_smp_processor_id();
+
+	__sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1);
+}
+
+SEC("?uprobe")
+int bench_trigger_uprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
-SEC("raw_tp/sys_enter")
-int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id)
+const volatile int batch_iters = 0;
+
+SEC("?raw_tp")
+int trigger_count(void *ctx)
 {
-	if (id == __NR_getpgid)
-		__sync_add_and_fetch(&hits, 1);
+	int i;
+
+	for (i = 0; i < batch_iters; i++)
+		inc_counter();
+
 	return 0;
 }
 
-SEC("kprobe/" SYS_PREFIX "sys_getpgid")
+SEC("?raw_tp")
+int trigger_driver(void *ctx)
+{
+	int i;
+
+	for (i = 0; i < batch_iters; i++)
+		(void)bpf_get_numa_node_id(); /* attach point for benchmarking */
+
+	return 0;
+}
+
+extern int bpf_modify_return_test_tp(int nonce) __ksym __weak;
+
+SEC("?raw_tp")
+int trigger_driver_kfunc(void *ctx)
+{
+	int i;
+
+	for (i = 0; i < batch_iters; i++)
+		(void)bpf_modify_return_test_tp(0); /* attach point for benchmarking */
+
+	return 0;
+}
+
+SEC("?kprobe/bpf_get_numa_node_id")
 int bench_trigger_kprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
-SEC("kretprobe/" SYS_PREFIX "sys_getpgid")
+SEC("?kretprobe/bpf_get_numa_node_id")
 int bench_trigger_kretprobe(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
-SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid")
+SEC("?kprobe.multi/bpf_get_numa_node_id")
 int bench_trigger_kprobe_multi(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
-SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid")
+SEC("?kretprobe.multi/bpf_get_numa_node_id")
 int bench_trigger_kretprobe_multi(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
-SEC("fentry/" SYS_PREFIX "sys_getpgid")
+SEC("?fentry/bpf_get_numa_node_id")
 int bench_trigger_fentry(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
-SEC("fexit/" SYS_PREFIX "sys_getpgid")
+SEC("?fexit/bpf_get_numa_node_id")
 int bench_trigger_fexit(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
 
-SEC("fentry.s/" SYS_PREFIX "sys_getpgid")
-int bench_trigger_fentry_sleep(void *ctx)
+SEC("?fmod_ret/bpf_modify_return_test_tp")
+int bench_trigger_fmodret(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
-	return 0;
+	inc_counter();
+	return -22;
 }
 
-SEC("fmod_ret/" SYS_PREFIX "sys_getpgid")
-int bench_trigger_fmodret(void *ctx)
+SEC("?tp/bpf_test_run/bpf_trigger_tp")
+int bench_trigger_tp(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
-	return -22;
+	inc_counter();
+	return 0;
 }
 
-SEC("uprobe")
-int bench_trigger_uprobe(void *ctx)
+SEC("?raw_tp/bpf_trigger_tp")
+int bench_trigger_rawtp(void *ctx)
 {
-	__sync_add_and_fetch(&hits, 1);
+	inc_counter();
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c
index 5540b05ff9ee..93144ae6df74 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena.c
@@ -12,14 +12,18 @@ struct {
 	__uint(type, BPF_MAP_TYPE_ARENA);
 	__uint(map_flags, BPF_F_MMAPABLE);
 	__uint(max_entries, 2); /* arena of two pages close to 32-bit boundary*/
-	__ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * 2 + 1)); /* start of mmap() region */
+#ifdef __TARGET_ARCH_arm64
+        __ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * 2 + 1)); /* start of mmap() region */
+#else
+        __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * 2 + 1)); /* start of mmap() region */
+#endif
 } arena SEC(".maps");
 
 SEC("syscall")
 __success __retval(0)
 int basic_alloc1(void *ctx)
 {
-#if defined(__BPF_FEATURE_ARENA_CAST)
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
 	volatile int __arena *page1, *page2, *no_page, *page3;
 
 	page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
@@ -58,7 +62,7 @@ SEC("syscall")
 __success __retval(0)
 int basic_alloc2(void *ctx)
 {
-#if defined(__BPF_FEATURE_ARENA_CAST)
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
 	volatile char __arena *page1, *page2, *page3, *page4;
 
 	page1 = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0);
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
new file mode 100644
index 000000000000..ef66ea460264
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+#include "bpf_arena_common.h"
+
+#define ARENA_SIZE (1ull << 32)
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+	__uint(max_entries, ARENA_SIZE / PAGE_SIZE);
+} arena SEC(".maps");
+
+SEC("syscall")
+__success __retval(0)
+int big_alloc1(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	volatile char __arena *page1, *page2, *no_page, *page3;
+	void __arena *base;
+
+	page1 = base = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	if (!page1)
+		return 1;
+	*page1 = 1;
+	page2 = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE,
+				      1, NUMA_NO_NODE, 0);
+	if (!page2)
+		return 2;
+	*page2 = 2;
+	no_page = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE,
+					1, NUMA_NO_NODE, 0);
+	if (no_page)
+		return 3;
+	if (*page1 != 1)
+		return 4;
+	if (*page2 != 2)
+		return 5;
+	bpf_arena_free_pages(&arena, (void __arena *)page1, 1);
+	if (*page2 != 2)
+		return 6;
+	if (*page1 != 0) /* use-after-free should return 0 */
+		return 7;
+	page3 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	if (!page3)
+		return 8;
+	*page3 = 3;
+	if (page1 != page3)
+		return 9;
+	if (*page2 != 2)
+		return 10;
+	if (*(page1 + PAGE_SIZE) != 0)
+		return 11;
+	if (*(page1 - PAGE_SIZE) != 0)
+		return 12;
+	if (*(page2 + PAGE_SIZE) != 0)
+		return 13;
+	if (*(page2 - PAGE_SIZE) != 0)
+		return 14;
+#endif
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c
index 960998f16306..a0bb7fb40ea5 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c
@@ -886,6 +886,69 @@ l1_%=:	r0 = 0;						\
 }
 
 SEC("socket")
+__description("bounds check for non const xor src dst")
+__success __log_level(2)
+__msg("5: (af) r0 ^= r6                      ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))")
+__naked void non_const_xor_src_dst(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];                    \
+	r6 = r0;					\
+	call %[bpf_get_prandom_u32];                    \
+	r6 &= 0xaf;					\
+	r0 &= 0x1a0;					\
+	r0 ^= r6;					\
+	exit;						\
+"	:
+	: __imm(bpf_map_lookup_elem),
+	__imm_addr(map_hash_8b),
+	__imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("bounds check for non const or src dst")
+__success __log_level(2)
+__msg("5: (4f) r0 |= r6                      ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))")
+__naked void non_const_or_src_dst(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];                    \
+	r6 = r0;					\
+	call %[bpf_get_prandom_u32];                    \
+	r6 &= 0xaf;					\
+	r0 &= 0x1a0;					\
+	r0 |= r6;					\
+	exit;						\
+"	:
+	: __imm(bpf_map_lookup_elem),
+	__imm_addr(map_hash_8b),
+	__imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("bounds check for non const mul regs")
+__success __log_level(2)
+__msg("5: (2f) r0 *= r6                      ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))")
+__naked void non_const_mul_regs(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];                    \
+	r6 = r0;					\
+	call %[bpf_get_prandom_u32];                    \
+	r6 &= 0xff;					\
+	r0 &= 0x0f;					\
+	r0 *= r6;					\
+	exit;						\
+"	:
+	: __imm(bpf_map_lookup_elem),
+	__imm_addr(map_hash_8b),
+	__imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
 __description("bounds checks after 32-bit truncation. test 1")
 __success __failure_unpriv __msg_unpriv("R0 leaks addr")
 __retval(0)
diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c
index baff5ffe9405..a9fc30ed4d73 100644
--- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c
+++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c
@@ -8,6 +8,13 @@
 #include "xdp_metadata.h"
 #include "bpf_kfuncs.h"
 
+/* The compiler may be able to detect the access to uninitialized
+   memory in the routines performing out of bound memory accesses and
+   emit warnings about it.  This is the case of GCC. */
+#if !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 int arr[1];
 int unkn_idx;
 const volatile bool call_dead_subprog = false;
diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c
index 0ede0ccd090c..059aa716e3d0 100644
--- a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c
+++ b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c
@@ -30,7 +30,7 @@ struct {
 
 SEC("kprobe")
 __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE")
-__failure __msg("unknown func bpf_ktime_get_coarse_ns")
+__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns")
 __naked void in_bpf_prog_type_kprobe_1(void)
 {
 	asm volatile ("					\
@@ -44,7 +44,7 @@ __naked void in_bpf_prog_type_kprobe_1(void)
 
 SEC("tracepoint")
 __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_TRACEPOINT")
-__failure __msg("unknown func bpf_ktime_get_coarse_ns")
+__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns")
 __naked void in_bpf_prog_type_tracepoint_1(void)
 {
 	asm volatile ("					\
@@ -58,7 +58,7 @@ __naked void in_bpf_prog_type_tracepoint_1(void)
 
 SEC("perf_event")
 __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_PERF_EVENT")
-__failure __msg("unknown func bpf_ktime_get_coarse_ns")
+__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns")
 __naked void bpf_prog_type_perf_event_1(void)
 {
 	asm volatile ("					\
@@ -72,7 +72,7 @@ __naked void bpf_prog_type_perf_event_1(void)
 
 SEC("raw_tracepoint")
 __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT")
-__failure __msg("unknown func bpf_ktime_get_coarse_ns")
+__failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns")
 __naked void bpf_prog_type_raw_tracepoint_1(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c
index 99e561f18f9b..bd676d7e615f 100644
--- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c
+++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c
@@ -318,7 +318,7 @@ int cond_break1(const void *ctx)
 	unsigned long i;
 	unsigned int sum = 0;
 
-	for (i = zero; i < ARR_SZ; cond_break, i++)
+	for (i = zero; i < ARR_SZ && can_loop; i++)
 		sum += i;
 	for (i = zero; i < ARR_SZ; i++) {
 		barrier_var(i);
@@ -336,12 +336,11 @@ int cond_break2(const void *ctx)
 	int i, j;
 	int sum = 0;
 
-	for (i = zero; i < 1000; cond_break, i++)
+	for (i = zero; i < 1000 && can_loop; i++)
 		for (j = zero; j < 1000; j++) {
 			sum += i + j;
 			cond_break;
-		}
-
+	}
 	return sum;
 }
 
@@ -349,7 +348,7 @@ static __noinline int loop(void)
 {
 	int i, sum = 0;
 
-	for (i = zero; i <= 1000000; i++, cond_break)
+	for (i = zero; i <= 1000000 && can_loop; i++)
 		sum += i;
 
 	return sum;
diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c
new file mode 100644
index 000000000000..cb32b0cfc84b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+
+#include "bpf_misc.h"
+#include "cgrp_kfunc_common.h"
+#include "cpumask_common.h"
+#include "task_kfunc_common.h"
+
+char _license[] SEC("license") = "GPL";
+
+/***************
+ * Task kfuncs *
+ ***************/
+
+static void task_kfunc_load_test(void)
+{
+	struct task_struct *current, *ref_1, *ref_2;
+
+	current = bpf_get_current_task_btf();
+	ref_1 = bpf_task_from_pid(current->pid);
+	if (!ref_1)
+		return;
+
+	ref_2 = bpf_task_acquire(ref_1);
+	if (ref_2)
+		bpf_task_release(ref_2);
+	bpf_task_release(ref_1);
+}
+
+SEC("raw_tp")
+__failure __msg("calling kernel function")
+int BPF_PROG(task_kfunc_raw_tp)
+{
+	task_kfunc_load_test();
+	return 0;
+}
+
+SEC("syscall")
+__success
+int BPF_PROG(task_kfunc_syscall)
+{
+	task_kfunc_load_test();
+	return 0;
+}
+
+/*****************
+ * cgroup kfuncs *
+ *****************/
+
+static void cgrp_kfunc_load_test(void)
+{
+	struct cgroup *cgrp, *ref;
+
+	cgrp = bpf_cgroup_from_id(0);
+	if (!cgrp)
+		return;
+
+	ref = bpf_cgroup_acquire(cgrp);
+	if (!ref) {
+		bpf_cgroup_release(cgrp);
+		return;
+	}
+
+	bpf_cgroup_release(ref);
+	bpf_cgroup_release(cgrp);
+}
+
+SEC("raw_tp")
+__failure __msg("calling kernel function")
+int BPF_PROG(cgrp_kfunc_raw_tp)
+{
+	cgrp_kfunc_load_test();
+	return 0;
+}
+
+SEC("syscall")
+__success
+int BPF_PROG(cgrp_kfunc_syscall)
+{
+	cgrp_kfunc_load_test();
+	return 0;
+}
+
+/******************
+ * cpumask kfuncs *
+ ******************/
+
+static void cpumask_kfunc_load_test(void)
+{
+	struct bpf_cpumask *alloc, *ref;
+
+	alloc = bpf_cpumask_create();
+	if (!alloc)
+		return;
+
+	ref = bpf_cpumask_acquire(alloc);
+	bpf_cpumask_set_cpu(0, alloc);
+	bpf_cpumask_test_cpu(0, (const struct cpumask *)ref);
+
+	bpf_cpumask_release(ref);
+	bpf_cpumask_release(alloc);
+}
+
+SEC("raw_tp")
+__failure __msg("calling kernel function")
+int BPF_PROG(cpumask_kfunc_raw_tp)
+{
+	cpumask_kfunc_load_test();
+	return 0;
+}
+
+SEC("syscall")
+__success
+int BPF_PROG(cpumask_kfunc_syscall)
+{
+	cpumask_kfunc_load_test();
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/verifier_sock_addr.c b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c
new file mode 100644
index 000000000000..9c31448a0f52
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c
@@ -0,0 +1,331 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Google LLC */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf_sockopt_helpers.h>
+#include "bpf_misc.h"
+
+SEC("cgroup/recvmsg4")
+__success
+int recvmsg4_good_return_code(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/recvmsg4")
+__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]")
+int recvmsg4_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/recvmsg6")
+__success
+int recvmsg6_good_return_code(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/recvmsg6")
+__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]")
+int recvmsg6_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/recvmsg_unix")
+__success
+int recvmsg_unix_good_return_code(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/recvmsg_unix")
+__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]")
+int recvmsg_unix_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/sendmsg4")
+__success
+int sendmsg4_good_return_code_0(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/sendmsg4")
+__success
+int sendmsg4_good_return_code_1(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/sendmsg4")
+__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]")
+int sendmsg4_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 2;
+}
+
+SEC("cgroup/sendmsg6")
+__success
+int sendmsg6_good_return_code_0(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/sendmsg6")
+__success
+int sendmsg6_good_return_code_1(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/sendmsg6")
+__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]")
+int sendmsg6_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 2;
+}
+
+SEC("cgroup/sendmsg_unix")
+__success
+int sendmsg_unix_good_return_code_0(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/sendmsg_unix")
+__success
+int sendmsg_unix_good_return_code_1(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/sendmsg_unix")
+__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]")
+int sendmsg_unix_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 2;
+}
+
+SEC("cgroup/getpeername4")
+__success
+int getpeername4_good_return_code(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/getpeername4")
+__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]")
+int getpeername4_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/getpeername6")
+__success
+int getpeername6_good_return_code(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/getpeername6")
+__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]")
+int getpeername6_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/getpeername_unix")
+__success
+int getpeername_unix_good_return_code(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/getpeername_unix")
+__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]")
+int getpeername_unix_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/getsockname4")
+__success
+int getsockname4_good_return_code(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/getsockname4")
+__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]")
+int getsockname4_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/getsockname6")
+__success
+int getsockname6_good_return_code(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/getsockname6")
+__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]")
+int getsockname6_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/getsockname_unix")
+__success
+int getsockname_unix_good_return_code(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/getsockname_unix")
+__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]")
+int getsockname_unix_unix_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/bind4")
+__success
+int bind4_good_return_code_0(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/bind4")
+__success
+int bind4_good_return_code_1(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/bind4")
+__success
+int bind4_good_return_code_2(struct bpf_sock_addr *ctx)
+{
+	return 2;
+}
+
+SEC("cgroup/bind4")
+__success
+int bind4_good_return_code_3(struct bpf_sock_addr *ctx)
+{
+	return 3;
+}
+
+SEC("cgroup/bind4")
+__failure __msg("At program exit the register R0 has smin=4 smax=4 should have been in [0, 3]")
+int bind4_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 4;
+}
+
+SEC("cgroup/bind6")
+__success
+int bind6_good_return_code_0(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/bind6")
+__success
+int bind6_good_return_code_1(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/bind6")
+__success
+int bind6_good_return_code_2(struct bpf_sock_addr *ctx)
+{
+	return 2;
+}
+
+SEC("cgroup/bind6")
+__success
+int bind6_good_return_code_3(struct bpf_sock_addr *ctx)
+{
+	return 3;
+}
+
+SEC("cgroup/bind6")
+__failure __msg("At program exit the register R0 has smin=4 smax=4 should have been in [0, 3]")
+int bind6_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 4;
+}
+
+SEC("cgroup/connect4")
+__success
+int connect4_good_return_code_0(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/connect4")
+__success
+int connect4_good_return_code_1(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/connect4")
+__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]")
+int connect4_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 2;
+}
+
+SEC("cgroup/connect6")
+__success
+int connect6_good_return_code_0(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/connect6")
+__success
+int connect6_good_return_code_1(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/connect6")
+__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]")
+int connect6_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 2;
+}
+
+SEC("cgroup/connect_unix")
+__success
+int connect_unix_good_return_code_0(struct bpf_sock_addr *ctx)
+{
+	return 0;
+}
+
+SEC("cgroup/connect_unix")
+__success
+int connect_unix_good_return_code_1(struct bpf_sock_addr *ctx)
+{
+	return 1;
+}
+
+SEC("cgroup/connect_unix")
+__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]")
+int connect_unix_bad_return_code(struct bpf_sock_addr *ctx)
+{
+	return 2;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index 6f5d19665cf6..4a58e0398e72 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -6,6 +6,7 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
+#include <../../../tools/include/linux/filter.h>
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 
@@ -76,6 +77,94 @@ __naked int subprog_result_precise(void)
 	);
 }
 
+__naked __noinline __used
+static unsigned long fp_leaking_subprog()
+{
+	asm volatile (
+		".8byte %[r0_eq_r10_cast_s8];"
+		"exit;"
+		:: __imm_insn(r0_eq_r10_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_10, 8))
+	);
+}
+
+__naked __noinline __used
+static unsigned long sneaky_fp_leaking_subprog()
+{
+	asm volatile (
+		"r1 = r10;"
+		".8byte %[r0_eq_r1_cast_s8];"
+		"exit;"
+		:: __imm_insn(r0_eq_r1_cast_s8, BPF_MOVSX64_REG(BPF_REG_0, BPF_REG_1, 8))
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("6: (0f) r1 += r0")
+__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6")
+__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4")
+__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3")
+__msg("mark_precise: frame0: regs=r0 stack= before 10: (95) exit")
+__msg("mark_precise: frame1: regs=r0 stack= before 9: (bf) r0 = (s8)r10")
+__msg("7: R0_w=scalar")
+__naked int fp_precise_subprog_result(void)
+{
+	asm volatile (
+		"call fp_leaking_subprog;"
+		/* use subprog's returned value (which is derived from r10=fp
+		 * register), as index into vals array, forcing all of that to
+		 * be known precisely
+		 */
+		"r0 &= 3;"
+		"r0 *= 4;"
+		"r1 = %[vals];"
+		/* force precision marking */
+		"r1 += r0;"
+		"r0 = *(u32 *)(r1 + 0);"
+		"exit;"
+		:
+		: __imm_ptr(vals)
+		: __clobber_common
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("6: (0f) r1 += r0")
+__msg("mark_precise: frame0: last_idx 6 first_idx 0 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6")
+__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4")
+__msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3")
+__msg("mark_precise: frame0: regs=r0 stack= before 11: (95) exit")
+__msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = (s8)r1")
+/* here r1 is marked precise, even though it's fp register, but that's fine
+ * because by the time we get out of subprogram it has to be derived from r10
+ * anyways, at which point we'll break precision chain
+ */
+__msg("mark_precise: frame1: regs=r1 stack= before 9: (bf) r1 = r10")
+__msg("7: R0_w=scalar")
+__naked int sneaky_fp_precise_subprog_result(void)
+{
+	asm volatile (
+		"call sneaky_fp_leaking_subprog;"
+		/* use subprog's returned value (which is derived from r10=fp
+		 * register), as index into vals array, forcing all of that to
+		 * be known precisely
+		 */
+		"r0 &= 3;"
+		"r0 *= 4;"
+		"r1 = %[vals];"
+		/* force precision marking */
+		"r1 += r0;"
+		"r0 = *(u32 *)(r1 + 0);"
+		"exit;"
+		:
+		: __imm_ptr(vals)
+		: __clobber_common
+	);
+}
+
 SEC("?raw_tp")
 __success __log_level(2)
 __msg("9: (0f) r1 += r0")
diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c
new file mode 100644
index 000000000000..49e712acbf60
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/wq.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Benjamin Tissoires
+ */
+
+#include "bpf_experimental.h"
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+#include "../bpf_testmod/bpf_testmod_kfunc.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct hmap_elem {
+	int counter;
+	struct bpf_timer timer; /* unused */
+	struct bpf_spin_lock lock; /* unused */
+	struct bpf_wq work;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 1000);
+	__type(key, int);
+	__type(value, struct hmap_elem);
+} hmap SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__uint(max_entries, 1000);
+	__type(key, int);
+	__type(value, struct hmap_elem);
+} hmap_malloc SEC(".maps");
+
+struct elem {
+	struct bpf_wq w;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 2);
+	__type(key, int);
+	__type(value, struct elem);
+} array SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_LRU_HASH);
+	__uint(max_entries, 4);
+	__type(key, int);
+	__type(value, struct elem);
+} lru SEC(".maps");
+
+__u32 ok;
+__u32 ok_sleepable;
+
+static int test_elem_callback(void *map, int *key,
+		int (callback_fn)(void *map, int *key, struct bpf_wq *wq))
+{
+	struct elem init = {}, *val;
+	struct bpf_wq *wq;
+
+	if ((ok & (1 << *key) ||
+	    (ok_sleepable & (1 << *key))))
+		return -22;
+
+	if (map == &lru &&
+	    bpf_map_update_elem(map, key, &init, 0))
+		return -1;
+
+	val = bpf_map_lookup_elem(map, key);
+	if (!val)
+		return -2;
+
+	wq = &val->w;
+	if (bpf_wq_init(wq, map, 0) != 0)
+		return -3;
+
+	if (bpf_wq_set_callback(wq, callback_fn, 0))
+		return -4;
+
+	if (bpf_wq_start(wq, 0))
+		return -5;
+
+	return 0;
+}
+
+static int test_hmap_elem_callback(void *map, int *key,
+		int (callback_fn)(void *map, int *key, struct bpf_wq *wq))
+{
+	struct hmap_elem init = {}, *val;
+	struct bpf_wq *wq;
+
+	if ((ok & (1 << *key) ||
+	    (ok_sleepable & (1 << *key))))
+		return -22;
+
+	if (bpf_map_update_elem(map, key, &init, 0))
+		return -1;
+
+	val = bpf_map_lookup_elem(map, key);
+	if (!val)
+		return -2;
+
+	wq = &val->work;
+	if (bpf_wq_init(wq, map, 0) != 0)
+		return -3;
+
+	if (bpf_wq_set_callback(wq, callback_fn, 0))
+		return -4;
+
+	if (bpf_wq_start(wq, 0))
+		return -5;
+
+	return 0;
+}
+
+/* callback for non sleepable workqueue */
+static int wq_callback(void *map, int *key, struct bpf_wq *work)
+{
+	bpf_kfunc_common_test();
+	ok |= (1 << *key);
+	return 0;
+}
+
+/* callback for sleepable workqueue */
+static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work)
+{
+	bpf_kfunc_call_test_sleepable();
+	ok_sleepable |= (1 << *key);
+	return 0;
+}
+
+SEC("tc")
+/* test that workqueues can be used from an array */
+__retval(0)
+long test_call_array_sleepable(void *ctx)
+{
+	int key = 0;
+
+	return test_elem_callback(&array, &key, wq_cb_sleepable);
+}
+
+SEC("syscall")
+/* Same test than above but from a sleepable context. */
+__retval(0)
+long test_syscall_array_sleepable(void *ctx)
+{
+	int key = 1;
+
+	return test_elem_callback(&array, &key, wq_cb_sleepable);
+}
+
+SEC("tc")
+/* test that workqueues can be used from a hashmap */
+__retval(0)
+long test_call_hash_sleepable(void *ctx)
+{
+	int key = 2;
+
+	return test_hmap_elem_callback(&hmap, &key, wq_callback);
+}
+
+SEC("tc")
+/* test that workqueues can be used from a hashmap with NO_PREALLOC. */
+__retval(0)
+long test_call_hash_malloc_sleepable(void *ctx)
+{
+	int key = 3;
+
+	return test_hmap_elem_callback(&hmap_malloc, &key, wq_callback);
+}
+
+SEC("tc")
+/* test that workqueues can be used from a LRU map */
+__retval(0)
+long test_call_lru_sleepable(void *ctx)
+{
+	int key = 4;
+
+	return test_elem_callback(&lru, &key, wq_callback);
+}
diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c
new file mode 100644
index 000000000000..4cbdb425f223
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/wq_failures.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Benjamin Tissoires
+ */
+
+#include "bpf_experimental.h"
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+#include "../bpf_testmod/bpf_testmod_kfunc.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct elem {
+	struct bpf_wq w;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 2);
+	__type(key, int);
+	__type(value, struct elem);
+} array SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_LRU_HASH);
+	__uint(max_entries, 4);
+	__type(key, int);
+	__type(value, struct elem);
+} lru SEC(".maps");
+
+/* callback for non sleepable workqueue */
+static int wq_callback(void *map, int *key, struct bpf_wq *work)
+{
+	bpf_kfunc_common_test();
+	return 0;
+}
+
+/* callback for sleepable workqueue */
+static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work)
+{
+	bpf_kfunc_call_test_sleepable();
+	return 0;
+}
+
+SEC("tc")
+/* test that bpf_wq_init takes a map as a second argument
+ */
+__log_level(2)
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure
+__msg(": (85) call bpf_wq_init#") /* anchor message */
+__msg("pointer in R2 isn't map pointer")
+long test_wq_init_nomap(void *ctx)
+{
+	struct bpf_wq *wq;
+	struct elem *val;
+	int key = 0;
+
+	val = bpf_map_lookup_elem(&array, &key);
+	if (!val)
+		return -1;
+
+	wq = &val->w;
+	if (bpf_wq_init(wq, &key, 0) != 0)
+		return -3;
+
+	return 0;
+}
+
+SEC("tc")
+/* test that the workqueue is part of the map in bpf_wq_init
+ */
+__log_level(2)
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure
+__msg(": (85) call bpf_wq_init#") /* anchor message */
+__msg("workqueue pointer in R1 map_uid=0 doesn't match map pointer in R2 map_uid=0")
+long test_wq_init_wrong_map(void *ctx)
+{
+	struct bpf_wq *wq;
+	struct elem *val;
+	int key = 0;
+
+	val = bpf_map_lookup_elem(&array, &key);
+	if (!val)
+		return -1;
+
+	wq = &val->w;
+	if (bpf_wq_init(wq, &lru, 0) != 0)
+		return -3;
+
+	return 0;
+}
+
+SEC("?tc")
+__log_level(2)
+__failure
+/* check that the first argument of bpf_wq_set_callback()
+ * is a correct bpf_wq pointer.
+ */
+__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */
+__msg("arg#0 doesn't point to a map value")
+long test_wrong_wq_pointer(void *ctx)
+{
+	int key = 0;
+	struct bpf_wq *wq;
+
+	wq = bpf_map_lookup_elem(&array, &key);
+	if (!wq)
+		return 1;
+
+	if (bpf_wq_init(wq, &array, 0))
+		return 2;
+
+	if (bpf_wq_set_callback((void *)&wq, wq_callback, 0))
+		return 3;
+
+	return -22;
+}
+
+SEC("?tc")
+__log_level(2)
+__failure
+/* check that the first argument of bpf_wq_set_callback()
+ * is a correct bpf_wq pointer.
+ */
+__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */
+__msg("off 1 doesn't point to 'struct bpf_wq' that is at 0")
+long test_wrong_wq_pointer_offset(void *ctx)
+{
+	int key = 0;
+	struct bpf_wq *wq;
+
+	wq = bpf_map_lookup_elem(&array, &key);
+	if (!wq)
+		return 1;
+
+	if (bpf_wq_init(wq, &array, 0))
+		return 2;
+
+	if (bpf_wq_set_callback((void *)wq + 1, wq_cb_sleepable, 0))
+		return 3;
+
+	return -22;
+}
diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp
index f4936834f76f..dde0bb16e782 100644
--- a/tools/testing/selftests/bpf/test_cpp.cpp
+++ b/tools/testing/selftests/bpf/test_cpp.cpp
@@ -7,6 +7,7 @@
 #include <bpf/bpf.h>
 #include <bpf/btf.h>
 #include "test_core_extern.skel.h"
+#include "struct_ops_module.skel.h"
 
 template <typename T>
 class Skeleton {
@@ -98,6 +99,7 @@ int main(int argc, char *argv[])
 {
 	struct btf_dump_opts opts = { };
 	struct test_core_extern *skel;
+	struct struct_ops_module *skel2;
 	struct btf *btf;
 	int fd;
 
@@ -118,6 +120,9 @@ int main(int argc, char *argv[])
 	skel = test_core_extern__open_and_load();
 	test_core_extern__destroy(skel);
 
+	skel2 = struct_ops_module__open_and_load();
+	struct_ops_module__destroy(skel2);
+
 	fd = bpf_enable_stats(BPF_STATS_RUN_TIME);
 	if (fd < 0)
 		std::cout << "FAILED to enable stats: " << fd << std::endl;
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
deleted file mode 100644
index 80c42583f597..000000000000
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ /dev/null
@@ -1,1434 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (c) 2018 Facebook
-
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#include <arpa/inet.h>
-#include <netinet/in.h>
-#include <sys/types.h>
-#include <sys/select.h>
-#include <sys/socket.h>
-
-#include <linux/filter.h>
-
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include "cgroup_helpers.h"
-#include "testing_helpers.h"
-#include "bpf_util.h"
-
-#ifndef ENOTSUPP
-# define ENOTSUPP 524
-#endif
-
-#define CG_PATH	"/foo"
-#define CONNECT4_PROG_PATH	"./connect4_prog.bpf.o"
-#define CONNECT6_PROG_PATH	"./connect6_prog.bpf.o"
-#define SENDMSG4_PROG_PATH	"./sendmsg4_prog.bpf.o"
-#define SENDMSG6_PROG_PATH	"./sendmsg6_prog.bpf.o"
-#define RECVMSG4_PROG_PATH	"./recvmsg4_prog.bpf.o"
-#define RECVMSG6_PROG_PATH	"./recvmsg6_prog.bpf.o"
-#define BIND4_PROG_PATH		"./bind4_prog.bpf.o"
-#define BIND6_PROG_PATH		"./bind6_prog.bpf.o"
-
-#define SERV4_IP		"192.168.1.254"
-#define SERV4_REWRITE_IP	"127.0.0.1"
-#define SRC4_IP			"172.16.0.1"
-#define SRC4_REWRITE_IP		"127.0.0.4"
-#define SERV4_PORT		4040
-#define SERV4_REWRITE_PORT	4444
-
-#define SERV6_IP		"face:b00c:1234:5678::abcd"
-#define SERV6_REWRITE_IP	"::1"
-#define SERV6_V4MAPPED_IP	"::ffff:192.168.0.4"
-#define SRC6_IP			"::1"
-#define SRC6_REWRITE_IP		"::6"
-#define WILDCARD6_IP		"::"
-#define SERV6_PORT		6060
-#define SERV6_REWRITE_PORT	6666
-
-#define INET_NTOP_BUF	40
-
-struct sock_addr_test;
-
-typedef int (*load_fn)(const struct sock_addr_test *test);
-typedef int (*info_fn)(int, struct sockaddr *, socklen_t *);
-
-char bpf_log_buf[BPF_LOG_BUF_SIZE];
-
-struct sock_addr_test {
-	const char *descr;
-	/* BPF prog properties */
-	load_fn loadfn;
-	enum bpf_attach_type expected_attach_type;
-	enum bpf_attach_type attach_type;
-	/* Socket properties */
-	int domain;
-	int type;
-	/* IP:port pairs for BPF prog to override */
-	const char *requested_ip;
-	unsigned short requested_port;
-	const char *expected_ip;
-	unsigned short expected_port;
-	const char *expected_src_ip;
-	/* Expected test result */
-	enum {
-		LOAD_REJECT,
-		ATTACH_REJECT,
-		ATTACH_OKAY,
-		SYSCALL_EPERM,
-		SYSCALL_ENOTSUPP,
-		SUCCESS,
-	} expected_result;
-};
-
-static int bind4_prog_load(const struct sock_addr_test *test);
-static int bind6_prog_load(const struct sock_addr_test *test);
-static int connect4_prog_load(const struct sock_addr_test *test);
-static int connect6_prog_load(const struct sock_addr_test *test);
-static int sendmsg_allow_prog_load(const struct sock_addr_test *test);
-static int sendmsg_deny_prog_load(const struct sock_addr_test *test);
-static int recvmsg_allow_prog_load(const struct sock_addr_test *test);
-static int recvmsg_deny_prog_load(const struct sock_addr_test *test);
-static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test);
-static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test);
-static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test);
-static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test);
-static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test);
-static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test);
-static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test);
-static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test);
-
-static struct sock_addr_test tests[] = {
-	/* bind */
-	{
-		"bind4: load prog with wrong expected attach type",
-		bind4_prog_load,
-		BPF_CGROUP_INET6_BIND,
-		BPF_CGROUP_INET4_BIND,
-		AF_INET,
-		SOCK_STREAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		LOAD_REJECT,
-	},
-	{
-		"bind4: attach prog with wrong attach type",
-		bind4_prog_load,
-		BPF_CGROUP_INET4_BIND,
-		BPF_CGROUP_INET6_BIND,
-		AF_INET,
-		SOCK_STREAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		ATTACH_REJECT,
-	},
-	{
-		"bind4: rewrite IP & TCP port in",
-		bind4_prog_load,
-		BPF_CGROUP_INET4_BIND,
-		BPF_CGROUP_INET4_BIND,
-		AF_INET,
-		SOCK_STREAM,
-		SERV4_IP,
-		SERV4_PORT,
-		SERV4_REWRITE_IP,
-		SERV4_REWRITE_PORT,
-		NULL,
-		SUCCESS,
-	},
-	{
-		"bind4: rewrite IP & UDP port in",
-		bind4_prog_load,
-		BPF_CGROUP_INET4_BIND,
-		BPF_CGROUP_INET4_BIND,
-		AF_INET,
-		SOCK_DGRAM,
-		SERV4_IP,
-		SERV4_PORT,
-		SERV4_REWRITE_IP,
-		SERV4_REWRITE_PORT,
-		NULL,
-		SUCCESS,
-	},
-	{
-		"bind6: load prog with wrong expected attach type",
-		bind6_prog_load,
-		BPF_CGROUP_INET4_BIND,
-		BPF_CGROUP_INET6_BIND,
-		AF_INET6,
-		SOCK_STREAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		LOAD_REJECT,
-	},
-	{
-		"bind6: attach prog with wrong attach type",
-		bind6_prog_load,
-		BPF_CGROUP_INET6_BIND,
-		BPF_CGROUP_INET4_BIND,
-		AF_INET,
-		SOCK_STREAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		ATTACH_REJECT,
-	},
-	{
-		"bind6: rewrite IP & TCP port in",
-		bind6_prog_load,
-		BPF_CGROUP_INET6_BIND,
-		BPF_CGROUP_INET6_BIND,
-		AF_INET6,
-		SOCK_STREAM,
-		SERV6_IP,
-		SERV6_PORT,
-		SERV6_REWRITE_IP,
-		SERV6_REWRITE_PORT,
-		NULL,
-		SUCCESS,
-	},
-	{
-		"bind6: rewrite IP & UDP port in",
-		bind6_prog_load,
-		BPF_CGROUP_INET6_BIND,
-		BPF_CGROUP_INET6_BIND,
-		AF_INET6,
-		SOCK_DGRAM,
-		SERV6_IP,
-		SERV6_PORT,
-		SERV6_REWRITE_IP,
-		SERV6_REWRITE_PORT,
-		NULL,
-		SUCCESS,
-	},
-
-	/* connect */
-	{
-		"connect4: load prog with wrong expected attach type",
-		connect4_prog_load,
-		BPF_CGROUP_INET6_CONNECT,
-		BPF_CGROUP_INET4_CONNECT,
-		AF_INET,
-		SOCK_STREAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		LOAD_REJECT,
-	},
-	{
-		"connect4: attach prog with wrong attach type",
-		connect4_prog_load,
-		BPF_CGROUP_INET4_CONNECT,
-		BPF_CGROUP_INET6_CONNECT,
-		AF_INET,
-		SOCK_STREAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		ATTACH_REJECT,
-	},
-	{
-		"connect4: rewrite IP & TCP port",
-		connect4_prog_load,
-		BPF_CGROUP_INET4_CONNECT,
-		BPF_CGROUP_INET4_CONNECT,
-		AF_INET,
-		SOCK_STREAM,
-		SERV4_IP,
-		SERV4_PORT,
-		SERV4_REWRITE_IP,
-		SERV4_REWRITE_PORT,
-		SRC4_REWRITE_IP,
-		SUCCESS,
-	},
-	{
-		"connect4: rewrite IP & UDP port",
-		connect4_prog_load,
-		BPF_CGROUP_INET4_CONNECT,
-		BPF_CGROUP_INET4_CONNECT,
-		AF_INET,
-		SOCK_DGRAM,
-		SERV4_IP,
-		SERV4_PORT,
-		SERV4_REWRITE_IP,
-		SERV4_REWRITE_PORT,
-		SRC4_REWRITE_IP,
-		SUCCESS,
-	},
-	{
-		"connect6: load prog with wrong expected attach type",
-		connect6_prog_load,
-		BPF_CGROUP_INET4_CONNECT,
-		BPF_CGROUP_INET6_CONNECT,
-		AF_INET6,
-		SOCK_STREAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		LOAD_REJECT,
-	},
-	{
-		"connect6: attach prog with wrong attach type",
-		connect6_prog_load,
-		BPF_CGROUP_INET6_CONNECT,
-		BPF_CGROUP_INET4_CONNECT,
-		AF_INET,
-		SOCK_STREAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		ATTACH_REJECT,
-	},
-	{
-		"connect6: rewrite IP & TCP port",
-		connect6_prog_load,
-		BPF_CGROUP_INET6_CONNECT,
-		BPF_CGROUP_INET6_CONNECT,
-		AF_INET6,
-		SOCK_STREAM,
-		SERV6_IP,
-		SERV6_PORT,
-		SERV6_REWRITE_IP,
-		SERV6_REWRITE_PORT,
-		SRC6_REWRITE_IP,
-		SUCCESS,
-	},
-	{
-		"connect6: rewrite IP & UDP port",
-		connect6_prog_load,
-		BPF_CGROUP_INET6_CONNECT,
-		BPF_CGROUP_INET6_CONNECT,
-		AF_INET6,
-		SOCK_DGRAM,
-		SERV6_IP,
-		SERV6_PORT,
-		SERV6_REWRITE_IP,
-		SERV6_REWRITE_PORT,
-		SRC6_REWRITE_IP,
-		SUCCESS,
-	},
-
-	/* sendmsg */
-	{
-		"sendmsg4: load prog with wrong expected attach type",
-		sendmsg4_rw_asm_prog_load,
-		BPF_CGROUP_UDP6_SENDMSG,
-		BPF_CGROUP_UDP4_SENDMSG,
-		AF_INET,
-		SOCK_DGRAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		LOAD_REJECT,
-	},
-	{
-		"sendmsg4: attach prog with wrong attach type",
-		sendmsg4_rw_asm_prog_load,
-		BPF_CGROUP_UDP4_SENDMSG,
-		BPF_CGROUP_UDP6_SENDMSG,
-		AF_INET,
-		SOCK_DGRAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		ATTACH_REJECT,
-	},
-	{
-		"sendmsg4: rewrite IP & port (asm)",
-		sendmsg4_rw_asm_prog_load,
-		BPF_CGROUP_UDP4_SENDMSG,
-		BPF_CGROUP_UDP4_SENDMSG,
-		AF_INET,
-		SOCK_DGRAM,
-		SERV4_IP,
-		SERV4_PORT,
-		SERV4_REWRITE_IP,
-		SERV4_REWRITE_PORT,
-		SRC4_REWRITE_IP,
-		SUCCESS,
-	},
-	{
-		"sendmsg4: rewrite IP & port (C)",
-		sendmsg4_rw_c_prog_load,
-		BPF_CGROUP_UDP4_SENDMSG,
-		BPF_CGROUP_UDP4_SENDMSG,
-		AF_INET,
-		SOCK_DGRAM,
-		SERV4_IP,
-		SERV4_PORT,
-		SERV4_REWRITE_IP,
-		SERV4_REWRITE_PORT,
-		SRC4_REWRITE_IP,
-		SUCCESS,
-	},
-	{
-		"sendmsg4: deny call",
-		sendmsg_deny_prog_load,
-		BPF_CGROUP_UDP4_SENDMSG,
-		BPF_CGROUP_UDP4_SENDMSG,
-		AF_INET,
-		SOCK_DGRAM,
-		SERV4_IP,
-		SERV4_PORT,
-		SERV4_REWRITE_IP,
-		SERV4_REWRITE_PORT,
-		SRC4_REWRITE_IP,
-		SYSCALL_EPERM,
-	},
-	{
-		"sendmsg6: load prog with wrong expected attach type",
-		sendmsg6_rw_asm_prog_load,
-		BPF_CGROUP_UDP4_SENDMSG,
-		BPF_CGROUP_UDP6_SENDMSG,
-		AF_INET6,
-		SOCK_DGRAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		LOAD_REJECT,
-	},
-	{
-		"sendmsg6: attach prog with wrong attach type",
-		sendmsg6_rw_asm_prog_load,
-		BPF_CGROUP_UDP6_SENDMSG,
-		BPF_CGROUP_UDP4_SENDMSG,
-		AF_INET6,
-		SOCK_DGRAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		ATTACH_REJECT,
-	},
-	{
-		"sendmsg6: rewrite IP & port (asm)",
-		sendmsg6_rw_asm_prog_load,
-		BPF_CGROUP_UDP6_SENDMSG,
-		BPF_CGROUP_UDP6_SENDMSG,
-		AF_INET6,
-		SOCK_DGRAM,
-		SERV6_IP,
-		SERV6_PORT,
-		SERV6_REWRITE_IP,
-		SERV6_REWRITE_PORT,
-		SRC6_REWRITE_IP,
-		SUCCESS,
-	},
-	{
-		"sendmsg6: rewrite IP & port (C)",
-		sendmsg6_rw_c_prog_load,
-		BPF_CGROUP_UDP6_SENDMSG,
-		BPF_CGROUP_UDP6_SENDMSG,
-		AF_INET6,
-		SOCK_DGRAM,
-		SERV6_IP,
-		SERV6_PORT,
-		SERV6_REWRITE_IP,
-		SERV6_REWRITE_PORT,
-		SRC6_REWRITE_IP,
-		SUCCESS,
-	},
-	{
-		"sendmsg6: IPv4-mapped IPv6",
-		sendmsg6_rw_v4mapped_prog_load,
-		BPF_CGROUP_UDP6_SENDMSG,
-		BPF_CGROUP_UDP6_SENDMSG,
-		AF_INET6,
-		SOCK_DGRAM,
-		SERV6_IP,
-		SERV6_PORT,
-		SERV6_REWRITE_IP,
-		SERV6_REWRITE_PORT,
-		SRC6_REWRITE_IP,
-		SYSCALL_ENOTSUPP,
-	},
-	{
-		"sendmsg6: set dst IP = [::] (BSD'ism)",
-		sendmsg6_rw_wildcard_prog_load,
-		BPF_CGROUP_UDP6_SENDMSG,
-		BPF_CGROUP_UDP6_SENDMSG,
-		AF_INET6,
-		SOCK_DGRAM,
-		SERV6_IP,
-		SERV6_PORT,
-		SERV6_REWRITE_IP,
-		SERV6_REWRITE_PORT,
-		SRC6_REWRITE_IP,
-		SUCCESS,
-	},
-	{
-		"sendmsg6: preserve dst IP = [::] (BSD'ism)",
-		sendmsg_allow_prog_load,
-		BPF_CGROUP_UDP6_SENDMSG,
-		BPF_CGROUP_UDP6_SENDMSG,
-		AF_INET6,
-		SOCK_DGRAM,
-		WILDCARD6_IP,
-		SERV6_PORT,
-		SERV6_REWRITE_IP,
-		SERV6_PORT,
-		SRC6_IP,
-		SUCCESS,
-	},
-	{
-		"sendmsg6: deny call",
-		sendmsg_deny_prog_load,
-		BPF_CGROUP_UDP6_SENDMSG,
-		BPF_CGROUP_UDP6_SENDMSG,
-		AF_INET6,
-		SOCK_DGRAM,
-		SERV6_IP,
-		SERV6_PORT,
-		SERV6_REWRITE_IP,
-		SERV6_REWRITE_PORT,
-		SRC6_REWRITE_IP,
-		SYSCALL_EPERM,
-	},
-
-	/* recvmsg */
-	{
-		"recvmsg4: return code ok",
-		recvmsg_allow_prog_load,
-		BPF_CGROUP_UDP4_RECVMSG,
-		BPF_CGROUP_UDP4_RECVMSG,
-		AF_INET,
-		SOCK_DGRAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		ATTACH_OKAY,
-	},
-	{
-		"recvmsg4: return code !ok",
-		recvmsg_deny_prog_load,
-		BPF_CGROUP_UDP4_RECVMSG,
-		BPF_CGROUP_UDP4_RECVMSG,
-		AF_INET,
-		SOCK_DGRAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		LOAD_REJECT,
-	},
-	{
-		"recvmsg6: return code ok",
-		recvmsg_allow_prog_load,
-		BPF_CGROUP_UDP6_RECVMSG,
-		BPF_CGROUP_UDP6_RECVMSG,
-		AF_INET6,
-		SOCK_DGRAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		ATTACH_OKAY,
-	},
-	{
-		"recvmsg6: return code !ok",
-		recvmsg_deny_prog_load,
-		BPF_CGROUP_UDP6_RECVMSG,
-		BPF_CGROUP_UDP6_RECVMSG,
-		AF_INET6,
-		SOCK_DGRAM,
-		NULL,
-		0,
-		NULL,
-		0,
-		NULL,
-		LOAD_REJECT,
-	},
-	{
-		"recvmsg4: rewrite IP & port (C)",
-		recvmsg4_rw_c_prog_load,
-		BPF_CGROUP_UDP4_RECVMSG,
-		BPF_CGROUP_UDP4_RECVMSG,
-		AF_INET,
-		SOCK_DGRAM,
-		SERV4_REWRITE_IP,
-		SERV4_REWRITE_PORT,
-		SERV4_REWRITE_IP,
-		SERV4_REWRITE_PORT,
-		SERV4_IP,
-		SUCCESS,
-	},
-	{
-		"recvmsg6: rewrite IP & port (C)",
-		recvmsg6_rw_c_prog_load,
-		BPF_CGROUP_UDP6_RECVMSG,
-		BPF_CGROUP_UDP6_RECVMSG,
-		AF_INET6,
-		SOCK_DGRAM,
-		SERV6_REWRITE_IP,
-		SERV6_REWRITE_PORT,
-		SERV6_REWRITE_IP,
-		SERV6_REWRITE_PORT,
-		SERV6_IP,
-		SUCCESS,
-	},
-};
-
-static int mk_sockaddr(int domain, const char *ip, unsigned short port,
-		       struct sockaddr *addr, socklen_t addr_len)
-{
-	struct sockaddr_in6 *addr6;
-	struct sockaddr_in *addr4;
-
-	if (domain != AF_INET && domain != AF_INET6) {
-		log_err("Unsupported address family");
-		return -1;
-	}
-
-	memset(addr, 0, addr_len);
-
-	if (domain == AF_INET) {
-		if (addr_len < sizeof(struct sockaddr_in))
-			return -1;
-		addr4 = (struct sockaddr_in *)addr;
-		addr4->sin_family = domain;
-		addr4->sin_port = htons(port);
-		if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1) {
-			log_err("Invalid IPv4: %s", ip);
-			return -1;
-		}
-	} else if (domain == AF_INET6) {
-		if (addr_len < sizeof(struct sockaddr_in6))
-			return -1;
-		addr6 = (struct sockaddr_in6 *)addr;
-		addr6->sin6_family = domain;
-		addr6->sin6_port = htons(port);
-		if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1) {
-			log_err("Invalid IPv6: %s", ip);
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-static int load_insns(const struct sock_addr_test *test,
-		      const struct bpf_insn *insns, size_t insns_cnt)
-{
-	LIBBPF_OPTS(bpf_prog_load_opts, opts);
-	int ret;
-
-	opts.expected_attach_type = test->expected_attach_type;
-	opts.log_buf = bpf_log_buf;
-	opts.log_size = BPF_LOG_BUF_SIZE;
-
-	ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, NULL, "GPL", insns, insns_cnt, &opts);
-	if (ret < 0 && test->expected_result != LOAD_REJECT) {
-		log_err(">>> Loading program error.\n"
-			">>> Verifier output:\n%s\n-------\n", bpf_log_buf);
-	}
-
-	return ret;
-}
-
-static int load_path(const struct sock_addr_test *test, const char *path)
-{
-	struct bpf_object *obj;
-	struct bpf_program *prog;
-	int err;
-
-	obj = bpf_object__open_file(path, NULL);
-	err = libbpf_get_error(obj);
-	if (err) {
-		log_err(">>> Opening BPF object (%s) error.\n", path);
-		return -1;
-	}
-
-	prog = bpf_object__next_program(obj, NULL);
-	if (!prog)
-		goto err_out;
-
-	bpf_program__set_type(prog, BPF_PROG_TYPE_CGROUP_SOCK_ADDR);
-	bpf_program__set_expected_attach_type(prog, test->expected_attach_type);
-	bpf_program__set_flags(prog, testing_prog_flags());
-
-	err = bpf_object__load(obj);
-	if (err) {
-		if (test->expected_result != LOAD_REJECT)
-			log_err(">>> Loading program (%s) error.\n", path);
-		goto err_out;
-	}
-
-	return bpf_program__fd(prog);
-err_out:
-	bpf_object__close(obj);
-	return -1;
-}
-
-static int bind4_prog_load(const struct sock_addr_test *test)
-{
-	return load_path(test, BIND4_PROG_PATH);
-}
-
-static int bind6_prog_load(const struct sock_addr_test *test)
-{
-	return load_path(test, BIND6_PROG_PATH);
-}
-
-static int connect4_prog_load(const struct sock_addr_test *test)
-{
-	return load_path(test, CONNECT4_PROG_PATH);
-}
-
-static int connect6_prog_load(const struct sock_addr_test *test)
-{
-	return load_path(test, CONNECT6_PROG_PATH);
-}
-
-static int xmsg_ret_only_prog_load(const struct sock_addr_test *test,
-				   int32_t rc)
-{
-	struct bpf_insn insns[] = {
-		/* return rc */
-		BPF_MOV64_IMM(BPF_REG_0, rc),
-		BPF_EXIT_INSN(),
-	};
-	return load_insns(test, insns, ARRAY_SIZE(insns));
-}
-
-static int sendmsg_allow_prog_load(const struct sock_addr_test *test)
-{
-	return xmsg_ret_only_prog_load(test, /*rc*/ 1);
-}
-
-static int sendmsg_deny_prog_load(const struct sock_addr_test *test)
-{
-	return xmsg_ret_only_prog_load(test, /*rc*/ 0);
-}
-
-static int recvmsg_allow_prog_load(const struct sock_addr_test *test)
-{
-	return xmsg_ret_only_prog_load(test, /*rc*/ 1);
-}
-
-static int recvmsg_deny_prog_load(const struct sock_addr_test *test)
-{
-	return xmsg_ret_only_prog_load(test, /*rc*/ 0);
-}
-
-static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test)
-{
-	struct sockaddr_in dst4_rw_addr;
-	struct in_addr src4_rw_ip;
-
-	if (inet_pton(AF_INET, SRC4_REWRITE_IP, (void *)&src4_rw_ip) != 1) {
-		log_err("Invalid IPv4: %s", SRC4_REWRITE_IP);
-		return -1;
-	}
-
-	if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT,
-			(struct sockaddr *)&dst4_rw_addr,
-			sizeof(dst4_rw_addr)) == -1)
-		return -1;
-
-	struct bpf_insn insns[] = {
-		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-
-		/* if (sk.family == AF_INET && */
-		BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-			    offsetof(struct bpf_sock_addr, family)),
-		BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 8),
-
-		/*     sk.type == SOCK_DGRAM)  { */
-		BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-			    offsetof(struct bpf_sock_addr, type)),
-		BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 6),
-
-		/*      msg_src_ip4 = src4_rw_ip */
-		BPF_MOV32_IMM(BPF_REG_7, src4_rw_ip.s_addr),
-		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
-			    offsetof(struct bpf_sock_addr, msg_src_ip4)),
-
-		/*      user_ip4 = dst4_rw_addr.sin_addr */
-		BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_addr.s_addr),
-		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
-			    offsetof(struct bpf_sock_addr, user_ip4)),
-
-		/*      user_port = dst4_rw_addr.sin_port */
-		BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_port),
-		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
-			    offsetof(struct bpf_sock_addr, user_port)),
-		/* } */
-
-		/* return 1 */
-		BPF_MOV64_IMM(BPF_REG_0, 1),
-		BPF_EXIT_INSN(),
-	};
-
-	return load_insns(test, insns, ARRAY_SIZE(insns));
-}
-
-static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test)
-{
-	return load_path(test, RECVMSG4_PROG_PATH);
-}
-
-static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test)
-{
-	return load_path(test, SENDMSG4_PROG_PATH);
-}
-
-static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test,
-					 const char *rw_dst_ip)
-{
-	struct sockaddr_in6 dst6_rw_addr;
-	struct in6_addr src6_rw_ip;
-
-	if (inet_pton(AF_INET6, SRC6_REWRITE_IP, (void *)&src6_rw_ip) != 1) {
-		log_err("Invalid IPv6: %s", SRC6_REWRITE_IP);
-		return -1;
-	}
-
-	if (mk_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT,
-			(struct sockaddr *)&dst6_rw_addr,
-			sizeof(dst6_rw_addr)) == -1)
-		return -1;
-
-	struct bpf_insn insns[] = {
-		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
-
-		/* if (sk.family == AF_INET6) { */
-		BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
-			    offsetof(struct bpf_sock_addr, family)),
-		BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18),
-
-#define STORE_IPV6_WORD_N(DST, SRC, N)					       \
-		BPF_MOV32_IMM(BPF_REG_7, SRC[N]),			       \
-		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,		       \
-			    offsetof(struct bpf_sock_addr, DST[N]))
-
-#define STORE_IPV6(DST, SRC)						       \
-		STORE_IPV6_WORD_N(DST, SRC, 0),				       \
-		STORE_IPV6_WORD_N(DST, SRC, 1),				       \
-		STORE_IPV6_WORD_N(DST, SRC, 2),				       \
-		STORE_IPV6_WORD_N(DST, SRC, 3)
-
-		STORE_IPV6(msg_src_ip6, src6_rw_ip.s6_addr32),
-		STORE_IPV6(user_ip6, dst6_rw_addr.sin6_addr.s6_addr32),
-
-		/*      user_port = dst6_rw_addr.sin6_port */
-		BPF_MOV32_IMM(BPF_REG_7, dst6_rw_addr.sin6_port),
-		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
-			    offsetof(struct bpf_sock_addr, user_port)),
-
-		/* } */
-
-		/* return 1 */
-		BPF_MOV64_IMM(BPF_REG_0, 1),
-		BPF_EXIT_INSN(),
-	};
-
-	return load_insns(test, insns, ARRAY_SIZE(insns));
-}
-
-static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test)
-{
-	return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP);
-}
-
-static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test)
-{
-	return load_path(test, RECVMSG6_PROG_PATH);
-}
-
-static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test)
-{
-	return sendmsg6_rw_dst_asm_prog_load(test, SERV6_V4MAPPED_IP);
-}
-
-static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test)
-{
-	return sendmsg6_rw_dst_asm_prog_load(test, WILDCARD6_IP);
-}
-
-static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test)
-{
-	return load_path(test, SENDMSG6_PROG_PATH);
-}
-
-static int cmp_addr(const struct sockaddr_storage *addr1,
-		    const struct sockaddr_storage *addr2, int cmp_port)
-{
-	const struct sockaddr_in *four1, *four2;
-	const struct sockaddr_in6 *six1, *six2;
-
-	if (addr1->ss_family != addr2->ss_family)
-		return -1;
-
-	if (addr1->ss_family == AF_INET) {
-		four1 = (const struct sockaddr_in *)addr1;
-		four2 = (const struct sockaddr_in *)addr2;
-		return !((four1->sin_port == four2->sin_port || !cmp_port) &&
-			 four1->sin_addr.s_addr == four2->sin_addr.s_addr);
-	} else if (addr1->ss_family == AF_INET6) {
-		six1 = (const struct sockaddr_in6 *)addr1;
-		six2 = (const struct sockaddr_in6 *)addr2;
-		return !((six1->sin6_port == six2->sin6_port || !cmp_port) &&
-			 !memcmp(&six1->sin6_addr, &six2->sin6_addr,
-				 sizeof(struct in6_addr)));
-	}
-
-	return -1;
-}
-
-static int cmp_sock_addr(info_fn fn, int sock1,
-			 const struct sockaddr_storage *addr2, int cmp_port)
-{
-	struct sockaddr_storage addr1;
-	socklen_t len1 = sizeof(addr1);
-
-	memset(&addr1, 0, len1);
-	if (fn(sock1, (struct sockaddr *)&addr1, (socklen_t *)&len1) != 0)
-		return -1;
-
-	return cmp_addr(&addr1, addr2, cmp_port);
-}
-
-static int cmp_local_ip(int sock1, const struct sockaddr_storage *addr2)
-{
-	return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 0);
-}
-
-static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2)
-{
-	return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 1);
-}
-
-static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2)
-{
-	return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1);
-}
-
-static int start_server(int type, const struct sockaddr_storage *addr,
-			socklen_t addr_len)
-{
-	int fd;
-
-	fd = socket(addr->ss_family, type, 0);
-	if (fd == -1) {
-		log_err("Failed to create server socket");
-		goto out;
-	}
-
-	if (bind(fd, (const struct sockaddr *)addr, addr_len) == -1) {
-		log_err("Failed to bind server socket");
-		goto close_out;
-	}
-
-	if (type == SOCK_STREAM) {
-		if (listen(fd, 128) == -1) {
-			log_err("Failed to listen on server socket");
-			goto close_out;
-		}
-	}
-
-	goto out;
-close_out:
-	close(fd);
-	fd = -1;
-out:
-	return fd;
-}
-
-static int connect_to_server(int type, const struct sockaddr_storage *addr,
-			     socklen_t addr_len)
-{
-	int domain;
-	int fd = -1;
-
-	domain = addr->ss_family;
-
-	if (domain != AF_INET && domain != AF_INET6) {
-		log_err("Unsupported address family");
-		goto err;
-	}
-
-	fd = socket(domain, type, 0);
-	if (fd == -1) {
-		log_err("Failed to create client socket");
-		goto err;
-	}
-
-	if (connect(fd, (const struct sockaddr *)addr, addr_len) == -1) {
-		log_err("Fail to connect to server");
-		goto err;
-	}
-
-	goto out;
-err:
-	close(fd);
-	fd = -1;
-out:
-	return fd;
-}
-
-int init_pktinfo(int domain, struct cmsghdr *cmsg)
-{
-	struct in6_pktinfo *pktinfo6;
-	struct in_pktinfo *pktinfo4;
-
-	if (domain == AF_INET) {
-		cmsg->cmsg_level = SOL_IP;
-		cmsg->cmsg_type = IP_PKTINFO;
-		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo));
-		pktinfo4 = (struct in_pktinfo *)CMSG_DATA(cmsg);
-		memset(pktinfo4, 0, sizeof(struct in_pktinfo));
-		if (inet_pton(domain, SRC4_IP,
-			      (void *)&pktinfo4->ipi_spec_dst) != 1)
-			return -1;
-	} else if (domain == AF_INET6) {
-		cmsg->cmsg_level = SOL_IPV6;
-		cmsg->cmsg_type = IPV6_PKTINFO;
-		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
-		pktinfo6 = (struct in6_pktinfo *)CMSG_DATA(cmsg);
-		memset(pktinfo6, 0, sizeof(struct in6_pktinfo));
-		if (inet_pton(domain, SRC6_IP,
-			      (void *)&pktinfo6->ipi6_addr) != 1)
-			return -1;
-	} else {
-		return -1;
-	}
-
-	return 0;
-}
-
-static int sendmsg_to_server(int type, const struct sockaddr_storage *addr,
-			     socklen_t addr_len, int set_cmsg, int flags,
-			     int *syscall_err)
-{
-	union {
-		char buf[CMSG_SPACE(sizeof(struct in6_pktinfo))];
-		struct cmsghdr align;
-	} control6;
-	union {
-		char buf[CMSG_SPACE(sizeof(struct in_pktinfo))];
-		struct cmsghdr align;
-	} control4;
-	struct msghdr hdr;
-	struct iovec iov;
-	char data = 'a';
-	int domain;
-	int fd = -1;
-
-	domain = addr->ss_family;
-
-	if (domain != AF_INET && domain != AF_INET6) {
-		log_err("Unsupported address family");
-		goto err;
-	}
-
-	fd = socket(domain, type, 0);
-	if (fd == -1) {
-		log_err("Failed to create client socket");
-		goto err;
-	}
-
-	memset(&iov, 0, sizeof(iov));
-	iov.iov_base = &data;
-	iov.iov_len = sizeof(data);
-
-	memset(&hdr, 0, sizeof(hdr));
-	hdr.msg_name = (void *)addr;
-	hdr.msg_namelen = addr_len;
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-
-	if (set_cmsg) {
-		if (domain == AF_INET) {
-			hdr.msg_control = &control4;
-			hdr.msg_controllen = sizeof(control4.buf);
-		} else if (domain == AF_INET6) {
-			hdr.msg_control = &control6;
-			hdr.msg_controllen = sizeof(control6.buf);
-		}
-		if (init_pktinfo(domain, CMSG_FIRSTHDR(&hdr))) {
-			log_err("Fail to init pktinfo");
-			goto err;
-		}
-	}
-
-	if (sendmsg(fd, &hdr, flags) != sizeof(data)) {
-		log_err("Fail to send message to server");
-		*syscall_err = errno;
-		goto err;
-	}
-
-	goto out;
-err:
-	close(fd);
-	fd = -1;
-out:
-	return fd;
-}
-
-static int fastconnect_to_server(const struct sockaddr_storage *addr,
-				 socklen_t addr_len)
-{
-	int sendmsg_err;
-
-	return sendmsg_to_server(SOCK_STREAM, addr, addr_len, /*set_cmsg*/0,
-				 MSG_FASTOPEN, &sendmsg_err);
-}
-
-static int recvmsg_from_client(int sockfd, struct sockaddr_storage *src_addr)
-{
-	struct timeval tv;
-	struct msghdr hdr;
-	struct iovec iov;
-	char data[64];
-	fd_set rfds;
-
-	FD_ZERO(&rfds);
-	FD_SET(sockfd, &rfds);
-
-	tv.tv_sec = 2;
-	tv.tv_usec = 0;
-
-	if (select(sockfd + 1, &rfds, NULL, NULL, &tv) <= 0 ||
-	    !FD_ISSET(sockfd, &rfds))
-		return -1;
-
-	memset(&iov, 0, sizeof(iov));
-	iov.iov_base = data;
-	iov.iov_len = sizeof(data);
-
-	memset(&hdr, 0, sizeof(hdr));
-	hdr.msg_name = src_addr;
-	hdr.msg_namelen = sizeof(struct sockaddr_storage);
-	hdr.msg_iov = &iov;
-	hdr.msg_iovlen = 1;
-
-	return recvmsg(sockfd, &hdr, 0);
-}
-
-static int init_addrs(const struct sock_addr_test *test,
-		      struct sockaddr_storage *requested_addr,
-		      struct sockaddr_storage *expected_addr,
-		      struct sockaddr_storage *expected_src_addr)
-{
-	socklen_t addr_len = sizeof(struct sockaddr_storage);
-
-	if (mk_sockaddr(test->domain, test->expected_ip, test->expected_port,
-			(struct sockaddr *)expected_addr, addr_len) == -1)
-		goto err;
-
-	if (mk_sockaddr(test->domain, test->requested_ip, test->requested_port,
-			(struct sockaddr *)requested_addr, addr_len) == -1)
-		goto err;
-
-	if (test->expected_src_ip &&
-	    mk_sockaddr(test->domain, test->expected_src_ip, 0,
-			(struct sockaddr *)expected_src_addr, addr_len) == -1)
-		goto err;
-
-	return 0;
-err:
-	return -1;
-}
-
-static int run_bind_test_case(const struct sock_addr_test *test)
-{
-	socklen_t addr_len = sizeof(struct sockaddr_storage);
-	struct sockaddr_storage requested_addr;
-	struct sockaddr_storage expected_addr;
-	int clientfd = -1;
-	int servfd = -1;
-	int err = 0;
-
-	if (init_addrs(test, &requested_addr, &expected_addr, NULL))
-		goto err;
-
-	servfd = start_server(test->type, &requested_addr, addr_len);
-	if (servfd == -1)
-		goto err;
-
-	if (cmp_local_addr(servfd, &expected_addr))
-		goto err;
-
-	/* Try to connect to server just in case */
-	clientfd = connect_to_server(test->type, &expected_addr, addr_len);
-	if (clientfd == -1)
-		goto err;
-
-	goto out;
-err:
-	err = -1;
-out:
-	close(clientfd);
-	close(servfd);
-	return err;
-}
-
-static int run_connect_test_case(const struct sock_addr_test *test)
-{
-	socklen_t addr_len = sizeof(struct sockaddr_storage);
-	struct sockaddr_storage expected_src_addr;
-	struct sockaddr_storage requested_addr;
-	struct sockaddr_storage expected_addr;
-	int clientfd = -1;
-	int servfd = -1;
-	int err = 0;
-
-	if (init_addrs(test, &requested_addr, &expected_addr,
-		       &expected_src_addr))
-		goto err;
-
-	/* Prepare server to connect to */
-	servfd = start_server(test->type, &expected_addr, addr_len);
-	if (servfd == -1)
-		goto err;
-
-	clientfd = connect_to_server(test->type, &requested_addr, addr_len);
-	if (clientfd == -1)
-		goto err;
-
-	/* Make sure src and dst addrs were overridden properly */
-	if (cmp_peer_addr(clientfd, &expected_addr))
-		goto err;
-
-	if (cmp_local_ip(clientfd, &expected_src_addr))
-		goto err;
-
-	if (test->type == SOCK_STREAM) {
-		/* Test TCP Fast Open scenario */
-		clientfd = fastconnect_to_server(&requested_addr, addr_len);
-		if (clientfd == -1)
-			goto err;
-
-		/* Make sure src and dst addrs were overridden properly */
-		if (cmp_peer_addr(clientfd, &expected_addr))
-			goto err;
-
-		if (cmp_local_ip(clientfd, &expected_src_addr))
-			goto err;
-	}
-
-	goto out;
-err:
-	err = -1;
-out:
-	close(clientfd);
-	close(servfd);
-	return err;
-}
-
-static int run_xmsg_test_case(const struct sock_addr_test *test, int max_cmsg)
-{
-	socklen_t addr_len = sizeof(struct sockaddr_storage);
-	struct sockaddr_storage expected_addr;
-	struct sockaddr_storage server_addr;
-	struct sockaddr_storage sendmsg_addr;
-	struct sockaddr_storage recvmsg_addr;
-	int clientfd = -1;
-	int servfd = -1;
-	int set_cmsg;
-	int err = 0;
-
-	if (test->type != SOCK_DGRAM)
-		goto err;
-
-	if (init_addrs(test, &sendmsg_addr, &server_addr, &expected_addr))
-		goto err;
-
-	/* Prepare server to sendmsg to */
-	servfd = start_server(test->type, &server_addr, addr_len);
-	if (servfd == -1)
-		goto err;
-
-	for (set_cmsg = 0; set_cmsg <= max_cmsg; ++set_cmsg) {
-		if (clientfd >= 0)
-			close(clientfd);
-
-		clientfd = sendmsg_to_server(test->type, &sendmsg_addr,
-					     addr_len, set_cmsg, /*flags*/0,
-					     &err);
-		if (err)
-			goto out;
-		else if (clientfd == -1)
-			goto err;
-
-		/* Try to receive message on server instead of using
-		 * getpeername(2) on client socket, to check that client's
-		 * destination address was rewritten properly, since
-		 * getpeername(2) doesn't work with unconnected datagram
-		 * sockets.
-		 *
-		 * Get source address from recvmsg(2) as well to make sure
-		 * source was rewritten properly: getsockname(2) can't be used
-		 * since socket is unconnected and source defined for one
-		 * specific packet may differ from the one used by default and
-		 * returned by getsockname(2).
-		 */
-		if (recvmsg_from_client(servfd, &recvmsg_addr) == -1)
-			goto err;
-
-		if (cmp_addr(&recvmsg_addr, &expected_addr, /*cmp_port*/0))
-			goto err;
-	}
-
-	goto out;
-err:
-	err = -1;
-out:
-	close(clientfd);
-	close(servfd);
-	return err;
-}
-
-static int run_test_case(int cgfd, const struct sock_addr_test *test)
-{
-	int progfd = -1;
-	int err = 0;
-
-	printf("Test case: %s .. ", test->descr);
-
-	progfd = test->loadfn(test);
-	if (test->expected_result == LOAD_REJECT && progfd < 0)
-		goto out;
-	else if (test->expected_result == LOAD_REJECT || progfd < 0)
-		goto err;
-
-	err = bpf_prog_attach(progfd, cgfd, test->attach_type,
-			      BPF_F_ALLOW_OVERRIDE);
-	if (test->expected_result == ATTACH_REJECT && err) {
-		err = 0; /* error was expected, reset it */
-		goto out;
-	} else if (test->expected_result == ATTACH_REJECT || err) {
-		goto err;
-	} else if (test->expected_result == ATTACH_OKAY) {
-		err = 0;
-		goto out;
-	}
-
-	switch (test->attach_type) {
-	case BPF_CGROUP_INET4_BIND:
-	case BPF_CGROUP_INET6_BIND:
-		err = run_bind_test_case(test);
-		break;
-	case BPF_CGROUP_INET4_CONNECT:
-	case BPF_CGROUP_INET6_CONNECT:
-		err = run_connect_test_case(test);
-		break;
-	case BPF_CGROUP_UDP4_SENDMSG:
-	case BPF_CGROUP_UDP6_SENDMSG:
-		err = run_xmsg_test_case(test, 1);
-		break;
-	case BPF_CGROUP_UDP4_RECVMSG:
-	case BPF_CGROUP_UDP6_RECVMSG:
-		err = run_xmsg_test_case(test, 0);
-		break;
-	default:
-		goto err;
-	}
-
-	if (test->expected_result == SYSCALL_EPERM && err == EPERM) {
-		err = 0; /* error was expected, reset it */
-		goto out;
-	}
-
-	if (test->expected_result == SYSCALL_ENOTSUPP && err == ENOTSUPP) {
-		err = 0; /* error was expected, reset it */
-		goto out;
-	}
-
-	if (err || test->expected_result != SUCCESS)
-		goto err;
-
-	goto out;
-err:
-	err = -1;
-out:
-	/* Detaching w/o checking return code: best effort attempt. */
-	if (progfd != -1)
-		bpf_prog_detach(cgfd, test->attach_type);
-	close(progfd);
-	printf("[%s]\n", err ? "FAIL" : "PASS");
-	return err;
-}
-
-static int run_tests(int cgfd)
-{
-	int passes = 0;
-	int fails = 0;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(tests); ++i) {
-		if (run_test_case(cgfd, &tests[i]))
-			++fails;
-		else
-			++passes;
-	}
-	printf("Summary: %d PASSED, %d FAILED\n", passes, fails);
-	return fails ? -1 : 0;
-}
-
-int main(int argc, char **argv)
-{
-	int cgfd = -1;
-	int err = 0;
-
-	if (argc < 2) {
-		fprintf(stderr,
-			"%s has to be run via %s.sh. Skip direct run.\n",
-			argv[0], argv[0]);
-		exit(err);
-	}
-
-	cgfd = cgroup_setup_and_join(CG_PATH);
-	if (cgfd < 0)
-		goto err;
-
-	/* Use libbpf 1.0 API mode */
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	if (run_tests(cgfd))
-		goto err;
-
-	goto out;
-err:
-	err = -1;
-out:
-	close(cgfd);
-	cleanup_cgroup_environment();
-	return err;
-}
diff --git a/tools/testing/selftests/bpf/test_sock_addr.sh b/tools/testing/selftests/bpf/test_sock_addr.sh
deleted file mode 100755
index 3b9fdb8094aa..000000000000
--- a/tools/testing/selftests/bpf/test_sock_addr.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/sh
-
-set -eu
-
-ping_once()
-{
-	type ping${1} >/dev/null 2>&1 && PING="ping${1}" || PING="ping -${1}"
-	$PING -q -c 1 -W 1 ${2%%/*} >/dev/null 2>&1
-}
-
-wait_for_ip()
-{
-	local _i
-	echo -n "Wait for testing IPv4/IPv6 to become available "
-	for _i in $(seq ${MAX_PING_TRIES}); do
-		echo -n "."
-		if ping_once 4 ${TEST_IPv4} && ping_once 6 ${TEST_IPv6}; then
-			echo " OK"
-			return
-		fi
-	done
-	echo 1>&2 "ERROR: Timeout waiting for test IP to become available."
-	exit 1
-}
-
-setup()
-{
-	# Create testing interfaces not to interfere with current environment.
-	ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER}
-	ip link set ${TEST_IF} up
-	ip link set ${TEST_IF_PEER} up
-
-	ip -4 addr add ${TEST_IPv4} dev ${TEST_IF}
-	ip -6 addr add ${TEST_IPv6} dev ${TEST_IF}
-	wait_for_ip
-}
-
-cleanup()
-{
-	ip link del ${TEST_IF} 2>/dev/null || :
-	ip link del ${TEST_IF_PEER} 2>/dev/null || :
-}
-
-main()
-{
-	trap cleanup EXIT 2 3 6 15
-	setup
-	./test_sock_addr setup_done
-}
-
-BASENAME=$(basename $0 .sh)
-TEST_IF="${BASENAME}1"
-TEST_IF_PEER="${BASENAME}2"
-TEST_IPv4="127.0.0.4/8"
-TEST_IPv6="::6/128"
-MAX_PING_TRIES=5
-
-main
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 024a0faafb3b..92752f5eeded 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1887,10 +1887,13 @@ static int check_whitelist(struct _test *t, struct sockmap_options *opt)
 	while (entry) {
 		if ((opt->prepend && strstr(opt->prepend, entry) != 0) ||
 		    strstr(opt->map, entry) != 0 ||
-		    strstr(t->title, entry) != 0)
+		    strstr(t->title, entry) != 0) {
+			free(ptr);
 			return 0;
+		}
 		entry = strtok(NULL, ",");
 	}
+	free(ptr);
 	return -EINVAL;
 }
 
@@ -1907,10 +1910,13 @@ static int check_blacklist(struct _test *t, struct sockmap_options *opt)
 	while (entry) {
 		if ((opt->prepend && strstr(opt->prepend, entry) != 0) ||
 		    strstr(opt->map, entry) != 0 ||
-		    strstr(t->title, entry) != 0)
+		    strstr(t->title, entry) != 0) {
+			free(ptr);
 			return 0;
+		}
 		entry = strtok(NULL, ",");
 	}
+	free(ptr);
 	return -EINVAL;
 }
 
@@ -2104,9 +2110,9 @@ out:
 		free(options.whitelist);
 	if (options.blacklist)
 		free(options.blacklist);
+	close(cg_fd);
 	if (cg_created)
 		cleanup_cgroup_environment();
-	close(cg_fd);
 	return err;
 }
 
diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
index 910044f08908..7989ec608454 100755
--- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
@@ -72,7 +72,6 @@ cleanup() {
 server_listen() {
 	ip netns exec "${ns2}" nc "${netcat_opt}" -l "${port}" > "${outfile}" &
 	server_pid=$!
-	sleep 0.2
 }
 
 client_connect() {
@@ -93,6 +92,16 @@ verify_data() {
 	fi
 }
 
+wait_for_port() {
+	for i in $(seq 20); do
+		if ip netns exec "${ns2}" ss ${2:--4}OHntl | grep -q "$1"; then
+			return 0
+		fi
+		sleep 0.1
+	done
+	return 1
+}
+
 set -e
 
 # no arguments: automated test, run all
@@ -193,6 +202,7 @@ setup
 # basic communication works
 echo "test basic connectivity"
 server_listen
+wait_for_port ${port} ${netcat_opt}
 client_connect
 verify_data
 
@@ -204,6 +214,7 @@ ip netns exec "${ns1}" tc filter add dev veth1 egress \
 	section "encap_${tuntype}_${mac}"
 echo "test bpf encap without decap (expect failure)"
 server_listen
+wait_for_port ${port} ${netcat_opt}
 ! client_connect
 
 if [[ "$tuntype" =~ "udp" ]]; then
diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
index 32df93747095..7b5fc98838cd 100644
--- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
+++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
@@ -16,68 +16,7 @@
 #include <bpf/libbpf.h>
 
 #include "cgroup_helpers.h"
-
-static int start_server(const struct sockaddr *addr, socklen_t len, bool dual)
-{
-	int mode = !dual;
-	int fd;
-
-	fd = socket(addr->sa_family, SOCK_STREAM, 0);
-	if (fd == -1) {
-		log_err("Failed to create server socket");
-		goto out;
-	}
-
-	if (addr->sa_family == AF_INET6) {
-		if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, (char *)&mode,
-			       sizeof(mode)) == -1) {
-			log_err("Failed to set the dual-stack mode");
-			goto close_out;
-		}
-	}
-
-	if (bind(fd, addr, len) == -1) {
-		log_err("Failed to bind server socket");
-		goto close_out;
-	}
-
-	if (listen(fd, 128) == -1) {
-		log_err("Failed to listen on server socket");
-		goto close_out;
-	}
-
-	goto out;
-
-close_out:
-	close(fd);
-	fd = -1;
-out:
-	return fd;
-}
-
-static int connect_to_server(const struct sockaddr *addr, socklen_t len)
-{
-	int fd = -1;
-
-	fd = socket(addr->sa_family, SOCK_STREAM, 0);
-	if (fd == -1) {
-		log_err("Failed to create client socket");
-		goto out;
-	}
-
-	if (connect(fd, (const struct sockaddr *)addr, len) == -1) {
-		log_err("Fail to connect to server");
-		goto close_out;
-	}
-
-	goto out;
-
-close_out:
-	close(fd);
-	fd = -1;
-out:
-	return fd;
-}
+#include "network_helpers.h"
 
 static int get_map_fd_by_prog_id(int prog_id, bool *xdp)
 {
@@ -117,8 +56,7 @@ err:
 	return map_fd;
 }
 
-static int run_test(int server_fd, int results_fd, bool xdp,
-		    const struct sockaddr *addr, socklen_t len)
+static int run_test(int server_fd, int results_fd, bool xdp)
 {
 	int client = -1, srv_client = -1;
 	int ret = 0;
@@ -144,7 +82,7 @@ static int run_test(int server_fd, int results_fd, bool xdp,
 		goto err;
 	}
 
-	client = connect_to_server(addr, len);
+	client = connect_to_fd(server_fd, 0);
 	if (client == -1)
 		goto err;
 
@@ -201,23 +139,23 @@ out:
 	return ret;
 }
 
-static bool get_port(int server_fd, in_port_t *port)
+static int v6only_true(int fd, const struct post_socket_opts *opts)
 {
-	struct sockaddr_in addr;
-	socklen_t len = sizeof(addr);
+	int mode = true;
 
-	if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
-		log_err("Failed to get server addr");
-		return false;
-	}
+	return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode));
+}
+
+static int v6only_false(int fd, const struct post_socket_opts *opts)
+{
+	int mode = false;
 
-	/* sin_port and sin6_port are located at the same offset. */
-	*port = addr.sin_port;
-	return true;
+	return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode));
 }
 
 int main(int argc, char **argv)
 {
+	struct network_helper_opts opts = { 0 };
 	struct sockaddr_in addr4;
 	struct sockaddr_in6 addr6;
 	struct sockaddr_in addr4dual;
@@ -259,31 +197,30 @@ int main(int argc, char **argv)
 	addr6dual.sin6_addr = in6addr_any;
 	addr6dual.sin6_port = 0;
 
-	server = start_server((const struct sockaddr *)&addr4, sizeof(addr4),
-			      false);
-	if (server == -1 || !get_port(server, &addr4.sin_port))
+	server = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr4,
+				   sizeof(addr4), NULL);
+	if (server == -1)
 		goto err;
 
-	server_v6 = start_server((const struct sockaddr *)&addr6,
-				 sizeof(addr6), false);
-	if (server_v6 == -1 || !get_port(server_v6, &addr6.sin6_port))
+	opts.post_socket_cb = v6only_true;
+	server_v6 = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6,
+				      sizeof(addr6), &opts);
+	if (server_v6 == -1)
 		goto err;
 
-	server_dual = start_server((const struct sockaddr *)&addr6dual,
-				   sizeof(addr6dual), true);
-	if (server_dual == -1 || !get_port(server_dual, &addr4dual.sin_port))
+	opts.post_socket_cb = v6only_false;
+	server_dual = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6dual,
+					sizeof(addr6dual), &opts);
+	if (server_dual == -1)
 		goto err;
 
-	if (run_test(server, results, xdp,
-		     (const struct sockaddr *)&addr4, sizeof(addr4)))
+	if (run_test(server, results, xdp))
 		goto err;
 
-	if (run_test(server_v6, results, xdp,
-		     (const struct sockaddr *)&addr6, sizeof(addr6)))
+	if (run_test(server_v6, results, xdp))
 		goto err;
 
-	if (run_test(server_dual, results, xdp,
-		     (const struct sockaddr *)&addr4dual, sizeof(addr4dual)))
+	if (run_test(server_dual, results, xdp))
 		goto err;
 
 	printf("ok\n");
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index 28b6646662af..d5379a0e6da8 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -368,9 +368,23 @@ int delete_module(const char *name, int flags)
 
 int unload_bpf_testmod(bool verbose)
 {
+	int ret, cnt = 0;
+
 	if (kern_sync_rcu())
 		fprintf(stdout, "Failed to trigger kernel-side RCU sync!\n");
-	if (delete_module("bpf_testmod", 0)) {
+
+	for (;;) {
+		ret = delete_module("bpf_testmod", 0);
+		if (!ret || errno != EAGAIN)
+			break;
+		if (++cnt > 10000) {
+			fprintf(stdout, "Unload of bpf_testmod timed out\n");
+			break;
+		}
+		usleep(100);
+	}
+
+	if (ret) {
 		if (errno == ENOENT) {
 			if (verbose)
 				fprintf(stdout, "bpf_testmod.ko is already unloaded.\n");
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 27fd7ed3e4b0..70e29f316fe7 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -61,12 +61,7 @@ void free_kallsyms_local(struct ksyms *ksyms)
 	free(ksyms);
 }
 
-static int ksym_cmp(const void *p1, const void *p2)
-{
-	return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
-}
-
-struct ksyms *load_kallsyms_local(void)
+static struct ksyms *load_kallsyms_local_common(ksym_cmp_t cmp_cb)
 {
 	FILE *f;
 	char func[256], buf[256];
@@ -100,7 +95,7 @@ struct ksyms *load_kallsyms_local(void)
 			goto error;
 	}
 	fclose(f);
-	qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), ksym_cmp);
+	qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), cmp_cb);
 	return ksyms;
 
 error:
@@ -109,6 +104,21 @@ error:
 	return NULL;
 }
 
+static int ksym_cmp(const void *p1, const void *p2)
+{
+	return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
+}
+
+struct ksyms *load_kallsyms_local(void)
+{
+	return load_kallsyms_local_common(ksym_cmp);
+}
+
+struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb)
+{
+	return load_kallsyms_local_common(cmp_cb);
+}
+
 int load_kallsyms(void)
 {
 	pthread_mutex_lock(&ksyms_mutex);
@@ -148,6 +158,28 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key)
 	return &ksyms->syms[0];
 }
 
+struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p,
+					  ksym_search_cmp_t cmp_cb)
+{
+	int start = 0, mid, end = ksyms->sym_cnt;
+	struct ksym *ks;
+	int result;
+
+	while (start < end) {
+		mid = start + (end - start) / 2;
+		ks = &ksyms->syms[mid];
+		result = cmp_cb(p, ks);
+		if (result < 0)
+			end = mid;
+		else if (result > 0)
+			start = mid + 1;
+		else
+			return ks;
+	}
+
+	return NULL;
+}
+
 struct ksym *ksym_search(long key)
 {
 	if (!ksyms)
@@ -201,29 +233,6 @@ out:
 	return err;
 }
 
-void read_trace_pipe(void)
-{
-	int trace_fd;
-
-	if (access(TRACEFS_PIPE, F_OK) == 0)
-		trace_fd = open(TRACEFS_PIPE, O_RDONLY, 0);
-	else
-		trace_fd = open(DEBUGFS_PIPE, O_RDONLY, 0);
-	if (trace_fd < 0)
-		return;
-
-	while (1) {
-		static char buf[4096];
-		ssize_t sz;
-
-		sz = read(trace_fd, buf, sizeof(buf) - 1);
-		if (sz > 0) {
-			buf[sz] = 0;
-			puts(buf);
-		}
-	}
-}
-
 ssize_t get_uprobe_offset(const void *addr)
 {
 	size_t start, end, base;
@@ -381,3 +390,43 @@ out:
 	close(fd);
 	return err;
 }
+
+int read_trace_pipe_iter(void (*cb)(const char *str, void *data), void *data, int iter)
+{
+	size_t buflen, n;
+	char *buf = NULL;
+	FILE *fp = NULL;
+
+	if (access(TRACEFS_PIPE, F_OK) == 0)
+		fp = fopen(TRACEFS_PIPE, "r");
+	else
+		fp = fopen(DEBUGFS_PIPE, "r");
+	if (!fp)
+		return -1;
+
+	 /* We do not want to wait forever when iter is specified. */
+	if (iter)
+		fcntl(fileno(fp), F_SETFL, O_NONBLOCK);
+
+	while ((n = getline(&buf, &buflen, fp) >= 0) || errno == EAGAIN) {
+		if (n > 0)
+			cb(buf, data);
+		if (iter && !(--iter))
+			break;
+	}
+
+	free(buf);
+	if (fp)
+		fclose(fp);
+	return 0;
+}
+
+static void trace_pipe_cb(const char *str, void *data)
+{
+	printf("%s", str);
+}
+
+void read_trace_pipe(void)
+{
+	read_trace_pipe_iter(trace_pipe_cb, NULL, 0);
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 04fd1da7079d..2ce873c9f9aa 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -13,6 +13,9 @@ struct ksym {
 };
 struct ksyms;
 
+typedef int (*ksym_cmp_t)(const void *p1, const void *p2);
+typedef int (*ksym_search_cmp_t)(const void *p1, const struct ksym *p2);
+
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
 long ksym_get_addr(const char *name);
@@ -22,10 +25,16 @@ struct ksym *ksym_search_local(struct ksyms *ksyms, long key);
 long ksym_get_addr_local(struct ksyms *ksyms, const char *name);
 void free_kallsyms_local(struct ksyms *ksyms);
 
+struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb);
+struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p1,
+					  ksym_search_cmp_t cmp_cb);
+
 /* open kallsyms and find addresses on the fly, faster than load + search. */
 int kallsyms_find(const char *sym, unsigned long long *addr);
 
 void read_trace_pipe(void);
+int read_trace_pipe_iter(void (*cb)(const char *str, void *data),
+			 void *data, int iter);
 
 ssize_t get_uprobe_offset(const void *addr);
 ssize_t get_rel_offset(uintptr_t addr);
diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c
index a61ceab60b68..7ffa563ffeba 100644
--- a/tools/testing/selftests/bpf/uprobe_multi.c
+++ b/tools/testing/selftests/bpf/uprobe_multi.c
@@ -9,7 +9,7 @@
 
 #define NAME(name, idx) PASTE(name, idx)
 
-#define DEF(name, idx)  int NAME(name, idx)(void) { return 0; }
+#define DEF(name, idx) int __attribute__((weak)) NAME(name, idx)(void) { return 0; }
 #define CALL(name, idx) NAME(name, idx)();
 
 #define F(body, name, idx) body(name, idx)
diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 244d4996e06e..b2854238d4a0 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -792,10 +792,13 @@ static int parse_stats(const char *stats_str, struct stat_specs *specs)
 
 	while ((next = strtok_r(state ? NULL : input, ",", &state))) {
 		err = parse_stat(next, specs);
-		if (err)
+		if (err) {
+			free(input);
 			return err;
+		}
 	}
 
+	free(input);
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index bdf5d8180067..6f9956eed797 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -495,20 +495,6 @@ peek:
 	return 0;
 }
 
-struct ethtool_channels {
-	__u32	cmd;
-	__u32	max_rx;
-	__u32	max_tx;
-	__u32	max_other;
-	__u32	max_combined;
-	__u32	rx_count;
-	__u32	tx_count;
-	__u32	other_count;
-	__u32	combined_count;
-};
-
-#define ETHTOOL_GCHANNELS	0x0000003c /* Get no of channels */
-
 static int rxq_num(const char *ifname)
 {
 	struct ethtool_channels ch = {
@@ -595,6 +581,8 @@ static void cleanup(void)
 
 	if (bpf_obj)
 		xdp_hw_metadata__destroy(bpf_obj);
+
+	free((void *)saved_hwtstamp_ifname);
 }
 
 static void handle_signal(int sig)
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index b1102ee13faa..2eac0895b0a1 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -81,6 +81,7 @@
 #include <linux/mman.h>
 #include <linux/netdev.h>
 #include <linux/bitmap.h>
+#include <linux/ethtool.h>
 #include <arpa/inet.h>
 #include <net/if.h>
 #include <locale.h>
@@ -105,11 +106,15 @@
 #include "../kselftest.h"
 #include "xsk_xdp_common.h"
 
+#include <network_helpers.h>
+
 static bool opt_verbose;
 static bool opt_print_tests;
 static enum test_mode opt_mode = TEST_MODE_ALL;
 static u32 opt_run_test = RUN_ALL_TESTS;
 
+void test__fail(void) { /* for network_helpers.c */ }
+
 static void __exit_with_error(int error, const char *file, const char *func, int line)
 {
 	ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error,
@@ -239,7 +244,7 @@ static void enable_busy_poll(struct xsk_socket_info *xsk)
 		       (void *)&sock_opt, sizeof(sock_opt)) < 0)
 		exit_with_error(errno);
 
-	sock_opt = BATCH_SIZE;
+	sock_opt = xsk->batch_size;
 	if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET,
 		       (void *)&sock_opt, sizeof(sock_opt)) < 0)
 		exit_with_error(errno);
@@ -409,6 +414,33 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj
 	}
 }
 
+static int set_ring_size(struct ifobject *ifobj)
+{
+	int ret;
+	u32 ctr = 0;
+
+	while (ctr++ < SOCK_RECONF_CTR) {
+		ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring);
+		if (!ret)
+			break;
+
+		/* Retry if it fails */
+		if (ctr >= SOCK_RECONF_CTR || errno != EBUSY)
+			return -errno;
+
+		usleep(USLEEP_MAX);
+	}
+
+	return ret;
+}
+
+static int hw_ring_size_reset(struct ifobject *ifobj)
+{
+	ifobj->ring.tx_pending = ifobj->set_ring.default_tx;
+	ifobj->ring.rx_pending = ifobj->set_ring.default_rx;
+	return set_ring_size(ifobj);
+}
+
 static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
 			     struct ifobject *ifobj_rx)
 {
@@ -439,6 +471,7 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
 		for (j = 0; j < MAX_SOCKETS; j++) {
 			memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j]));
 			ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+			ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE;
 			if (i == 0)
 				ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default;
 			else
@@ -451,12 +484,16 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
 		}
 	}
 
+	if (ifobj_tx->hw_ring_size_supp)
+		hw_ring_size_reset(ifobj_tx);
+
 	test->ifobj_tx = ifobj_tx;
 	test->ifobj_rx = ifobj_rx;
 	test->current_step = 0;
 	test->total_steps = 1;
 	test->nb_sockets = 1;
 	test->fail = false;
+	test->set_ring = false;
 	test->mtu = MAX_ETH_PKT_SIZE;
 	test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog;
 	test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk;
@@ -1087,7 +1124,7 @@ static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk)
 			return TEST_CONTINUE;
 	}
 
-	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
+	rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx);
 	if (!rcvd)
 		return TEST_CONTINUE;
 
@@ -1239,7 +1276,8 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b
 
 	buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len);
 	/* pkts_in_flight might be negative if many invalid packets are sent */
-	if (pkts_in_flight >= (int)((umem_size(umem) - BATCH_SIZE * buffer_len) / buffer_len)) {
+	if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) /
+	    buffer_len)) {
 		ret = kick_tx(xsk);
 		if (ret)
 			return TEST_FAILURE;
@@ -1249,7 +1287,7 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b
 	fds.fd = xsk_socket__fd(xsk->xsk);
 	fds.events = POLLOUT;
 
-	while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) {
+	while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) {
 		if (use_poll) {
 			ret = poll(&fds, 1, POLL_TMOUT);
 			if (timeout) {
@@ -1269,10 +1307,10 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b
 			}
 		}
 
-		complete_pkts(xsk, BATCH_SIZE);
+		complete_pkts(xsk, xsk->batch_size);
 	}
 
-	for (i = 0; i < BATCH_SIZE; i++) {
+	for (i = 0; i < xsk->batch_size; i++) {
 		struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream);
 		u32 nb_frags_left, nb_frags, bytes_written = 0;
 
@@ -1280,9 +1318,9 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b
 			break;
 
 		nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt);
-		if (nb_frags > BATCH_SIZE - i) {
+		if (nb_frags > xsk->batch_size - i) {
 			pkt_stream_cancel(pkt_stream);
-			xsk_ring_prod__cancel(&xsk->tx, BATCH_SIZE - i);
+			xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i);
 			break;
 		}
 		nb_frags_left = nb_frags;
@@ -1370,7 +1408,7 @@ static int wait_for_tx_completion(struct xsk_socket_info *xsk)
 			return TEST_FAILURE;
 		}
 
-		complete_pkts(xsk, BATCH_SIZE);
+		complete_pkts(xsk, xsk->batch_size);
 	}
 
 	return TEST_PASS;
@@ -1860,6 +1898,14 @@ static int testapp_validate_traffic(struct test_spec *test)
 		return TEST_SKIP;
 	}
 
+	if (test->set_ring) {
+		if (ifobj_tx->hw_ring_size_supp)
+			return set_ring_size(ifobj_tx);
+
+	ksft_test_result_skip("Changing HW ring size not supported.\n");
+	return TEST_SKIP;
+	}
+
 	xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx);
 	return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx);
 }
@@ -2373,6 +2419,50 @@ static int testapp_xdp_metadata_mb(struct test_spec *test)
 	return testapp_xdp_metadata_copy(test);
 }
 
+static int testapp_hw_sw_min_ring_size(struct test_spec *test)
+{
+	int ret;
+
+	test->set_ring = true;
+	test->total_steps = 2;
+	test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE;
+	test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2;
+	test->ifobj_tx->xsk->batch_size = 1;
+	test->ifobj_rx->xsk->batch_size = 1;
+	ret = testapp_validate_traffic(test);
+	if (ret)
+		return ret;
+
+	/* Set batch size to hw_ring_size - 1 */
+	test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1;
+	test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1;
+	return testapp_validate_traffic(test);
+}
+
+static int testapp_hw_sw_max_ring_size(struct test_spec *test)
+{
+	u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2;
+	int ret;
+
+	test->set_ring = true;
+	test->total_steps = 2;
+	test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending;
+	test->ifobj_tx->ring.rx_pending  = test->ifobj_tx->ring.rx_max_pending;
+	test->ifobj_rx->umem->num_frames = max_descs;
+	test->ifobj_rx->xsk->rxqsize = max_descs;
+	test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+	test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+
+	ret = testapp_validate_traffic(test);
+	if (ret)
+		return ret;
+
+	/* Set batch_size to 4095 */
+	test->ifobj_tx->xsk->batch_size = max_descs - 1;
+	test->ifobj_rx->xsk->batch_size = max_descs - 1;
+	return testapp_validate_traffic(test);
+}
+
 static void run_pkt_test(struct test_spec *test)
 {
 	int ret;
@@ -2477,7 +2567,9 @@ static const struct test_spec tests[] = {
 	{.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb},
 	{.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb},
 	{.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags},
-};
+	{.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size},
+	{.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size},
+	};
 
 static void print_tests(void)
 {
@@ -2497,6 +2589,7 @@ int main(int argc, char **argv)
 	int modes = TEST_MODE_SKB + 1;
 	struct test_spec test;
 	bool shared_netdev;
+	int ret;
 
 	/* Use libbpf 1.0 API mode */
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
@@ -2534,6 +2627,13 @@ int main(int argc, char **argv)
 			modes++;
 	}
 
+	ret = get_hw_ring_size(ifobj_tx->ifname, &ifobj_tx->ring);
+	if (!ret) {
+		ifobj_tx->hw_ring_size_supp = true;
+		ifobj_tx->set_ring.default_tx = ifobj_tx->ring.tx_pending;
+		ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending;
+	}
+
 	init_iface(ifobj_rx, worker_testapp_validate_rx);
 	init_iface(ifobj_tx, worker_testapp_validate_tx);
 
@@ -2581,6 +2681,9 @@ int main(int argc, char **argv)
 		}
 	}
 
+	if (ifobj_tx->hw_ring_size_supp)
+		hw_ring_size_reset(ifobj_tx);
+
 	pkt_stream_delete(tx_pkt_stream_default);
 	pkt_stream_delete(rx_pkt_stream_default);
 	xsk_unload_xdp_programs(ifobj_tx);
diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h
index f174df2d693f..906de5fab7a3 100644
--- a/tools/testing/selftests/bpf/xskxceiver.h
+++ b/tools/testing/selftests/bpf/xskxceiver.h
@@ -44,7 +44,7 @@
 #define MAX_ETH_JUMBO_SIZE 9000
 #define USLEEP_MAX 10000
 #define SOCK_RECONF_CTR 10
-#define BATCH_SIZE 64
+#define DEFAULT_BATCH_SIZE 64
 #define POLL_TMOUT 1000
 #define THREAD_TMOUT 3
 #define DEFAULT_PKT_CNT (4 * 1024)
@@ -91,6 +91,7 @@ struct xsk_socket_info {
 	struct pkt_stream *pkt_stream;
 	u32 outstanding_tx;
 	u32 rxqsize;
+	u32 batch_size;
 	u8 dst_mac[ETH_ALEN];
 	u8 src_mac[ETH_ALEN];
 };
@@ -113,6 +114,11 @@ struct pkt_stream {
 	bool verbatim;
 };
 
+struct set_hw_ring {
+	u32 default_tx;
+	u32 default_rx;
+};
+
 struct ifobject;
 struct test_spec;
 typedef int (*validation_func_t)(struct ifobject *ifobj);
@@ -129,6 +135,8 @@ struct ifobject {
 	struct xsk_xdp_progs *xdp_progs;
 	struct bpf_map *xskmap;
 	struct bpf_program *xdp_prog;
+	struct ethtool_ringparam ring;
+	struct set_hw_ring set_ring;
 	enum test_mode mode;
 	int ifindex;
 	int mtu;
@@ -145,6 +153,7 @@ struct ifobject {
 	bool unaligned_supp;
 	bool multi_buff_supp;
 	bool multi_buff_zc_supp;
+	bool hw_ring_size_supp;
 };
 
 struct test_spec {
@@ -162,6 +171,7 @@ struct test_spec {
 	u16 current_step;
 	u16 nb_sockets;
 	bool fail;
+	bool set_ring;
 	enum test_mode mode;
 	char name[MAX_TEST_NAME_SIZE];
 };
diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c
index 7cde07a5df78..47bad7ddc5bc 100644
--- a/tools/testing/selftests/capabilities/test_execve.c
+++ b/tools/testing/selftests/capabilities/test_execve.c
@@ -82,7 +82,7 @@ static bool create_and_enter_ns(uid_t inner_uid)
 {
 	uid_t outer_uid;
 	gid_t outer_gid;
-	int i;
+	int i, ret;
 	bool have_outer_privilege;
 
 	outer_uid = getuid();
@@ -97,7 +97,10 @@ static bool create_and_enter_ns(uid_t inner_uid)
 			ksft_exit_fail_msg("setresuid - %s\n", strerror(errno));
 
 		// Re-enable effective caps
-		capng_get_caps_process();
+		ret = capng_get_caps_process();
+		if (ret == -1)
+			ksft_exit_fail_msg("capng_get_caps_process failed\n");
+
 		for (i = 0; i < CAP_LAST_CAP; i++)
 			if (capng_have_capability(CAPNG_PERMITTED, i))
 				capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i);
@@ -207,6 +210,7 @@ static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient)
 
 static int do_tests(int uid, const char *our_path)
 {
+	int ret;
 	bool have_outer_privilege = create_and_enter_ns(uid);
 
 	int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY);
@@ -250,7 +254,9 @@ static int do_tests(int uid, const char *our_path)
 			ksft_exit_fail_msg("chmod - %s\n", strerror(errno));
 	}
 
-	capng_get_caps_process();
+	ret = capng_get_caps_process();
+	if (ret == -1)
+		ksft_exit_fail_msg("capng_get_caps_process failed\n");
 
 	/* Make sure that i starts out clear */
 	capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c
index 60b4e7b716a7..65f2a1c89239 100644
--- a/tools/testing/selftests/capabilities/validate_cap.c
+++ b/tools/testing/selftests/capabilities/validate_cap.c
@@ -28,6 +28,7 @@ static bool bool_arg(char **argv, int i)
 int main(int argc, char **argv)
 {
 	const char *atsec = "";
+	int ret;
 
 	/*
 	 * Be careful just in case a setgid or setcapped copy of this
@@ -44,7 +45,11 @@ int main(int argc, char **argv)
 		atsec = " (AT_SECURE is not set)";
 #endif
 
-	capng_get_caps_process();
+	ret = capng_get_caps_process();
+	if (ret == -1) {
+		ksft_print_msg("capng_get_caps_process failed\n");
+		return 1;
+	}
 
 	if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) {
 		ksft_print_msg("Wrong effective state%s\n", atsec);
diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
index 47fdaa146443..f0e488ed90d8 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -52,7 +52,7 @@ static int get_zswap_stored_pages(size_t *value)
 
 static int get_cg_wb_count(const char *cg)
 {
-	return cg_read_key_long(cg, "memory.stat", "zswp_wb");
+	return cg_read_key_long(cg, "memory.stat", "zswpwb");
 }
 
 static long get_zswpout(const char *cgroup)
@@ -60,6 +60,27 @@ static long get_zswpout(const char *cgroup)
 	return cg_read_key_long(cgroup, "memory.stat", "zswpout ");
 }
 
+static int allocate_and_read_bytes(const char *cgroup, void *arg)
+{
+	size_t size = (size_t)arg;
+	char *mem = (char *)malloc(size);
+	int ret = 0;
+
+	if (!mem)
+		return -1;
+	for (int i = 0; i < size; i += 4095)
+		mem[i] = 'a';
+
+	/* Go through the allocated memory to (z)swap in and out pages */
+	for (int i = 0; i < size; i += 4095) {
+		if (mem[i] != 'a')
+			ret = -1;
+	}
+
+	free(mem);
+	return ret;
+}
+
 static int allocate_bytes(const char *cgroup, void *arg)
 {
 	size_t size = (size_t)arg;
@@ -100,7 +121,6 @@ static int test_zswap_usage(const char *root)
 	int ret = KSFT_FAIL;
 	char *test_group;
 
-	/* Set up */
 	test_group = cg_name(root, "no_shrink_test");
 	if (!test_group)
 		goto out;
@@ -134,6 +154,101 @@ out:
 }
 
 /*
+ * Check that when memory.zswap.max = 0, no pages can go to the zswap pool for
+ * the cgroup.
+ */
+static int test_swapin_nozswap(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *test_group;
+	long swap_peak, zswpout;
+
+	test_group = cg_name(root, "no_zswap_test");
+	if (!test_group)
+		goto out;
+	if (cg_create(test_group))
+		goto out;
+	if (cg_write(test_group, "memory.max", "8M"))
+		goto out;
+	if (cg_write(test_group, "memory.zswap.max", "0"))
+		goto out;
+
+	/* Allocate and read more than memory.max to trigger swapin */
+	if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
+		goto out;
+
+	/* Verify that pages are swapped out, but no zswap happened */
+	swap_peak = cg_read_long(test_group, "memory.swap.peak");
+	if (swap_peak < 0) {
+		ksft_print_msg("failed to get cgroup's swap_peak\n");
+		goto out;
+	}
+
+	if (swap_peak < MB(24)) {
+		ksft_print_msg("at least 24MB of memory should be swapped out\n");
+		goto out;
+	}
+
+	zswpout = get_zswpout(test_group);
+	if (zswpout < 0) {
+		ksft_print_msg("failed to get zswpout\n");
+		goto out;
+	}
+
+	if (zswpout > 0) {
+		ksft_print_msg("zswapout > 0 when memory.zswap.max = 0\n");
+		goto out;
+	}
+
+	ret = KSFT_PASS;
+
+out:
+	cg_destroy(test_group);
+	free(test_group);
+	return ret;
+}
+
+/* Simple test to verify the (z)swapin code paths */
+static int test_zswapin(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *test_group;
+	long zswpin;
+
+	test_group = cg_name(root, "zswapin_test");
+	if (!test_group)
+		goto out;
+	if (cg_create(test_group))
+		goto out;
+	if (cg_write(test_group, "memory.max", "8M"))
+		goto out;
+	if (cg_write(test_group, "memory.zswap.max", "max"))
+		goto out;
+
+	/* Allocate and read more than memory.max to trigger (z)swap in */
+	if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
+		goto out;
+
+	zswpin = cg_read_key_long(test_group, "memory.stat", "zswpin ");
+	if (zswpin < 0) {
+		ksft_print_msg("failed to get zswpin\n");
+		goto out;
+	}
+
+	if (zswpin < MB(24) / PAGE_SIZE) {
+		ksft_print_msg("at least 24MB should be brought back from zswap\n");
+		goto out;
+	}
+
+	ret = KSFT_PASS;
+
+out:
+	cg_destroy(test_group);
+	free(test_group);
+	return ret;
+}
+
+/*
  * When trying to store a memcg page in zswap, if the memcg hits its memory
  * limit in zswap, writeback should affect only the zswapped pages of that
  * memcg.
@@ -144,7 +259,6 @@ static int test_no_invasive_cgroup_shrink(const char *root)
 	size_t control_allocation_size = MB(10);
 	char *control_allocation, *wb_group = NULL, *control_group = NULL;
 
-	/* Set up */
 	wb_group = setup_test_group_1M(root, "per_memcg_wb_test1");
 	if (!wb_group)
 		return KSFT_FAIL;
@@ -309,6 +423,8 @@ struct zswap_test {
 	const char *name;
 } tests[] = {
 	T(test_zswap_usage),
+	T(test_swapin_nozswap),
+	T(test_zswapin),
 	T(test_no_kmem_bypass),
 	T(test_no_invasive_cgroup_shrink),
 };
diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c
index 3c9bf0cd82a8..e61f07973ce5 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -95,9 +95,14 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
 			getpid(), pid);
 
 	if (waitpid(-1, &status, __WALL) < 0) {
-		ksft_print_msg("Child returned %s\n", strerror(errno));
+		ksft_print_msg("waitpid() returned %s\n", strerror(errno));
 		return -errno;
 	}
+	if (!WIFEXITED(status)) {
+		ksft_print_msg("Child did not exit normally, status 0x%x\n",
+			       status);
+		return EXIT_FAILURE;
+	}
 	if (WEXITSTATUS(status))
 		return WEXITSTATUS(status);
 
diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c
index 54a8b2445be9..ce0426786828 100644
--- a/tools/testing/selftests/clone3/clone3_clear_sighand.c
+++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c
@@ -120,5 +120,5 @@ int main(int argc, char **argv)
 
 	test_clone3_clear_sighand();
 
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c
index ed785afb6077..bfb0da2b4fdd 100644
--- a/tools/testing/selftests/clone3/clone3_set_tid.c
+++ b/tools/testing/selftests/clone3/clone3_set_tid.c
@@ -114,7 +114,8 @@ static int call_clone3_set_tid(pid_t *set_tid,
 	return WEXITSTATUS(status);
 }
 
-static void test_clone3_set_tid(pid_t *set_tid,
+static void test_clone3_set_tid(const char *desc,
+				pid_t *set_tid,
 				size_t set_tid_size,
 				int flags,
 				int expected,
@@ -129,17 +130,13 @@ static void test_clone3_set_tid(pid_t *set_tid,
 	ret = call_clone3_set_tid(set_tid, set_tid_size, flags, expected_pid,
 				  wait_for_it);
 	ksft_print_msg(
-		"[%d] clone3() with CLONE_SET_TID %d says :%d - expected %d\n",
+		"[%d] clone3() with CLONE_SET_TID %d says: %d - expected %d\n",
 		getpid(), set_tid[0], ret, expected);
-	if (ret != expected)
-		ksft_test_result_fail(
-			"[%d] Result (%d) is different than expected (%d)\n",
-			getpid(), ret, expected);
-	else
-		ksft_test_result_pass(
-			"[%d] Result (%d) matches expectation (%d)\n",
-			getpid(), ret, expected);
+
+	ksft_test_result(ret == expected, "%s with %zu TIDs and flags 0x%x\n",
+			 desc, set_tid_size, flags);
 }
+
 int main(int argc, char *argv[])
 {
 	FILE *f;
@@ -172,73 +169,91 @@ int main(int argc, char *argv[])
 
 	/* Try invalid settings */
 	memset(&set_tid, 0, sizeof(set_tid));
-	test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("invalid size, 0 TID",
+			    set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0);
 
-	test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("invalid size, 0 TID",
+			    set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0);
 
-	test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0,
-			-EINVAL, 0, 0);
+	test_clone3_set_tid("invalid size, 0 TID",
+			    set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0,
+			    -EINVAL, 0, 0);
 
-	test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("invalid size, 0 TID",
+			    set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0);
 
 	/*
 	 * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1
 	 * nested PID namespace.
 	 */
-	test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("invalid size, 0 TID",
+			    set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0);
 
 	memset(&set_tid, 0xff, sizeof(set_tid));
-	test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("invalid size, TID all 1s",
+			    set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0);
 
-	test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("invalid size, TID all 1s",
+			    set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0);
 
-	test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0,
-			-EINVAL, 0, 0);
+	test_clone3_set_tid("invalid size, TID all 1s",
+			    set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0,
+			    -EINVAL, 0, 0);
 
-	test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("invalid size, TID all 1s",
+			    set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0);
 
 	/*
 	 * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1
 	 * nested PID namespace.
 	 */
-	test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("invalid size, TID all 1s",
+			    set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0);
 
 	memset(&set_tid, 0, sizeof(set_tid));
 	/* Try with an invalid PID */
 	set_tid[0] = 0;
-	test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("valid size, 0 TID",
+			    set_tid, 1, 0, -EINVAL, 0, 0);
 
 	set_tid[0] = -1;
-	test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("valid size, -1 TID",
+			    set_tid, 1, 0, -EINVAL, 0, 0);
 
 	/* Claim that the set_tid array actually contains 2 elements. */
-	test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("2 TIDs, -1 and 0",
+			    set_tid, 2, 0, -EINVAL, 0, 0);
 
 	/* Try it in a new PID namespace */
 	if (uid == 0)
-		test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0);
+		test_clone3_set_tid("valid size, -1 TID",
+				    set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0);
 	else
 		ksft_test_result_skip("Clone3() with set_tid requires root\n");
 
 	/* Try with a valid PID (1) this should return -EEXIST. */
 	set_tid[0] = 1;
 	if (uid == 0)
-		test_clone3_set_tid(set_tid, 1, 0, -EEXIST, 0, 0);
+		test_clone3_set_tid("duplicate PID 1",
+				    set_tid, 1, 0, -EEXIST, 0, 0);
 	else
 		ksft_test_result_skip("Clone3() with set_tid requires root\n");
 
 	/* Try it in a new PID namespace */
 	if (uid == 0)
-		test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, 0, 0, 0);
+		test_clone3_set_tid("duplicate PID 1",
+				    set_tid, 1, CLONE_NEWPID, 0, 0, 0);
 	else
 		ksft_test_result_skip("Clone3() with set_tid requires root\n");
 
 	/* pid_max should fail everywhere */
 	set_tid[0] = pid_max;
-	test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("set TID to maximum",
+			    set_tid, 1, 0, -EINVAL, 0, 0);
 
 	if (uid == 0)
-		test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0);
+		test_clone3_set_tid("set TID to maximum",
+				    set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0);
 	else
 		ksft_test_result_skip("Clone3() with set_tid requires root\n");
 
@@ -262,10 +277,12 @@ int main(int argc, char *argv[])
 
 	/* After the child has finished, its PID should be free. */
 	set_tid[0] = pid;
-	test_clone3_set_tid(set_tid, 1, 0, 0, 0, 0);
+	test_clone3_set_tid("reallocate child TID",
+			    set_tid, 1, 0, 0, 0, 0);
 
 	/* This should fail as there is no PID 1 in that namespace */
-	test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0);
+	test_clone3_set_tid("duplicate child TID",
+			    set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0);
 
 	/*
 	 * Creating a process with PID 1 in the newly created most nested
@@ -274,7 +291,8 @@ int main(int argc, char *argv[])
 	 */
 	set_tid[0] = 1;
 	set_tid[1] = pid;
-	test_clone3_set_tid(set_tid, 2, CLONE_NEWPID, 0, pid, 0);
+	test_clone3_set_tid("create PID 1 in new NS",
+			    set_tid, 2, CLONE_NEWPID, 0, pid, 0);
 
 	ksft_print_msg("unshare PID namespace\n");
 	if (unshare(CLONE_NEWPID) == -1)
@@ -284,7 +302,8 @@ int main(int argc, char *argv[])
 	set_tid[0] = pid;
 
 	/* This should fail as there is no PID 1 in that namespace */
-	test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0);
+	test_clone3_set_tid("duplicate PID 1",
+			    set_tid, 1, 0, -EINVAL, 0, 0);
 
 	/* Let's create a PID 1 */
 	ns_pid = fork();
@@ -295,21 +314,25 @@ int main(int argc, char *argv[])
 		 */
 		set_tid[0] = 43;
 		set_tid[1] = -1;
-		test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0);
+		test_clone3_set_tid("check leak on invalid TID -1",
+				    set_tid, 2, 0, -EINVAL, 0, 0);
 
 		set_tid[0] = 43;
 		set_tid[1] = pid;
-		test_clone3_set_tid(set_tid, 2, 0, 0, 43, 0);
+		test_clone3_set_tid("check leak on invalid specific TID",
+				    set_tid, 2, 0, 0, 43, 0);
 
 		ksft_print_msg("Child in PID namespace has PID %d\n", getpid());
 		set_tid[0] = 2;
-		test_clone3_set_tid(set_tid, 1, 0, 0, 2, 0);
+		test_clone3_set_tid("create PID 2 in child NS",
+				    set_tid, 1, 0, 0, 2, 0);
 
 		set_tid[0] = 1;
 		set_tid[1] = -1;
 		set_tid[2] = pid;
 		/* This should fail as there is invalid PID at level '1'. */
-		test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, -EINVAL, 0, 0);
+		test_clone3_set_tid("fail due to invalid TID at level 1",
+				    set_tid, 3, CLONE_NEWPID, -EINVAL, 0, 0);
 
 		set_tid[0] = 1;
 		set_tid[1] = 42;
@@ -319,13 +342,15 @@ int main(int argc, char *argv[])
 		 * namespaces. Again assuming this is running in the host's
 		 * PID namespace. Not yet nested.
 		 */
-		test_clone3_set_tid(set_tid, 4, CLONE_NEWPID, -EINVAL, 0, 0);
+		test_clone3_set_tid("fail due to too few active PID NSs",
+				    set_tid, 4, CLONE_NEWPID, -EINVAL, 0, 0);
 
 		/*
 		 * This should work and from the parent we should see
 		 * something like 'NSpid:	pid	42	1'.
 		 */
-		test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, 0, 42, true);
+		test_clone3_set_tid("verify that we have 3 PID NSs",
+				    set_tid, 3, CLONE_NEWPID, 0, 42, true);
 
 		child_exit(ksft_cnt.ksft_fail);
 	}
@@ -380,16 +405,14 @@ int main(int argc, char *argv[])
 	ksft_cnt.ksft_pass += 6 - (ksft_cnt.ksft_fail - WEXITSTATUS(status));
 	ksft_cnt.ksft_fail = WEXITSTATUS(status);
 
-	if (ns3 == pid && ns2 == 42 && ns1 == 1)
-		ksft_test_result_pass(
-			"PIDs in all namespaces as expected (%d,%d,%d)\n",
-			ns3, ns2, ns1);
-	else
-		ksft_test_result_fail(
-			"PIDs in all namespaces not as expected (%d,%d,%d)\n",
-			ns3, ns2, ns1);
+	ksft_print_msg("Expecting PIDs %d, 42, 1\n", pid);
+	ksft_print_msg("Have PIDs in namespaces: %d, %d, %d\n", ns3, ns2, ns1);
+	ksft_test_result(ns3 == pid && ns2 == 42 && ns1 == 1,
+			 "PIDs in all namespaces as expected\n");
 out:
 	ret = 0;
 
-	return !ret ? ksft_exit_pass() : ksft_exit_fail();
+	if (ret)
+		ksft_exit_fail();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
index c59e4adb905d..991c473e3859 100644
--- a/tools/testing/selftests/core/close_range_test.c
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -17,6 +17,15 @@
 #include "../kselftest_harness.h"
 #include "../clone3/clone3_selftests.h"
 
+
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE 1024
+#endif
+
+#ifndef F_DUPFD_QUERY
+#define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3)
+#endif
+
 static inline int sys_close_range(unsigned int fd, unsigned int max_fd,
 				  unsigned int flags)
 {
@@ -45,6 +54,15 @@ TEST(core_close_range)
 			SKIP(return, "close_range() syscall not supported");
 	}
 
+	for (i = 0; i < 100; i++) {
+		ret = fcntl(open_fds[i], F_DUPFD_QUERY, open_fds[i + 1]);
+		if (ret < 0) {
+			EXPECT_EQ(errno, EINVAL);
+		} else {
+			EXPECT_EQ(ret, 0);
+		}
+	}
+
 	EXPECT_EQ(0, sys_close_range(open_fds[0], open_fds[50], 0));
 
 	for (i = 0; i <= 50; i++)
@@ -358,7 +376,7 @@ TEST(close_range_cloexec_unshare)
  */
 TEST(close_range_cloexec_syzbot)
 {
-	int fd1, fd2, fd3, flags, ret, status;
+	int fd1, fd2, fd3, fd4, flags, ret, status;
 	pid_t pid;
 	struct __clone_args args = {
 		.flags = CLONE_FILES,
@@ -372,6 +390,13 @@ TEST(close_range_cloexec_syzbot)
 	fd2 = dup2(fd1, 1000);
 	EXPECT_GT(fd2, 0);
 
+	flags = fcntl(fd1, F_DUPFD_QUERY, fd2);
+	if (flags < 0) {
+		EXPECT_EQ(errno, EINVAL);
+	} else {
+		EXPECT_EQ(flags, 1);
+	}
+
 	pid = sys_clone3(&args, sizeof(args));
 	ASSERT_GE(pid, 0);
 
@@ -396,6 +421,15 @@ TEST(close_range_cloexec_syzbot)
 		fd3 = dup2(fd1, 42);
 		EXPECT_GT(fd3, 0);
 
+		flags = fcntl(fd1, F_DUPFD_QUERY, fd3);
+		if (flags < 0) {
+			EXPECT_EQ(errno, EINVAL);
+		} else {
+			EXPECT_EQ(flags, 1);
+		}
+
+
+
 		/*
 			 * Duplicating the file descriptor must remove the
 			 * FD_CLOEXEC flag.
@@ -426,6 +460,24 @@ TEST(close_range_cloexec_syzbot)
 	fd3 = dup2(fd1, 42);
 	EXPECT_GT(fd3, 0);
 
+	flags = fcntl(fd1, F_DUPFD_QUERY, fd3);
+	if (flags < 0) {
+		EXPECT_EQ(errno, EINVAL);
+	} else {
+		EXPECT_EQ(flags, 1);
+	}
+
+	fd4 = open("/dev/null", O_RDWR);
+	EXPECT_GT(fd4, 0);
+
+	/* Same inode, different file pointers. */
+	flags = fcntl(fd1, F_DUPFD_QUERY, fd4);
+	if (flags < 0) {
+		EXPECT_EQ(errno, EINVAL);
+	} else {
+		EXPECT_EQ(flags, 0);
+	}
+
 	flags = fcntl(fd3, F_GETFD);
 	EXPECT_GT(flags, -1);
 	EXPECT_EQ(flags & FD_CLOEXEC, 0);
@@ -433,6 +485,7 @@ TEST(close_range_cloexec_syzbot)
 	EXPECT_EQ(close(fd1), 0);
 	EXPECT_EQ(close(fd2), 0);
 	EXPECT_EQ(close(fd3), 0);
+	EXPECT_EQ(close(fd4), 0);
 }
 
 /*
diff --git a/tools/testing/selftests/cpufreq/cpufreq.sh b/tools/testing/selftests/cpufreq/cpufreq.sh
index b583a2fb4504..a8b1dbc0a3a5 100755
--- a/tools/testing/selftests/cpufreq/cpufreq.sh
+++ b/tools/testing/selftests/cpufreq/cpufreq.sh
@@ -178,8 +178,7 @@ cpufreq_basic_tests()
 
 	count=$(count_cpufreq_managed_cpus)
 	if [ $count = 0 ]; then
-		printf "No cpu is managed by cpufreq core, exiting\n"
-		exit;
+		ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting\n"
 	else
 		printf "CPUFreq manages: $count CPUs\n\n"
 	fi
diff --git a/tools/testing/selftests/cpufreq/main.sh b/tools/testing/selftests/cpufreq/main.sh
index 60ce18ed0666..a0eb84cf7167 100755
--- a/tools/testing/selftests/cpufreq/main.sh
+++ b/tools/testing/selftests/cpufreq/main.sh
@@ -7,15 +7,15 @@ source governor.sh
 source module.sh
 source special-tests.sh
 
+DIR="$(dirname $(readlink -f "$0"))"
+source "${DIR}"/../kselftest/ktap_helpers.sh
+
 FUNC=basic	# do basic tests by default
 OUTFILE=cpufreq_selftest
 SYSFS=
 CPUROOT=
 CPUFREQROOT=
 
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
 helpme()
 {
 	printf "Usage: $0 [-h] [-todg args]
@@ -32,7 +32,7 @@ helpme()
 	[-d <driver's module name: only with \"-t modtest>\"]
 	[-g <governor's module name: only with \"-t modtest>\"]
 	\n"
-	exit 2
+	exit "${KSFT_FAIL}"
 }
 
 prerequisite()
@@ -40,8 +40,8 @@ prerequisite()
 	msg="skip all tests:"
 
 	if [ $UID != 0 ]; then
-		echo $msg must be run as root >&2
-		exit $ksft_skip
+		ktap_skip_all "$msg must be run as root"
+		exit "${KSFT_SKIP}"
 	fi
 
 	taskset -p 01 $$
@@ -49,21 +49,21 @@ prerequisite()
 	SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
 
 	if [ ! -d "$SYSFS" ]; then
-		echo $msg sysfs is not mounted >&2
-		exit 2
+		ktap_skip_all "$msg sysfs is not mounted"
+		exit "${KSFT_SKIP}"
 	fi
 
 	CPUROOT=$SYSFS/devices/system/cpu
 	CPUFREQROOT="$CPUROOT/cpufreq"
 
 	if ! ls $CPUROOT/cpu* > /dev/null 2>&1; then
-		echo $msg cpus not available in sysfs >&2
-		exit 2
+		ktap_skip_all "$msg cpus not available in sysfs"
+		exit "${KSFT_SKIP}"
 	fi
 
 	if ! ls $CPUROOT/cpufreq > /dev/null 2>&1; then
-		echo $msg cpufreq directory not available in sysfs >&2
-		exit 2
+		ktap_skip_all "$msg cpufreq directory not available in sysfs"
+		exit "${KSFT_SKIP}"
 	fi
 }
 
@@ -105,8 +105,7 @@ do_test()
 	count=$(count_cpufreq_managed_cpus)
 
 	if [ $count = 0 -a $FUNC != "modtest" ]; then
-		echo "No cpu is managed by cpufreq core, exiting"
-		exit 2;
+		ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting"
 	fi
 
 	case "$FUNC" in
@@ -125,8 +124,7 @@ do_test()
 		"modtest")
 		# Do we have modules in place?
 		if [ -z $DRIVER_MOD ] && [ -z $GOVERNOR_MOD ]; then
-			echo "No driver or governor module passed with -d or -g"
-			exit 2;
+			ktap_exit_fail_msg "No driver or governor module passed with -d or -g"
 		fi
 
 		if [ $DRIVER_MOD ]; then
@@ -137,8 +135,7 @@ do_test()
 			fi
 		else
 			if [ $count = 0 ]; then
-				echo "No cpu is managed by cpufreq core, exiting"
-				exit 2;
+				ktap_exit_fail_msg "No cpu is managed by cpufreq core, exiting"
 			fi
 
 			module_governor_test $GOVERNOR_MOD
@@ -162,7 +159,7 @@ do_test()
 		;;
 
 		*)
-		echo "Invalid [-f] function type"
+		ktap_print_msg "Invalid [-f] function type"
 		helpme
 		;;
 	esac
@@ -186,13 +183,25 @@ dmesg_dumps()
 	dmesg >> $1.dmesg_full.txt
 }
 
+ktap_print_header
+
 # Parse arguments
 parse_arguments $@
 
+ktap_set_plan 1
+
 # Make sure all requirements are met
 prerequisite
 
 # Run requested functions
 clear_dumps $OUTFILE
 do_test | tee -a $OUTFILE.txt
+if [ "${PIPESTATUS[0]}" -ne 0 ]; then
+    exit ${PIPESTATUS[0]};
+fi
 dmesg_dumps $OUTFILE
+
+ktap_test_pass "Completed successfully"
+
+ktap_print_totals
+exit "${KSFT_PASS}"
diff --git a/tools/testing/selftests/cpufreq/module.sh b/tools/testing/selftests/cpufreq/module.sh
index 22563cd122e7..7f2667e0ae2d 100755
--- a/tools/testing/selftests/cpufreq/module.sh
+++ b/tools/testing/selftests/cpufreq/module.sh
@@ -24,16 +24,14 @@ test_basic_insmod_rmmod()
 	# insert module
 	insmod $1
 	if [ $? != 0 ]; then
-		printf "Insmod $1 failed\n"
-		exit;
+		ktap_exit_fail_msg "Insmod $1 failed\n"
 	fi
 
 	printf "Removing $1 module\n"
 	# remove module
 	rmmod $1
 	if [ $? != 0 ]; then
-		printf "rmmod $1 failed\n"
-		exit;
+		ktap_exit_fail_msg "rmmod $1 failed\n"
 	fi
 
 	printf "\n"
diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore
index c6c2965a6607..e65ef9d9cedc 100644
--- a/tools/testing/selftests/damon/.gitignore
+++ b/tools/testing/selftests/damon/.gitignore
@@ -1,2 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 huge_count_read_write
+debugfs_target_ids_read_before_terminate_race
+debugfs_target_ids_pid_leak
+access_memory
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 8a1cc2bf1864..789d6949c247 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -2,6 +2,8 @@
 # Makefile for damon selftests
 
 TEST_GEN_FILES += huge_count_read_write
+TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race
+TEST_GEN_FILES += debugfs_target_ids_pid_leak
 TEST_GEN_FILES += access_memory
 
 TEST_FILES = _chk_dependency.sh _debugfs_common.sh
@@ -9,9 +11,12 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
 TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
 TEST_PROGS += debugfs_duplicate_context_creation.sh
 TEST_PROGS += debugfs_rm_non_contexts.sh
+TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh
+TEST_PROGS += debugfs_target_ids_pid_leak.sh
 TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh
 TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py
 TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py
+TEST_PROGS += damos_quota.py damos_apply_interval.py
 TEST_PROGS += reclaim.sh lru_sort.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh
index 0328ac0b5a5e..dda3a87dc00a 100644
--- a/tools/testing/selftests/damon/_chk_dependency.sh
+++ b/tools/testing/selftests/damon/_chk_dependency.sh
@@ -4,7 +4,14 @@
 # Kselftest framework requirement - SKIP code is 4.
 ksft_skip=4
 
-DBGFS=/sys/kernel/debug/damon
+DBGFS=$(grep debugfs /proc/mounts --max-count 1 | awk '{print $2}')
+if [ "$DBGFS" = "" ]
+then
+	echo "debugfs not mounted"
+	exit $ksft_skip
+fi
+
+DBGFS+="/damon"
 
 if [ $EUID -ne 0 ];
 then
@@ -18,7 +25,14 @@ then
 	exit $ksft_skip
 fi
 
-for f in attrs target_ids monitor_on
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+	monitor_on_file="monitor_on_DEPRECATED"
+else
+	monitor_on_file="monitor_on"
+fi
+
+for f in attrs target_ids "$monitor_on_file"
 do
 	if [ ! -f "$DBGFS/$f" ]
 	then
@@ -28,7 +42,7 @@ do
 done
 
 permission_error="Operation not permitted"
-for f in attrs target_ids monitor_on
+for f in attrs target_ids "$monitor_on_file"
 do
 	status=$( cat "$DBGFS/$f" 2>&1 )
 	if [ "${status#*$permission_error}" != "$status" ]; then
diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py
index e98cf4b6a4b7..d23d7398a27a 100644
--- a/tools/testing/selftests/damon/_damon_sysfs.py
+++ b/tools/testing/selftests/damon/_damon_sysfs.py
@@ -70,18 +70,65 @@ class DamosAccessPattern:
         if err != None:
             return err
 
+class DamosQuota:
+    sz = None                   # size quota, in bytes
+    ms = None                   # time quota
+    reset_interval_ms = None    # quota reset interval
+    scheme = None               # owner scheme
+
+    def __init__(self, sz=0, ms=0, reset_interval_ms=0):
+        self.sz = sz
+        self.ms = ms
+        self.reset_interval_ms = reset_interval_ms
+
+    def sysfs_dir(self):
+        return os.path.join(self.scheme.sysfs_dir(), 'quotas')
+
+    def stage(self):
+        err = write_file(os.path.join(self.sysfs_dir(), 'bytes'), self.sz)
+        if err != None:
+            return err
+        err = write_file(os.path.join(self.sysfs_dir(), 'ms'), self.ms)
+        if err != None:
+            return err
+        err = write_file(os.path.join(self.sysfs_dir(), 'reset_interval_ms'),
+                         self.reset_interval_ms)
+        if err != None:
+            return err
+
+class DamosStats:
+    nr_tried = None
+    sz_tried = None
+    nr_applied = None
+    sz_applied = None
+    qt_exceeds = None
+
+    def __init__(self, nr_tried, sz_tried, nr_applied, sz_applied, qt_exceeds):
+        self.nr_tried = nr_tried
+        self.sz_tried = sz_tried
+        self.nr_applied = nr_applied
+        self.sz_applied = sz_applied
+        self.qt_exceeds = qt_exceeds
+
 class Damos:
     action = None
     access_pattern = None
-    # todo: Support quotas, watermarks, stats, tried_regions
+    quota = None
+    apply_interval_us = None
+    # todo: Support watermarks, stats, tried_regions
     idx = None
     context = None
     tried_bytes = None
+    stats = None
 
-    def __init__(self, action='stat', access_pattern=DamosAccessPattern()):
+    def __init__(self, action='stat', access_pattern=DamosAccessPattern(),
+                 quota=DamosQuota(), apply_interval_us=0):
         self.action = action
         self.access_pattern = access_pattern
         self.access_pattern.scheme = self
+        self.quota = quota
+        self.quota.scheme = self
+        self.apply_interval_us = apply_interval_us
 
     def sysfs_dir(self):
         return os.path.join(
@@ -94,13 +141,12 @@ class Damos:
         err = self.access_pattern.stage()
         if err != None:
             return err
-
-        # disable quotas
-        err = write_file(os.path.join(self.sysfs_dir(), 'quotas', 'ms'), '0')
+        err = write_file(os.path.join(self.sysfs_dir(), 'apply_interval_us'),
+                         '%d' % self.apply_interval_us)
         if err != None:
             return err
-        err = write_file(
-                os.path.join(self.sysfs_dir(), 'quotas', 'bytes'), '0')
+
+        err = self.quota.stage()
         if err != None:
             return err
 
@@ -298,6 +344,23 @@ class Kdamond:
                     return err
                 scheme.tried_bytes = int(content)
 
+    def update_schemes_stats(self):
+        err = write_file(os.path.join(self.sysfs_dir(), 'state'),
+                'update_schemes_stats')
+        if err != None:
+            return err
+        for context in self.contexts:
+            for scheme in context.schemes:
+                stat_values = []
+                for stat in ['nr_tried', 'sz_tried', 'nr_applied',
+                             'sz_applied', 'qt_exceeds']:
+                    content, err = read_file(
+                            os.path.join(scheme.sysfs_dir(), 'stats', stat))
+                    if err != None:
+                        return err
+                    stat_values.append(int(content))
+                scheme.stats = DamosStats(*stat_values)
+
 class Kdamonds:
     kdamonds = []
 
diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh
index 48989d4813ae..aa995516870b 100644
--- a/tools/testing/selftests/damon/_debugfs_common.sh
+++ b/tools/testing/selftests/damon/_debugfs_common.sh
@@ -45,6 +45,13 @@ test_content() {
 source ./_chk_dependency.sh
 
 damon_onoff="$DBGFS/monitor_on"
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+	damon_onoff="$DBGFS/monitor_on_DEPRECATED"
+else
+	damon_onoff="$DBGFS/monitor_on"
+fi
+
 if [ $(cat "$damon_onoff") = "on" ]
 then
 	echo "monitoring is on"
diff --git a/tools/testing/selftests/damon/damos_apply_interval.py b/tools/testing/selftests/damon/damos_apply_interval.py
new file mode 100644
index 000000000000..f04d43702481
--- /dev/null
+++ b/tools/testing/selftests/damon/damos_apply_interval.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+import time
+
+import _damon_sysfs
+
+def main():
+    # access two 10 MiB memory regions, 2 second per each
+    sz_region = 10 * 1024 * 1024
+    proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000'])
+
+    # Set quota up to 1 MiB per 100 ms
+    kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond(
+            contexts=[_damon_sysfs.DamonCtx(
+                ops='vaddr',
+                targets=[_damon_sysfs.DamonTarget(pid=proc.pid)],
+                schemes=[
+                    _damon_sysfs.Damos(
+                        access_pattern=_damon_sysfs.DamosAccessPattern(
+                            # >= 25% access rate, >= 200ms age
+                            nr_accesses=[5, 20], age=[2, 2**64 - 1]),
+                        # aggregation interval (100 ms) is used
+                        apply_interval_us=0),
+                    # use 10ms apply interval
+                    _damon_sysfs.Damos(
+                        access_pattern=_damon_sysfs.DamosAccessPattern(
+                            # >= 25% access rate, >= 200ms age
+                            nr_accesses=[5, 20], age=[2, 2**64 - 1]),
+                        # explicitly set 10 ms apply interval
+                        apply_interval_us=10 * 1000)
+                    ] # schemes
+                )] # contexts
+            )]) # kdamonds
+
+    err = kdamonds.start()
+    if err != None:
+        print('kdamond start failed: %s' % err)
+        exit(1)
+
+    wss_collected = []
+    nr_quota_exceeds = 0
+    while proc.poll() == None:
+        time.sleep(0.1)
+        err = kdamonds.kdamonds[0].update_schemes_stats()
+        if err != None:
+            print('stats update failed: %s' % err)
+            exit(1)
+    schemes = kdamonds.kdamonds[0].contexts[0].schemes
+    nr_tried_stats = [s.stats.nr_tried for s in schemes]
+    if nr_tried_stats[0] == 0 or nr_tried_stats[1] == 0:
+        print('scheme(s) are not tried')
+        exit(1)
+
+    # Because the second scheme was having the apply interval that is ten times
+    # lower than that of the first scheme, the second scheme should be tried
+    # about ten times more frequently than the first scheme.  For possible
+    # timing errors, check if it was at least nine times more freuqnetly tried.
+    ratio = nr_tried_stats[1] / nr_tried_stats[0]
+    if ratio < 9:
+        print('%d / %d = %f (< 9)' %
+              (nr_tried_stats[1], nr_tried_stats[0], ratio))
+        exit(1)
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/testing/selftests/damon/damos_quota.py b/tools/testing/selftests/damon/damos_quota.py
new file mode 100644
index 000000000000..7d4c6bb2e3cd
--- /dev/null
+++ b/tools/testing/selftests/damon/damos_quota.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+import time
+
+import _damon_sysfs
+
+def main():
+    # access two 10 MiB memory regions, 2 second per each
+    sz_region = 10 * 1024 * 1024
+    proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000'])
+
+    # Set quota up to 1 MiB per 100 ms
+    sz_quota = 1024 * 1024 # 1 MiB
+    quota_reset_interval = 100 # 100 ms
+    kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond(
+            contexts=[_damon_sysfs.DamonCtx(
+                ops='vaddr',
+                targets=[_damon_sysfs.DamonTarget(pid=proc.pid)],
+                schemes=[_damon_sysfs.Damos(
+                    access_pattern=_damon_sysfs.DamosAccessPattern(
+                        # >= 25% access rate, >= 200ms age
+                        nr_accesses=[5, 20], age=[2, 2**64 - 1]),
+                    quota=_damon_sysfs.DamosQuota(
+                        sz=sz_quota, reset_interval_ms=quota_reset_interval)
+                    )] # schemes
+                )] # contexts
+            )]) # kdamonds
+
+    err = kdamonds.start()
+    if err != None:
+        print('kdamond start failed: %s' % err)
+        exit(1)
+
+    wss_collected = []
+    nr_quota_exceeds = 0
+    while proc.poll() == None:
+        time.sleep(0.1)
+        err = kdamonds.kdamonds[0].update_schemes_tried_bytes()
+        if err != None:
+            print('tried bytes update failed: %s' % err)
+            exit(1)
+        err = kdamonds.kdamonds[0].update_schemes_stats()
+        if err != None:
+            print('stats update failed: %s' % err)
+            exit(1)
+
+        scheme = kdamonds.kdamonds[0].contexts[0].schemes[0]
+        wss_collected.append(scheme.tried_bytes)
+        nr_quota_exceeds = scheme.stats.qt_exceeds
+
+    wss_collected.sort()
+    for wss in wss_collected:
+        if wss > sz_quota:
+            print('quota is not kept: %s > %s' % (wss, sz_quota))
+            print('collected samples are as below')
+            print('\n'.join(['%d' % wss for wss in wss_collected]))
+            exit(1)
+
+    if nr_quota_exceeds < len(wss_collected):
+        print('quota is not always exceeded: %d > %d' %
+              (len(wss_collected), nr_quota_exceeds))
+        exit(1)
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh
index 87aff8083822..effbea33dc16 100755
--- a/tools/testing/selftests/damon/debugfs_empty_targets.sh
+++ b/tools/testing/selftests/damon/debugfs_empty_targets.sh
@@ -8,6 +8,14 @@ source _debugfs_common.sh
 
 orig_target_ids=$(cat "$DBGFS/target_ids")
 echo "" > "$DBGFS/target_ids"
-orig_monitor_on=$(cat "$DBGFS/monitor_on")
-test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids"
+
+if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
+then
+	monitor_on_file="$DBGFS/monitor_on_DEPRECATED"
+else
+	monitor_on_file="$DBGFS/monitor_on"
+fi
+
+orig_monitor_on=$(cat "$monitor_on_file")
+test_write_fail "$monitor_on_file" "on" "orig_monitor_on" "empty target ids"
 echo "$orig_target_ids" > "$DBGFS/target_ids"
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c
new file mode 100644
index 000000000000..0cc2eef7d142
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define _GNU_SOURCE
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids"
+
+static void write_targetid_exit(void)
+{
+	int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
+	char pid_str[128];
+
+	snprintf(pid_str, sizeof(pid_str), "%d", getpid());
+	write(target_ids_fd, pid_str, sizeof(pid_str));
+	close(target_ids_fd);
+	exit(0);
+}
+
+unsigned long msec_timestamp(void)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	return tv.tv_sec * 1000UL + tv.tv_usec / 1000;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long start_ms;
+	int time_to_run, nr_forks = 0;
+
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s <msecs to run>\n", argv[0]);
+		exit(1);
+	}
+	time_to_run = atoi(argv[1]);
+
+	start_ms = msec_timestamp();
+	while (true) {
+		int pid = fork();
+
+		if (pid < 0) {
+			fprintf(stderr, "fork() failed\n");
+			exit(1);
+		}
+		if (pid == 0)
+			write_targetid_exit();
+		wait(NULL);
+		nr_forks++;
+
+		if (msec_timestamp() - start_ms > time_to_run)
+			break;
+	}
+	printf("%d\n", nr_forks);
+	return 0;
+}
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
new file mode 100644
index 000000000000..31fe33c2b032
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+before=$(grep "^pid " /proc/slabinfo | awk '{print $2}')
+
+nr_leaks=$(./debugfs_target_ids_pid_leak 1000)
+expected_after_max=$((before + nr_leaks / 2))
+
+after=$(grep "^pid " /proc/slabinfo | awk '{print $2}')
+
+echo > /sys/kernel/debug/damon/target_ids
+
+echo "tried $nr_leaks pid leak"
+echo "number of active pid slabs: $before -> $after"
+echo "(up to $expected_after_max expected)"
+if [ $after -gt $expected_after_max ]
+then
+	echo "maybe pids are leaking"
+	exit 1
+else
+	exit 0
+fi
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c
new file mode 100644
index 000000000000..b06f52a8ce2d
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+#define _GNU_SOURCE
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#define DBGFS_MONITOR_ON "/sys/kernel/debug/damon/monitor_on_DEPRECATED"
+#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids"
+
+static void turn_damon_on_exit(void)
+{
+	int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
+	int monitor_on_fd = open(DBGFS_MONITOR_ON, O_RDWR);
+	char pid_str[128];
+
+	snprintf(pid_str, sizeof(pid_str), "%d", getpid());
+	write(target_ids_fd, pid_str, sizeof(pid_str));
+	write(monitor_on_fd, "on\n", 3);
+	close(target_ids_fd);
+	close(monitor_on_fd);
+	usleep(1000);
+	exit(0);
+}
+
+static void try_race(void)
+{
+	int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR);
+	int pid = fork();
+	int buf[256];
+
+	if (pid < 0) {
+		fprintf(stderr, "fork() failed\n");
+		exit(1);
+	}
+	if (pid == 0)
+		turn_damon_on_exit();
+	while (true) {
+		int status;
+
+		read(target_ids_fd, buf, sizeof(buf));
+		if (waitpid(-1, &status, WNOHANG) == pid)
+			break;
+	}
+	close(target_ids_fd);
+}
+
+static inline uint64_t ts_to_ms(struct timespec *ts)
+{
+	return (uint64_t)ts->tv_sec * 1000 + (uint64_t)ts->tv_nsec / 1000000;
+}
+
+int main(int argc, char *argv[])
+{
+	struct timespec start_time, now;
+	int runtime_ms;
+
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s <runtime in ms>\n", argv[0]);
+		exit(1);
+	}
+	runtime_ms = atoi(argv[1]);
+	clock_gettime(CLOCK_MONOTONIC, &start_time);
+	while (true) {
+		try_race();
+		clock_gettime(CLOCK_MONOTONIC, &now);
+		if (ts_to_ms(&now) - ts_to_ms(&start_time) > runtime_ms)
+			break;
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
new file mode 100644
index 000000000000..fc793c4c9aea
--- /dev/null
+++ b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+dmesg -C
+
+./debugfs_target_ids_read_before_terminate_race 5000
+
+if dmesg | grep -q dbgfs_target_ids_read
+then
+	dmesg
+	exit 1
+else
+	exit 0
+fi
diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py
index 8c690ba1a573..28c887a0108f 100644
--- a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py
+++ b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py
@@ -20,7 +20,7 @@ def main():
 
     err = kdamonds.start()
     if err != None:
-        print('kdmaond start failed: %s' % err)
+        print('kdamond start failed: %s' % err)
         exit(1)
 
     while proc.poll() == None:
diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
index cdbf19b442c9..90ad7409a7a6 100644
--- a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
+++ b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
@@ -23,7 +23,7 @@ def main():
 
     err = kdamonds.start()
     if err != None:
-        print('kdmaond start failed: %s' % err)
+        print('kdamond start failed: %s' % err)
         exit(1)
 
     wss_collected = []
diff --git a/tools/testing/selftests/devices/Makefile b/tools/testing/selftests/devices/Makefile
new file mode 100644
index 000000000000..ca29249b30c3
--- /dev/null
+++ b/tools/testing/selftests/devices/Makefile
@@ -0,0 +1,4 @@
+TEST_PROGS := test_discoverable_devices.py
+TEST_FILES := boards ksft.py
+
+include ../lib.mk
diff --git a/tools/testing/selftests/devices/boards/Dell Inc.,XPS 13 9300.yaml b/tools/testing/selftests/devices/boards/Dell Inc.,XPS 13 9300.yaml
new file mode 100644
index 000000000000..ff932eb19f0b
--- /dev/null
+++ b/tools/testing/selftests/devices/boards/Dell Inc.,XPS 13 9300.yaml
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# This is the device definition for the XPS 13 9300.
+# The filename "Dell Inc.,XPS 13 9300" was chosen following the format
+# "Vendor,Product", where Vendor comes from
+# /sys/devices/virtual/dmi/id/sys_vendor, and Product comes from
+# /sys/devices/virtual/dmi/id/product_name.
+#
+# See google,spherion.yaml for more information.
+#
+- type: pci-controller
+  # This machine has a single PCI host controller so it's valid to not have any
+  # key to identify the controller. If it had more than one controller, the UID
+  # of the controller from ACPI could be used to distinguish as follows:
+  #acpi-uid: 0
+  devices:
+    - path: 14.0
+      type: usb-controller
+      usb-version: 2
+      devices:
+        - path: 9
+          name: camera
+          interfaces: [0, 1, 2, 3]
+        - path: 10
+          name: bluetooth
+          interfaces: [0, 1]
+    - path: 2.0
+      name: gpu
+    - path: 4.0
+      name: thermal
+    - path: 12.0
+      name: sensors
+    - path: 14.3
+      name: wifi
+    - path: 1d.0/0.0
+      name: ssd
+    - path: 1d.7/0.0
+      name: sdcard-reader
+    - path: 1f.3
+      name: audio
diff --git a/tools/testing/selftests/devices/boards/google,spherion.yaml b/tools/testing/selftests/devices/boards/google,spherion.yaml
new file mode 100644
index 000000000000..17157ecd8c14
--- /dev/null
+++ b/tools/testing/selftests/devices/boards/google,spherion.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# This is the device definition for the Google Spherion Chromebook.
+# The filename "google,spherion" comes from the Devicetree compatible, so this
+# file will be automatically used when the test is run on that machine.
+#
+# The top-level is a list of controllers, either for USB or PCI(e).
+# Every controller needs to have a 'type' key set to either 'usb-controller' or
+# 'pci-controller'.
+# Every controller needs to be uniquely identified on the platform. To achieve
+# this, several optional keys can be used:
+# - dt-mmio: identify the MMIO address of the controller as defined in the
+#   Devicetree.
+# - usb-version: for USB controllers to differentiate between USB3 and USB2
+#   buses sharing the same controller.
+# - acpi-uid: _UID property of the controller as supplied by the ACPI. Useful to
+#   distinguish between multiple PCI host controllers.
+#
+# The 'devices' key defines a list of devices that are accessible under that
+# controller. A device might be a leaf device or another controller (see
+# 'Dell Inc.,XPS 13 9300.yaml').
+#
+# The 'path' key is needed for every child device (that is, not top-level) to
+# define how to reach this device from the parent controller. For USB devices it
+# follows the format \d(.\d)* and denotes the port in the hub at each level in
+# the USB topology. For PCI devices it follows the format \d.\d(/\d.\d)*
+# denoting the device (identified by device-function pair) at each level in the
+# PCI topology.
+#
+# The 'name' key is used in the leaf devices to name the device for clarity in
+# the test output.
+#
+# For USB leaf devices, the 'interfaces' key should contain a list of the
+# interfaces in that device that should be bound to a driver.
+#
+- type: usb-controller
+  dt-mmio: 11200000
+  usb-version: 2
+  devices:
+    - path: 1.4.1
+      interfaces: [0, 1]
+      name: camera
+    - path: 1.4.2
+      interfaces: [0, 1]
+      name: bluetooth
+- type: pci-controller
+  dt-mmio: 11230000
+  devices:
+    - path: 0.0/0.0
+      name: wifi
diff --git a/tools/testing/selftests/devices/ksft.py b/tools/testing/selftests/devices/ksft.py
new file mode 100644
index 000000000000..cd89fb2bc10e
--- /dev/null
+++ b/tools/testing/selftests/devices/ksft.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2023 Collabora Ltd
+#
+# Kselftest helpers for outputting in KTAP format. Based on kselftest.h.
+#
+
+import sys
+
+ksft_cnt = {"pass": 0, "fail": 0, "skip": 0}
+ksft_num_tests = 0
+ksft_test_number = 1
+
+KSFT_PASS = 0
+KSFT_FAIL = 1
+KSFT_SKIP = 4
+
+
+def print_header():
+    print("TAP version 13")
+
+
+def set_plan(num_tests):
+    global ksft_num_tests
+    ksft_num_tests = num_tests
+    print("1..{}".format(num_tests))
+
+
+def print_cnts():
+    print(
+        f"# Totals: pass:{ksft_cnt['pass']} fail:{ksft_cnt['fail']} xfail:0 xpass:0 skip:{ksft_cnt['skip']} error:0"
+    )
+
+
+def print_msg(msg):
+    print(f"# {msg}")
+
+
+def _test_print(result, description, directive=None):
+    if directive:
+        directive_str = f"# {directive}"
+    else:
+        directive_str = ""
+
+    global ksft_test_number
+    print(f"{result} {ksft_test_number} {description} {directive_str}")
+    ksft_test_number += 1
+
+
+def test_result_pass(description):
+    _test_print("ok", description)
+    ksft_cnt["pass"] += 1
+
+
+def test_result_fail(description):
+    _test_print("not ok", description)
+    ksft_cnt["fail"] += 1
+
+
+def test_result_skip(description):
+    _test_print("ok", description, "SKIP")
+    ksft_cnt["skip"] += 1
+
+
+def test_result(condition, description=""):
+    if condition:
+        test_result_pass(description)
+    else:
+        test_result_fail(description)
+
+
+def finished():
+    if ksft_cnt["pass"] == ksft_num_tests:
+        exit_code = KSFT_PASS
+    else:
+        exit_code = KSFT_FAIL
+
+    print_cnts()
+
+    sys.exit(exit_code)
+
+
+def exit_fail():
+    print_cnts()
+    sys.exit(KSFT_FAIL)
+
+
+def exit_pass():
+    print_cnts()
+    sys.exit(KSFT_PASS)
diff --git a/tools/testing/selftests/devices/test_discoverable_devices.py b/tools/testing/selftests/devices/test_discoverable_devices.py
new file mode 100755
index 000000000000..fbae8deb593d
--- /dev/null
+++ b/tools/testing/selftests/devices/test_discoverable_devices.py
@@ -0,0 +1,318 @@
+#!/usr/bin/python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2023 Collabora Ltd
+#
+# This script tests for presence and driver binding of devices from discoverable
+# buses (ie USB, PCI).
+#
+# The per-platform YAML file defining the devices to be tested is stored inside
+# the boards/ directory and chosen based on DT compatible or DMI IDs (sys_vendor
+# and product_name).
+#
+# See boards/google,spherion.yaml and boards/'Dell Inc.,XPS 13 9300.yaml' for
+# the description and examples of the file structure and vocabulary.
+#
+
+import glob
+import ksft
+import os
+import re
+import sys
+import yaml
+
+pci_controllers = []
+usb_controllers = []
+
+sysfs_usb_devices = "/sys/bus/usb/devices/"
+
+
+def find_pci_controller_dirs():
+    sysfs_devices = "/sys/devices"
+    pci_controller_sysfs_dir = "pci[0-9a-f]{4}:[0-9a-f]{2}"
+
+    dir_regex = re.compile(pci_controller_sysfs_dir)
+    for path, dirs, _ in os.walk(sysfs_devices):
+        for d in dirs:
+            if dir_regex.match(d):
+                pci_controllers.append(os.path.join(path, d))
+
+
+def find_usb_controller_dirs():
+    usb_controller_sysfs_dir = "usb[\d]+"
+
+    dir_regex = re.compile(usb_controller_sysfs_dir)
+    for d in os.scandir(sysfs_usb_devices):
+        if dir_regex.match(d.name):
+            usb_controllers.append(os.path.realpath(d.path))
+
+
+def get_dt_mmio(sysfs_dev_dir):
+    re_dt_mmio = re.compile("OF_FULLNAME=.*@([0-9a-f]+)")
+    dt_mmio = None
+
+    # PCI controllers' sysfs don't have an of_node, so have to read it from the
+    # parent
+    while not dt_mmio:
+        try:
+            with open(os.path.join(sysfs_dev_dir, "uevent")) as f:
+                dt_mmio = re_dt_mmio.search(f.read()).group(1)
+                return dt_mmio
+        except:
+            pass
+        sysfs_dev_dir = os.path.dirname(sysfs_dev_dir)
+
+
+def get_acpi_uid(sysfs_dev_dir):
+    with open(os.path.join(sysfs_dev_dir, "firmware_node", "uid")) as f:
+        return f.read()
+
+
+def get_usb_version(sysfs_dev_dir):
+    re_usb_version = re.compile("PRODUCT=.*/(\d)/.*")
+    with open(os.path.join(sysfs_dev_dir, "uevent")) as f:
+        return int(re_usb_version.search(f.read()).group(1))
+
+
+def get_usb_busnum(sysfs_dev_dir):
+    re_busnum = re.compile("BUSNUM=(.*)")
+    with open(os.path.join(sysfs_dev_dir, "uevent")) as f:
+        return int(re_busnum.search(f.read()).group(1))
+
+
+def find_controller_in_sysfs(controller, parent_sysfs=None):
+    if controller["type"] == "pci-controller":
+        controllers = pci_controllers
+    elif controller["type"] == "usb-controller":
+        controllers = usb_controllers
+
+    result_controllers = []
+
+    for c in controllers:
+        if parent_sysfs and parent_sysfs not in c:
+            continue
+
+        if controller.get("dt-mmio"):
+            if str(controller["dt-mmio"]) != get_dt_mmio(c):
+                continue
+
+        if controller.get("usb-version"):
+            if controller["usb-version"] != get_usb_version(c):
+                continue
+
+        if controller.get("acpi-uid"):
+            if controller["acpi-uid"] != get_acpi_uid(c):
+                continue
+
+        result_controllers.append(c)
+
+    return result_controllers
+
+
+def is_controller(device):
+    return device.get("type") and "controller" in device.get("type")
+
+
+def path_to_dir(parent_sysfs, dev_type, path):
+    if dev_type == "usb-device":
+        usb_dev_sysfs_fmt = "{}-{}"
+        busnum = get_usb_busnum(parent_sysfs)
+        dirname = os.path.join(
+            sysfs_usb_devices, usb_dev_sysfs_fmt.format(busnum, path)
+        )
+        return [os.path.realpath(dirname)]
+    else:
+        pci_dev_sysfs_fmt = "????:??:{}"
+        path_glob = ""
+        for dev_func in path.split("/"):
+            dev_func = dev_func.zfill(4)
+            path_glob = os.path.join(path_glob, pci_dev_sysfs_fmt.format(dev_func))
+
+        dir_list = glob.glob(os.path.join(parent_sysfs, path_glob))
+
+        return dir_list
+
+
+def find_in_sysfs(device, parent_sysfs=None):
+    if parent_sysfs and device.get("path"):
+        pathdirs = path_to_dir(
+            parent_sysfs, device["meta"]["type"], str(device["path"])
+        )
+        if len(pathdirs) != 1:
+            # Early return to report error
+            return pathdirs
+        pathdir = pathdirs[0]
+        sysfs_path = os.path.join(parent_sysfs, pathdir)
+    else:
+        sysfs_path = parent_sysfs
+
+    if is_controller(device):
+        return find_controller_in_sysfs(device, sysfs_path)
+    else:
+        return [sysfs_path]
+
+
+def check_driver_presence(sysfs_dir, current_node):
+    if current_node["meta"]["type"] == "usb-device":
+        usb_intf_fmt = "*-*:*.{}"
+
+        interfaces = []
+        for i in current_node["interfaces"]:
+            interfaces.append((i, usb_intf_fmt.format(i)))
+
+        for intf_num, intf_dir_fmt in interfaces:
+            test_name = f"{current_node['meta']['pathname']}.{intf_num}.driver"
+
+            intf_dirs = glob.glob(os.path.join(sysfs_dir, intf_dir_fmt))
+            if len(intf_dirs) != 1:
+                ksft.test_result_fail(test_name)
+                continue
+            intf_dir = intf_dirs[0]
+
+            driver_link = os.path.join(sysfs_dir, intf_dir, "driver")
+            ksft.test_result(os.path.isdir(driver_link), test_name)
+    else:
+        driver_link = os.path.join(sysfs_dir, "driver")
+        test_name = current_node["meta"]["pathname"] + ".driver"
+        ksft.test_result(os.path.isdir(driver_link), test_name)
+
+
+def generate_pathname(device):
+    pathname = ""
+
+    if device.get("path"):
+        pathname = str(device["path"])
+
+    if device.get("type"):
+        dev_type = device["type"]
+        if device.get("usb-version"):
+            dev_type = dev_type.replace("usb", "usb" + str(device["usb-version"]))
+        if device.get("acpi-uid") is not None:
+            dev_type = dev_type.replace("pci", "pci" + str(device["acpi-uid"]))
+        pathname = pathname + "/" + dev_type
+
+    if device.get("dt-mmio"):
+        pathname += "@" + str(device["dt-mmio"])
+
+    if device.get("name"):
+        pathname = pathname + "/" + device["name"]
+
+    return pathname
+
+
+def fill_meta_keys(child, parent=None):
+    child["meta"] = {}
+
+    if parent:
+        child["meta"]["type"] = parent["type"].replace("controller", "device")
+
+    pathname = generate_pathname(child)
+    if parent:
+        pathname = parent["meta"]["pathname"] + "/" + pathname
+    child["meta"]["pathname"] = pathname
+
+
+def parse_device_tree_node(current_node, parent_sysfs=None):
+    if not parent_sysfs:
+        fill_meta_keys(current_node)
+
+    sysfs_dirs = find_in_sysfs(current_node, parent_sysfs)
+    if len(sysfs_dirs) != 1:
+        if len(sysfs_dirs) == 0:
+            ksft.test_result_fail(
+                f"Couldn't find in sysfs: {current_node['meta']['pathname']}"
+            )
+        else:
+            ksft.test_result_fail(
+                f"Found multiple sysfs entries for {current_node['meta']['pathname']}: {sysfs_dirs}"
+            )
+        return
+    sysfs_dir = sysfs_dirs[0]
+
+    if not is_controller(current_node):
+        ksft.test_result(
+            os.path.exists(sysfs_dir), current_node["meta"]["pathname"] + ".device"
+        )
+        check_driver_presence(sysfs_dir, current_node)
+    else:
+        for child_device in current_node["devices"]:
+            fill_meta_keys(child_device, current_node)
+            parse_device_tree_node(child_device, sysfs_dir)
+
+
+def count_tests(device_trees):
+    test_count = 0
+
+    def parse_node(device):
+        nonlocal test_count
+        if device.get("devices"):
+            for child in device["devices"]:
+                parse_node(child)
+        else:
+            if device.get("interfaces"):
+                test_count += len(device["interfaces"])
+            else:
+                test_count += 1
+            test_count += 1
+
+    for device_tree in device_trees:
+        parse_node(device_tree)
+
+    return test_count
+
+
+def get_board_filenames():
+    filenames = []
+
+    platform_compatible_file = "/proc/device-tree/compatible"
+    if os.path.exists(platform_compatible_file):
+        with open(platform_compatible_file) as f:
+            for line in f:
+                filenames.extend(line.split("\0"))
+    else:
+        dmi_id_dir = "/sys/devices/virtual/dmi/id"
+        vendor_dmi_file = os.path.join(dmi_id_dir, "sys_vendor")
+        product_dmi_file = os.path.join(dmi_id_dir, "product_name")
+
+        with open(vendor_dmi_file) as f:
+            vendor = f.read().replace("\n", "")
+        with open(product_dmi_file) as f:
+            product = f.read().replace("\n", "")
+
+        filenames = [vendor + "," + product]
+
+    return filenames
+
+
+def run_test(yaml_file):
+    ksft.print_msg(f"Using board file: {yaml_file}")
+
+    with open(yaml_file) as f:
+        device_trees = yaml.safe_load(f)
+
+    ksft.set_plan(count_tests(device_trees))
+
+    for device_tree in device_trees:
+        parse_device_tree_node(device_tree)
+
+
+find_pci_controller_dirs()
+find_usb_controller_dirs()
+
+ksft.print_header()
+
+board_file = ""
+for board_filename in get_board_filenames():
+    full_board_filename = os.path.join("boards", board_filename + ".yaml")
+
+    if os.path.exists(full_board_filename):
+        board_file = full_board_filename
+        break
+
+if not board_file:
+    ksft.print_msg("No matching board file found")
+    ksft.exit_fail()
+
+run_test(board_file)
+
+ksft.finished()
diff --git a/tools/testing/selftests/dmabuf-heaps/config b/tools/testing/selftests/dmabuf-heaps/config
new file mode 100644
index 000000000000..be091f1cdfa0
--- /dev/null
+++ b/tools/testing/selftests/dmabuf-heaps/config
@@ -0,0 +1,3 @@
+CONFIG_DMABUF_HEAPS=y
+CONFIG_DMABUF_HEAPS_SYSTEM=y
+CONFIG_DRM_VGEM=y
diff --git a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c
index 890a8236a8ba..5f541522364f 100644
--- a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c
+++ b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c
@@ -15,6 +15,7 @@
 #include <linux/dma-buf.h>
 #include <linux/dma-heap.h>
 #include <drm/drm.h>
+#include "../kselftest.h"
 
 #define DEVPATH "/dev/dma_heap"
 
@@ -90,14 +91,13 @@ static int dmabuf_heap_open(char *name)
 	char buf[256];
 
 	ret = snprintf(buf, 256, "%s/%s", DEVPATH, name);
-	if (ret < 0) {
-		printf("snprintf failed!\n");
-		return ret;
-	}
+	if (ret < 0)
+		ksft_exit_fail_msg("snprintf failed! %d\n", ret);
 
 	fd = open(buf, O_RDWR);
 	if (fd < 0)
-		printf("open %s failed!\n", buf);
+		ksft_exit_fail_msg("open %s failed: %s\n", buf, strerror(errno));
+
 	return fd;
 }
 
@@ -140,7 +140,7 @@ static int dmabuf_sync(int fd, int start_stop)
 
 #define ONE_MEG (1024 * 1024)
 
-static int test_alloc_and_import(char *heap_name)
+static void test_alloc_and_import(char *heap_name)
 {
 	int heap_fd = -1, dmabuf_fd = -1, importer_fd = -1;
 	uint32_t handle = 0;
@@ -148,27 +148,19 @@ static int test_alloc_and_import(char *heap_name)
 	int ret;
 
 	heap_fd = dmabuf_heap_open(heap_name);
-	if (heap_fd < 0)
-		return -1;
 
-	printf("  Testing allocation and importing:  ");
+	ksft_print_msg("Testing allocation and importing:\n");
 	ret = dmabuf_heap_alloc(heap_fd, ONE_MEG, 0, &dmabuf_fd);
 	if (ret) {
-		printf("FAIL (Allocation Failed!)\n");
-		ret = -1;
-		goto out;
+		ksft_test_result_fail("FAIL (Allocation Failed!) %d\n", ret);
+		return;
 	}
+
 	/* mmap and write a simple pattern */
-	p = mmap(NULL,
-		 ONE_MEG,
-		 PROT_READ | PROT_WRITE,
-		 MAP_SHARED,
-		 dmabuf_fd,
-		 0);
+	p = mmap(NULL, ONE_MEG, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd, 0);
 	if (p == MAP_FAILED) {
-		printf("FAIL (mmap() failed)\n");
-		ret = -1;
-		goto out;
+		ksft_test_result_fail("FAIL (mmap() failed): %s\n", strerror(errno));
+		goto close_and_return;
 	}
 
 	dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_START);
@@ -178,71 +170,64 @@ static int test_alloc_and_import(char *heap_name)
 
 	importer_fd = open_vgem();
 	if (importer_fd < 0) {
-		ret = importer_fd;
-		printf("(Could not open vgem - skipping):  ");
+		ksft_test_result_skip("Could not open vgem %d\n", importer_fd);
 	} else {
 		ret = import_vgem_fd(importer_fd, dmabuf_fd, &handle);
-		if (ret < 0) {
-			printf("FAIL (Failed to import buffer)\n");
-			goto out;
-		}
+		ksft_test_result(ret >= 0, "Import buffer %d\n", ret);
 	}
 
 	ret = dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_START);
 	if (ret < 0) {
-		printf("FAIL (DMA_BUF_SYNC_START failed!)\n");
+		ksft_print_msg("FAIL (DMA_BUF_SYNC_START failed!) %d\n", ret);
 		goto out;
 	}
 
 	memset(p, 0xff, ONE_MEG);
 	ret = dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_END);
 	if (ret < 0) {
-		printf("FAIL (DMA_BUF_SYNC_END failed!)\n");
+		ksft_print_msg("FAIL (DMA_BUF_SYNC_END failed!) %d\n", ret);
 		goto out;
 	}
 
 	close_handle(importer_fd, handle);
-	ret = 0;
-	printf(" OK\n");
+	ksft_test_result_pass("%s dmabuf sync succeeded\n", __func__);
+	return;
+
 out:
-	if (p)
-		munmap(p, ONE_MEG);
-	if (importer_fd >= 0)
-		close(importer_fd);
-	if (dmabuf_fd >= 0)
-		close(dmabuf_fd);
-	if (heap_fd >= 0)
-		close(heap_fd);
+	ksft_test_result_fail("%s dmabuf sync failed\n", __func__);
+	munmap(p, ONE_MEG);
+	close(importer_fd);
 
-	return ret;
+close_and_return:
+	close(dmabuf_fd);
+	close(heap_fd);
 }
 
-static int test_alloc_zeroed(char *heap_name, size_t size)
+static void test_alloc_zeroed(char *heap_name, size_t size)
 {
 	int heap_fd = -1, dmabuf_fd[32];
-	int i, j, ret;
+	int i, j, k, ret;
 	void *p = NULL;
 	char *c;
 
-	printf("  Testing alloced %ldk buffers are zeroed:  ", size / 1024);
+	ksft_print_msg("Testing alloced %ldk buffers are zeroed:\n", size / 1024);
 	heap_fd = dmabuf_heap_open(heap_name);
-	if (heap_fd < 0)
-		return -1;
 
 	/* Allocate and fill a bunch of buffers */
 	for (i = 0; i < 32; i++) {
 		ret = dmabuf_heap_alloc(heap_fd, size, 0, &dmabuf_fd[i]);
-		if (ret < 0) {
-			printf("FAIL (Allocation (%i) failed)\n", i);
-			goto out;
+		if (ret) {
+			ksft_test_result_fail("FAIL (Allocation (%i) failed) %d\n", i, ret);
+			goto close_and_return;
 		}
+
 		/* mmap and fill with simple pattern */
 		p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd[i], 0);
 		if (p == MAP_FAILED) {
-			printf("FAIL (mmap() failed!)\n");
-			ret = -1;
-			goto out;
+			ksft_test_result_fail("FAIL (mmap() failed!): %s\n", strerror(errno));
+			goto close_and_return;
 		}
+
 		dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_START);
 		memset(p, 0xff, size);
 		dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END);
@@ -251,48 +236,47 @@ static int test_alloc_zeroed(char *heap_name, size_t size)
 	/* close them all */
 	for (i = 0; i < 32; i++)
 		close(dmabuf_fd[i]);
+	ksft_test_result_pass("Allocate and fill a bunch of buffers\n");
 
 	/* Allocate and validate all buffers are zeroed */
 	for (i = 0; i < 32; i++) {
 		ret = dmabuf_heap_alloc(heap_fd, size, 0, &dmabuf_fd[i]);
 		if (ret < 0) {
-			printf("FAIL (Allocation (%i) failed)\n", i);
-			goto out;
+			ksft_test_result_fail("FAIL (Allocation (%i) failed) %d\n", i, ret);
+			goto close_and_return;
 		}
 
 		/* mmap and validate everything is zero */
 		p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, dmabuf_fd[i], 0);
 		if (p == MAP_FAILED) {
-			printf("FAIL (mmap() failed!)\n");
-			ret = -1;
-			goto out;
+			ksft_test_result_fail("FAIL (mmap() failed!): %s\n", strerror(errno));
+			goto close_and_return;
 		}
+
 		dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_START);
 		c = (char *)p;
 		for (j = 0; j < size; j++) {
 			if (c[j] != 0) {
-				printf("FAIL (Allocated buffer not zeroed @ %i)\n", j);
-				break;
+				ksft_print_msg("FAIL (Allocated buffer not zeroed @ %i)\n", j);
+				dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END);
+				munmap(p, size);
+				goto out;
 			}
 		}
 		dmabuf_sync(dmabuf_fd[i], DMA_BUF_SYNC_END);
 		munmap(p, size);
 	}
-	/* close them all */
-	for (i = 0; i < 32; i++)
-		close(dmabuf_fd[i]);
-
-	close(heap_fd);
-	printf("OK\n");
-	return 0;
 
 out:
-	while (i > 0) {
-		close(dmabuf_fd[i]);
-		i--;
-	}
+	ksft_test_result(i == 32, "Allocate and validate all buffers are zeroed\n");
+
+close_and_return:
+	/* close them all */
+	for (k = 0; k < i; k++)
+		close(dmabuf_fd[k]);
+
 	close(heap_fd);
-	return ret;
+	return;
 }
 
 /* Test the ioctl version compatibility w/ a smaller structure then expected */
@@ -360,126 +344,97 @@ static int dmabuf_heap_alloc_newer(int fd, size_t len, unsigned int flags,
 	return ret;
 }
 
-static int test_alloc_compat(char *heap_name)
+static void test_alloc_compat(char *heap_name)
 {
-	int heap_fd = -1, dmabuf_fd = -1;
-	int ret;
+	int ret, heap_fd = -1, dmabuf_fd = -1;
 
 	heap_fd = dmabuf_heap_open(heap_name);
-	if (heap_fd < 0)
-		return -1;
 
-	printf("  Testing (theoretical)older alloc compat:  ");
+	ksft_print_msg("Testing (theoretical) older alloc compat:\n");
 	ret = dmabuf_heap_alloc_older(heap_fd, ONE_MEG, 0, &dmabuf_fd);
-	if (ret) {
-		printf("FAIL (Older compat allocation failed!)\n");
-		ret = -1;
-		goto out;
-	}
-	close(dmabuf_fd);
-	printf("OK\n");
+	if (dmabuf_fd >= 0)
+		close(dmabuf_fd);
+	ksft_test_result(!ret, "dmabuf_heap_alloc_older\n");
 
-	printf("  Testing (theoretical)newer alloc compat:  ");
+	ksft_print_msg("Testing (theoretical) newer alloc compat:\n");
 	ret = dmabuf_heap_alloc_newer(heap_fd, ONE_MEG, 0, &dmabuf_fd);
-	if (ret) {
-		printf("FAIL (Newer compat allocation failed!)\n");
-		ret = -1;
-		goto out;
-	}
-	printf("OK\n");
-out:
 	if (dmabuf_fd >= 0)
 		close(dmabuf_fd);
-	if (heap_fd >= 0)
-		close(heap_fd);
+	ksft_test_result(!ret, "dmabuf_heap_alloc_newer\n");
 
-	return ret;
+	close(heap_fd);
 }
 
-static int test_alloc_errors(char *heap_name)
+static void test_alloc_errors(char *heap_name)
 {
 	int heap_fd = -1, dmabuf_fd = -1;
 	int ret;
 
 	heap_fd = dmabuf_heap_open(heap_name);
-	if (heap_fd < 0)
-		return -1;
 
-	printf("  Testing expected error cases:  ");
+	ksft_print_msg("Testing expected error cases:\n");
 	ret = dmabuf_heap_alloc(0, ONE_MEG, 0x111111, &dmabuf_fd);
-	if (!ret) {
-		printf("FAIL (Did not see expected error (invalid fd)!)\n");
-		ret = -1;
-		goto out;
-	}
+	ksft_test_result(ret, "Error expected on invalid fd %d\n", ret);
 
 	ret = dmabuf_heap_alloc(heap_fd, ONE_MEG, 0x111111, &dmabuf_fd);
-	if (!ret) {
-		printf("FAIL (Did not see expected error (invalid heap flags)!)\n");
-		ret = -1;
-		goto out;
-	}
+	ksft_test_result(ret, "Error expected on invalid heap flags %d\n", ret);
 
 	ret = dmabuf_heap_alloc_fdflags(heap_fd, ONE_MEG,
 					~(O_RDWR | O_CLOEXEC), 0, &dmabuf_fd);
-	if (!ret) {
-		printf("FAIL (Did not see expected error (invalid fd flags)!)\n");
-		ret = -1;
-		goto out;
-	}
+	ksft_test_result(ret, "Error expected on invalid heap flags %d\n", ret);
 
-	printf("OK\n");
-	ret = 0;
-out:
 	if (dmabuf_fd >= 0)
 		close(dmabuf_fd);
-	if (heap_fd >= 0)
-		close(heap_fd);
+	close(heap_fd);
+}
 
-	return ret;
+static int numer_of_heaps(void)
+{
+	DIR *d = opendir(DEVPATH);
+	struct dirent *dir;
+	int heaps = 0;
+
+	while ((dir = readdir(d))) {
+		if (!strncmp(dir->d_name, ".", 2))
+			continue;
+		if (!strncmp(dir->d_name, "..", 3))
+			continue;
+		heaps++;
+	}
+
+	return heaps;
 }
 
 int main(void)
 {
-	DIR *d;
 	struct dirent *dir;
-	int ret = -1;
+	DIR *d;
+
+	ksft_print_header();
 
 	d = opendir(DEVPATH);
 	if (!d) {
-		printf("No %s directory?\n", DEVPATH);
-		return -1;
+		ksft_print_msg("No %s directory?\n", DEVPATH);
+		return KSFT_SKIP;
 	}
 
-	while ((dir = readdir(d)) != NULL) {
+	ksft_set_plan(11 * numer_of_heaps());
+
+	while ((dir = readdir(d))) {
 		if (!strncmp(dir->d_name, ".", 2))
 			continue;
 		if (!strncmp(dir->d_name, "..", 3))
 			continue;
 
-		printf("Testing heap: %s\n", dir->d_name);
-		printf("=======================================\n");
-		ret = test_alloc_and_import(dir->d_name);
-		if (ret)
-			break;
-
-		ret = test_alloc_zeroed(dir->d_name, 4 * 1024);
-		if (ret)
-			break;
-
-		ret = test_alloc_zeroed(dir->d_name, ONE_MEG);
-		if (ret)
-			break;
-
-		ret = test_alloc_compat(dir->d_name);
-		if (ret)
-			break;
-
-		ret = test_alloc_errors(dir->d_name);
-		if (ret)
-			break;
+		ksft_print_msg("Testing heap: %s\n", dir->d_name);
+		ksft_print_msg("=======================================\n");
+		test_alloc_and_import(dir->d_name);
+		test_alloc_zeroed(dir->d_name, 4 * 1024);
+		test_alloc_zeroed(dir->d_name, ONE_MEG);
+		test_alloc_compat(dir->d_name);
+		test_alloc_errors(dir->d_name);
 	}
 	closedir(d);
 
-	return ret;
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
new file mode 100644
index 000000000000..e54f382bcb02
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_INCLUDES := $(wildcard lib/py/*.py)
+
+TEST_PROGS := \
+	ping.py \
+	queues.py \
+	stats.py \
+# end of TEST_PROGS
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst
new file mode 100644
index 000000000000..3b6a29e6564b
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/README.rst
@@ -0,0 +1,136 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Running driver tests
+====================
+
+Networking driver tests are executed within kselftest framework like any
+other tests. They support testing both real device drivers and emulated /
+software drivers (latter mostly to test the core parts of the stack).
+
+SW mode
+~~~~~~~
+
+By default, when no extra parameters are set or exported, tests execute
+against software drivers such as netdevsim. No extra preparation is required
+the software devices are created and destroyed as part of the test.
+In this mode the tests are indistinguishable from other selftests and
+(for example) can be run under ``virtme-ng`` like the core networking selftests.
+
+HW mode
+~~~~~~~
+
+Executing tests against a real device requires external preparation.
+The netdevice against which tests will be run must exist, be running
+(in UP state) and be configured with an IP address.
+
+Refer to list of :ref:`Variables` later in this file to set up running
+the tests against a real device.
+
+Both modes required
+~~~~~~~~~~~~~~~~~~~
+
+All tests in drivers/net must support running both against a software device
+and a real device. SW-only tests should instead be placed in net/ or
+drivers/net/netdevsim, HW-only tests in drivers/net/hw.
+
+Variables
+=========
+
+The variables can be set in the environment or by creating a net.config
+file in the same directory as this README file. Example::
+
+  $ NETIF=eth0 ./some_test.sh
+
+or::
+
+  $ cat tools/testing/selftests/drivers/net/net.config
+  # Variable set in a file
+  NETIF=eth0
+
+Local test (which don't require endpoint for sending / receiving traffic)
+need only the ``NETIF`` variable. Remaining variables define the endpoint
+and communication method.
+
+NETIF
+~~~~~
+
+Name of the netdevice against which the test should be executed.
+When empty or not set software devices will be used.
+
+LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Local and remote endpoint IP addresses.
+
+REMOTE_TYPE
+~~~~~~~~~~~
+
+Communication method used to run commands on the remote endpoint.
+Test framework has built-in support for ``netns`` and ``ssh`` channels.
+``netns`` assumes the "remote" interface is part of the same
+host, just moved to the specified netns.
+``ssh`` communicates with remote endpoint over ``ssh`` and ``scp``.
+Using persistent SSH connections is strongly encouraged to avoid
+the latency of SSH connection setup on every command.
+
+Communication methods are defined by classes in ``lib/py/remote_{name}.py``.
+It should be possible to add a new method without modifying any of
+the framework, by simply adding an appropriately named file to ``lib/py``.
+
+REMOTE_ARGS
+~~~~~~~~~~~
+
+Arguments used to construct the communication channel.
+Communication channel dependent::
+
+  for netns - name of the "remote" namespace
+  for ssh - name/address of the remote host
+
+Example
+=======
+
+Build the selftests::
+
+  # make -C tools/testing/selftests/ TARGETS="drivers/net drivers/net/hw"
+
+"Install" the tests and copy them over to the target machine::
+
+  # make -C tools/testing/selftests/ TARGETS="drivers/net drivers/net/hw" \
+     install INSTALL_PATH=/tmp/ksft-net-drv
+
+  # rsync -ra --delete /tmp/ksft-net-drv root@192.168.1.1:/root/
+
+On the target machine, running the tests will use netdevsim by default::
+
+  [/root] # ./ksft-net-drv/run_kselftest.sh -t drivers/net:ping.py
+  TAP version 13
+  1..1
+  # timeout set to 45
+  # selftests: drivers/net: ping.py
+  # KTAP version 1
+  # 1..3
+  # ok 1 ping.test_v4
+  # ok 2 ping.test_v6
+  # ok 3 ping.test_tcp
+  # # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0
+  ok 1 selftests: drivers/net: ping.py
+
+Create a config with remote info::
+
+  [/root] # cat > ./ksft-net-drv/drivers/net/net.config <<EOF
+  NETIF=eth0
+  LOCAL_V4=192.168.1.1
+  REMOTE_V4=192.168.1.2
+  REMOTE_TYPE=ssh
+  REMOTE_ARGS=root@192.168.1.2
+  EOF
+
+Run the test::
+
+  [/root] # ./ksft-net-drv/drivers/net/ping.py
+  KTAP version 1
+  1..3
+  ok 1 ping.test_v4
+  ok 2 ping.test_v6 # SKIP Test requires IPv6 connectivity
+  ok 3 ping.test_tcp
+  # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:1 error:0
diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config
new file mode 100644
index 000000000000..f6a58ce8a230
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/config
@@ -0,0 +1,2 @@
+CONFIG_IPV6=y
+CONFIG_NETDEVSIM=m
diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
new file mode 100644
index 000000000000..4933d045ab66
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0+ OR MIT
+
+TEST_PROGS = \
+	csum.py \
+	devlink_port_split.py \
+	ethtool.sh \
+	ethtool_extended_state.sh \
+	ethtool_mm.sh \
+	ethtool_rmon.sh \
+	hw_stats_l3.sh \
+	hw_stats_l3_gre.sh \
+	loopback.sh \
+	pp_alloc_fail.py \
+	#
+
+TEST_FILES := \
+	ethtool_lib.sh \
+	#
+
+TEST_INCLUDES := \
+	$(wildcard lib/py/*.py ../lib/py/*.py) \
+	../../../net/lib.sh \
+	../../../net/forwarding/lib.sh \
+	../../../net/forwarding/ipip_lib.sh \
+	../../../net/forwarding/tc_common.sh \
+	#
+
+include ../../../lib.mk
diff --git a/tools/testing/selftests/drivers/net/hw/csum.py b/tools/testing/selftests/drivers/net/hw/csum.py
new file mode 100755
index 000000000000..cb40497faee4
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/csum.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+"""Run the tools/testing/selftests/net/csum testsuite."""
+
+from os import path
+
+from lib.py import ksft_run, ksft_exit, KsftSkipEx
+from lib.py import EthtoolFamily, NetDrvEpEnv
+from lib.py import bkg, cmd, wait_port_listen
+
+def test_receive(cfg, ipv4=False, extra_args=None):
+    """Test local nic checksum receive. Remote host sends crafted packets."""
+    if not cfg.have_rx_csum:
+        raise KsftSkipEx(f"Test requires rx checksum offload on {cfg.ifname}")
+
+    if ipv4:
+        ip_args = f"-4 -S {cfg.remote_v4} -D {cfg.v4}"
+    else:
+        ip_args = f"-6 -S {cfg.remote_v6} -D {cfg.v6}"
+
+    rx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -n 100 {ip_args} -r 1 -R {extra_args}"
+    tx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -n 100 {ip_args} -r 1 -T {extra_args}"
+
+    with bkg(rx_cmd, exit_wait=True):
+        wait_port_listen(34000, proto="udp")
+        cmd(tx_cmd, host=cfg.remote)
+
+
+def test_transmit(cfg, ipv4=False, extra_args=None):
+    """Test local nic checksum transmit. Remote host verifies packets."""
+    if (not cfg.have_tx_csum_generic and
+        not (cfg.have_tx_csum_ipv4 and ipv4) and
+        not (cfg.have_tx_csum_ipv6 and not ipv4)):
+        raise KsftSkipEx(f"Test requires tx checksum offload on {cfg.ifname}")
+
+    if ipv4:
+        ip_args = f"-4 -S {cfg.v4} -D {cfg.remote_v4}"
+    else:
+        ip_args = f"-6 -S {cfg.v6} -D {cfg.remote_v6}"
+
+    # Cannot randomize input when calculating zero checksum
+    if extra_args != "-U -Z":
+        extra_args += " -r 1"
+
+    rx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -L 1 -n 100 {ip_args} -R {extra_args}"
+    tx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -L 1 -n 100 {ip_args} -T {extra_args}"
+
+    with bkg(rx_cmd, host=cfg.remote, exit_wait=True):
+        wait_port_listen(34000, proto="udp", host=cfg.remote)
+        cmd(tx_cmd)
+
+
+def test_builder(name, cfg, ipv4=False, tx=False, extra_args=""):
+    """Construct specific tests from the common template.
+
+       Most tests follow the same basic pattern, differing only in
+       Direction of the test and optional flags passed to csum."""
+    def f(cfg):
+        if ipv4:
+            cfg.require_v4()
+        else:
+            cfg.require_v6()
+
+        if tx:
+            test_transmit(cfg, ipv4, extra_args)
+        else:
+            test_receive(cfg, ipv4, extra_args)
+
+    if ipv4:
+        f.__name__ = "ipv4_" + name
+    else:
+        f.__name__ = "ipv6_" + name
+    return f
+
+
+def check_nic_features(cfg) -> None:
+    """Test whether Tx and Rx checksum offload are enabled.
+
+       If the device under test has either off, then skip the relevant tests."""
+    cfg.have_tx_csum_generic = False
+    cfg.have_tx_csum_ipv4 = False
+    cfg.have_tx_csum_ipv6 = False
+    cfg.have_rx_csum = False
+
+    ethnl = EthtoolFamily()
+    features = ethnl.features_get({"header": {"dev-index": cfg.ifindex}})
+    for f in features["active"]["bits"]["bit"]:
+        if f["name"] == "tx-checksum-ip-generic":
+            cfg.have_tx_csum_generic = True
+        elif f["name"] == "tx-checksum-ipv4":
+            cfg.have_tx_csum_ipv4 = True
+        elif f["name"] == "tx-checksum-ipv6":
+            cfg.have_tx_csum_ipv6 = True
+        elif f["name"] == "rx-checksum":
+            cfg.have_rx_csum = True
+
+
+def main() -> None:
+    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
+        check_nic_features(cfg)
+
+        cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../net/lib/csum")
+        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
+
+        cases = []
+        for ipv4 in [True, False]:
+            cases.append(test_builder("rx_tcp", cfg, ipv4, False, "-t"))
+            cases.append(test_builder("rx_tcp_invalid", cfg, ipv4, False, "-t -E"))
+
+            cases.append(test_builder("rx_udp", cfg, ipv4, False, ""))
+            cases.append(test_builder("rx_udp_invalid", cfg, ipv4, False, "-E"))
+
+            cases.append(test_builder("tx_udp_csum_offload", cfg, ipv4, True, "-U"))
+            cases.append(test_builder("tx_udp_zero_checksum", cfg, ipv4, True, "-U -Z"))
+
+        ksft_run(cases=cases, args=(cfg, ))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/drivers/net/hw/devlink_port_split.py
index 2d84c7a0be6b..2d84c7a0be6b 100755
--- a/tools/testing/selftests/net/devlink_port_split.py
+++ b/tools/testing/selftests/drivers/net/hw/devlink_port_split.py
diff --git a/tools/testing/selftests/net/forwarding/ethtool.sh b/tools/testing/selftests/drivers/net/hw/ethtool.sh
index aa2eafb7b243..fa6953de6b6d 100755
--- a/tools/testing/selftests/net/forwarding/ethtool.sh
+++ b/tools/testing/selftests/drivers/net/hw/ethtool.sh
@@ -10,7 +10,8 @@ ALL_TESTS="
 	different_speeds_autoneg_on
 "
 NUM_NETIFS=2
-source lib.sh
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/forwarding/lib.sh
 source ethtool_lib.sh
 
 h1_create()
@@ -64,9 +65,8 @@ same_speeds_autoneg_off()
 		setup_wait_dev_with_timeout $h1
 		setup_wait_dev_with_timeout $h2
 		ping_do $h1 192.0.2.2
-		check_err $? "speed $speed autoneg off"
-		log_test "force of same speed autoneg off"
-		log_info "speed = $speed"
+		check_err $? "ping with speed $speed autoneg off"
+		log_test "force speed $speed on both ends"
 	done
 
 	ethtool -s $h2 autoneg on
@@ -111,9 +111,8 @@ combination_of_neg_on_and_off()
 		setup_wait_dev_with_timeout $h1
 		setup_wait_dev_with_timeout $h2
 		ping_do $h1 192.0.2.2
-		check_err $? "h1-speed=$speed autoneg off, h2 autoneg on"
-		log_test "one side with autoneg off and another with autoneg on"
-		log_info "force speed = $speed"
+		check_err $? "ping with h1-speed=$speed autoneg off, h2 autoneg on"
+		log_test "force speed $speed vs. autoneg"
 	done
 
 	ethtool -s $h1 autoneg on
@@ -206,10 +205,9 @@ advertise_subset_of_speeds()
 		setup_wait_dev_with_timeout $h1
 		setup_wait_dev_with_timeout $h2
 		ping_do $h1 192.0.2.2
-		check_err $? "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)"
+		check_err $? "ping with h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)"
 
-		log_test "advertise subset of speeds"
-		log_info "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise"
+		log_test "advertise $speed_1_to_advertise vs. $speed_2_to_advertise"
 	done
 
 	ethtool -s $h2 autoneg on
@@ -286,8 +284,6 @@ different_speeds_autoneg_on()
 	ethtool -s $h1 autoneg on
 }
 
-skip_on_veth
-
 trap cleanup EXIT
 
 setup_prepare
diff --git a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh
index 17f89c3b7c02..a7584448416e 100755
--- a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh
+++ b/tools/testing/selftests/drivers/net/hw/ethtool_extended_state.sh
@@ -8,7 +8,8 @@ ALL_TESTS="
 "
 
 NUM_NETIFS=2
-source lib.sh
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/forwarding/lib.sh
 source ethtool_lib.sh
 
 TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms
@@ -108,8 +109,6 @@ no_cable()
 	ip link set dev $swp3 down
 }
 
-skip_on_veth
-
 setup_prepare
 
 tests_run
diff --git a/tools/testing/selftests/net/forwarding/ethtool_lib.sh b/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh
index b9bfb45085af..b9bfb45085af 100644
--- a/tools/testing/selftests/net/forwarding/ethtool_lib.sh
+++ b/tools/testing/selftests/drivers/net/hw/ethtool_lib.sh
diff --git a/tools/testing/selftests/net/forwarding/ethtool_mm.sh b/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh
index 50d5bfb17ef1..c301e735c8ab 100755
--- a/tools/testing/selftests/net/forwarding/ethtool_mm.sh
+++ b/tools/testing/selftests/drivers/net/hw/ethtool_mm.sh
@@ -14,7 +14,8 @@ ALL_TESTS="
 NUM_NETIFS=2
 REQUIRE_MZ=no
 PREEMPTIBLE_PRIO=0
-source lib.sh
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/forwarding/lib.sh
 
 traffic_test()
 {
diff --git a/tools/testing/selftests/net/forwarding/ethtool_rmon.sh b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh
index 41a34a61f763..8f60c1685ad4 100755
--- a/tools/testing/selftests/net/forwarding/ethtool_rmon.sh
+++ b/tools/testing/selftests/drivers/net/hw/ethtool_rmon.sh
@@ -7,7 +7,8 @@ ALL_TESTS="
 "
 
 NUM_NETIFS=2
-source lib.sh
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/forwarding/lib.sh
 
 ETH_FCS_LEN=4
 ETH_HLEN=$((6+6+2))
@@ -43,6 +44,7 @@ bucket_test()
 	# Mausezahn does not include FCS bytes in its length - but the
 	# histogram counters do
 	len=$((len - ETH_FCS_LEN))
+	len=$((len > 0 ? len : 0))
 
 	before=$(ethtool --json -S $iface --groups rmon | \
 		jq -r ".[0].rmon[\"${set}-pktsNtoM\"][$bucket].val")
@@ -78,7 +80,7 @@ rmon_histogram()
 
 		for if in $iface $neigh; do
 			if ! ensure_mtu $if ${bucket[0]}; then
-				log_test_skip "$if does not support the required MTU for $step"
+				log_test_xfail "$if does not support the required MTU for $step"
 				return
 			fi
 		done
@@ -93,7 +95,7 @@ rmon_histogram()
 		jq -r ".[0].rmon[\"${set}-pktsNtoM\"][]|[.low, .high]|@tsv" 2>/dev/null)
 
 	if [ $nbuckets -eq 0 ]; then
-		log_test_skip "$iface does not support $set histogram counters"
+		log_test_xfail "$iface does not support $set histogram counters"
 		return
 	fi
 }
diff --git a/tools/testing/selftests/net/forwarding/hw_stats_l3.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh
index 48584a51388f..67fafefc80be 100755
--- a/tools/testing/selftests/net/forwarding/hw_stats_l3.sh
+++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3.sh
@@ -48,7 +48,9 @@ ALL_TESTS="
 	test_double_enable
 "
 NUM_NETIFS=4
-source lib.sh
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/forwarding/lib.sh
+source "$lib_dir"/../../../net/forwarding/tc_common.sh
 
 h1_create()
 {
@@ -324,17 +326,9 @@ setup_wait
 
 used=$(ip -j stats show dev $rp1.200 group offload subgroup hw_stats_info |
 	   jq '.[].info.l3_stats.used')
-kind=$(ip -j -d link show dev $rp1 |
-	   jq -r '.[].linkinfo.info_kind')
-if [[ $used != true ]]; then
-	if [[ $kind == veth ]]; then
-		log_test_skip "l3_stats not offloaded on veth interface"
-		EXIT_STATUS=$ksft_skip
-	else
-		RET=1 log_test "l3_stats not offloaded"
-	fi
-else
-	tests_run
-fi
+[[ $used = true ]]
+check_err $? "hw_stats_info.used=$used"
+log_test "l3_stats offloaded"
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh
index 7594bbb49029..a94d92e1abce 100755
--- a/tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh
+++ b/tools/testing/selftests/drivers/net/hw/hw_stats_l3_gre.sh
@@ -12,8 +12,10 @@ ALL_TESTS="
 	test_stats_tx
 "
 NUM_NETIFS=6
-source lib.sh
-source ipip_lib.sh
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/forwarding/lib.sh
+source "$lib_dir"/../../../net/forwarding/ipip_lib.sh
+source "$lib_dir"/../../../net/forwarding/tc_common.sh
 
 setup_prepare()
 {
@@ -99,8 +101,6 @@ test_stats_rx()
 	test_stats g2a rx
 }
 
-skip_on_veth
-
 trap cleanup EXIT
 
 setup_prepare
diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
new file mode 100644
index 000000000000..b582885786f5
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import sys
+from pathlib import Path
+
+KSFT_DIR = (Path(__file__).parent / "../../../../..").resolve()
+
+try:
+    sys.path.append(KSFT_DIR.as_posix())
+    from net.lib.py import *
+    from drivers.net.lib.py import *
+except ModuleNotFoundError as e:
+    ksft_pr("Failed importing `net` library from kernel sources")
+    ksft_pr(str(e))
+    ktap_result(True, comment="SKIP")
+    sys.exit(4)
diff --git a/tools/testing/selftests/net/forwarding/loopback.sh b/tools/testing/selftests/drivers/net/hw/loopback.sh
index 8f4057310b5b..5acc3ff820aa 100755
--- a/tools/testing/selftests/net/forwarding/loopback.sh
+++ b/tools/testing/selftests/drivers/net/hw/loopback.sh
@@ -6,8 +6,9 @@ ksft_skip=4
 
 ALL_TESTS="loopback_test"
 NUM_NETIFS=2
-source tc_common.sh
-source lib.sh
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/forwarding/tc_common.sh
+source "$lib_dir"/../../../net/forwarding/lib.sh
 
 h1_create()
 {
diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
new file mode 100755
index 000000000000..026d98976c35
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import time
+import os
+from lib.py import ksft_run, ksft_exit, ksft_pr
+from lib.py import KsftSkipEx, KsftFailEx
+from lib.py import NetdevFamily, NlError
+from lib.py import NetDrvEpEnv
+from lib.py import cmd, tool, GenerateTraffic
+
+
+def _write_fail_config(config):
+    for key, value in config.items():
+        with open("/sys/kernel/debug/fail_function/" + key, "w") as fp:
+            fp.write(str(value) + "\n")
+
+
+def _enable_pp_allocation_fail():
+    if not os.path.exists("/sys/kernel/debug/fail_function"):
+        raise KsftSkipEx("Kernel built without function error injection (or DebugFS)")
+
+    if not os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"):
+        with open("/sys/kernel/debug/fail_function/inject", "w") as fp:
+            fp.write("page_pool_alloc_pages\n")
+
+    _write_fail_config({
+        "verbose": 0,
+        "interval": 511,
+        "probability": 100,
+        "times": -1,
+    })
+
+
+def _disable_pp_allocation_fail():
+    if not os.path.exists("/sys/kernel/debug/fail_function"):
+        return
+
+    if os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"):
+        with open("/sys/kernel/debug/fail_function/inject", "w") as fp:
+            fp.write("\n")
+
+    _write_fail_config({
+        "probability": 0,
+        "times": 0,
+    })
+
+
+def test_pp_alloc(cfg, netdevnl):
+    def get_stats():
+        return netdevnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0]
+
+    def check_traffic_flowing():
+        stat1 = get_stats()
+        time.sleep(1)
+        stat2 = get_stats()
+        if stat2['rx-packets'] - stat1['rx-packets'] < 15000:
+            raise KsftFailEx("Traffic seems low:", stat2['rx-packets'] - stat1['rx-packets'])
+
+
+    try:
+        stats = get_stats()
+    except NlError as e:
+        if e.nl_msg.error == -95:
+            stats = {}
+        else:
+            raise
+    if 'rx-alloc-fail' not in stats:
+        raise KsftSkipEx("Driver does not report 'rx-alloc-fail' via qstats")
+
+    set_g = False
+    traffic = None
+    try:
+        traffic = GenerateTraffic(cfg)
+
+        check_traffic_flowing()
+
+        _enable_pp_allocation_fail()
+
+        s1 = get_stats()
+        time.sleep(3)
+        s2 = get_stats()
+
+        if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 1:
+            raise KsftSkipEx("Allocation failures not increasing")
+        if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 100:
+            raise KsftSkipEx("Allocation increasing too slowly", s2['rx-alloc-fail'] - s1['rx-alloc-fail'],
+                             "packets:", s2['rx-packets'] - s1['rx-packets'])
+
+        # Basic failures are fine, try to wobble some settings to catch extra failures
+        check_traffic_flowing()
+        g = tool("ethtool", "-g " + cfg.ifname, json=True)[0]
+        if 'rx' in g and g["rx"] * 2 <= g["rx-max"]:
+            new_g = g['rx'] * 2
+        elif 'rx' in g:
+            new_g = g['rx'] // 2
+        else:
+            new_g = None
+
+        if new_g:
+            set_g = cmd(f"ethtool -G {cfg.ifname} rx {new_g}", fail=False).ret == 0
+            if set_g:
+                ksft_pr("ethtool -G change retval: success")
+            else:
+                ksft_pr("ethtool -G change retval: did not succeed", new_g)
+        else:
+                ksft_pr("ethtool -G change retval: did not try")
+
+        time.sleep(0.1)
+        check_traffic_flowing()
+    finally:
+        _disable_pp_allocation_fail()
+        if traffic:
+            traffic.stop()
+        time.sleep(0.1)
+        if set_g:
+            cmd(f"ethtool -G {cfg.ifname} rx {g['rx']}")
+
+
+def main() -> None:
+    netdevnl = NetdevFamily()
+    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
+
+        ksft_run([test_pp_alloc], args=(cfg, netdevnl, ))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/drivers/net/hw/settings b/tools/testing/selftests/drivers/net/hw/settings
new file mode 100644
index 000000000000..e7b9417537fb
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/settings
@@ -0,0 +1 @@
+timeout=0
diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py
new file mode 100644
index 000000000000..401e70f7f136
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import sys
+from pathlib import Path
+
+KSFT_DIR = (Path(__file__).parent / "../../../..").resolve()
+
+try:
+    sys.path.append(KSFT_DIR.as_posix())
+    from net.lib.py import *
+except ModuleNotFoundError as e:
+    ksft_pr("Failed importing `net` library from kernel sources")
+    ksft_pr(str(e))
+    ktap_result(True, comment="SKIP")
+    sys.exit(4)
+
+from .env import *
+from .load import *
+from .remote import Remote
diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py
new file mode 100644
index 000000000000..edcedd7bffab
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -0,0 +1,224 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import os
+from pathlib import Path
+from lib.py import KsftSkipEx, KsftXfailEx
+from lib.py import cmd, ip
+from lib.py import NetNS, NetdevSimDev
+from .remote import Remote
+
+
+def _load_env_file(src_path):
+    env = os.environ.copy()
+
+    src_dir = Path(src_path).parent.resolve()
+    if not (src_dir / "net.config").exists():
+        return env
+
+    with open((src_dir / "net.config").as_posix(), 'r') as fp:
+        for line in fp.readlines():
+            full_file = line
+            # Strip comments
+            pos = line.find("#")
+            if pos >= 0:
+                line = line[:pos]
+            line = line.strip()
+            if not line:
+                continue
+            pair = line.split('=', maxsplit=1)
+            if len(pair) != 2:
+                raise Exception("Can't parse configuration line:", full_file)
+            env[pair[0]] = pair[1]
+    return env
+
+
+class NetDrvEnv:
+    """
+    Class for a single NIC / host env, with no remote end
+    """
+    def __init__(self, src_path, **kwargs):
+        self._ns = None
+
+        self.env = _load_env_file(src_path)
+
+        if 'NETIF' in self.env:
+            self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0]
+        else:
+            self._ns = NetdevSimDev(**kwargs)
+            self.dev = self._ns.nsims[0].dev
+        self.ifindex = self.dev['ifindex']
+
+    def __enter__(self):
+        ip(f"link set dev {self.dev['ifname']} up")
+
+        return self
+
+    def __exit__(self, ex_type, ex_value, ex_tb):
+        """
+        __exit__ gets called at the end of a "with" block.
+        """
+        self.__del__()
+
+    def __del__(self):
+        if self._ns:
+            self._ns.remove()
+            self._ns = None
+
+
+class NetDrvEpEnv:
+    """
+    Class for an environment with a local device and "remote endpoint"
+    which can be used to send traffic in.
+
+    For local testing it creates two network namespaces and a pair
+    of netdevsim devices.
+    """
+
+    # Network prefixes used for local tests
+    nsim_v4_pfx = "192.0.2."
+    nsim_v6_pfx = "2001:db8::"
+
+    def __init__(self, src_path, nsim_test=None):
+
+        self.env = _load_env_file(src_path)
+
+        # Things we try to destroy
+        self.remote = None
+        # These are for local testing state
+        self._netns = None
+        self._ns = None
+        self._ns_peer = None
+
+        if "NETIF" in self.env:
+            if nsim_test is True:
+                raise KsftXfailEx("Test only works on netdevsim")
+            self._check_env()
+
+            self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0]
+
+            self.v4 = self.env.get("LOCAL_V4")
+            self.v6 = self.env.get("LOCAL_V6")
+            self.remote_v4 = self.env.get("REMOTE_V4")
+            self.remote_v6 = self.env.get("REMOTE_V6")
+            kind = self.env["REMOTE_TYPE"]
+            args = self.env["REMOTE_ARGS"]
+        else:
+            if nsim_test is False:
+                raise KsftXfailEx("Test does not work on netdevsim")
+
+            self.create_local()
+
+            self.dev = self._ns.nsims[0].dev
+
+            self.v4 = self.nsim_v4_pfx + "1"
+            self.v6 = self.nsim_v6_pfx + "1"
+            self.remote_v4 = self.nsim_v4_pfx + "2"
+            self.remote_v6 = self.nsim_v6_pfx + "2"
+            kind = "netns"
+            args = self._netns.name
+
+        self.remote = Remote(kind, args, src_path)
+
+        self.addr = self.v6 if self.v6 else self.v4
+        self.remote_addr = self.remote_v6 if self.remote_v6 else self.remote_v4
+
+        self.addr_ipver = "6" if self.v6 else "4"
+        # Bracketed addresses, some commands need IPv6 to be inside []
+        self.baddr = f"[{self.v6}]" if self.v6 else self.v4
+        self.remote_baddr = f"[{self.remote_v6}]" if self.remote_v6 else self.remote_v4
+
+        self.ifname = self.dev['ifname']
+        self.ifindex = self.dev['ifindex']
+
+        self._required_cmd = {}
+
+    def create_local(self):
+        self._netns = NetNS()
+        self._ns = NetdevSimDev()
+        self._ns_peer = NetdevSimDev(ns=self._netns)
+
+        with open("/proc/self/ns/net") as nsfd0, \
+             open("/var/run/netns/" + self._netns.name) as nsfd1:
+            ifi0 = self._ns.nsims[0].ifindex
+            ifi1 = self._ns_peer.nsims[0].ifindex
+            NetdevSimDev.ctrl_write('link_device',
+                                    f'{nsfd0.fileno()}:{ifi0} {nsfd1.fileno()}:{ifi1}')
+
+        ip(f"   addr add dev {self._ns.nsims[0].ifname} {self.nsim_v4_pfx}1/24")
+        ip(f"-6 addr add dev {self._ns.nsims[0].ifname} {self.nsim_v6_pfx}1/64 nodad")
+        ip(f"   link set dev {self._ns.nsims[0].ifname} up")
+
+        ip(f"   addr add dev {self._ns_peer.nsims[0].ifname} {self.nsim_v4_pfx}2/24", ns=self._netns)
+        ip(f"-6 addr add dev {self._ns_peer.nsims[0].ifname} {self.nsim_v6_pfx}2/64 nodad", ns=self._netns)
+        ip(f"   link set dev {self._ns_peer.nsims[0].ifname} up", ns=self._netns)
+
+    def _check_env(self):
+        vars_needed = [
+            ["LOCAL_V4", "LOCAL_V6"],
+            ["REMOTE_V4", "REMOTE_V6"],
+            ["REMOTE_TYPE"],
+            ["REMOTE_ARGS"]
+        ]
+        missing = []
+
+        for choice in vars_needed:
+            for entry in choice:
+                if entry in self.env:
+                    break
+            else:
+                missing.append(choice)
+        # Make sure v4 / v6 configs are symmetric
+        if ("LOCAL_V6" in self.env) != ("REMOTE_V6" in self.env):
+            missing.append(["LOCAL_V6", "REMOTE_V6"])
+        if ("LOCAL_V4" in self.env) != ("REMOTE_V4" in self.env):
+            missing.append(["LOCAL_V4", "REMOTE_V4"])
+        if missing:
+            raise Exception("Invalid environment, missing configuration:", missing,
+                            "Please see tools/testing/selftests/drivers/net/README.rst")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, ex_type, ex_value, ex_tb):
+        """
+        __exit__ gets called at the end of a "with" block.
+        """
+        self.__del__()
+
+    def __del__(self):
+        if self._ns:
+            self._ns.remove()
+            self._ns = None
+        if self._ns_peer:
+            self._ns_peer.remove()
+            self._ns_peer = None
+        if self._netns:
+            del self._netns
+            self._netns = None
+        if self.remote:
+            del self.remote
+            self.remote = None
+
+    def require_v4(self):
+        if not self.v4 or not self.remote_v4:
+            raise KsftSkipEx("Test requires IPv4 connectivity")
+
+    def require_v6(self):
+        if not self.v6 or not self.remote_v6:
+            raise KsftSkipEx("Test requires IPv6 connectivity")
+
+    def _require_cmd(self, comm, key, host=None):
+        cached = self._required_cmd.get(comm, {})
+        if cached.get(key) is None:
+            cached[key] = cmd("command -v -- " + comm, fail=False,
+                              shell=True, host=host).ret == 0
+        self._required_cmd[comm] = cached
+        return cached[key]
+
+    def require_cmd(self, comm, local=True, remote=False):
+        if local:
+            if not self._require_cmd(comm, "local"):
+                raise KsftSkipEx("Test requires command: " + comm)
+        if remote:
+            if not self._require_cmd(comm, "remote"):
+                raise KsftSkipEx("Test requires (remote) command: " + comm)
diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py
new file mode 100644
index 000000000000..abdb677bdb1c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/lib/py/load.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import time
+
+from lib.py import ksft_pr, cmd, ip, rand_port, wait_port_listen
+
+class GenerateTraffic:
+    def __init__(self, env):
+        env.require_cmd("iperf3", remote=True)
+
+        self.env = env
+
+        port = rand_port()
+        self._iperf_server = cmd(f"iperf3 -s -p {port}", background=True)
+        wait_port_listen(port)
+        time.sleep(0.1)
+        self._iperf_client = cmd(f"iperf3 -c {env.addr} -P 16 -p {port} -t 86400",
+                                 background=True, host=env.remote)
+
+        # Wait for traffic to ramp up
+        pkt = ip("-s link show dev " + env.ifname, json=True)[0]["stats64"]["rx"]["packets"]
+        for _ in range(50):
+            time.sleep(0.1)
+            now = ip("-s link show dev " + env.ifname, json=True)[0]["stats64"]["rx"]["packets"]
+            if now - pkt > 1000:
+                return
+            pkt = now
+        self.stop(verbose=True)
+        raise Exception("iperf3 traffic did not ramp up")
+
+    def stop(self, verbose=None):
+        self._iperf_client.process(terminate=True)
+        if verbose:
+            ksft_pr(">> Client:")
+            ksft_pr(self._iperf_client.stdout)
+            ksft_pr(self._iperf_client.stderr)
+        self._iperf_server.process(terminate=True)
+        if verbose:
+            ksft_pr(">> Server:")
+            ksft_pr(self._iperf_server.stdout)
+            ksft_pr(self._iperf_server.stderr)
diff --git a/tools/testing/selftests/drivers/net/lib/py/remote.py b/tools/testing/selftests/drivers/net/lib/py/remote.py
new file mode 100644
index 000000000000..b1780b987722
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/lib/py/remote.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import os
+import importlib
+
+_modules = {}
+
+def Remote(kind, args, src_path):
+    global _modules
+
+    if kind not in _modules:
+        _modules[kind] = importlib.import_module("..remote_" + kind, __name__)
+
+    dir_path = os.path.abspath(src_path + "/../")
+    return getattr(_modules[kind], "Remote")(args, dir_path)
diff --git a/tools/testing/selftests/drivers/net/lib/py/remote_netns.py b/tools/testing/selftests/drivers/net/lib/py/remote_netns.py
new file mode 100644
index 000000000000..7d5eeb0271bc
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/lib/py/remote_netns.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import os
+import subprocess
+
+from lib.py import cmd
+
+
+class Remote:
+    def __init__(self, name, dir_path):
+        self.name = name
+        self.dir_path = dir_path
+
+    def cmd(self, comm):
+        return subprocess.Popen(["ip", "netns", "exec", self.name, "bash", "-c", comm],
+                                 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    def deploy(self, what):
+        if os.path.isabs(what):
+            return what
+        return os.path.abspath(self.dir_path + "/" + what)
diff --git a/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py b/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py
new file mode 100644
index 000000000000..924addde19a3
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/lib/py/remote_ssh.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import os
+import string
+import subprocess
+import random
+
+from lib.py import cmd
+
+
+class Remote:
+    def __init__(self, name, dir_path):
+        self.name = name
+        self.dir_path = dir_path
+        self._tmpdir = None
+
+    def __del__(self):
+        if self._tmpdir:
+            cmd("rm -rf " + self._tmpdir, host=self)
+            self._tmpdir = None
+
+    def cmd(self, comm):
+        return subprocess.Popen(["ssh", "-q", self.name, comm],
+                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    def _mktmp(self):
+        return ''.join(random.choice(string.ascii_lowercase) for _ in range(8))
+
+    def deploy(self, what):
+        if not self._tmpdir:
+            self._tmpdir = "/tmp/" + self._mktmp()
+            cmd("mkdir " + self._tmpdir, host=self)
+        file_name = self._tmpdir + "/" + self._mktmp() + os.path.basename(what)
+
+        if not os.path.isabs(what):
+            what = os.path.abspath(self.dir_path + "/" + what)
+
+        cmd(f"scp {what} {self.name}:{file_name}")
+        return file_name
diff --git a/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh b/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh
new file mode 100755
index 000000000000..82be5d013330
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/microchip/ksz9477_qos.sh
@@ -0,0 +1,668 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+
+# The script is adopted to work with the Microchip KSZ switch driver.
+
+ETH_FCS_LEN=4
+
+WAIT_TIME=1
+NUM_NETIFS=4
+REQUIRE_JQ="yes"
+REQUIRE_MZ="yes"
+STABLE_MAC_ADDRS=yes
+NETIF_CREATE=no
+lib_dir=$(dirname $0)/../../../net/forwarding
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+
+require_command dcb
+
+h1=${NETIFS[p1]}
+swp1=${NETIFS[p2]}
+swp2=${NETIFS[p3]}
+h2=${NETIFS[p4]}
+
+H1_IPV4="192.0.2.1"
+H2_IPV4="192.0.2.2"
+H1_IPV6="2001:db8:1::1"
+H2_IPV6="2001:db8:1::2"
+
+# On h1_ and h2_create do not set IP addresses to avoid interaction with the
+# system, to keep packet counters clean.
+h1_create()
+{
+	simple_if_init $h1
+	sysctl_set net.ipv6.conf.${h1}.disable_ipv6 1
+	# Get the MAC address of the interface to use it with mausezahn
+	h1_mac=$(ip -j link show dev ${h1} | jq -e '.[].address')
+}
+
+h1_destroy()
+{
+	sysctl_restore net.ipv6.conf.${h1}.disable_ipv6
+	simple_if_fini $h1
+}
+
+h2_create()
+{
+	simple_if_init $h2
+	sysctl_set net.ipv6.conf.${h2}.disable_ipv6 1
+	h2_mac=$(ip -j link show dev ${h2} | jq -e '.[].address')
+}
+
+h2_destroy()
+{
+	sysctl_restore net.ipv6.conf.${h2}.disable_ipv6
+	simple_if_fini $h2
+}
+
+switch_create()
+{
+	ip link set ${swp1} up
+	ip link set ${swp2} up
+	sysctl_set net.ipv6.conf.${swp1}.disable_ipv6 1
+	sysctl_set net.ipv6.conf.${swp2}.disable_ipv6 1
+
+	# Ports should trust VLAN PCP even with vlan_filtering=0
+	ip link add br0 type bridge
+	ip link set ${swp1} master br0
+	ip link set ${swp2} master br0
+	ip link set br0 up
+	sysctl_set net.ipv6.conf.br0.disable_ipv6 1
+}
+
+switch_destroy()
+{
+	sysctl_restore net.ipv6.conf.${swp2}.disable_ipv6
+	sysctl_restore net.ipv6.conf.${swp1}.disable_ipv6
+
+	ip link del br0
+}
+
+setup_prepare()
+{
+	vrf_prepare
+
+	h1_create
+	h2_create
+	switch_create
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	h2_destroy
+	h1_destroy
+	switch_destroy
+
+	vrf_cleanup
+}
+
+set_apptrust_order()
+{
+	local if_name=$1
+	local order=$2
+
+	dcb apptrust set dev ${if_name} order ${order}
+}
+
+# Function to extract a specified field from a given JSON stats string
+extract_network_stat() {
+	local stats_json=$1
+	local field_name=$2
+
+	echo $(echo "$stats_json" | jq -r "$field_name")
+}
+
+run_test()
+{
+	local test_name=$1;
+	local apptrust_order=$2;
+	local port_prio=$3;
+	local dscp_ipv=$4;
+	local dscp=$5;
+	local have_vlan=$6;
+	local pcp_ipv=$7;
+	local vlan_pcp=$8;
+	local ip_v6=$9
+
+	local rx_ipv
+	local tx_ipv
+
+	RET=0
+
+	# Send some packet to populate the switch MAC table
+	$MZ ${h2} -a ${h2_mac} -b ${h1_mac} -p 64 -t icmp echores -c 1
+
+	# Based on the apptrust order, set the expected Internal Priority values
+	# for the RX and TX paths.
+	if [ "${apptrust_order}" == "" ]; then
+		echo "Apptrust order not set."
+		rx_ipv=${port_prio}
+		tx_ipv=${port_prio}
+	elif [ "${apptrust_order}" == "dscp" ]; then
+		echo "Apptrust order is DSCP."
+		rx_ipv=${dscp_ipv}
+		tx_ipv=${dscp_ipv}
+	elif [ "${apptrust_order}" == "pcp" ]; then
+		echo "Apptrust order is PCP."
+		rx_ipv=${pcp_ipv}
+		tx_ipv=${pcp_ipv}
+	elif [ "${apptrust_order}" == "pcp dscp" ]; then
+		echo "Apptrust order is PCP DSCP."
+		if [ ${have_vlan} -eq 1 ]; then
+			rx_ipv=$((dscp_ipv > pcp_ipv ? dscp_ipv : pcp_ipv))
+			tx_ipv=${pcp_ipv}
+		else
+			rx_ipv=${dscp_ipv}
+			tx_ipv=${dscp_ipv}
+		fi
+	else
+		RET=1
+		echo "Error: Unknown apptrust order ${apptrust_order}"
+		log_test "${test_name}"
+		return
+	fi
+
+	# Most/all? of the KSZ switches do not provide per-TC counters. There
+	# are only tx_hi and rx_hi counters, which are used to count packets
+	# which are considered as high priority and most likely not assigned
+	# to the queue 0.
+	# On the ingress path, packets seem to get high priority status
+	# independently of the DSCP or PCP global mapping. On the egress path,
+	# the high priority status is assigned based on the DSCP or PCP global
+	# map configuration.
+	# The thresholds for the high priority status are not documented, but
+	# it seems that the switch considers packets as high priority on the
+	# ingress path if detected Internal Priority is greater than 0. On the
+	# egress path, the switch considers packets as high priority if
+	# detected Internal Priority is greater than 1.
+	if [ ${rx_ipv} -ge 1 ]; then
+		local expect_rx_high_prio=1
+	else
+		local expect_rx_high_prio=0
+	fi
+
+	if [ ${tx_ipv} -ge 2 ]; then
+		local expect_tx_high_prio=1
+	else
+		local expect_tx_high_prio=0
+	fi
+
+	# Use ip tool to get the current switch packet counters. ethool stats
+	# need to be recalculated to get the correct values.
+	local swp1_stats=$(ip -s -j link show dev ${swp1})
+	local swp2_stats=$(ip -s -j link show dev ${swp2})
+	local swp1_rx_packets_before=$(extract_network_stat "$swp1_stats" \
+				       '.[0].stats64.rx.packets')
+	local swp1_rx_bytes_before=$(extract_network_stat "$swp1_stats" \
+				     '.[0].stats64.rx.bytes')
+	local swp2_tx_packets_before=$(extract_network_stat "$swp2_stats" \
+				       '.[0].stats64.tx.packets')
+	local swp2_tx_bytes_before=$(extract_network_stat "$swp2_stats" \
+				     '.[0].stats64.tx.bytes')
+	local swp1_rx_hi_before=$(ethtool_stats_get ${swp1} "rx_hi")
+	local swp2_tx_hi_before=$(ethtool_stats_get ${swp2} "tx_hi")
+
+	# Assamble the mausezahn command based on the test parameters
+	# For the testis with ipv4 or ipv6, use icmp response packets,
+	# to avoid interaction with the system, to keep packet counters
+	# clean.
+	if [ ${ip_v6} -eq 0 ]; then
+		local ip="-a ${h1_mac} -b ${h2_mac} -A ${H1_IPV4} \
+			  -B ${H2_IPV4} -t icmp unreach,code=1,dscp=${dscp}"
+	else
+		local ip="-6 -a ${h1_mac} -b ${h2_mac} -A ${H1_IPV6} \
+			  -B ${H2_IPV6} -t icmp6 type=1,code=0,dscp=${dscp}"
+	fi
+
+	if [ ${have_vlan} -eq 1 ]; then
+		local vlan_pcp_opt="-Q ${vlan_pcp}:0"
+	else
+		local vlan_pcp_opt=""
+	fi
+	$MZ ${h1} ${ip} -c ${PING_COUNT} -d 10msec ${vlan_pcp_opt}
+
+	# Wait until the switch packet counters are updated
+	sleep 6
+
+	local swp1_stats=$(ip -s -j link show dev ${swp1})
+	local swp2_stats=$(ip -s -j link show dev ${swp2})
+
+	local swp1_rx_packets_after=$(extract_network_stat "$swp1_stats" \
+				      '.[0].stats64.rx.packets')
+	local swp1_rx_bytes_after=$(extract_network_stat "$swp1_stats" \
+				    '.[0].stats64.rx.bytes')
+	local swp2_tx_packets_after=$(extract_network_stat "$swp2_stats" \
+				      '.[0].stats64.tx.packets')
+	local swp2_tx_bytes_after=$(extract_network_stat "$swp2_stats" \
+				    '.[0].stats64.tx.bytes')
+
+	local swp1_rx_packets_diff=$((${swp1_rx_packets_after} - \
+				      ${swp1_rx_packets_before}))
+	local swp2_tx_packets_diff=$((${swp2_tx_packets_after} - \
+				      ${swp2_tx_packets_before}))
+
+	local swp1_rx_hi_after=$(ethtool_stats_get ${swp1} "rx_hi")
+	local swp2_tx_hi_after=$(ethtool_stats_get ${swp2} "tx_hi")
+
+	# Test if any packets were received on swp1, we will rx before and after
+	if [ ${swp1_rx_packets_diff} -lt ${PING_COUNT} ]; then
+		echo "Not expected amount of received packets on ${swp1}"
+		echo "before ${swp1_rx_packets_before} after ${swp1_rx_packets_after}"
+		RET=1
+	fi
+
+	# Test if any packets were transmitted on swp2, we will tx before and after
+	if [ ${swp2_tx_packets_diff} -lt ${PING_COUNT} ]; then
+		echo "Not expected amount of transmitted packets on ${swp2}"
+		echo "before ${swp2_tx_packets_before} after ${swp2_tx_packets_after}"
+		RET=1
+	fi
+
+	# tx/rx_hi counted in bytes. So, we need to compare the difference in bytes
+	local swp1_rx_bytes_diff=$(($swp1_rx_bytes_after - $swp1_rx_bytes_before))
+	local swp2_tx_bytes_diff=$(($swp2_tx_bytes_after - $swp2_tx_bytes_before))
+	local swp1_rx_hi_diff=$(($swp1_rx_hi_after - $swp1_rx_hi_before))
+	local swp2_tx_hi_diff=$(($swp2_tx_hi_after - $swp2_tx_hi_before))
+
+	if [ ${expect_rx_high_prio} -eq 1 ]; then
+		swp1_rx_hi_diff=$((${swp1_rx_hi_diff} - \
+				   ${swp1_rx_packets_diff} * ${ETH_FCS_LEN}))
+		if [ ${swp1_rx_hi_diff} -ne ${swp1_rx_bytes_diff} ]; then
+			echo "Not expected amount of high priority packets received on ${swp1}"
+			echo "RX hi diff: ${swp1_rx_hi_diff}, expected RX bytes diff: ${swp1_rx_bytes_diff}"
+			RET=1
+		fi
+	else
+		if [ ${swp1_rx_hi_diff} -ne 0 ]; then
+			echo "Unexpected amount of high priority packets received on ${swp1}"
+			echo "RX hi diff: ${swp1_rx_hi_diff}, expected 0"
+			RET=1
+		fi
+	fi
+
+	if [ ${expect_tx_high_prio} -eq 1 ]; then
+		swp2_tx_hi_diff=$((${swp2_tx_hi_diff} - \
+				   ${swp2_tx_packets_diff} * ${ETH_FCS_LEN}))
+		if [ ${swp2_tx_hi_diff} -ne ${swp2_tx_bytes_diff} ]; then
+			echo "Not expected amount of high priority packets transmitted on ${swp2}"
+			echo "TX hi diff: ${swp2_tx_hi_diff}, expected TX bytes diff: ${swp2_tx_bytes_diff}"
+			RET=1
+		fi
+	else
+		if [ ${swp2_tx_hi_diff} -ne 0 ]; then
+			echo "Unexpected amount of high priority packets transmitted on ${swp2}"
+			echo "TX hi diff: ${swp2_tx_hi_diff}, expected 0"
+			RET=1
+		fi
+	fi
+
+	log_test "${test_name}"
+}
+
+run_test_dscp()
+{
+	# IPv4 test
+	run_test "$1" "$2" "$3" "$4" "$5" 0 0 0 0
+	# IPv6 test
+	run_test "$1" "$2" "$3" "$4" "$5" 0 0 0 1
+}
+
+run_test_dscp_pcp()
+{
+	# IPv4 test
+	run_test "$1" "$2" "$3" "$4" "$5" 1 "$6" "$7" 0
+	# IPv6 test
+	run_test "$1" "$2" "$3" "$4" "$5" 1 "$6" "$7" 1
+}
+
+port_default_prio_get()
+{
+	local if_name=$1
+	local prio
+
+	prio="$(dcb -j app show dev ${if_name} default-prio | \
+		jq '.default_prio[]')"
+	if [ -z "${prio}" ]; then
+		prio=0
+	fi
+
+	echo ${prio}
+}
+
+test_port_default()
+{
+	local orig_apptrust=$(port_get_default_apptrust ${swp1})
+	local orig_prio=$(port_default_prio_get ${swp1})
+	local apptrust_order=""
+
+	RET=0
+
+	# Make sure no other priority sources will interfere with the test
+	set_apptrust_order ${swp1} "${apptrust_order}"
+
+	for val in $(seq 0 7); do
+		dcb app replace dev ${swp1} default-prio ${val}
+		if [ $val -ne $(port_default_prio_get ${swp1}) ]; then
+			RET=1
+			break
+		fi
+
+		run_test_dscp "Port-default QoS classification, prio: ${val}" \
+			"${apptrust_order}" ${val} 0 0
+	done
+
+	set_apptrust_order ${swp1} "${orig_apptrust}"
+	if [[ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]]; then
+		RET=1
+	fi
+
+	dcb app replace dev ${swp1} default-prio ${orig_prio}
+	if [ $orig_prio -ne $(port_default_prio_get ${swp1}) ]; then
+		RET=1
+	fi
+
+	log_test "Port-default QoS classification"
+}
+
+port_get_default_apptrust()
+{
+	local if_name=$1
+
+	dcb -j apptrust show dev ${if_name} | jq -r '.order[]' | \
+		tr '\n' ' ' | xargs
+}
+
+test_port_apptrust()
+{
+	local original_dscp_prios_swp1=$(get_dscp_prios ${swp1})
+	local orig_apptrust=$(port_get_default_apptrust ${swp1})
+	local orig_port_prio=$(port_default_prio_get ${swp1})
+	local order_variants=("pcp dscp" "dscp" "pcp")
+	local apptrust_order
+	local port_prio
+	local dscp_prio
+	local pcp_prio
+	local dscp
+	local pcp
+
+	RET=0
+
+	# First, test if apptrust configuration as taken by the kernel
+	for order in "${order_variants[@]}"; do
+		set_apptrust_order ${swp1} "${order}"
+		if [[ "$order" != "$(port_get_default_apptrust ${swp1})" ]]; then
+			RET=1
+			break
+		fi
+	done
+
+	log_test "Apptrust, supported variants"
+
+	# To test if the apptrust configuration is working as expected, we need
+	# to set DSCP priorities for the switch port.
+	init_dscp_prios "${swp1}" "${original_dscp_prios_swp1}"
+
+	# Start with a simple test where all apptrust sources are disabled
+	# default port priority is 0, DSCP priority is mapped to 7.
+	# No high priority packets should be received or transmitted.
+	port_prio=0
+	dscp_prio=7
+	dscp=4
+
+	dcb app replace dev ${swp1} default-prio ${port_prio}
+	dcb app replace dev ${swp1} dscp-prio ${dscp}:${dscp_prio}
+
+	apptrust_order=""
+	set_apptrust_order ${swp1} "${apptrust_order}"
+	# Test with apptrust sources disabled, Packets should get port default
+	# priority which is 0
+	run_test_dscp "Apptrust, all disabled. DSCP-prio ${dscp}:${dscp_prio}" \
+		"${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp}
+
+	apptrust_order="pcp"
+	set_apptrust_order ${swp1} "${apptrust_order}"
+	# If PCP is enabled, packets should get PCP priority, which is not
+	# set in this test (no VLAN tags are present in the packet). No high
+	# priority packets should be received or transmitted.
+	run_test_dscp "Apptrust, PCP enabled. DSCP-prio ${dscp}:${dscp_prio}" \
+		"${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp}
+
+	apptrust_order="dscp"
+	set_apptrust_order ${swp1} "${apptrust_order}"
+	# If DSCP is enabled, packets should get DSCP priority which is set to 7
+	# in this test. High priority packets should be received and transmitted.
+	run_test_dscp "Apptrust, DSCP enabled. DSCP-prio ${dscp}:${dscp_prio}" \
+		"${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp}
+
+	apptrust_order="pcp dscp"
+	set_apptrust_order ${swp1} "${apptrust_order}"
+	# If PCP and DSCP are enabled, PCP would have higher apptrust priority
+	# so packets should get PCP priority. But in this test VLAN PCP is not
+	# set, so it should get DSCP priority which is set to 7. High priority
+	# packets should be received and transmitted.
+	run_test_dscp "Apptrust, PCP and DSCP are enabled. DSCP-prio ${dscp}:${dscp_prio}" \
+		"${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp}
+
+	# If VLAN PCP is set, it should have higher apptrust priority than DSCP
+	# so packets should get VLAN PCP priority. Send packets with VLAN PCP
+	# set to 0, DSCP set to 7. Packets should get VLAN PCP priority.
+	# No high priority packets should be transmitted. Due to nature of the
+	# switch, high priority packets will be received.
+	pcp_prio=0
+	pcp=0
+	run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \
+		"${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp}
+
+	# If VLAN PCP is set to 7, it should have higher apptrust priority than
+	# DSCP so packets should get VLAN PCP priority. Send packets with VLAN
+	# PCP set to 7, DSCP set to 7. Packets should get VLAN PCP priority.
+	# High priority packets should be received and transmitted.
+	pcp_prio=7
+	pcp=7
+	run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \
+		"${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp}
+	# Now make sure that the switch is able to handle the case where DSCP
+	# priority is set to 0 and PCP priority is set to 7. Packets should get
+	# PCP priority. High priority packets should be received and transmitted.
+	dscp_prio=0
+	dcb app replace dev ${swp1} dscp-prio ${dscp}:${dscp_prio}
+	run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \
+		"${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp}
+	# If both VLAN PCP and DSCP are set to 0, packets should get 0 priority.
+	# No high priority packets should be received or transmitted.
+	pcp_prio=0
+	pcp=0
+	run_test_dscp_pcp "Apptrust, PCP and DSCP are enabled. PCP ${pcp_prio}, DSCP-prio ${dscp}:${dscp_prio}" \
+		"${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp} ${pcp_prio} ${pcp}
+
+	# Restore original priorities
+	if ! restore_priorities "${swp1}" "${original_dscp_prios_swp1}"; then
+		RET=1
+	fi
+
+	set_apptrust_order ${swp1} "${orig_apptrust}"
+	if [ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]; then
+		RET=1
+	fi
+
+	dcb app replace dev ${swp1} default-prio ${orig_port_prio}
+	if [ $orig_port_prio -ne $(port_default_prio_get ${swp1}) ]; then
+		RET=1
+	fi
+
+	log_test "Apptrust, restore original settings"
+}
+
+# Function to get current DSCP priorities
+get_dscp_prios() {
+	local if_name=$1
+	dcb -j app show dev ${if_name} | jq -c '.dscp_prio'
+}
+
+# Function to set a specific DSCP priority on a device
+replace_dscp_prio() {
+	local if_name=$1
+	local dscp=$2
+	local prio=$3
+	dcb app replace dev ${if_name} dscp-prio ${dscp}:${prio}
+}
+
+# Function to compare DSCP maps
+compare_dscp_maps() {
+	local old_json=$1
+	local new_json=$2
+	local dscp=$3
+	local prio=$4
+
+	# Create a modified old_json with the expected change for comparison
+	local modified_old_json=$(echo "$old_json" |
+		jq --argjson dscp $dscp --argjson prio $prio \
+			'map(if .[0] == $dscp then [$dscp, $prio] else . end)' |
+		tr -d " \n")
+
+	# Compare new_json with the modified_old_json
+	if [[ "$modified_old_json" == "$new_json" ]]; then
+		return 0
+	else
+		return 1
+	fi
+}
+
+# Function to set DSCP priorities
+set_and_verify_dscp() {
+	local port=$1
+	local dscp=$2
+	local new_prio=$3
+
+	local old_prios=$(get_dscp_prios $port)
+
+	replace_dscp_prio "$port" $dscp $new_prio
+
+	# Fetch current settings and compare
+	local current_prios=$(get_dscp_prios $port)
+	if ! compare_dscp_maps "$old_prios" "$current_prios" $dscp $new_prio; then
+		echo "Error: Unintended changes detected in DSCP map for $port after setting DSCP $dscp to $new_prio."
+		return 1
+	fi
+	return 0
+}
+
+# Function to restore original priorities
+restore_priorities() {
+	local port=$1
+	local original_prios=$2
+
+	echo "Removing test artifacts for $port"
+	local current_prios=$(get_dscp_prios $port)
+	local prio_str=$(echo "$current_prios" |
+		jq -r 'map("\(.[0]):\(.[1])") | join(" ")')
+	dcb app del dev $port dscp-prio $prio_str
+
+	echo "Restoring original DSCP priorities for $port"
+	local restore_str=$(echo "$original_prios" |
+		jq -r 'map("\(.[0]):\(.[1])") | join(" ")')
+	dcb app add dev $port dscp-prio $restore_str
+
+	local current_prios=$(get_dscp_prios $port)
+	if [[ "$original_prios" != "$current_prios" ]]; then
+		echo "Error: Failed to restore original DSCP priorities for $port"
+		return 1
+	fi
+	return 0
+}
+
+# Initialize DSCP priorities. Set them to predictable values for testing.
+init_dscp_prios() {
+	local port=$1
+	local original_prios=$2
+
+	echo "Removing any existing DSCP priority mappins for $port"
+	local prio_str=$(echo "$original_prios" |
+		jq -r 'map("\(.[0]):\(.[1])") | join(" ")')
+	dcb app del dev $port dscp-prio $prio_str
+
+	# Initialize DSCP priorities list
+	local dscp_prios=""
+	for dscp in {0..63}; do
+		dscp_prios+=("$dscp:0")
+	done
+
+	echo "Setting initial DSCP priorities map to 0 for $port"
+	dcb app add dev $port dscp-prio ${dscp_prios[@]}
+}
+
+# Main function to test global DSCP map across specified ports
+test_global_dscp_map() {
+	local ports=("$swp1" "$swp2")
+	local original_dscp_prios_port0=$(get_dscp_prios ${ports[0]})
+	local orig_apptrust=$(port_get_default_apptrust ${swp1})
+	local orig_port_prio=$(port_default_prio_get ${swp1})
+	local apptrust_order="dscp"
+	local port_prio=0
+	local dscp_prio
+	local dscp
+
+	RET=0
+
+	set_apptrust_order ${swp1} "${apptrust_order}"
+	dcb app replace dev ${swp1} default-prio ${port_prio}
+
+	# Initialize DSCP priorities
+	init_dscp_prios "${ports[0]}" "$original_dscp_prios_port0"
+
+	# Loop over each DSCP index
+	for dscp in {0..63}; do
+		# and test each Internal Priority value
+		for dscp_prio in {0..7}; do
+			# do it for each port. This is to test if the global DSCP map
+			# is accessible from all ports.
+			for port in "${ports[@]}"; do
+				if ! set_and_verify_dscp "$port" $dscp $dscp_prio; then
+					RET=1
+				fi
+			done
+
+			# Test if the DSCP priority is correctly applied to the packets
+			run_test_dscp "DSCP (${dscp}) QoS classification, prio: ${dscp_prio}" \
+				"${apptrust_order}" ${port_prio} ${dscp_prio} ${dscp}
+			if [ ${RET} -eq 1 ]; then
+				break
+			fi
+		done
+	done
+
+	# Restore original priorities
+	if ! restore_priorities "${ports[0]}" "${original_dscp_prios_port0}"; then
+		RET=1
+	fi
+
+	set_apptrust_order ${swp1} "${orig_apptrust}"
+	if [[ "$orig_apptrust" != "$(port_get_default_apptrust ${swp1})" ]]; then
+		RET=1
+	fi
+
+	dcb app replace dev ${swp1} default-prio ${orig_port_prio}
+	if [ $orig_port_prio -ne $(port_default_prio_get ${swp1}) ]; then
+		RET=1
+	fi
+
+	log_test "DSCP global map"
+}
+
+trap cleanup EXIT
+
+ALL_TESTS="
+	test_port_default
+	test_port_apptrust
+	test_global_dscp_map
+"
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh
index 91891b9418d7..877cd6df94a1 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh
@@ -24,8 +24,8 @@ setup_prepare()
 	busywait "$TIMEOUT" wait_for_port_up ethtool $swp2
 	check_err $? "ports did not come up"
 
-	local lanes_exist=$(ethtool $swp1 | grep 'Lanes:')
-	if [[ -z $lanes_exist ]]; then
+	busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:"
+	if [[ $? -ne 0 ]]; then
 		log_test "SKIP: driver does not support lanes setting"
 		exit 1
 	fi
@@ -122,8 +122,9 @@ autoneg()
 			ethtool_set $swp1 speed $max_speed lanes $lanes
 			ip link set dev $swp1 up
 			ip link set dev $swp2 up
-			busywait "$TIMEOUT" wait_for_port_up ethtool $swp2
-			check_err $? "ports did not come up"
+
+			busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:"
+			check_err $? "Lanes parameter is not presented on time"
 
 			check_lanes $swp1 $lanes $max_speed
 			log_test "$lanes lanes is autonegotiated"
@@ -160,8 +161,9 @@ autoneg_force_mode()
 			ethtool_set $swp2 speed $max_speed lanes $lanes autoneg off
 			ip link set dev $swp1 up
 			ip link set dev $swp2 up
-			busywait "$TIMEOUT" wait_for_port_up ethtool $swp2
-			check_err $? "ports did not come up"
+
+			busywait $TIMEOUT sh -c "ethtool $swp1 | grep -q Lanes:"
+			check_err $? "Lanes parameter is not presented on time"
 
 			check_lanes $swp1 $lanes $max_speed
 			log_test "Autoneg off, $lanes lanes detected during force mode"
diff --git a/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh
index 6369927e9c37..48395cfd4f95 100644
--- a/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh
@@ -42,7 +42,7 @@ __mlxsw_only_on_spectrum()
 	local src=$1; shift
 
 	if ! mlxsw_on_spectrum "$rev"; then
-		log_test_skip $src:$caller "(Spectrum-$rev only)"
+		log_test_xfail $src:$caller "(Spectrum-$rev only)"
 		return 1
 	fi
 }
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh
index a88d8a8c85f2..899b6892603f 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh
@@ -47,7 +47,6 @@ for current_test in ${TESTS:-$ALL_TESTS}; do
 		RET=0
 		target=$(${current_test}_get_target "$should_fail")
 		if ((target == 0)); then
-			log_test_skip "'$current_test' should_fail=$should_fail test"
 			continue
 		fi
 
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh
index f981c957f097..482ebb744eba 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh
@@ -52,7 +52,6 @@ for current_test in ${TESTS:-$ALL_TESTS}; do
 			RET=0
 			target=$(${current_test}_get_target "$should_fail")
 			if ((target == 0)); then
-				log_test_skip "'$current_test' [$profile] should_fail=$should_fail test"
 				continue
 			fi
 			${current_test}_setup_prepare
diff --git a/tools/testing/selftests/drivers/net/netdevsim/settings b/tools/testing/selftests/drivers/net/netdevsim/settings
new file mode 100644
index 000000000000..a62d2fa1275c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/settings
@@ -0,0 +1 @@
+timeout=600
diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py
new file mode 100755
index 000000000000..eb83e7b48797
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/ping.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+from lib.py import ksft_run, ksft_exit
+from lib.py import ksft_eq
+from lib.py import NetDrvEpEnv
+from lib.py import bkg, cmd, wait_port_listen, rand_port
+
+
+def test_v4(cfg) -> None:
+    cfg.require_v4()
+
+    cmd(f"ping -c 1 -W0.5 {cfg.remote_v4}")
+    cmd(f"ping -c 1 -W0.5 {cfg.v4}", host=cfg.remote)
+
+
+def test_v6(cfg) -> None:
+    cfg.require_v6()
+
+    cmd(f"ping -c 1 -W0.5 {cfg.remote_v6}")
+    cmd(f"ping -c 1 -W0.5 {cfg.v6}", host=cfg.remote)
+
+
+def test_tcp(cfg) -> None:
+    cfg.require_cmd("socat", remote=True)
+
+    port = rand_port()
+    listen_cmd = f"socat -{cfg.addr_ipver} -t 2 -u TCP-LISTEN:{port},reuseport STDOUT"
+
+    with bkg(listen_cmd, exit_wait=True) as nc:
+        wait_port_listen(port)
+
+        cmd(f"echo ping | socat -t 2 -u STDIN TCP:{cfg.baddr}:{port}",
+            shell=True, host=cfg.remote)
+    ksft_eq(nc.stdout.strip(), "ping")
+
+    with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as nc:
+        wait_port_listen(port, host=cfg.remote)
+
+        cmd(f"echo ping | socat -t 2 -u STDIN TCP:{cfg.remote_baddr}:{port}", shell=True)
+    ksft_eq(nc.stdout.strip(), "ping")
+
+
+def main() -> None:
+    with NetDrvEpEnv(__file__) as cfg:
+        ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, ))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py
new file mode 100755
index 000000000000..30f29096e27c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/queues.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+from lib.py import ksft_run, ksft_exit, ksft_eq, KsftSkipEx
+from lib.py import EthtoolFamily, NetdevFamily
+from lib.py import NetDrvEnv
+from lib.py import cmd
+import glob
+
+
+def sys_get_queues(ifname) -> int:
+    folders = glob.glob(f'/sys/class/net/{ifname}/queues/rx-*')
+    return len(folders)
+
+
+def nl_get_queues(cfg, nl):
+    queues = nl.queue_get({'ifindex': cfg.ifindex}, dump=True)
+    if queues:
+        return len([q for q in queues if q['type'] == 'rx'])
+    return None
+
+
+def get_queues(cfg, nl) -> None:
+    queues = nl_get_queues(cfg, nl)
+    if not queues:
+        raise KsftSkipEx('queue-get not supported by device')
+
+    expected = sys_get_queues(cfg.dev['ifname'])
+    ksft_eq(queues, expected)
+
+
+def addremove_queues(cfg, nl) -> None:
+    queues = nl_get_queues(cfg, nl)
+    if not queues:
+        raise KsftSkipEx('queue-get not supported by device')
+
+    curr_queues = sys_get_queues(cfg.dev['ifname'])
+    if curr_queues == 1:
+        raise KsftSkipEx('cannot decrement queue: already at 1')
+
+    netnl = EthtoolFamily()
+    channels = netnl.channels_get({'header': {'dev-index': cfg.ifindex}})
+    if channels['combined-count'] == 0:
+        rx_type = 'rx'
+    else:
+        rx_type = 'combined'
+
+    expected = curr_queues - 1
+    cmd(f"ethtool -L {cfg.dev['ifname']} {rx_type} {expected}", timeout=10)
+    queues = nl_get_queues(cfg, nl)
+    ksft_eq(queues, expected)
+
+    expected = curr_queues
+    cmd(f"ethtool -L {cfg.dev['ifname']} {rx_type} {expected}", timeout=10)
+    queues = nl_get_queues(cfg, nl)
+    ksft_eq(queues, expected)
+
+
+def main() -> None:
+    with NetDrvEnv(__file__, queue_count=3) as cfg:
+        ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily()))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py
new file mode 100755
index 000000000000..7a7b16b180e2
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/stats.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+from lib.py import ksft_run, ksft_exit, ksft_pr
+from lib.py import ksft_ge, ksft_eq, ksft_in, ksft_true, ksft_raises, KsftSkipEx, KsftXfailEx
+from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError
+from lib.py import NetDrvEnv
+
+ethnl = EthtoolFamily()
+netfam = NetdevFamily()
+rtnl = RtnlFamily()
+
+
+def check_pause(cfg) -> None:
+    global ethnl
+
+    try:
+        ethnl.pause_get({"header": {"dev-index": cfg.ifindex}})
+    except NlError as e:
+        if e.error == 95:
+            raise KsftXfailEx("pause not supported by the device")
+        raise
+
+    data = ethnl.pause_get({"header": {"dev-index": cfg.ifindex,
+                                       "flags": {'stats'}}})
+    ksft_true(data['stats'], "driver does not report stats")
+
+
+def check_fec(cfg) -> None:
+    global ethnl
+
+    try:
+        ethnl.fec_get({"header": {"dev-index": cfg.ifindex}})
+    except NlError as e:
+        if e.error == 95:
+            raise KsftXfailEx("FEC not supported by the device")
+        raise
+
+    data = ethnl.fec_get({"header": {"dev-index": cfg.ifindex,
+                                     "flags": {'stats'}}})
+    ksft_true(data['stats'], "driver does not report stats")
+
+
+def pkt_byte_sum(cfg) -> None:
+    global netfam, rtnl
+
+    def get_qstat(test):
+        global netfam
+        stats = netfam.qstats_get({}, dump=True)
+        if stats:
+            for qs in stats:
+                if qs["ifindex"]== test.ifindex:
+                    return qs
+
+    qstat = get_qstat(cfg)
+    if qstat is None:
+        raise KsftSkipEx("qstats not supported by the device")
+
+    for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']:
+        ksft_in(key, qstat, "Drivers should always report basic keys")
+
+    # Compare stats, rtnl stats and qstats must match,
+    # but the interface may be up, so do a series of dumps
+    # each time the more "recent" stats must be higher or same.
+    def stat_cmp(rstat, qstat):
+        for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']:
+            if rstat[key] != qstat[key]:
+                return rstat[key] - qstat[key]
+        return 0
+
+    for _ in range(10):
+        rtstat = rtnl.getlink({"ifi-index": cfg.ifindex})['stats']
+        if stat_cmp(rtstat, qstat) < 0:
+            raise Exception("RTNL stats are lower, fetched later")
+        qstat = get_qstat(cfg)
+        if stat_cmp(rtstat, qstat) > 0:
+            raise Exception("Qstats are lower, fetched later")
+
+
+def qstat_by_ifindex(cfg) -> None:
+    global netfam
+    global rtnl
+
+    # Construct a map ifindex -> [dump, by-index, dump]
+    ifindexes = {}
+    stats = netfam.qstats_get({}, dump=True)
+    for entry in stats:
+        ifindexes[entry['ifindex']] = [entry, None, None]
+
+    for ifindex in ifindexes.keys():
+        entry = netfam.qstats_get({"ifindex": ifindex}, dump=True)
+        ksft_eq(len(entry), 1)
+        ifindexes[entry[0]['ifindex']][1] = entry[0]
+
+    stats = netfam.qstats_get({}, dump=True)
+    for entry in stats:
+        ifindexes[entry['ifindex']][2] = entry
+
+    if len(ifindexes) == 0:
+        raise KsftSkipEx("No ifindex supports qstats")
+
+    # Now make sure the stats match/make sense
+    for ifindex, triple in ifindexes.items():
+        all_keys = triple[0].keys() | triple[1].keys() | triple[2].keys()
+
+        for key in all_keys:
+            ksft_ge(triple[1][key], triple[0][key], comment="bad key: " + key)
+            ksft_ge(triple[2][key], triple[1][key], comment="bad key: " + key)
+
+    # Test invalid dumps
+    # 0 is invalid
+    with ksft_raises(NlError) as cm:
+        netfam.qstats_get({"ifindex": 0}, dump=True)
+    ksft_eq(cm.exception.nl_msg.error, -34)
+    ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
+
+    # loopback has no stats
+    with ksft_raises(NlError) as cm:
+        netfam.qstats_get({"ifindex": 1}, dump=True)
+    ksft_eq(cm.exception.nl_msg.error, -95)
+    ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
+
+    # Try to get stats for lowest unused ifindex but not 0
+    devs = rtnl.getlink({}, dump=True)
+    all_ifindexes = set([dev["ifi-index"] for dev in devs])
+    lowest = 2
+    while lowest in all_ifindexes:
+        lowest += 1
+
+    with ksft_raises(NlError) as cm:
+        netfam.qstats_get({"ifindex": lowest}, dump=True)
+    ksft_eq(cm.exception.nl_msg.error, -19)
+    ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
+
+
+def main() -> None:
+    with NetDrvEnv(__file__) as cfg:
+        ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex],
+                 args=(cfg, ))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/drivers/net/virtio_net/Makefile b/tools/testing/selftests/drivers/net/virtio_net/Makefile
new file mode 100644
index 000000000000..7ec7cd3ab2cc
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/virtio_net/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0+ OR MIT
+
+TEST_PROGS = basic_features.sh \
+        #
+
+TEST_FILES = \
+        virtio_net_common.sh \
+        #
+
+TEST_INCLUDES = \
+        ../../../net/forwarding/lib.sh \
+        ../../../net/lib.sh \
+        #
+
+include ../../../lib.mk
diff --git a/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh b/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh
new file mode 100755
index 000000000000..cf8cf816ed48
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/virtio_net/basic_features.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# See virtio_net_common.sh comments for more details about assumed setup
+
+ALL_TESTS="
+	initial_ping_test
+	f_mac_test
+"
+
+source virtio_net_common.sh
+
+lib_dir=$(dirname "$0")
+source "$lib_dir"/../../../net/forwarding/lib.sh
+
+h1=${NETIFS[p1]}
+h2=${NETIFS[p2]}
+
+h1_create()
+{
+	simple_if_init $h1 $H1_IPV4/24 $H1_IPV6/64
+}
+
+h1_destroy()
+{
+	simple_if_fini $h1 $H1_IPV4/24 $H1_IPV6/64
+}
+
+h2_create()
+{
+	simple_if_init $h2 $H2_IPV4/24 $H2_IPV6/64
+}
+
+h2_destroy()
+{
+	simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64
+}
+
+initial_ping_test()
+{
+	setup_cleanup
+	setup_prepare
+	ping_test $h1 $H2_IPV4 " simple"
+}
+
+f_mac_test()
+{
+	RET=0
+	local test_name="mac feature filtered"
+
+	virtio_feature_present $h1 $VIRTIO_NET_F_MAC
+	if [ $? -ne 0 ]; then
+		log_test_skip "$test_name" "Device $h1 is missing feature $VIRTIO_NET_F_MAC."
+		return 0
+	fi
+	virtio_feature_present $h1 $VIRTIO_NET_F_MAC
+	if [ $? -ne 0 ]; then
+		log_test_skip "$test_name" "Device $h2 is missing feature $VIRTIO_NET_F_MAC."
+		return 0
+	fi
+
+	setup_cleanup
+	setup_prepare
+
+	grep -q 0 /sys/class/net/$h1/addr_assign_type
+	check_err $? "Permanent address assign type for $h1 is not set"
+	grep -q 0 /sys/class/net/$h2/addr_assign_type
+	check_err $? "Permanent address assign type for $h2 is not set"
+
+	setup_cleanup
+	virtio_filter_feature_add $h1 $VIRTIO_NET_F_MAC
+	virtio_filter_feature_add $h2 $VIRTIO_NET_F_MAC
+	setup_prepare
+
+	grep -q 0 /sys/class/net/$h1/addr_assign_type
+	check_fail $? "Permanent address assign type for $h1 is set when F_MAC feature is filtered"
+	grep -q 0 /sys/class/net/$h2/addr_assign_type
+	check_fail $? "Permanent address assign type for $h2 is set when F_MAC feature is filtered"
+
+	ping_do $h1 $H2_IPV4
+	check_err $? "Ping failed"
+
+	log_test "$test_name"
+}
+
+setup_prepare()
+{
+	virtio_device_rebind $h1
+	virtio_device_rebind $h2
+	wait_for_dev $h1
+	wait_for_dev $h2
+
+	vrf_prepare
+
+	h1_create
+	h2_create
+}
+
+setup_cleanup()
+{
+	h2_destroy
+	h1_destroy
+
+	vrf_cleanup
+
+	virtio_filter_features_clear $h1
+	virtio_filter_features_clear $h2
+	virtio_device_rebind $h1
+	virtio_device_rebind $h2
+	wait_for_dev $h1
+	wait_for_dev $h2
+}
+
+cleanup()
+{
+	pre_cleanup
+	setup_cleanup
+}
+
+check_driver $h1 "virtio_net"
+check_driver $h2 "virtio_net"
+check_virtio_debugfs $h1
+check_virtio_debugfs $h2
+
+trap cleanup EXIT
+
+setup_prepare
+
+tests_run
+
+exit "$EXIT_STATUS"
diff --git a/tools/testing/selftests/drivers/net/virtio_net/config b/tools/testing/selftests/drivers/net/virtio_net/config
new file mode 100644
index 000000000000..f35de0542b60
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/virtio_net/config
@@ -0,0 +1,2 @@
+CONFIG_VIRTIO_NET=y
+CONFIG_VIRTIO_DEBUG=y
diff --git a/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh b/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh
new file mode 100644
index 000000000000..57bd8055e2e5
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/virtio_net/virtio_net_common.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This assumes running on a host with two virtio interfaces connected
+# back to back. Example script to do such wire-up of tap devices would
+# look like this:
+#
+# =======================================================================================================
+# #!/bin/bash
+#
+# DEV1="$1"
+# DEV2="$2"
+#
+# sudo tc qdisc add dev $DEV1 clsact
+# sudo tc qdisc add dev $DEV2 clsact
+# sudo tc filter add dev $DEV1 ingress protocol all pref 1 matchall action mirred egress redirect dev $DEV2
+# sudo tc filter add dev $DEV2 ingress protocol all pref 1 matchall action mirred egress redirect dev $DEV1
+# sudo ip link set $DEV1 up
+# sudo ip link set $DEV2 up
+# =======================================================================================================
+
+REQUIRE_MZ="no"
+NETIF_CREATE="no"
+NETIF_FIND_DRIVER="virtio_net"
+NUM_NETIFS=2
+
+H1_IPV4="192.0.2.1"
+H2_IPV4="192.0.2.2"
+H1_IPV6="2001:db8:1::1"
+H2_IPV6="2001:db8:1::2"
+
+VIRTIO_NET_F_MAC=5
+
+virtio_device_get()
+{
+	local dev=$1; shift
+	local device_path="/sys/class/net/$dev/device/"
+
+	basename `realpath $device_path`
+}
+
+virtio_device_rebind()
+{
+	local dev=$1; shift
+	local device=`virtio_device_get $dev`
+
+	echo "$device" > /sys/bus/virtio/drivers/virtio_net/unbind
+	echo "$device" > /sys/bus/virtio/drivers/virtio_net/bind
+}
+
+virtio_debugfs_get()
+{
+	local dev=$1; shift
+	local device=`virtio_device_get $dev`
+
+	echo /sys/kernel/debug/virtio/$device/
+}
+
+check_virtio_debugfs()
+{
+	local dev=$1; shift
+	local debugfs=`virtio_debugfs_get $dev`
+
+	if [ ! -f "$debugfs/device_features" ] ||
+	   [ ! -f "$debugfs/filter_feature_add"  ] ||
+	   [ ! -f "$debugfs/filter_feature_del"  ] ||
+	   [ ! -f "$debugfs/filter_features"  ] ||
+	   [ ! -f "$debugfs/filter_features_clear"  ]; then
+		echo "SKIP: not possible to access debugfs for $dev"
+		exit $ksft_skip
+	fi
+}
+
+virtio_feature_present()
+{
+	local dev=$1; shift
+	local feature=$1; shift
+	local debugfs=`virtio_debugfs_get $dev`
+
+	cat $debugfs/device_features |grep "^$feature$" &> /dev/null
+	return $?
+}
+
+virtio_filter_features_clear()
+{
+	local dev=$1; shift
+	local debugfs=`virtio_debugfs_get $dev`
+
+	echo "1" > $debugfs/filter_features_clear
+}
+
+virtio_filter_feature_add()
+{
+	local dev=$1; shift
+	local feature=$1; shift
+	local debugfs=`virtio_debugfs_get $dev`
+
+	echo "$feature" > $debugfs/filter_feature_add
+}
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
index a0b8688b0836..fb4472ddffd8 100644
--- a/tools/testing/selftests/exec/Makefile
+++ b/tools/testing/selftests/exec/Makefile
@@ -19,8 +19,8 @@ include ../lib.mk
 
 $(OUTPUT)/subdir:
 	mkdir -p $@
-$(OUTPUT)/script:
-	echo '#!/bin/sh' > $@
+$(OUTPUT)/script: Makefile
+	echo '#!/bin/bash' > $@
 	echo 'exit $$*' >> $@
 	chmod +x $@
 $(OUTPUT)/execveat.symlink: $(OUTPUT)/execveat
diff --git a/tools/testing/selftests/exec/binfmt_script.py b/tools/testing/selftests/exec/binfmt_script.py
index 05f94a741c7a..2c575a2c0eab 100755
--- a/tools/testing/selftests/exec/binfmt_script.py
+++ b/tools/testing/selftests/exec/binfmt_script.py
@@ -16,6 +16,8 @@ SIZE=256
 NAME_MAX=int(subprocess.check_output(["getconf", "NAME_MAX", "."]))
 
 test_num=0
+pass_num=0
+fail_num=0
 
 code='''#!/usr/bin/perl
 print "Executed interpreter! Args:\n";
@@ -42,7 +44,7 @@ foreach my $a (@ARGV) {
 # ...
 def test(name, size, good=True, leading="", root="./", target="/perl",
                      fill="A", arg="", newline="\n", hashbang="#!"):
-    global test_num, tests, NAME_MAX
+    global test_num, pass_num, fail_num, tests, NAME_MAX
     test_num += 1
     if test_num > tests:
         raise ValueError("more binfmt_script tests than expected! (want %d, expected %d)"
@@ -80,16 +82,20 @@ def test(name, size, good=True, leading="", root="./", target="/perl",
         if good:
             print("ok %d - binfmt_script %s (successful good exec)"
                   % (test_num, name))
+            pass_num += 1
         else:
             print("not ok %d - binfmt_script %s succeeded when it should have failed"
                   % (test_num, name))
+            fail_num = 1
     else:
         if good:
             print("not ok %d - binfmt_script %s failed when it should have succeeded (rc:%d)"
                   % (test_num, name, proc.returncode))
+            fail_num = 1
         else:
             print("ok %d - binfmt_script %s (correctly failed bad exec)"
                   % (test_num, name))
+            pass_num += 1
 
     # Clean up crazy binaries
     os.unlink(script)
@@ -166,6 +172,8 @@ test(name="two-under-trunc-arg", size=int(SIZE/2), arg=" ")
 test(name="two-under-leading",   size=int(SIZE/2), leading=" ")
 test(name="two-under-lead-trunc-arg", size=int(SIZE/2), leading=" ", arg=" ")
 
+print("# Totals: pass:%d fail:%d xfail:0 xpass:0 skip:0 error:0" % (pass_num, fail_num))
+
 if test_num != tests:
     raise ValueError("fewer binfmt_script tests than expected! (ran %d, expected %d"
                      % (test_num, tests))
diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c
index 0546ca24f2b2..6418ded40bdd 100644
--- a/tools/testing/selftests/exec/execveat.c
+++ b/tools/testing/selftests/exec/execveat.c
@@ -98,10 +98,9 @@ static int check_execveat_invoked_rc(int fd, const char *path, int flags,
 	if (child == 0) {
 		/* Child: do execveat(). */
 		rc = execveat_(fd, path, argv, envp, flags);
-		ksft_print_msg("execveat() failed, rc=%d errno=%d (%s)\n",
+		ksft_print_msg("child execveat() failed, rc=%d errno=%d (%s)\n",
 			       rc, errno, strerror(errno));
-		ksft_test_result_fail("%s\n", test_name);
-		exit(1);  /* should not reach here */
+		exit(errno);
 	}
 	/* Parent: wait for & check child's exit status. */
 	rc = waitpid(child, &status, 0);
@@ -226,11 +225,14 @@ static int check_execveat_pathmax(int root_dfd, const char *src, int is_script)
 	 * "If the command name is found, but it is not an executable utility,
 	 * the exit status shall be 126."), so allow either.
 	 */
-	if (is_script)
+	if (is_script) {
+		ksft_print_msg("Invoke script via root_dfd and relative filename\n");
 		fail += check_execveat_invoked_rc(root_dfd, longpath + 1, 0,
 						  127, 126);
-	else
+	} else {
+		ksft_print_msg("Invoke exec via root_dfd and relative filename\n");
 		fail += check_execveat(root_dfd, longpath + 1, 0);
+	}
 
 	return fail;
 }
diff --git a/tools/testing/selftests/exec/load_address.c b/tools/testing/selftests/exec/load_address.c
index d487c2f6a615..17e3207d34ae 100644
--- a/tools/testing/selftests/exec/load_address.c
+++ b/tools/testing/selftests/exec/load_address.c
@@ -5,6 +5,7 @@
 #include <link.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include "../kselftest.h"
 
 struct Statistics {
 	unsigned long long load_address;
@@ -41,28 +42,23 @@ int main(int argc, char **argv)
 	unsigned long long misalign;
 	int ret;
 
+	ksft_print_header();
+	ksft_set_plan(1);
+
 	ret = dl_iterate_phdr(ExtractStatistics, &extracted);
-	if (ret != 1) {
-		fprintf(stderr, "FAILED\n");
-		return 1;
-	}
+	if (ret != 1)
+		ksft_exit_fail_msg("FAILED: dl_iterate_phdr\n");
 
-	if (extracted.alignment == 0) {
-		fprintf(stderr, "No alignment found\n");
-		return 1;
-	} else if (extracted.alignment & (extracted.alignment - 1)) {
-		fprintf(stderr, "Alignment is not a power of 2\n");
-		return 1;
-	}
+	if (extracted.alignment == 0)
+		ksft_exit_fail_msg("FAILED: No alignment found\n");
+	else if (extracted.alignment & (extracted.alignment - 1))
+		ksft_exit_fail_msg("FAILED: Alignment is not a power of 2\n");
 
 	misalign = extracted.load_address & (extracted.alignment - 1);
-	if (misalign) {
-		printf("alignment = %llu, load_address = %llu\n",
-			extracted.alignment, extracted.load_address);
-		fprintf(stderr, "FAILED\n");
-		return 1;
-	}
+	if (misalign)
+		ksft_exit_fail_msg("FAILED: alignment = %llu, load_address = %llu\n",
+				   extracted.alignment, extracted.load_address);
 
-	fprintf(stderr, "PASS\n");
-	return 0;
+	ksft_test_result_pass("Completed\n");
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/exec/recursion-depth.c b/tools/testing/selftests/exec/recursion-depth.c
index 2dbd5bc45b3e..438c8ff2fd26 100644
--- a/tools/testing/selftests/exec/recursion-depth.c
+++ b/tools/testing/selftests/exec/recursion-depth.c
@@ -23,45 +23,44 @@
 #include <fcntl.h>
 #include <sys/mount.h>
 #include <unistd.h>
+#include "../kselftest.h"
 
 int main(void)
 {
+	int fd, rv;
+
+	ksft_print_header();
+	ksft_set_plan(1);
+
 	if (unshare(CLONE_NEWNS) == -1) {
 		if (errno == ENOSYS || errno == EPERM) {
-			fprintf(stderr, "error: unshare, errno %d\n", errno);
-			return 4;
+			ksft_test_result_skip("error: unshare, errno %d\n", errno);
+			ksft_finished();
 		}
-		fprintf(stderr, "error: unshare, errno %d\n", errno);
-		return 1;
-	}
-	if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
-		fprintf(stderr, "error: mount '/', errno %d\n", errno);
-		return 1;
+		ksft_exit_fail_perror("error: unshare");
 	}
+
+	if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL) == -1)
+		ksft_exit_fail_perror("error: mount '/'");
+
 	/* Require "exec" filesystem. */
-	if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1) {
-		fprintf(stderr, "error: mount ramfs, errno %d\n", errno);
-		return 1;
-	}
+	if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1)
+		ksft_exit_fail_perror("error: mount ramfs");
 
 #define FILENAME "/tmp/1"
 
-	int fd = creat(FILENAME, 0700);
-	if (fd == -1) {
-		fprintf(stderr, "error: creat, errno %d\n", errno);
-		return 1;
-	}
+	fd = creat(FILENAME, 0700);
+	if (fd == -1)
+		ksft_exit_fail_perror("error: creat");
+
 #define S "#!" FILENAME "\n"
-	if (write(fd, S, strlen(S)) != strlen(S)) {
-		fprintf(stderr, "error: write, errno %d\n", errno);
-		return 1;
-	}
+	if (write(fd, S, strlen(S)) != strlen(S))
+		ksft_exit_fail_perror("error: write");
+
 	close(fd);
 
-	int rv = execve(FILENAME, NULL, NULL);
-	if (rv == -1 && errno == ELOOP) {
-		return 0;
-	}
-	fprintf(stderr, "error: execve, rv %d, errno %d\n", rv, errno);
-	return 1;
+	rv = execve(FILENAME, NULL, NULL);
+	ksft_test_result(rv == -1 && errno == ELOOP,
+			 "execve failed as expected (ret %d, errno %d)\n", rv, errno);
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/filesystems/binderfs/Makefile b/tools/testing/selftests/filesystems/binderfs/Makefile
index c2f7cef919c0..eb4c3b411934 100644
--- a/tools/testing/selftests/filesystems/binderfs/Makefile
+++ b/tools/testing/selftests/filesystems/binderfs/Makefile
@@ -3,6 +3,4 @@
 CFLAGS += $(KHDR_INCLUDES) -pthread
 TEST_GEN_PROGS := binderfs_test
 
-binderfs_test: binderfs_test.c ../../kselftest.h ../../kselftest_harness.h
-
 include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/eventfd/.gitignore b/tools/testing/selftests/filesystems/eventfd/.gitignore
new file mode 100644
index 000000000000..483faf59fe4a
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+eventfd_test
diff --git a/tools/testing/selftests/filesystems/eventfd/Makefile b/tools/testing/selftests/filesystems/eventfd/Makefile
new file mode 100644
index 000000000000..0a8e3910df15
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += $(KHDR_INCLUDES)
+LDLIBS += -lpthread
+TEST_GEN_PROGS := eventfd_test
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
new file mode 100644
index 000000000000..f142a137526c
--- /dev/null
+++ b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <asm/unistd.h>
+#include <linux/time_types.h>
+#include <unistd.h>
+#include <assert.h>
+#include <signal.h>
+#include <pthread.h>
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+#include "../../kselftest_harness.h"
+
+struct error {
+	int  code;
+	char msg[512];
+};
+
+static int error_set(struct error *err, int code, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	if (code == 0 || !err || err->code != 0)
+		return code;
+
+	err->code = code;
+	va_start(args, fmt);
+	r = vsnprintf(err->msg, sizeof(err->msg), fmt, args);
+	assert((size_t)r < sizeof(err->msg));
+	va_end(args);
+
+	return code;
+}
+
+static inline int sys_eventfd2(unsigned int count, int flags)
+{
+	return syscall(__NR_eventfd2, count, flags);
+}
+
+TEST(eventfd01)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, 0);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFL);
+	// since the kernel automatically added O_RDWR.
+	EXPECT_EQ(flags, O_RDWR);
+
+	close(fd);
+}
+
+TEST(eventfd02)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, EFD_CLOEXEC);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFD);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags, FD_CLOEXEC);
+
+	close(fd);
+}
+
+TEST(eventfd03)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, EFD_NONBLOCK);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFL);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags & EFD_NONBLOCK, EFD_NONBLOCK);
+	EXPECT_EQ(flags & O_RDWR, O_RDWR);
+
+	close(fd);
+}
+
+TEST(eventfd04)
+{
+	int fd, flags;
+
+	fd = sys_eventfd2(0, EFD_CLOEXEC|EFD_NONBLOCK);
+	ASSERT_GE(fd, 0);
+
+	flags = fcntl(fd, F_GETFL);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags & EFD_NONBLOCK, EFD_NONBLOCK);
+	EXPECT_EQ(flags & O_RDWR, O_RDWR);
+
+	flags = fcntl(fd, F_GETFD);
+	ASSERT_GT(flags, -1);
+	EXPECT_EQ(flags, FD_CLOEXEC);
+
+	close(fd);
+}
+
+static inline void trim_newline(char *str)
+{
+	char *pos = strrchr(str, '\n');
+
+	if (pos)
+		*pos = '\0';
+}
+
+static int verify_fdinfo(int fd, struct error *err, const char *prefix,
+		size_t prefix_len, const char *expect, ...)
+{
+	char buffer[512] = {0, };
+	char path[512] = {0, };
+	va_list args;
+	FILE *f;
+	char *line = NULL;
+	size_t n = 0;
+	int found = 0;
+	int r;
+
+	va_start(args, expect);
+	r = vsnprintf(buffer, sizeof(buffer), expect, args);
+	assert((size_t)r < sizeof(buffer));
+	va_end(args);
+
+	snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd);
+	f = fopen(path, "re");
+	if (!f)
+		return error_set(err, -1, "fdinfo open failed for %d", fd);
+
+	while (getline(&line, &n, f) != -1) {
+		char *val;
+
+		if (strncmp(line, prefix, prefix_len))
+			continue;
+
+		found = 1;
+
+		val = line + prefix_len;
+		r = strcmp(val, buffer);
+		if (r != 0) {
+			trim_newline(line);
+			trim_newline(buffer);
+			error_set(err, -1, "%s '%s' != '%s'",
+				  prefix, val, buffer);
+		}
+		break;
+	}
+
+	free(line);
+	fclose(f);
+
+	if (found == 0)
+		return error_set(err, -1, "%s not found for fd %d",
+				 prefix, fd);
+
+	return 0;
+}
+
+TEST(eventfd05)
+{
+	struct error err = {0};
+	int fd, ret;
+
+	fd = sys_eventfd2(0, EFD_SEMAPHORE);
+	ASSERT_GE(fd, 0);
+
+	ret = fcntl(fd, F_GETFL);
+	ASSERT_GT(ret, -1);
+	EXPECT_EQ(ret & O_RDWR, O_RDWR);
+
+	// The semaphore could only be obtained from fdinfo.
+	ret = verify_fdinfo(fd, &err, "eventfd-semaphore: ", 19, "1\n");
+	if (ret != 0)
+		ksft_print_msg("eventfd-semaphore check failed, msg: %s\n",
+				err.msg);
+	EXPECT_EQ(ret, 0);
+
+	close(fd);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c
index 3eafd7da58e2..e6d7c4f1c85b 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
@@ -3,6 +3,7 @@
 #define _GNU_SOURCE
 
 #include <assert.h>
+#include <stddef.h>
 #include <stdint.h>
 #include <sched.h>
 #include <fcntl.h>
diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest
index 25d4e0fca385..cce72f8b03dc 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -255,7 +255,13 @@ prlog() { # messages
   [ "$LOG_FILE" ] && printf "$*$newline" | strip_esc >> $LOG_FILE
 }
 catlog() { #file
-  cat $1
+  if [ "${KTAP}" = "1" ]; then
+    cat $1 | while read line ; do
+      echo "# $line"
+    done
+  else
+    cat $1
+  fi
   [ "$LOG_FILE" ] && cat $1 | strip_esc >> $LOG_FILE
 }
 prlog "=== Ftrace unit tests ==="
diff --git a/tools/testing/selftests/ftrace/ftracetest-ktap b/tools/testing/selftests/ftrace/ftracetest-ktap
index b3284679ef3a..14e62ef3f3b9 100755
--- a/tools/testing/selftests/ftrace/ftracetest-ktap
+++ b/tools/testing/selftests/ftrace/ftracetest-ktap
@@ -5,4 +5,4 @@
 #
 # Copyright (C) Arm Ltd., 2023
 
-./ftracetest -K
+./ftracetest -K -v
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc
index b9c21a81d248..c0cdad4c400e 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_btfarg.tc
@@ -53,7 +53,7 @@ fi
 
 echo > dynamic_events
 
-if [ "$FIELDS" ] ; then
+if [ "$FIELDS" -a "$FPROBES" ] ; then
 echo "t:tpevent ${TP2} obj_size=s->object_size" >> dynamic_events
 echo "f:fpevent ${TP3}%return path=\$retval->name:string" >> dynamic_events
 echo "t:tpevent2 ${TP4} p->se.group_node.next->prev" >> dynamic_events
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc
new file mode 100644
index 000000000000..1e251ce2998e
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_entry_arg.tc
@@ -0,0 +1,18 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Function return probe entry argument access
+# requires: dynamic_events 'f[:[<group>/][<event>]] <func-name>':README 'kernel return probes support:':README
+
+echo 'f:tests/myevent1 vfs_open arg=$arg1' >> dynamic_events
+echo 'f:tests/myevent2 vfs_open%return arg=$arg1' >> dynamic_events
+
+echo 1 > events/tests/enable
+
+echo > trace
+cat trace > /dev/null
+
+streq() {
+	test $1 = $2
+}
+
+streq `grep -A 1 -m 1 myevent1 trace | sed -r 's/^.*(arg=.*)/\1/' `
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
index 20e42c030095..61877d166451 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
@@ -34,7 +34,9 @@ check_error 'f vfs_read ^$stack10000'	# BAD_STACK_NUM
 
 check_error 'f vfs_read ^$arg10000'	# BAD_ARG_NUM
 
+if !grep -q 'kernel return probes support:' README; then
 check_error 'f vfs_read $retval ^$arg1' # BAD_VAR
+fi
 check_error 'f vfs_read ^$none_var'	# BAD_VAR
 check_error 'f vfs_read ^'$REG		# BAD_VAR
 
@@ -99,7 +101,9 @@ if grep -q "<argname>" README; then
 check_error 'f vfs_read args=^$arg*'		# BAD_VAR_ARGS
 check_error 'f vfs_read +0(^$arg*)'		# BAD_VAR_ARGS
 check_error 'f vfs_read $arg* ^$arg*'		# DOUBLE_ARGS
+if !grep -q 'kernel return probes support:' README; then
 check_error 'f vfs_read%return ^$arg*'		# NOFENTRY_ARGS
+fi
 check_error 'f vfs_read ^hoge'			# NO_BTFARG
 check_error 'f kfree ^$arg10'			# NO_BTFARG (exceed the number of parameters)
 check_error 'f kfree%return ^$retval'		# NO_RETVAL
diff --git a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc
index b1ede6249866..b7c8f29c09a9 100644
--- a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc
+++ b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc
@@ -18,7 +18,7 @@ echo 'sched:*' > set_event
 
 yield
 
-count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
+count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
 if [ $count -lt 3 ]; then
     fail "at least fork, exec and exit events should be recorded"
 fi
@@ -29,7 +29,7 @@ echo 1 > events/sched/enable
 
 yield
 
-count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
+count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
 if [ $count -lt 3 ]; then
     fail "at least fork, exec and exit events should be recorded"
 fi
@@ -40,7 +40,7 @@ echo 0 > events/sched/enable
 
 yield
 
-count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
+count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
 if [ $count -ne 0 ]; then
     fail "any of scheduler events should not be recorded"
 fi
diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
index 2de7c61d1ae3..3f74c09c56b6 100644
--- a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
+++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
@@ -24,7 +24,7 @@ echo 0 > events/enable
 echo "Get the most frequently calling function"
 sample_events
 
-target_func=`cut -d: -f3 trace | sed 's/call_site=\([^+]*\)+0x.*/\1/' | sort | uniq -c | sort | tail -n 1 | sed 's/^[ 0-9]*//'`
+target_func=`cat trace | grep -o 'call_site=\([^+]*\)' | sed 's/call_site=//' | sort | uniq -c | sort | tail -n 1 | sed 's/^[ 0-9]*//'`
 if [ -z "$target_func" ]; then
     exit_fail
 fi
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
index 25432b8cd5bd..073a748b9380 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
@@ -19,7 +19,7 @@ fail() { # mesg
 
 FILTER=set_ftrace_filter
 FUNC1="schedule"
-FUNC2="scheduler_tick"
+FUNC2="sched_tick"
 
 ALL_FUNCS="#### all functions enabled ####"
 
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc
index 65fbb26fd58c..a16c6a6f6055 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc
@@ -108,7 +108,9 @@ if grep -q "<argname>" README; then
 check_error 'p vfs_read args=^$arg*'		# BAD_VAR_ARGS
 check_error 'p vfs_read +0(^$arg*)'		# BAD_VAR_ARGS
 check_error 'p vfs_read $arg* ^$arg*'		# DOUBLE_ARGS
+if !grep -q 'kernel return probes support:' README; then
 check_error 'r vfs_read ^$arg*'			# NOFENTRY_ARGS
+fi
 check_error 'p vfs_read+8 ^$arg*'		# NOFENTRY_ARGS
 check_error 'p vfs_read ^hoge'			# NO_BTFARG
 check_error 'p kfree ^$arg10'			# NO_BTFARG (exceed the number of parameters)
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc
new file mode 100644
index 000000000000..e50470b53164
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_entry_arg.tc
@@ -0,0 +1,18 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kretprobe entry argument access
+# requires: kprobe_events 'kernel return probes support:':README
+
+echo 'p:myevent1 vfs_open arg=$arg1' >> kprobe_events
+echo 'r:myevent2 vfs_open arg=$arg1' >> kprobe_events
+
+echo 1 > events/kprobes/enable
+
+echo > trace
+cat trace > /dev/null
+
+streq() {
+	test $1 = $2
+}
+
+streq `grep -A 1 -m 1 myevent1 trace | sed -r 's/^.*(arg=.*)/\1/' `
diff --git a/tools/testing/selftests/gpio/gpio-mockup.sh b/tools/testing/selftests/gpio/gpio-mockup.sh
index 0d6c5f7f95d2..fc2dd4c24d06 100755
--- a/tools/testing/selftests/gpio/gpio-mockup.sh
+++ b/tools/testing/selftests/gpio/gpio-mockup.sh
@@ -377,13 +377,10 @@ if [ "$full_test" ]; then
 	insmod_test "0,32,32,44,-1,22,-1,31" 32 12 22 31
 fi
 echo "2.  Module load error tests"
-echo "2.1 gpio overflow"
-# Currently: The max number of gpio(1024) is defined in arm architecture.
-insmod_test "-1,1024"
+echo "2.1 no lines defined"
+insmod_test "0,0"
 if [ "$full_test" ]; then
-	echo "2.2 no lines defined"
-	insmod_test "0,0"
-	echo "2.3 ignore range overlap"
+	echo "2.2 ignore range overlap"
 	insmod_test "0,32,0,1" 32
 	insmod_test "0,32,1,5" 32
 	insmod_test "0,32,30,35" 32
diff --git a/tools/testing/selftests/hid/config.common b/tools/testing/selftests/hid/config.common
index 0f456dbab62f..45b5570441ce 100644
--- a/tools/testing/selftests/hid/config.common
+++ b/tools/testing/selftests/hid/config.common
@@ -238,3 +238,4 @@ CONFIG_VLAN_8021Q=y
 CONFIG_XFRM_SUB_POLICY=y
 CONFIG_XFRM_USER=y
 CONFIG_ZEROPLUS_FF=y
+CONFIG_KASAN=y
diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index 2cf96f818f25..f825623e3edc 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -16,6 +16,11 @@
 
 #define SHOW_UHID_DEBUG 0
 
+#define min(a, b) \
+	({ __typeof__(a) _a = (a); \
+	__typeof__(b) _b = (b); \
+	_a < _b ? _a : _b; })
+
 static unsigned char rdesc[] = {
 	0x06, 0x00, 0xff,	/* Usage Page (Vendor Defined Page 1) */
 	0x09, 0x21,		/* Usage (Vendor Usage 0x21) */
@@ -111,6 +116,10 @@ struct hid_hw_request_syscall_args {
 static pthread_mutex_t uhid_started_mtx = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t uhid_started = PTHREAD_COND_INITIALIZER;
 
+static pthread_mutex_t uhid_output_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t uhid_output_cond = PTHREAD_COND_INITIALIZER;
+static unsigned char output_report[10];
+
 /* no need to protect uhid_stopped, only one thread accesses it */
 static bool uhid_stopped;
 
@@ -205,6 +214,13 @@ static int uhid_event(struct __test_metadata *_metadata, int fd)
 		break;
 	case UHID_OUTPUT:
 		UHID_LOG("UHID_OUTPUT from uhid-dev");
+
+		pthread_mutex_lock(&uhid_output_mtx);
+		memcpy(output_report,
+		       ev.u.output.data,
+		       min(ev.u.output.size, sizeof(output_report)));
+		pthread_cond_signal(&uhid_output_cond);
+		pthread_mutex_unlock(&uhid_output_mtx);
 		break;
 	case UHID_GET_REPORT:
 		UHID_LOG("UHID_GET_REPORT from uhid-dev");
@@ -734,8 +750,100 @@ TEST_F(hid_bpf, test_hid_change_report)
 }
 
 /*
- * Attach hid_user_raw_request to the given uhid device,
- * call the bpf program from userspace
+ * Call hid_bpf_input_report against the given uhid device,
+ * check that the program is called and does the expected.
+ */
+TEST_F(hid_bpf, test_hid_user_input_report_call)
+{
+	struct hid_hw_request_syscall_args args = {
+		.retval = -1,
+		.size = 10,
+	};
+	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattrs,
+			    .ctx_in = &args,
+			    .ctx_size_in = sizeof(args),
+	);
+	__u8 buf[10] = {0};
+	int err, prog_fd;
+
+	LOAD_BPF;
+
+	args.hid = self->hid_id;
+	args.data[0] = 1; /* report ID */
+	args.data[1] = 2; /* report ID */
+	args.data[2] = 42; /* report ID */
+
+	prog_fd = bpf_program__fd(self->skel->progs.hid_user_input_report);
+
+	/* check that there is no data to read from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, -1) TH_LOG("read_hidraw");
+
+	err = bpf_prog_test_run_opts(prog_fd, &tattrs);
+
+	ASSERT_OK(err) TH_LOG("error while calling bpf_prog_test_run_opts");
+
+	ASSERT_EQ(args.retval, 0);
+
+	/* read the data from hidraw */
+	memset(buf, 0, sizeof(buf));
+	err = read(self->hidraw_fd, buf, sizeof(buf));
+	ASSERT_EQ(err, 6) TH_LOG("read_hidraw");
+	ASSERT_EQ(buf[0], 1);
+	ASSERT_EQ(buf[1], 2);
+	ASSERT_EQ(buf[2], 42);
+}
+
+/*
+ * Call hid_bpf_hw_output_report against the given uhid device,
+ * check that the program is called and does the expected.
+ */
+TEST_F(hid_bpf, test_hid_user_output_report_call)
+{
+	struct hid_hw_request_syscall_args args = {
+		.retval = -1,
+		.size = 10,
+	};
+	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, tattrs,
+			    .ctx_in = &args,
+			    .ctx_size_in = sizeof(args),
+	);
+	int err, cond_err, prog_fd;
+	struct timespec time_to_wait;
+
+	LOAD_BPF;
+
+	args.hid = self->hid_id;
+	args.data[0] = 1; /* report ID */
+	args.data[1] = 2; /* report ID */
+	args.data[2] = 42; /* report ID */
+
+	prog_fd = bpf_program__fd(self->skel->progs.hid_user_output_report);
+
+	pthread_mutex_lock(&uhid_output_mtx);
+
+	memset(output_report, 0, sizeof(output_report));
+	clock_gettime(CLOCK_REALTIME, &time_to_wait);
+	time_to_wait.tv_sec += 2;
+
+	err = bpf_prog_test_run_opts(prog_fd, &tattrs);
+	cond_err = pthread_cond_timedwait(&uhid_output_cond, &uhid_output_mtx, &time_to_wait);
+
+	ASSERT_OK(err) TH_LOG("error while calling bpf_prog_test_run_opts");
+	ASSERT_OK(cond_err) TH_LOG("error while calling waiting for the condition");
+
+	ASSERT_EQ(args.retval, 3);
+
+	ASSERT_EQ(output_report[0], 1);
+	ASSERT_EQ(output_report[1], 2);
+	ASSERT_EQ(output_report[2], 42);
+
+	pthread_mutex_unlock(&uhid_output_mtx);
+}
+
+/*
+ * Call hid_hw_raw_request against the given uhid device,
  * check that the program is called and does the expected.
  */
 TEST_F(hid_bpf, test_hid_user_raw_request_call)
diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c
index 1e558826b809..f67d35def142 100644
--- a/tools/testing/selftests/hid/progs/hid.c
+++ b/tools/testing/selftests/hid/progs/hid.c
@@ -101,6 +101,52 @@ int hid_user_raw_request(struct hid_hw_request_syscall_args *args)
 	return 0;
 }
 
+SEC("syscall")
+int hid_user_output_report(struct hid_hw_request_syscall_args *args)
+{
+	struct hid_bpf_ctx *ctx;
+	const size_t size = args->size;
+	int i, ret = 0;
+
+	if (size > sizeof(args->data))
+		return -7; /* -E2BIG */
+
+	ctx = hid_bpf_allocate_context(args->hid);
+	if (!ctx)
+		return -1; /* EPERM check */
+
+	ret = hid_bpf_hw_output_report(ctx,
+				       args->data,
+				       size);
+	args->retval = ret;
+
+	hid_bpf_release_context(ctx);
+
+	return 0;
+}
+
+SEC("syscall")
+int hid_user_input_report(struct hid_hw_request_syscall_args *args)
+{
+	struct hid_bpf_ctx *ctx;
+	const size_t size = args->size;
+	int i, ret = 0;
+
+	if (size > sizeof(args->data))
+		return -7; /* -E2BIG */
+
+	ctx = hid_bpf_allocate_context(args->hid);
+	if (!ctx)
+		return -1; /* EPERM check */
+
+	ret = hid_bpf_input_report(ctx, HID_INPUT_REPORT, args->data, size);
+	args->retval = ret;
+
+	hid_bpf_release_context(ctx);
+
+	return 0;
+}
+
 static const __u8 rdesc[] = {
 	0x05, 0x01,				/* USAGE_PAGE (Generic Desktop) */
 	0x09, 0x32,				/* USAGE (Z) */
diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
index 65e657ac1198..9cd56821d0f1 100644
--- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
+++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
@@ -94,5 +94,11 @@ extern int hid_bpf_hw_request(struct hid_bpf_ctx *ctx,
 			      size_t buf__sz,
 			      enum hid_report_type type,
 			      enum hid_class_request reqtype) __ksym;
+extern int hid_bpf_hw_output_report(struct hid_bpf_ctx *ctx,
+				    __u8 *buf, size_t buf__sz) __ksym;
+extern int hid_bpf_input_report(struct hid_bpf_ctx *ctx,
+				enum hid_report_type type,
+				__u8 *data,
+				size_t buf__sz) __ksym;
 
 #endif /* __HID_BPF_HELPERS_H */
diff --git a/tools/testing/selftests/hid/tests/base.py b/tools/testing/selftests/hid/tests/base.py
index 51433063b227..3a465768e507 100644
--- a/tools/testing/selftests/hid/tests/base.py
+++ b/tools/testing/selftests/hid/tests/base.py
@@ -8,11 +8,13 @@
 import libevdev
 import os
 import pytest
+import shutil
+import subprocess
 import time
 
 import logging
 
-from hidtools.device.base_device import BaseDevice, EvdevMatch, SysfsFile
+from .base_device import BaseDevice, EvdevMatch, SysfsFile
 from pathlib import Path
 from typing import Final, List, Tuple
 
@@ -157,6 +159,17 @@ class BaseTestCase:
         # for example ("playstation", "hid-playstation")
         kernel_modules: List[Tuple[str, str]] = []
 
+        # List of in kernel HID-BPF object files to load
+        # before starting the test
+        # Any existing pre-loaded HID-BPF module will be removed
+        # before the ones in this list will be manually loaded.
+        # Each Element is a tuple '(hid_bpf_object, rdesc_fixup_present)',
+        # for example '("xppen-ArtistPro16Gen2.bpf.o", True)'
+        # If 'rdesc_fixup_present' is True, the test needs to wait
+        # for one unbind and rebind before it can be sure the kernel is
+        # ready
+        hid_bpfs: List[Tuple[str, bool]] = []
+
         def assertInputEventsIn(self, expected_events, effective_events):
             effective_events = effective_events.copy()
             for ev in expected_events:
@@ -211,8 +224,6 @@ class BaseTestCase:
                 # we don't know beforehand the name of the module from modinfo
                 sysfs_path = Path("/sys/module") / kernel_module.replace("-", "_")
             if not sysfs_path.exists():
-                import subprocess
-
                 ret = subprocess.run(["/usr/sbin/modprobe", kernel_module])
                 if ret.returncode != 0:
                     pytest.skip(
@@ -225,6 +236,64 @@ class BaseTestCase:
                 self._load_kernel_module(kernel_driver, kernel_module)
             yield
 
+        def load_hid_bpfs(self):
+            script_dir = Path(os.path.dirname(os.path.realpath(__file__)))
+            root_dir = (script_dir / "../../../../..").resolve()
+            bpf_dir = root_dir / "drivers/hid/bpf/progs"
+
+            udev_hid_bpf = shutil.which("udev-hid-bpf")
+            if not udev_hid_bpf:
+                pytest.skip("udev-hid-bpf not found in $PATH, skipping")
+
+            wait = False
+            for _, rdesc_fixup in self.hid_bpfs:
+                if rdesc_fixup:
+                    wait = True
+
+            for hid_bpf, _ in self.hid_bpfs:
+                # We need to start `udev-hid-bpf` in the background
+                # and dispatch uhid events in case the kernel needs
+                # to fetch features on the device
+                process = subprocess.Popen(
+                    [
+                        "udev-hid-bpf",
+                        "--verbose",
+                        "add",
+                        str(self.uhdev.sys_path),
+                        str(bpf_dir / hid_bpf),
+                    ],
+                )
+                while process.poll() is None:
+                    self.uhdev.dispatch(1)
+
+                if process.poll() != 0:
+                    pytest.fail(
+                        f"Couldn't insert hid-bpf program '{hid_bpf}', marking the test as failed"
+                    )
+
+            if wait:
+                # the HID-BPF program exports a rdesc fixup, so it needs to be
+                # unbound by the kernel and then rebound.
+                # Ensure we get the bound event exactly 2 times (one for the normal
+                # uhid loading, and then the reload from HID-BPF)
+                now = time.time()
+                while self.uhdev.kernel_ready_count < 2 and time.time() - now < 2:
+                    self.uhdev.dispatch(1)
+
+                if self.uhdev.kernel_ready_count < 2:
+                    pytest.fail(
+                        f"Couldn't insert hid-bpf programs, marking the test as failed"
+                    )
+
+        def unload_hid_bpfs(self):
+            ret = subprocess.run(
+                ["udev-hid-bpf", "--verbose", "remove", str(self.uhdev.sys_path)],
+            )
+            if ret.returncode != 0:
+                pytest.fail(
+                    f"Couldn't unload hid-bpf programs, marking the test as failed"
+                )
+
         @pytest.fixture()
         def new_uhdev(self, load_kernel_module):
             return self.create_device()
@@ -248,12 +317,18 @@ class BaseTestCase:
                         now = time.time()
                         while not self.uhdev.is_ready() and time.time() - now < 5:
                             self.uhdev.dispatch(1)
+
+                        if self.hid_bpfs:
+                            self.load_hid_bpfs()
+
                         if self.uhdev.get_evdev() is None:
                             logger.warning(
                                 f"available list of input nodes: (default application is '{self.uhdev.application}')"
                             )
                             logger.warning(self.uhdev.input_nodes)
                         yield
+                        if self.hid_bpfs:
+                            self.unload_hid_bpfs()
                         self.uhdev = None
             except PermissionError:
                 pytest.skip("Insufficient permissions, run me as root")
@@ -313,8 +388,6 @@ class HIDTestUdevRule(object):
             self.reload_udev_rules()
 
     def reload_udev_rules(self):
-        import subprocess
-
         subprocess.run("udevadm control --reload-rules".split())
         subprocess.run("systemd-hwdb update".split())
 
@@ -330,10 +403,11 @@ class HIDTestUdevRule(object):
             delete=False,
         ) as f:
             f.write(
-                'KERNELS=="*input*", ATTRS{name}=="*uhid test *", ENV{LIBINPUT_IGNORE_DEVICE}="1"\n'
-            )
-            f.write(
-                'KERNELS=="*input*", ATTRS{name}=="*uhid test * System Multi Axis", ENV{ID_INPUT_TOUCHSCREEN}="", ENV{ID_INPUT_SYSTEM_MULTIAXIS}="1"\n'
+                """
+KERNELS=="*input*", ATTRS{name}=="*uhid test *", ENV{LIBINPUT_IGNORE_DEVICE}="1"
+KERNELS=="*hid*", ENV{HID_NAME}=="*uhid test *", ENV{HID_BPF_IGNORE_DEVICE}="1"
+KERNELS=="*input*", ATTRS{name}=="*uhid test * System Multi Axis", ENV{ID_INPUT_TOUCHSCREEN}="", ENV{ID_INPUT_SYSTEM_MULTIAXIS}="1"
+"""
             )
             self.rulesfile = f
 
diff --git a/tools/testing/selftests/hid/tests/base_device.py b/tools/testing/selftests/hid/tests/base_device.py
new file mode 100644
index 000000000000..e0515be97f83
--- /dev/null
+++ b/tools/testing/selftests/hid/tests/base_device.py
@@ -0,0 +1,421 @@
+#!/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2017 Benjamin Tissoires <benjamin.tissoires@gmail.com>
+# Copyright (c) 2017 Red Hat, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import fcntl
+import functools
+import libevdev
+import os
+
+try:
+    import pyudev
+except ImportError:
+    raise ImportError("UHID is not supported due to missing pyudev dependency")
+
+import logging
+
+import hidtools.hid as hid
+from hidtools.uhid import UHIDDevice
+from hidtools.util import BusType
+
+from pathlib import Path
+from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type, Union
+
+logger = logging.getLogger("hidtools.device.base_device")
+
+
+class SysfsFile(object):
+    def __init__(self, path):
+        self.path = path
+
+    def __set_value(self, value):
+        with open(self.path, "w") as f:
+            return f.write(f"{value}\n")
+
+    def __get_value(self):
+        with open(self.path) as f:
+            return f.read().strip()
+
+    @property
+    def int_value(self) -> int:
+        return int(self.__get_value())
+
+    @int_value.setter
+    def int_value(self, v: int) -> None:
+        self.__set_value(v)
+
+    @property
+    def str_value(self) -> str:
+        return self.__get_value()
+
+    @str_value.setter
+    def str_value(self, v: str) -> None:
+        self.__set_value(v)
+
+
+class LED(object):
+    def __init__(self, sys_path):
+        self.max_brightness = SysfsFile(sys_path / "max_brightness").int_value
+        self.__brightness = SysfsFile(sys_path / "brightness")
+
+    @property
+    def brightness(self) -> int:
+        return self.__brightness.int_value
+
+    @brightness.setter
+    def brightness(self, value: int) -> None:
+        self.__brightness.int_value = value
+
+
+class PowerSupply(object):
+    """Represents Linux power_supply_class sysfs nodes."""
+
+    def __init__(self, sys_path):
+        self._capacity = SysfsFile(sys_path / "capacity")
+        self._status = SysfsFile(sys_path / "status")
+        self._type = SysfsFile(sys_path / "type")
+
+    @property
+    def capacity(self) -> int:
+        return self._capacity.int_value
+
+    @property
+    def status(self) -> str:
+        return self._status.str_value
+
+    @property
+    def type(self) -> str:
+        return self._type.str_value
+
+
+class HIDIsReady(object):
+    """
+    Companion class that binds to a kernel mechanism
+    and that allows to know when a uhid device is ready or not.
+
+    See :meth:`is_ready` for details.
+    """
+
+    def __init__(self: "HIDIsReady", uhid: UHIDDevice) -> None:
+        self.uhid = uhid
+
+    def is_ready(self: "HIDIsReady") -> bool:
+        """
+        Overwrite in subclasses: should return True or False whether
+        the attached uhid device is ready or not.
+        """
+        return False
+
+
+class UdevHIDIsReady(HIDIsReady):
+    _pyudev_context: ClassVar[Optional[pyudev.Context]] = None
+    _pyudev_monitor: ClassVar[Optional[pyudev.Monitor]] = None
+    _uhid_devices: ClassVar[Dict[int, Tuple[bool, int]]] = {}
+
+    def __init__(self: "UdevHIDIsReady", uhid: UHIDDevice) -> None:
+        super().__init__(uhid)
+        self._init_pyudev()
+
+    @classmethod
+    def _init_pyudev(cls: Type["UdevHIDIsReady"]) -> None:
+        if cls._pyudev_context is None:
+            cls._pyudev_context = pyudev.Context()
+            cls._pyudev_monitor = pyudev.Monitor.from_netlink(cls._pyudev_context)
+            cls._pyudev_monitor.filter_by("hid")
+            cls._pyudev_monitor.start()
+
+            UHIDDevice._append_fd_to_poll(
+                cls._pyudev_monitor.fileno(), cls._cls_udev_event_callback
+            )
+
+    @classmethod
+    def _cls_udev_event_callback(cls: Type["UdevHIDIsReady"]) -> None:
+        if cls._pyudev_monitor is None:
+            return
+        event: pyudev.Device
+        for event in iter(functools.partial(cls._pyudev_monitor.poll, 0.02), None):
+            if event.action not in ["bind", "remove", "unbind"]:
+                return
+
+            logger.debug(f"udev event: {event.action} -> {event}")
+
+            id = int(event.sys_path.strip().split(".")[-1], 16)
+
+            device_ready, count = cls._uhid_devices.get(id, (False, 0))
+
+            ready = event.action == "bind"
+            if not device_ready and ready:
+                count += 1
+            cls._uhid_devices[id] = (ready, count)
+
+    def is_ready(self: "UdevHIDIsReady") -> Tuple[bool, int]:
+        try:
+            return self._uhid_devices[self.uhid.hid_id]
+        except KeyError:
+            return (False, 0)
+
+
+class EvdevMatch(object):
+    def __init__(
+        self: "EvdevMatch",
+        *,
+        requires: List[Any] = [],
+        excludes: List[Any] = [],
+        req_properties: List[Any] = [],
+        excl_properties: List[Any] = [],
+    ) -> None:
+        self.requires = requires
+        self.excludes = excludes
+        self.req_properties = req_properties
+        self.excl_properties = excl_properties
+
+    def is_a_match(self: "EvdevMatch", evdev: libevdev.Device) -> bool:
+        for m in self.requires:
+            if not evdev.has(m):
+                return False
+        for m in self.excludes:
+            if evdev.has(m):
+                return False
+        for p in self.req_properties:
+            if not evdev.has_property(p):
+                return False
+        for p in self.excl_properties:
+            if evdev.has_property(p):
+                return False
+        return True
+
+
+class EvdevDevice(object):
+    """
+    Represents an Evdev node and its properties.
+    This is a stub for the libevdev devices, as they are relying on
+    uevent to get the data, saving us some ioctls to fetch the names
+    and properties.
+    """
+
+    def __init__(self: "EvdevDevice", sysfs: Path) -> None:
+        self.sysfs = sysfs
+        self.event_node: Any = None
+        self.libevdev: Optional[libevdev.Device] = None
+
+        self.uevents = {}
+        # all of the interesting properties are stored in the input uevent, so in the parent
+        # so convert the uevent file of the parent input node into a dict
+        with open(sysfs.parent / "uevent") as f:
+            for line in f.readlines():
+                key, value = line.strip().split("=")
+                self.uevents[key] = value.strip('"')
+
+        # we open all evdev nodes in order to not miss any event
+        self.open()
+
+    @property
+    def name(self: "EvdevDevice") -> str:
+        assert "NAME" in self.uevents
+
+        return self.uevents["NAME"]
+
+    @property
+    def evdev(self: "EvdevDevice") -> Path:
+        return Path("/dev/input") / self.sysfs.name
+
+    def matches_application(
+        self: "EvdevDevice", application: str, matches: Dict[str, EvdevMatch]
+    ) -> bool:
+        if self.libevdev is None:
+            return False
+
+        if application in matches:
+            return matches[application].is_a_match(self.libevdev)
+
+        logger.error(
+            f"application '{application}' is unknown, please update/fix hid-tools"
+        )
+        assert False  # hid-tools likely needs an update
+
+    def open(self: "EvdevDevice") -> libevdev.Device:
+        self.event_node = open(self.evdev, "rb")
+        self.libevdev = libevdev.Device(self.event_node)
+
+        assert self.libevdev.fd is not None
+
+        fd = self.libevdev.fd.fileno()
+        flag = fcntl.fcntl(fd, fcntl.F_GETFD)
+        fcntl.fcntl(fd, fcntl.F_SETFL, flag | os.O_NONBLOCK)
+
+        return self.libevdev
+
+    def close(self: "EvdevDevice") -> None:
+        if self.libevdev is not None and self.libevdev.fd is not None:
+            self.libevdev.fd.close()
+            self.libevdev = None
+        if self.event_node is not None:
+            self.event_node.close()
+            self.event_node = None
+
+
+class BaseDevice(UHIDDevice):
+    # default _application_matches that matches nothing. This needs
+    # to be set in the subclasses to have get_evdev() working
+    _application_matches: Dict[str, EvdevMatch] = {}
+
+    def __init__(
+        self,
+        name,
+        application,
+        rdesc_str: Optional[str] = None,
+        rdesc: Optional[Union[hid.ReportDescriptor, str, bytes]] = None,
+        input_info=None,
+    ) -> None:
+        self._kernel_is_ready: HIDIsReady = UdevHIDIsReady(self)
+        if rdesc_str is None and rdesc is None:
+            raise Exception("Please provide at least a rdesc or rdesc_str")
+        super().__init__()
+        if name is None:
+            name = f"uhid gamepad test {self.__class__.__name__}"
+        if input_info is None:
+            input_info = (BusType.USB, 1, 2)
+        self.name = name
+        self.info = input_info
+        self.default_reportID = None
+        self.opened = False
+        self.started = False
+        self.application = application
+        self._input_nodes: Optional[list[EvdevDevice]] = None
+        if rdesc is None:
+            assert rdesc_str is not None
+            self.rdesc = hid.ReportDescriptor.from_human_descr(rdesc_str)  # type: ignore
+        else:
+            self.rdesc = rdesc  # type: ignore
+
+    @property
+    def power_supply_class(self: "BaseDevice") -> Optional[PowerSupply]:
+        ps = self.walk_sysfs("power_supply", "power_supply/*")
+        if ps is None or len(ps) < 1:
+            return None
+
+        return PowerSupply(ps[0])
+
+    @property
+    def led_classes(self: "BaseDevice") -> List[LED]:
+        leds = self.walk_sysfs("led", "**/max_brightness")
+        if leds is None:
+            return []
+
+        return [LED(led.parent) for led in leds]
+
+    @property
+    def kernel_is_ready(self: "BaseDevice") -> bool:
+        return self._kernel_is_ready.is_ready()[0] and self.started
+
+    @property
+    def kernel_ready_count(self: "BaseDevice") -> int:
+        return self._kernel_is_ready.is_ready()[1]
+
+    @property
+    def input_nodes(self: "BaseDevice") -> List[EvdevDevice]:
+        if self._input_nodes is not None:
+            return self._input_nodes
+
+        if not self.kernel_is_ready or not self.started:
+            return []
+
+        self._input_nodes = [
+            EvdevDevice(path)
+            for path in self.walk_sysfs("input", "input/input*/event*")
+        ]
+        return self._input_nodes
+
+    def match_evdev_rule(self, application, evdev):
+        """Replace this in subclasses if the device has multiple reports
+        of the same type and we need to filter based on the actual evdev
+        node.
+
+        returning True will append the corresponding report to
+        `self.input_nodes[type]`
+        returning False  will ignore this report / type combination
+        for the device.
+        """
+        return True
+
+    def open(self):
+        self.opened = True
+
+    def _close_all_opened_evdev(self):
+        if self._input_nodes is not None:
+            for e in self._input_nodes:
+                e.close()
+
+    def __del__(self):
+        self._close_all_opened_evdev()
+
+    def close(self):
+        self.opened = False
+
+    def start(self, flags):
+        self.started = True
+
+    def stop(self):
+        self.started = False
+        self._close_all_opened_evdev()
+
+    def next_sync_events(self, application=None):
+        evdev = self.get_evdev(application)
+        if evdev is not None:
+            return list(evdev.events())
+        return []
+
+    @property
+    def application_matches(self: "BaseDevice") -> Dict[str, EvdevMatch]:
+        return self._application_matches
+
+    @application_matches.setter
+    def application_matches(self: "BaseDevice", data: Dict[str, EvdevMatch]) -> None:
+        self._application_matches = data
+
+    def get_evdev(self, application=None):
+        if application is None:
+            application = self.application
+
+        if len(self.input_nodes) == 0:
+            return None
+
+        assert self._input_nodes is not None
+
+        if len(self._input_nodes) == 1:
+            evdev = self._input_nodes[0]
+            if self.match_evdev_rule(application, evdev.libevdev):
+                return evdev.libevdev
+        else:
+            for _evdev in self._input_nodes:
+                if _evdev.matches_application(application, self.application_matches):
+                    if self.match_evdev_rule(application, _evdev.libevdev):
+                        return _evdev.libevdev
+
+    def is_ready(self):
+        """Returns whether a UHID device is ready. Can be overwritten in
+        subclasses to add extra conditions on when to consider a UHID
+        device ready. This can be:
+
+        - we need to wait on different types of input devices to be ready
+          (Touch Screen and Pen for example)
+        - we need to have at least 4 LEDs present
+          (len(self.uhdev.leds_classes) == 4)
+        - or any other combinations"""
+        return self.kernel_is_ready
diff --git a/tools/testing/selftests/hid/tests/base_gamepad.py b/tools/testing/selftests/hid/tests/base_gamepad.py
new file mode 100644
index 000000000000..ec74d75767a2
--- /dev/null
+++ b/tools/testing/selftests/hid/tests/base_gamepad.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: GPL-2.0
+import libevdev
+
+from .base_device import BaseDevice
+from hidtools.util import BusType
+
+
+class InvalidHIDCommunication(Exception):
+    pass
+
+
+class GamepadData(object):
+    pass
+
+
+class AxisMapping(object):
+    """Represents a mapping between a HID type
+    and an evdev event"""
+
+    def __init__(self, hid, evdev=None):
+        self.hid = hid.lower()
+
+        if evdev is None:
+            evdev = f"ABS_{hid.upper()}"
+
+        self.evdev = libevdev.evbit("EV_ABS", evdev)
+
+
+class BaseGamepad(BaseDevice):
+    buttons_map = {
+        1: "BTN_SOUTH",
+        2: "BTN_EAST",
+        3: "BTN_C",
+        4: "BTN_NORTH",
+        5: "BTN_WEST",
+        6: "BTN_Z",
+        7: "BTN_TL",
+        8: "BTN_TR",
+        9: "BTN_TL2",
+        10: "BTN_TR2",
+        11: "BTN_SELECT",
+        12: "BTN_START",
+        13: "BTN_MODE",
+        14: "BTN_THUMBL",
+        15: "BTN_THUMBR",
+    }
+
+    axes_map = {
+        "left_stick": {
+            "x": AxisMapping("x"),
+            "y": AxisMapping("y"),
+        },
+        "right_stick": {
+            "x": AxisMapping("z"),
+            "y": AxisMapping("Rz"),
+        },
+    }
+
+    def __init__(self, rdesc, application="Game Pad", name=None, input_info=None):
+        assert rdesc is not None
+        super().__init__(name, application, input_info=input_info, rdesc=rdesc)
+        self.buttons = (1, 2, 3)
+        self._buttons = {}
+        self.left = (127, 127)
+        self.right = (127, 127)
+        self.hat_switch = 15
+        assert self.parsed_rdesc is not None
+
+        self.fields = []
+        for r in self.parsed_rdesc.input_reports.values():
+            if r.application_name == self.application:
+                self.fields.extend([f.usage_name for f in r])
+
+    def store_axes(self, which, gamepad, data):
+        amap = self.axes_map[which]
+        x, y = data
+        setattr(gamepad, amap["x"].hid, x)
+        setattr(gamepad, amap["y"].hid, y)
+
+    def create_report(
+        self,
+        *,
+        left=(None, None),
+        right=(None, None),
+        hat_switch=None,
+        buttons=None,
+        reportID=None,
+        application="Game Pad",
+    ):
+        """
+        Return an input report for this device.
+
+        :param left: a tuple of absolute (x, y) value of the left joypad
+            where ``None`` is "leave unchanged"
+        :param right: a tuple of absolute (x, y) value of the right joypad
+            where ``None`` is "leave unchanged"
+        :param hat_switch: an absolute angular value of the hat switch
+            (expressed in 1/8 of circle, 0 being North, 2 East)
+            where ``None`` is "leave unchanged"
+        :param buttons: a dict of index/bool for the button states,
+            where ``None`` is "leave unchanged"
+        :param reportID: the numeric report ID for this report, if needed
+        :param application: the application used to report the values
+        """
+        if buttons is not None:
+            for i, b in buttons.items():
+                if i not in self.buttons:
+                    raise InvalidHIDCommunication(
+                        f"button {i} is not part of this {self.application}"
+                    )
+                if b is not None:
+                    self._buttons[i] = b
+
+        def replace_none_in_tuple(item, default):
+            if item is None:
+                item = (None, None)
+
+            if None in item:
+                if item[0] is None:
+                    item = (default[0], item[1])
+                if item[1] is None:
+                    item = (item[0], default[1])
+
+            return item
+
+        right = replace_none_in_tuple(right, self.right)
+        self.right = right
+        left = replace_none_in_tuple(left, self.left)
+        self.left = left
+
+        if hat_switch is None:
+            hat_switch = self.hat_switch
+        else:
+            self.hat_switch = hat_switch
+
+        reportID = reportID or self.default_reportID
+
+        gamepad = GamepadData()
+        for i, b in self._buttons.items():
+            gamepad.__setattr__(f"b{i}", int(b) if b is not None else 0)
+
+        self.store_axes("left_stick", gamepad, left)
+        self.store_axes("right_stick", gamepad, right)
+        gamepad.hatswitch = hat_switch  # type: ignore  ### gamepad is by default empty
+        return super().create_report(
+            gamepad, reportID=reportID, application=application
+        )
+
+    def event(
+        self, *, left=(None, None), right=(None, None), hat_switch=None, buttons=None
+    ):
+        """
+        Send an input event on the default report ID.
+
+        :param left: a tuple of absolute (x, y) value of the left joypad
+            where ``None`` is "leave unchanged"
+        :param right: a tuple of absolute (x, y) value of the right joypad
+            where ``None`` is "leave unchanged"
+        :param hat_switch: an absolute angular value of the hat switch
+            where ``None`` is "leave unchanged"
+        :param buttons: a dict of index/bool for the button states,
+            where ``None`` is "leave unchanged"
+        """
+        r = self.create_report(
+            left=left, right=right, hat_switch=hat_switch, buttons=buttons
+        )
+        self.call_input_event(r)
+        return [r]
+
+
+class JoystickGamepad(BaseGamepad):
+    buttons_map = {
+        1: "BTN_TRIGGER",
+        2: "BTN_THUMB",
+        3: "BTN_THUMB2",
+        4: "BTN_TOP",
+        5: "BTN_TOP2",
+        6: "BTN_PINKIE",
+        7: "BTN_BASE",
+        8: "BTN_BASE2",
+        9: "BTN_BASE3",
+        10: "BTN_BASE4",
+        11: "BTN_BASE5",
+        12: "BTN_BASE6",
+        13: "BTN_DEAD",
+    }
+
+    axes_map = {
+        "left_stick": {
+            "x": AxisMapping("x"),
+            "y": AxisMapping("y"),
+        },
+        "right_stick": {
+            "x": AxisMapping("rudder"),
+            "y": AxisMapping("throttle"),
+        },
+    }
+
+    def __init__(self, rdesc, application="Joystick", name=None, input_info=None):
+        super().__init__(rdesc, application, name, input_info)
+
+    def create_report(
+        self,
+        *,
+        left=(None, None),
+        right=(None, None),
+        hat_switch=None,
+        buttons=None,
+        reportID=None,
+        application=None,
+    ):
+        """
+        Return an input report for this device.
+
+        :param left: a tuple of absolute (x, y) value of the left joypad
+            where ``None`` is "leave unchanged"
+        :param right: a tuple of absolute (x, y) value of the right joypad
+            where ``None`` is "leave unchanged"
+        :param hat_switch: an absolute angular value of the hat switch
+            where ``None`` is "leave unchanged"
+        :param buttons: a dict of index/bool for the button states,
+            where ``None`` is "leave unchanged"
+        :param reportID: the numeric report ID for this report, if needed
+        :param application: the application for this report, if needed
+        """
+        if application is None:
+            application = "Joystick"
+        return super().create_report(
+            left=left,
+            right=right,
+            hat_switch=hat_switch,
+            buttons=buttons,
+            reportID=reportID,
+            application=application,
+        )
+
+    def store_right_joystick(self, gamepad, data):
+        gamepad.rudder, gamepad.throttle = data
diff --git a/tools/testing/selftests/hid/tests/test_gamepad.py b/tools/testing/selftests/hid/tests/test_gamepad.py
index 26c74040b796..8d5b5ffdae49 100644
--- a/tools/testing/selftests/hid/tests/test_gamepad.py
+++ b/tools/testing/selftests/hid/tests/test_gamepad.py
@@ -10,7 +10,8 @@ from . import base
 import libevdev
 import pytest
 
-from hidtools.device.base_gamepad import AsusGamepad, SaitekGamepad
+from .base_gamepad import BaseGamepad, JoystickGamepad, AxisMapping
+from hidtools.util import BusType
 
 import logging
 
@@ -199,6 +200,449 @@ class BaseTest:
             )
 
 
+class SaitekGamepad(JoystickGamepad):
+    # fmt: off
+    report_descriptor = [
+        0x05, 0x01,                    # Usage Page (Generic Desktop)        0
+        0x09, 0x04,                    # Usage (Joystick)                    2
+        0xa1, 0x01,                    # Collection (Application)            4
+        0x09, 0x01,                    # .Usage (Pointer)                    6
+        0xa1, 0x00,                    # .Collection (Physical)              8
+        0x85, 0x01,                    # ..Report ID (1)                     10
+        0x09, 0x30,                    # ..Usage (X)                         12
+        0x15, 0x00,                    # ..Logical Minimum (0)               14
+        0x26, 0xff, 0x00,              # ..Logical Maximum (255)             16
+        0x35, 0x00,                    # ..Physical Minimum (0)              19
+        0x46, 0xff, 0x00,              # ..Physical Maximum (255)            21
+        0x75, 0x08,                    # ..Report Size (8)                   24
+        0x95, 0x01,                    # ..Report Count (1)                  26
+        0x81, 0x02,                    # ..Input (Data,Var,Abs)              28
+        0x09, 0x31,                    # ..Usage (Y)                         30
+        0x81, 0x02,                    # ..Input (Data,Var,Abs)              32
+        0x05, 0x02,                    # ..Usage Page (Simulation Controls)  34
+        0x09, 0xba,                    # ..Usage (Rudder)                    36
+        0x81, 0x02,                    # ..Input (Data,Var,Abs)              38
+        0x09, 0xbb,                    # ..Usage (Throttle)                  40
+        0x81, 0x02,                    # ..Input (Data,Var,Abs)              42
+        0x05, 0x09,                    # ..Usage Page (Button)               44
+        0x19, 0x01,                    # ..Usage Minimum (1)                 46
+        0x29, 0x0c,                    # ..Usage Maximum (12)                48
+        0x25, 0x01,                    # ..Logical Maximum (1)               50
+        0x45, 0x01,                    # ..Physical Maximum (1)              52
+        0x75, 0x01,                    # ..Report Size (1)                   54
+        0x95, 0x0c,                    # ..Report Count (12)                 56
+        0x81, 0x02,                    # ..Input (Data,Var,Abs)              58
+        0x95, 0x01,                    # ..Report Count (1)                  60
+        0x75, 0x00,                    # ..Report Size (0)                   62
+        0x81, 0x03,                    # ..Input (Cnst,Var,Abs)              64
+        0x05, 0x01,                    # ..Usage Page (Generic Desktop)      66
+        0x09, 0x39,                    # ..Usage (Hat switch)                68
+        0x25, 0x07,                    # ..Logical Maximum (7)               70
+        0x46, 0x3b, 0x01,              # ..Physical Maximum (315)            72
+        0x55, 0x00,                    # ..Unit Exponent (0)                 75
+        0x65, 0x44,                    # ..Unit (Degrees^4,EngRotation)      77
+        0x75, 0x04,                    # ..Report Size (4)                   79
+        0x81, 0x42,                    # ..Input (Data,Var,Abs,Null)         81
+        0x65, 0x00,                    # ..Unit (None)                       83
+        0xc0,                          # .End Collection                     85
+        0x05, 0x0f,                    # .Usage Page (Vendor Usage Page 0x0f) 86
+        0x09, 0x92,                    # .Usage (Vendor Usage 0x92)          88
+        0xa1, 0x02,                    # .Collection (Logical)               90
+        0x85, 0x02,                    # ..Report ID (2)                     92
+        0x09, 0xa0,                    # ..Usage (Vendor Usage 0xa0)         94
+        0x09, 0x9f,                    # ..Usage (Vendor Usage 0x9f)         96
+        0x25, 0x01,                    # ..Logical Maximum (1)               98
+        0x45, 0x00,                    # ..Physical Maximum (0)              100
+        0x75, 0x01,                    # ..Report Size (1)                   102
+        0x95, 0x02,                    # ..Report Count (2)                  104
+        0x81, 0x02,                    # ..Input (Data,Var,Abs)              106
+        0x75, 0x06,                    # ..Report Size (6)                   108
+        0x95, 0x01,                    # ..Report Count (1)                  110
+        0x81, 0x03,                    # ..Input (Cnst,Var,Abs)              112
+        0x09, 0x22,                    # ..Usage (Vendor Usage 0x22)         114
+        0x75, 0x07,                    # ..Report Size (7)                   116
+        0x25, 0x7f,                    # ..Logical Maximum (127)             118
+        0x81, 0x02,                    # ..Input (Data,Var,Abs)              120
+        0x09, 0x94,                    # ..Usage (Vendor Usage 0x94)         122
+        0x75, 0x01,                    # ..Report Size (1)                   124
+        0x25, 0x01,                    # ..Logical Maximum (1)               126
+        0x81, 0x02,                    # ..Input (Data,Var,Abs)              128
+        0xc0,                          # .End Collection                     130
+        0x09, 0x21,                    # .Usage (Vendor Usage 0x21)          131
+        0xa1, 0x02,                    # .Collection (Logical)               133
+        0x85, 0x0b,                    # ..Report ID (11)                    135
+        0x09, 0x22,                    # ..Usage (Vendor Usage 0x22)         137
+        0x26, 0xff, 0x00,              # ..Logical Maximum (255)             139
+        0x75, 0x08,                    # ..Report Size (8)                   142
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             144
+        0x09, 0x53,                    # ..Usage (Vendor Usage 0x53)         146
+        0x25, 0x0a,                    # ..Logical Maximum (10)              148
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             150
+        0x09, 0x50,                    # ..Usage (Vendor Usage 0x50)         152
+        0x27, 0xfe, 0xff, 0x00, 0x00,  # ..Logical Maximum (65534)           154
+        0x47, 0xfe, 0xff, 0x00, 0x00,  # ..Physical Maximum (65534)          159
+        0x75, 0x10,                    # ..Report Size (16)                  164
+        0x55, 0xfd,                    # ..Unit Exponent (237)               166
+        0x66, 0x01, 0x10,              # ..Unit (Seconds,SILinear)           168
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             171
+        0x55, 0x00,                    # ..Unit Exponent (0)                 173
+        0x65, 0x00,                    # ..Unit (None)                       175
+        0x09, 0x54,                    # ..Usage (Vendor Usage 0x54)         177
+        0x55, 0xfd,                    # ..Unit Exponent (237)               179
+        0x66, 0x01, 0x10,              # ..Unit (Seconds,SILinear)           181
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             184
+        0x55, 0x00,                    # ..Unit Exponent (0)                 186
+        0x65, 0x00,                    # ..Unit (None)                       188
+        0x09, 0xa7,                    # ..Usage (Vendor Usage 0xa7)         190
+        0x55, 0xfd,                    # ..Unit Exponent (237)               192
+        0x66, 0x01, 0x10,              # ..Unit (Seconds,SILinear)           194
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             197
+        0x55, 0x00,                    # ..Unit Exponent (0)                 199
+        0x65, 0x00,                    # ..Unit (None)                       201
+        0xc0,                          # .End Collection                     203
+        0x09, 0x5a,                    # .Usage (Vendor Usage 0x5a)          204
+        0xa1, 0x02,                    # .Collection (Logical)               206
+        0x85, 0x0c,                    # ..Report ID (12)                    208
+        0x09, 0x22,                    # ..Usage (Vendor Usage 0x22)         210
+        0x26, 0xff, 0x00,              # ..Logical Maximum (255)             212
+        0x45, 0x00,                    # ..Physical Maximum (0)              215
+        0x75, 0x08,                    # ..Report Size (8)                   217
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             219
+        0x09, 0x5c,                    # ..Usage (Vendor Usage 0x5c)         221
+        0x26, 0x10, 0x27,              # ..Logical Maximum (10000)           223
+        0x46, 0x10, 0x27,              # ..Physical Maximum (10000)          226
+        0x75, 0x10,                    # ..Report Size (16)                  229
+        0x55, 0xfd,                    # ..Unit Exponent (237)               231
+        0x66, 0x01, 0x10,              # ..Unit (Seconds,SILinear)           233
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             236
+        0x55, 0x00,                    # ..Unit Exponent (0)                 238
+        0x65, 0x00,                    # ..Unit (None)                       240
+        0x09, 0x5b,                    # ..Usage (Vendor Usage 0x5b)         242
+        0x25, 0x7f,                    # ..Logical Maximum (127)             244
+        0x75, 0x08,                    # ..Report Size (8)                   246
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             248
+        0x09, 0x5e,                    # ..Usage (Vendor Usage 0x5e)         250
+        0x26, 0x10, 0x27,              # ..Logical Maximum (10000)           252
+        0x75, 0x10,                    # ..Report Size (16)                  255
+        0x55, 0xfd,                    # ..Unit Exponent (237)               257
+        0x66, 0x01, 0x10,              # ..Unit (Seconds,SILinear)           259
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             262
+        0x55, 0x00,                    # ..Unit Exponent (0)                 264
+        0x65, 0x00,                    # ..Unit (None)                       266
+        0x09, 0x5d,                    # ..Usage (Vendor Usage 0x5d)         268
+        0x25, 0x7f,                    # ..Logical Maximum (127)             270
+        0x75, 0x08,                    # ..Report Size (8)                   272
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             274
+        0xc0,                          # .End Collection                     276
+        0x09, 0x73,                    # .Usage (Vendor Usage 0x73)          277
+        0xa1, 0x02,                    # .Collection (Logical)               279
+        0x85, 0x0d,                    # ..Report ID (13)                    281
+        0x09, 0x22,                    # ..Usage (Vendor Usage 0x22)         283
+        0x26, 0xff, 0x00,              # ..Logical Maximum (255)             285
+        0x45, 0x00,                    # ..Physical Maximum (0)              288
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             290
+        0x09, 0x70,                    # ..Usage (Vendor Usage 0x70)         292
+        0x15, 0x81,                    # ..Logical Minimum (-127)            294
+        0x25, 0x7f,                    # ..Logical Maximum (127)             296
+        0x36, 0xf0, 0xd8,              # ..Physical Minimum (-10000)         298
+        0x46, 0x10, 0x27,              # ..Physical Maximum (10000)          301
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             304
+        0xc0,                          # .End Collection                     306
+        0x09, 0x6e,                    # .Usage (Vendor Usage 0x6e)          307
+        0xa1, 0x02,                    # .Collection (Logical)               309
+        0x85, 0x0e,                    # ..Report ID (14)                    311
+        0x09, 0x22,                    # ..Usage (Vendor Usage 0x22)         313
+        0x15, 0x00,                    # ..Logical Minimum (0)               315
+        0x26, 0xff, 0x00,              # ..Logical Maximum (255)             317
+        0x35, 0x00,                    # ..Physical Minimum (0)              320
+        0x45, 0x00,                    # ..Physical Maximum (0)              322
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             324
+        0x09, 0x70,                    # ..Usage (Vendor Usage 0x70)         326
+        0x25, 0x7f,                    # ..Logical Maximum (127)             328
+        0x46, 0x10, 0x27,              # ..Physical Maximum (10000)          330
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             333
+        0x09, 0x6f,                    # ..Usage (Vendor Usage 0x6f)         335
+        0x15, 0x81,                    # ..Logical Minimum (-127)            337
+        0x36, 0xf0, 0xd8,              # ..Physical Minimum (-10000)         339
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             342
+        0x09, 0x71,                    # ..Usage (Vendor Usage 0x71)         344
+        0x15, 0x00,                    # ..Logical Minimum (0)               346
+        0x26, 0xff, 0x00,              # ..Logical Maximum (255)             348
+        0x35, 0x00,                    # ..Physical Minimum (0)              351
+        0x46, 0x68, 0x01,              # ..Physical Maximum (360)            353
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             356
+        0x09, 0x72,                    # ..Usage (Vendor Usage 0x72)         358
+        0x75, 0x10,                    # ..Report Size (16)                  360
+        0x26, 0x10, 0x27,              # ..Logical Maximum (10000)           362
+        0x46, 0x10, 0x27,              # ..Physical Maximum (10000)          365
+        0x55, 0xfd,                    # ..Unit Exponent (237)               368
+        0x66, 0x01, 0x10,              # ..Unit (Seconds,SILinear)           370
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             373
+        0x55, 0x00,                    # ..Unit Exponent (0)                 375
+        0x65, 0x00,                    # ..Unit (None)                       377
+        0xc0,                          # .End Collection                     379
+        0x09, 0x77,                    # .Usage (Vendor Usage 0x77)          380
+        0xa1, 0x02,                    # .Collection (Logical)               382
+        0x85, 0x51,                    # ..Report ID (81)                    384
+        0x09, 0x22,                    # ..Usage (Vendor Usage 0x22)         386
+        0x25, 0x7f,                    # ..Logical Maximum (127)             388
+        0x45, 0x00,                    # ..Physical Maximum (0)              390
+        0x75, 0x08,                    # ..Report Size (8)                   392
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             394
+        0x09, 0x78,                    # ..Usage (Vendor Usage 0x78)         396
+        0xa1, 0x02,                    # ..Collection (Logical)              398
+        0x09, 0x7b,                    # ...Usage (Vendor Usage 0x7b)        400
+        0x09, 0x79,                    # ...Usage (Vendor Usage 0x79)        402
+        0x09, 0x7a,                    # ...Usage (Vendor Usage 0x7a)        404
+        0x15, 0x01,                    # ...Logical Minimum (1)              406
+        0x25, 0x03,                    # ...Logical Maximum (3)              408
+        0x91, 0x00,                    # ...Output (Data,Arr,Abs)            410
+        0xc0,                          # ..End Collection                    412
+        0x09, 0x7c,                    # ..Usage (Vendor Usage 0x7c)         413
+        0x15, 0x00,                    # ..Logical Minimum (0)               415
+        0x26, 0xfe, 0x00,              # ..Logical Maximum (254)             417
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             420
+        0xc0,                          # .End Collection                     422
+        0x09, 0x92,                    # .Usage (Vendor Usage 0x92)          423
+        0xa1, 0x02,                    # .Collection (Logical)               425
+        0x85, 0x52,                    # ..Report ID (82)                    427
+        0x09, 0x96,                    # ..Usage (Vendor Usage 0x96)         429
+        0xa1, 0x02,                    # ..Collection (Logical)              431
+        0x09, 0x9a,                    # ...Usage (Vendor Usage 0x9a)        433
+        0x09, 0x99,                    # ...Usage (Vendor Usage 0x99)        435
+        0x09, 0x97,                    # ...Usage (Vendor Usage 0x97)        437
+        0x09, 0x98,                    # ...Usage (Vendor Usage 0x98)        439
+        0x09, 0x9b,                    # ...Usage (Vendor Usage 0x9b)        441
+        0x09, 0x9c,                    # ...Usage (Vendor Usage 0x9c)        443
+        0x15, 0x01,                    # ...Logical Minimum (1)              445
+        0x25, 0x06,                    # ...Logical Maximum (6)              447
+        0x91, 0x00,                    # ...Output (Data,Arr,Abs)            449
+        0xc0,                          # ..End Collection                    451
+        0xc0,                          # .End Collection                     452
+        0x05, 0xff,                    # .Usage Page (Vendor Usage Page 0xff) 453
+        0x0a, 0x01, 0x03,              # .Usage (Vendor Usage 0x301)         455
+        0xa1, 0x02,                    # .Collection (Logical)               458
+        0x85, 0x40,                    # ..Report ID (64)                    460
+        0x0a, 0x02, 0x03,              # ..Usage (Vendor Usage 0x302)        462
+        0xa1, 0x02,                    # ..Collection (Logical)              465
+        0x1a, 0x11, 0x03,              # ...Usage Minimum (785)              467
+        0x2a, 0x20, 0x03,              # ...Usage Maximum (800)              470
+        0x25, 0x10,                    # ...Logical Maximum (16)             473
+        0x91, 0x00,                    # ...Output (Data,Arr,Abs)            475
+        0xc0,                          # ..End Collection                    477
+        0x0a, 0x03, 0x03,              # ..Usage (Vendor Usage 0x303)        478
+        0x15, 0x00,                    # ..Logical Minimum (0)               481
+        0x27, 0xff, 0xff, 0x00, 0x00,  # ..Logical Maximum (65535)           483
+        0x75, 0x10,                    # ..Report Size (16)                  488
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             490
+        0xc0,                          # .End Collection                     492
+        0x05, 0x0f,                    # .Usage Page (Vendor Usage Page 0x0f) 493
+        0x09, 0x7d,                    # .Usage (Vendor Usage 0x7d)          495
+        0xa1, 0x02,                    # .Collection (Logical)               497
+        0x85, 0x43,                    # ..Report ID (67)                    499
+        0x09, 0x7e,                    # ..Usage (Vendor Usage 0x7e)         501
+        0x26, 0x80, 0x00,              # ..Logical Maximum (128)             503
+        0x46, 0x10, 0x27,              # ..Physical Maximum (10000)          506
+        0x75, 0x08,                    # ..Report Size (8)                   509
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             511
+        0xc0,                          # .End Collection                     513
+        0x09, 0x7f,                    # .Usage (Vendor Usage 0x7f)          514
+        0xa1, 0x02,                    # .Collection (Logical)               516
+        0x85, 0x0b,                    # ..Report ID (11)                    518
+        0x09, 0x80,                    # ..Usage (Vendor Usage 0x80)         520
+        0x26, 0xff, 0x7f,              # ..Logical Maximum (32767)           522
+        0x45, 0x00,                    # ..Physical Maximum (0)              525
+        0x75, 0x0f,                    # ..Report Size (15)                  527
+        0xb1, 0x03,                    # ..Feature (Cnst,Var,Abs)            529
+        0x09, 0xa9,                    # ..Usage (Vendor Usage 0xa9)         531
+        0x25, 0x01,                    # ..Logical Maximum (1)               533
+        0x75, 0x01,                    # ..Report Size (1)                   535
+        0xb1, 0x03,                    # ..Feature (Cnst,Var,Abs)            537
+        0x09, 0x83,                    # ..Usage (Vendor Usage 0x83)         539
+        0x26, 0xff, 0x00,              # ..Logical Maximum (255)             541
+        0x75, 0x08,                    # ..Report Size (8)                   544
+        0xb1, 0x03,                    # ..Feature (Cnst,Var,Abs)            546
+        0xc0,                          # .End Collection                     548
+        0x09, 0xab,                    # .Usage (Vendor Usage 0xab)          549
+        0xa1, 0x03,                    # .Collection (Report)                551
+        0x85, 0x15,                    # ..Report ID (21)                    553
+        0x09, 0x25,                    # ..Usage (Vendor Usage 0x25)         555
+        0xa1, 0x02,                    # ..Collection (Logical)              557
+        0x09, 0x26,                    # ...Usage (Vendor Usage 0x26)        559
+        0x09, 0x30,                    # ...Usage (Vendor Usage 0x30)        561
+        0x09, 0x32,                    # ...Usage (Vendor Usage 0x32)        563
+        0x09, 0x31,                    # ...Usage (Vendor Usage 0x31)        565
+        0x09, 0x33,                    # ...Usage (Vendor Usage 0x33)        567
+        0x09, 0x34,                    # ...Usage (Vendor Usage 0x34)        569
+        0x15, 0x01,                    # ...Logical Minimum (1)              571
+        0x25, 0x06,                    # ...Logical Maximum (6)              573
+        0xb1, 0x00,                    # ...Feature (Data,Arr,Abs)           575
+        0xc0,                          # ..End Collection                    577
+        0xc0,                          # .End Collection                     578
+        0x09, 0x89,                    # .Usage (Vendor Usage 0x89)          579
+        0xa1, 0x03,                    # .Collection (Report)                581
+        0x85, 0x16,                    # ..Report ID (22)                    583
+        0x09, 0x8b,                    # ..Usage (Vendor Usage 0x8b)         585
+        0xa1, 0x02,                    # ..Collection (Logical)              587
+        0x09, 0x8c,                    # ...Usage (Vendor Usage 0x8c)        589
+        0x09, 0x8d,                    # ...Usage (Vendor Usage 0x8d)        591
+        0x09, 0x8e,                    # ...Usage (Vendor Usage 0x8e)        593
+        0x25, 0x03,                    # ...Logical Maximum (3)              595
+        0xb1, 0x00,                    # ...Feature (Data,Arr,Abs)           597
+        0xc0,                          # ..End Collection                    599
+        0x09, 0x22,                    # ..Usage (Vendor Usage 0x22)         600
+        0x15, 0x00,                    # ..Logical Minimum (0)               602
+        0x26, 0xfe, 0x00,              # ..Logical Maximum (254)             604
+        0xb1, 0x02,                    # ..Feature (Data,Var,Abs)            607
+        0xc0,                          # .End Collection                     609
+        0x09, 0x90,                    # .Usage (Vendor Usage 0x90)          610
+        0xa1, 0x03,                    # .Collection (Report)                612
+        0x85, 0x50,                    # ..Report ID (80)                    614
+        0x09, 0x22,                    # ..Usage (Vendor Usage 0x22)         616
+        0x26, 0xff, 0x00,              # ..Logical Maximum (255)             618
+        0x91, 0x02,                    # ..Output (Data,Var,Abs)             621
+        0xc0,                          # .End Collection                     623
+        0xc0,                          # End Collection                      624
+    ]
+    # fmt: on
+
+    def __init__(self, rdesc=report_descriptor, name=None):
+        super().__init__(rdesc, name=name, input_info=(BusType.USB, 0x06A3, 0xFF0D))
+        self.buttons = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
+
+
+class AsusGamepad(BaseGamepad):
+    # fmt: off
+    report_descriptor = [
+        0x05, 0x01,                    # Usage Page (Generic Desktop)        0
+        0x09, 0x05,                    # Usage (Game Pad)                    2
+        0xa1, 0x01,                    # Collection (Application)            4
+        0x85, 0x01,                    # .Report ID (1)                      6
+        0x05, 0x09,                    # .Usage Page (Button)                8
+        0x0a, 0x01, 0x00,              # .Usage (Vendor Usage 0x01)          10
+        0x0a, 0x02, 0x00,              # .Usage (Vendor Usage 0x02)          13
+        0x0a, 0x04, 0x00,              # .Usage (Vendor Usage 0x04)          16
+        0x0a, 0x05, 0x00,              # .Usage (Vendor Usage 0x05)          19
+        0x0a, 0x07, 0x00,              # .Usage (Vendor Usage 0x07)          22
+        0x0a, 0x08, 0x00,              # .Usage (Vendor Usage 0x08)          25
+        0x0a, 0x0e, 0x00,              # .Usage (Vendor Usage 0x0e)          28
+        0x0a, 0x0f, 0x00,              # .Usage (Vendor Usage 0x0f)          31
+        0x0a, 0x0d, 0x00,              # .Usage (Vendor Usage 0x0d)          34
+        0x05, 0x0c,                    # .Usage Page (Consumer Devices)      37
+        0x0a, 0x24, 0x02,              # .Usage (AC Back)                    39
+        0x0a, 0x23, 0x02,              # .Usage (AC Home)                    42
+        0x15, 0x00,                    # .Logical Minimum (0)                45
+        0x25, 0x01,                    # .Logical Maximum (1)                47
+        0x75, 0x01,                    # .Report Size (1)                    49
+        0x95, 0x0b,                    # .Report Count (11)                  51
+        0x81, 0x02,                    # .Input (Data,Var,Abs)               53
+        0x75, 0x01,                    # .Report Size (1)                    55
+        0x95, 0x01,                    # .Report Count (1)                   57
+        0x81, 0x03,                    # .Input (Cnst,Var,Abs)               59
+        0x05, 0x01,                    # .Usage Page (Generic Desktop)       61
+        0x75, 0x04,                    # .Report Size (4)                    63
+        0x95, 0x01,                    # .Report Count (1)                   65
+        0x25, 0x07,                    # .Logical Maximum (7)                67
+        0x46, 0x3b, 0x01,              # .Physical Maximum (315)             69
+        0x66, 0x14, 0x00,              # .Unit (Degrees,EngRotation)         72
+        0x09, 0x39,                    # .Usage (Hat switch)                 75
+        0x81, 0x42,                    # .Input (Data,Var,Abs,Null)          77
+        0x66, 0x00, 0x00,              # .Unit (None)                        79
+        0x09, 0x01,                    # .Usage (Pointer)                    82
+        0xa1, 0x00,                    # .Collection (Physical)              84
+        0x09, 0x30,                    # ..Usage (X)                         86
+        0x09, 0x31,                    # ..Usage (Y)                         88
+        0x09, 0x32,                    # ..Usage (Z)                         90
+        0x09, 0x35,                    # ..Usage (Rz)                        92
+        0x05, 0x02,                    # ..Usage Page (Simulation Controls)  94
+        0x09, 0xc5,                    # ..Usage (Brake)                     96
+        0x09, 0xc4,                    # ..Usage (Accelerator)               98
+        0x15, 0x00,                    # ..Logical Minimum (0)               100
+        0x26, 0xff, 0x00,              # ..Logical Maximum (255)             102
+        0x35, 0x00,                    # ..Physical Minimum (0)              105
+        0x46, 0xff, 0x00,              # ..Physical Maximum (255)            107
+        0x75, 0x08,                    # ..Report Size (8)                   110
+        0x95, 0x06,                    # ..Report Count (6)                  112
+        0x81, 0x02,                    # ..Input (Data,Var,Abs)              114
+        0xc0,                          # .End Collection                     116
+        0x85, 0x02,                    # .Report ID (2)                      117
+        0x05, 0x08,                    # .Usage Page (LEDs)                  119
+        0x0a, 0x01, 0x00,              # .Usage (Num Lock)                   121
+        0x0a, 0x02, 0x00,              # .Usage (Caps Lock)                  124
+        0x0a, 0x03, 0x00,              # .Usage (Scroll Lock)                127
+        0x0a, 0x04, 0x00,              # .Usage (Compose)                    130
+        0x15, 0x00,                    # .Logical Minimum (0)                133
+        0x25, 0x01,                    # .Logical Maximum (1)                135
+        0x75, 0x01,                    # .Report Size (1)                    137
+        0x95, 0x04,                    # .Report Count (4)                   139
+        0x91, 0x02,                    # .Output (Data,Var,Abs)              141
+        0x75, 0x04,                    # .Report Size (4)                    143
+        0x95, 0x01,                    # .Report Count (1)                   145
+        0x91, 0x03,                    # .Output (Cnst,Var,Abs)              147
+        0xc0,                          # End Collection                      149
+        0x05, 0x0c,                    # Usage Page (Consumer Devices)       150
+        0x09, 0x01,                    # Usage (Consumer Control)            152
+        0xa1, 0x01,                    # Collection (Application)            154
+        0x85, 0x03,                    # .Report ID (3)                      156
+        0x05, 0x01,                    # .Usage Page (Generic Desktop)       158
+        0x09, 0x06,                    # .Usage (Keyboard)                   160
+        0xa1, 0x02,                    # .Collection (Logical)               162
+        0x05, 0x06,                    # ..Usage Page (Generic Device Controls) 164
+        0x09, 0x20,                    # ..Usage (Battery Strength)          166
+        0x15, 0x00,                    # ..Logical Minimum (0)               168
+        0x26, 0xff, 0x00,              # ..Logical Maximum (255)             170
+        0x75, 0x08,                    # ..Report Size (8)                   173
+        0x95, 0x01,                    # ..Report Count (1)                  175
+        0x81, 0x02,                    # ..Input (Data,Var,Abs)              177
+        0x06, 0xbc, 0xff,              # ..Usage Page (Vendor Usage Page 0xffbc) 179
+        0x0a, 0xad, 0xbd,              # ..Usage (Vendor Usage 0xbdad)       182
+        0x75, 0x08,                    # ..Report Size (8)                   185
+        0x95, 0x06,                    # ..Report Count (6)                  187
+        0x81, 0x02,                    # ..Input (Data,Var,Abs)              189
+        0xc0,                          # .End Collection                     191
+        0xc0,                          # End Collection                      192
+    ]
+    # fmt: on
+
+    def __init__(self, rdesc=report_descriptor, name=None):
+        super().__init__(rdesc, name=name, input_info=(BusType.USB, 0x18D1, 0x2C40))
+        self.buttons = (1, 2, 4, 5, 7, 8, 14, 15, 13)
+
+
+class RaptorMach2Joystick(JoystickGamepad):
+    axes_map = {
+        "left_stick": {
+            "x": AxisMapping("x"),
+            "y": AxisMapping("y"),
+        },
+        "right_stick": {
+            "x": AxisMapping("z"),
+            "y": AxisMapping("Rz"),
+        },
+    }
+
+    def __init__(
+        self,
+        name,
+        rdesc=None,
+        application="Joystick",
+        input_info=(BusType.USB, 0x11C0, 0x5606),
+    ):
+        super().__init__(rdesc, application, name, input_info)
+        self.buttons = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
+        self.hat_switch = 240  # null value is 240 as max is 239
+
+    def event(
+        self, *, left=(None, None), right=(None, None), hat_switch=None, buttons=None
+    ):
+        if hat_switch is not None:
+            hat_switch *= 30
+
+        return super().event(
+            left=left, right=right, hat_switch=hat_switch, buttons=buttons
+        )
+
+
 class TestSaitekGamepad(BaseTest.TestGamepad):
     def create_device(self):
         return SaitekGamepad()
@@ -207,3 +651,14 @@ class TestSaitekGamepad(BaseTest.TestGamepad):
 class TestAsusGamepad(BaseTest.TestGamepad):
     def create_device(self):
         return AsusGamepad()
+
+
+class TestRaptorMach2Joystick(BaseTest.TestGamepad):
+    hid_bpfs = [("FR-TEC__Raptor-Mach-2.bpf.o", True)]
+
+    def create_device(self):
+        return RaptorMach2Joystick(
+            "uhid test Sanmos Group FR-TEC Raptor MACH 2",
+            rdesc="05 01 09 04 a1 01 05 01 85 01 05 01 09 30 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 31 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 33 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 00 09 00 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 32 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 35 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 01 09 34 75 10 95 01 15 00 26 ff 07 46 ff 07 81 02 05 01 09 36 75 10 95 01 15 00 26 ff 03 46 ff 03 81 02 05 09 19 01 2a 1d 00 15 00 25 01 75 01 96 80 00 81 02 05 01 09 39 26 ef 00 46 68 01 65 14 75 10 95 01 81 42 05 01 09 00 75 08 95 1d 81 01 15 00 26 ef 00 85 58 26 ff 00 46 ff 00 75 08 95 3f 09 00 91 02 85 59 75 08 95 80 09 00 b1 02 c0",
+            input_info=(BusType.USB, 0x11C0, 0x5606),
+        )
diff --git a/tools/testing/selftests/hid/tests/test_tablet.py b/tools/testing/selftests/hid/tests/test_tablet.py
index 903f19f7cbe9..a9e2de1e8861 100644
--- a/tools/testing/selftests/hid/tests/test_tablet.py
+++ b/tools/testing/selftests/hid/tests/test_tablet.py
@@ -35,6 +35,7 @@ class BtnPressed(Enum):
 
     PRIMARY_PRESSED = libevdev.EV_KEY.BTN_STYLUS
     SECONDARY_PRESSED = libevdev.EV_KEY.BTN_STYLUS2
+    THIRD_PRESSED = libevdev.EV_KEY.BTN_STYLUS3
 
 
 class PenState(Enum):
@@ -44,58 +45,28 @@ class PenState(Enum):
     We extend it with the various buttons when we need to check them.
     """
 
-    PEN_IS_OUT_OF_RANGE = BtnTouch.UP, None, None
-    PEN_IS_IN_RANGE = BtnTouch.UP, ToolType.PEN, None
-    PEN_IS_IN_RANGE_WITH_BUTTON = BtnTouch.UP, ToolType.PEN, BtnPressed.PRIMARY_PRESSED
-    PEN_IS_IN_RANGE_WITH_SECOND_BUTTON = (
-        BtnTouch.UP,
-        ToolType.PEN,
-        BtnPressed.SECONDARY_PRESSED,
-    )
-    PEN_IS_IN_CONTACT = BtnTouch.DOWN, ToolType.PEN, None
-    PEN_IS_IN_CONTACT_WITH_BUTTON = (
-        BtnTouch.DOWN,
-        ToolType.PEN,
-        BtnPressed.PRIMARY_PRESSED,
-    )
-    PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON = (
-        BtnTouch.DOWN,
-        ToolType.PEN,
-        BtnPressed.SECONDARY_PRESSED,
-    )
-    PEN_IS_IN_RANGE_WITH_ERASING_INTENT = BtnTouch.UP, ToolType.RUBBER, None
-    PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON = (
-        BtnTouch.UP,
-        ToolType.RUBBER,
-        BtnPressed.PRIMARY_PRESSED,
-    )
-    PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_SECOND_BUTTON = (
-        BtnTouch.UP,
-        ToolType.RUBBER,
-        BtnPressed.SECONDARY_PRESSED,
-    )
-    PEN_IS_ERASING = BtnTouch.DOWN, ToolType.RUBBER, None
-    PEN_IS_ERASING_WITH_BUTTON = (
-        BtnTouch.DOWN,
-        ToolType.RUBBER,
-        BtnPressed.PRIMARY_PRESSED,
-    )
-    PEN_IS_ERASING_WITH_SECOND_BUTTON = (
-        BtnTouch.DOWN,
-        ToolType.RUBBER,
-        BtnPressed.SECONDARY_PRESSED,
-    )
-
-    def __init__(self, touch: BtnTouch, tool: Optional[ToolType], button: Optional[BtnPressed]):
+    PEN_IS_OUT_OF_RANGE = BtnTouch.UP, None, False
+    PEN_IS_IN_RANGE = BtnTouch.UP, ToolType.PEN, False
+    PEN_IS_IN_RANGE_WITH_BUTTON = BtnTouch.UP, ToolType.PEN, True
+    PEN_IS_IN_CONTACT = BtnTouch.DOWN, ToolType.PEN, False
+    PEN_IS_IN_CONTACT_WITH_BUTTON = BtnTouch.DOWN, ToolType.PEN, True
+    PEN_IS_IN_RANGE_WITH_ERASING_INTENT = BtnTouch.UP, ToolType.RUBBER, False
+    PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON = BtnTouch.UP, ToolType.RUBBER, True
+    PEN_IS_ERASING = BtnTouch.DOWN, ToolType.RUBBER, False
+    PEN_IS_ERASING_WITH_BUTTON = BtnTouch.DOWN, ToolType.RUBBER, True
+
+    def __init__(
+        self, touch: BtnTouch, tool: Optional[ToolType], button: Optional[bool]
+    ):
         self.touch = touch  # type: ignore
         self.tool = tool  # type: ignore
         self.button = button  # type: ignore
 
     @classmethod
-    def from_evdev(cls, evdev) -> "PenState":
+    def from_evdev(cls, evdev, test_button) -> "PenState":
         touch = BtnTouch(evdev.value[libevdev.EV_KEY.BTN_TOUCH])
         tool = None
-        button = None
+        button = False
         if (
             evdev.value[libevdev.EV_KEY.BTN_TOOL_RUBBER]
             and not evdev.value[libevdev.EV_KEY.BTN_TOOL_PEN]
@@ -112,19 +83,20 @@ class PenState(Enum):
         ):
             raise ValueError("2 tools are not allowed")
 
-        # we take only the highest button in account
-        for b in [libevdev.EV_KEY.BTN_STYLUS, libevdev.EV_KEY.BTN_STYLUS2]:
-            if bool(evdev.value[b]):
-                button = BtnPressed(b)
+        # we take only the provided button into account
+        if test_button is not None:
+            button = bool(evdev.value[test_button.value])
 
         # the kernel tends to insert an EV_SYN once removing the tool, so
         # the button will be released after
         if tool is None:
-            button = None
+            button = False
 
         return cls((touch, tool, button))  # type: ignore
 
-    def apply(self, events: List[libevdev.InputEvent], strict: bool) -> "PenState":
+    def apply(
+        self, events: List[libevdev.InputEvent], strict: bool, test_button: BtnPressed
+    ) -> "PenState":
         if libevdev.EV_SYN.SYN_REPORT in events:
             raise ValueError("EV_SYN is in the event sequence")
         touch = self.touch
@@ -148,19 +120,16 @@ class PenState(Enum):
                     raise ValueError(f"duplicated BTN_TOOL_* in {events}")
                 tool_found = True
                 tool = ToolType(ev.code) if ev.value else None
-            elif ev in (
-                libevdev.InputEvent(libevdev.EV_KEY.BTN_STYLUS),
-                libevdev.InputEvent(libevdev.EV_KEY.BTN_STYLUS2),
-            ):
+            elif test_button is not None and ev in (test_button.value,):
                 if button_found:
                     raise ValueError(f"duplicated BTN_STYLUS* in {events}")
                 button_found = True
-                button = BtnPressed(ev.code) if ev.value else None
+                button = bool(ev.value)
 
         # the kernel tends to insert an EV_SYN once removing the tool, so
         # the button will be released after
         if tool is None:
-            button = None
+            button = False
 
         new_state = PenState((touch, tool, button))  # type: ignore
         if strict:
@@ -183,11 +152,9 @@ class PenState(Enum):
                 PenState.PEN_IS_OUT_OF_RANGE,
                 PenState.PEN_IS_IN_RANGE,
                 PenState.PEN_IS_IN_RANGE_WITH_BUTTON,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
                 PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT,
                 PenState.PEN_IS_IN_CONTACT,
                 PenState.PEN_IS_IN_CONTACT_WITH_BUTTON,
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
                 PenState.PEN_IS_ERASING,
             )
 
@@ -195,7 +162,6 @@ class PenState(Enum):
             return (
                 PenState.PEN_IS_IN_RANGE,
                 PenState.PEN_IS_IN_RANGE_WITH_BUTTON,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
                 PenState.PEN_IS_OUT_OF_RANGE,
                 PenState.PEN_IS_IN_CONTACT,
             )
@@ -204,7 +170,6 @@ class PenState(Enum):
             return (
                 PenState.PEN_IS_IN_CONTACT,
                 PenState.PEN_IS_IN_CONTACT_WITH_BUTTON,
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
                 PenState.PEN_IS_IN_RANGE,
             )
 
@@ -236,21 +201,6 @@ class PenState(Enum):
                 PenState.PEN_IS_IN_RANGE_WITH_BUTTON,
             )
 
-        if self == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON:
-            return (
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_RANGE,
-                PenState.PEN_IS_OUT_OF_RANGE,
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
-            )
-
-        if self == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON:
-            return (
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_CONTACT,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
-            )
-
         return tuple()
 
     def historically_tolerated_transitions(self) -> Tuple["PenState", ...]:
@@ -263,11 +213,9 @@ class PenState(Enum):
                 PenState.PEN_IS_OUT_OF_RANGE,
                 PenState.PEN_IS_IN_RANGE,
                 PenState.PEN_IS_IN_RANGE_WITH_BUTTON,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
                 PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT,
                 PenState.PEN_IS_IN_CONTACT,
                 PenState.PEN_IS_IN_CONTACT_WITH_BUTTON,
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
                 PenState.PEN_IS_ERASING,
             )
 
@@ -275,7 +223,6 @@ class PenState(Enum):
             return (
                 PenState.PEN_IS_IN_RANGE,
                 PenState.PEN_IS_IN_RANGE_WITH_BUTTON,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
                 PenState.PEN_IS_OUT_OF_RANGE,
                 PenState.PEN_IS_IN_CONTACT,
             )
@@ -284,7 +231,6 @@ class PenState(Enum):
             return (
                 PenState.PEN_IS_IN_CONTACT,
                 PenState.PEN_IS_IN_CONTACT_WITH_BUTTON,
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
                 PenState.PEN_IS_IN_RANGE,
                 PenState.PEN_IS_OUT_OF_RANGE,
             )
@@ -319,22 +265,6 @@ class PenState(Enum):
                 PenState.PEN_IS_OUT_OF_RANGE,
             )
 
-        if self == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON:
-            return (
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_RANGE,
-                PenState.PEN_IS_OUT_OF_RANGE,
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
-            )
-
-        if self == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON:
-            return (
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_CONTACT,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_OUT_OF_RANGE,
-            )
-
         return tuple()
 
     @staticmethod
@@ -402,9 +332,9 @@ class PenState(Enum):
         }
 
     @staticmethod
-    def legal_transitions_with_primary_button() -> Dict[str, Tuple["PenState", ...]]:
+    def legal_transitions_with_button() -> Dict[str, Tuple["PenState", ...]]:
         """We revisit the Windows Pen Implementation state machine:
-        we now have a primary button.
+        we now have a button.
         """
         return {
             "hover-button": (PenState.PEN_IS_IN_RANGE_WITH_BUTTON,),
@@ -451,56 +381,6 @@ class PenState(Enum):
         }
 
     @staticmethod
-    def legal_transitions_with_secondary_button() -> Dict[str, Tuple["PenState", ...]]:
-        """We revisit the Windows Pen Implementation state machine:
-        we now have a secondary button.
-        Note: we don't looks for 2 buttons interactions.
-        """
-        return {
-            "hover-button": (PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,),
-            "hover-button -> out-of-range": (
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_OUT_OF_RANGE,
-            ),
-            "in-range -> button-press": (
-                PenState.PEN_IS_IN_RANGE,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
-            ),
-            "in-range -> button-press -> button-release": (
-                PenState.PEN_IS_IN_RANGE,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_RANGE,
-            ),
-            "in-range -> touch -> button-press -> button-release": (
-                PenState.PEN_IS_IN_RANGE,
-                PenState.PEN_IS_IN_CONTACT,
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_CONTACT,
-            ),
-            "in-range -> touch -> button-press -> release -> button-release": (
-                PenState.PEN_IS_IN_RANGE,
-                PenState.PEN_IS_IN_CONTACT,
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_RANGE,
-            ),
-            "in-range -> button-press -> touch -> release -> button-release": (
-                PenState.PEN_IS_IN_RANGE,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_RANGE,
-            ),
-            "in-range -> button-press -> touch -> button-release -> release": (
-                PenState.PEN_IS_IN_RANGE,
-                PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON,
-                PenState.PEN_IS_IN_CONTACT,
-                PenState.PEN_IS_IN_RANGE,
-            ),
-        }
-
-    @staticmethod
     def tolerated_transitions() -> Dict[str, Tuple["PenState", ...]]:
         """This is not adhering to the Windows Pen Implementation state machine
         but we should expect the kernel to behave properly, mostly for historical
@@ -616,10 +496,22 @@ class Pen(object):
             evdev.value[axis] == value
         ), f"assert evdev.value[{axis}] ({evdev.value[axis]}) != {value}"
 
-    def assert_expected_input_events(self, evdev):
+    def assert_expected_input_events(self, evdev, button):
         assert evdev.value[libevdev.EV_ABS.ABS_X] == self.x
         assert evdev.value[libevdev.EV_ABS.ABS_Y] == self.y
-        assert self.current_state == PenState.from_evdev(evdev)
+
+        # assert no other buttons than the tested ones are set
+        buttons = [
+            BtnPressed.PRIMARY_PRESSED,
+            BtnPressed.SECONDARY_PRESSED,
+            BtnPressed.THIRD_PRESSED,
+        ]
+        if button is not None:
+            buttons.remove(button)
+        for b in buttons:
+            assert evdev.value[b.value] is None or evdev.value[b.value] == False
+
+        assert self.current_state == PenState.from_evdev(evdev, button)
 
 
 class PenDigitizer(base.UHIDTestDevice):
@@ -647,7 +539,7 @@ class PenDigitizer(base.UHIDTestDevice):
                     continue
                 self.fields = [f.usage_name for f in r]
 
-    def move_to(self, pen, state):
+    def move_to(self, pen, state, button):
         # fill in the previous values
         if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE:
             pen.restore()
@@ -690,29 +582,17 @@ class PenDigitizer(base.UHIDTestDevice):
             pen.inrange = True
             pen.invert = False
             pen.eraser = False
-            pen.barrelswitch = True
-            pen.secondarybarrelswitch = False
+            assert button is not None
+            pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED
+            pen.secondarybarrelswitch = button == BtnPressed.SECONDARY_PRESSED
         elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON:
             pen.tipswitch = True
             pen.inrange = True
             pen.invert = False
             pen.eraser = False
-            pen.barrelswitch = True
-            pen.secondarybarrelswitch = False
-        elif state == PenState.PEN_IS_IN_RANGE_WITH_SECOND_BUTTON:
-            pen.tipswitch = False
-            pen.inrange = True
-            pen.invert = False
-            pen.eraser = False
-            pen.barrelswitch = False
-            pen.secondarybarrelswitch = True
-        elif state == PenState.PEN_IS_IN_CONTACT_WITH_SECOND_BUTTON:
-            pen.tipswitch = True
-            pen.inrange = True
-            pen.invert = False
-            pen.eraser = False
-            pen.barrelswitch = False
-            pen.secondarybarrelswitch = True
+            assert button is not None
+            pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED
+            pen.secondarybarrelswitch = button == BtnPressed.SECONDARY_PRESSED
         elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT:
             pen.tipswitch = False
             pen.inrange = True
@@ -730,7 +610,7 @@ class PenDigitizer(base.UHIDTestDevice):
 
         pen.current_state = state
 
-    def event(self, pen):
+    def event(self, pen, button):
         rs = []
         r = self.create_report(application=self.cur_application, data=pen)
         self.call_input_event(r)
@@ -771,17 +651,17 @@ class BaseTest:
         def create_device(self):
             raise Exception("please reimplement me in subclasses")
 
-        def post(self, uhdev, pen):
-            r = uhdev.event(pen)
+        def post(self, uhdev, pen, test_button):
+            r = uhdev.event(pen, test_button)
             events = uhdev.next_sync_events()
             self.debug_reports(r, uhdev, events)
             return events
 
         def validate_transitions(
-            self, from_state, pen, evdev, events, allow_intermediate_states
+            self, from_state, pen, evdev, events, allow_intermediate_states, button
         ):
             # check that the final state is correct
-            pen.assert_expected_input_events(evdev)
+            pen.assert_expected_input_events(evdev, button)
 
             state = from_state
 
@@ -794,12 +674,14 @@ class BaseTest:
                 events = events[idx + 1 :]
 
                 # now check for a valid transition
-                state = state.apply(sync_events, not allow_intermediate_states)
+                state = state.apply(sync_events, not allow_intermediate_states, button)
 
             if events:
-                state = state.apply(sync_events, not allow_intermediate_states)
+                state = state.apply(sync_events, not allow_intermediate_states, button)
 
-        def _test_states(self, state_list, scribble, allow_intermediate_states):
+        def _test_states(
+            self, state_list, scribble, allow_intermediate_states, button=None
+        ):
             """Internal method to test against a list of
             transition between states.
             state_list is a list of PenState objects
@@ -812,10 +694,10 @@ class BaseTest:
             cur_state = PenState.PEN_IS_OUT_OF_RANGE
 
             p = Pen(50, 60)
-            uhdev.move_to(p, PenState.PEN_IS_OUT_OF_RANGE)
-            events = self.post(uhdev, p)
+            uhdev.move_to(p, PenState.PEN_IS_OUT_OF_RANGE, button)
+            events = self.post(uhdev, p, button)
             self.validate_transitions(
-                cur_state, p, evdev, events, allow_intermediate_states
+                cur_state, p, evdev, events, allow_intermediate_states, button
             )
 
             cur_state = p.current_state
@@ -824,18 +706,18 @@ class BaseTest:
                 if scribble and cur_state != PenState.PEN_IS_OUT_OF_RANGE:
                     p.x += 1
                     p.y -= 1
-                    events = self.post(uhdev, p)
+                    events = self.post(uhdev, p, button)
                     self.validate_transitions(
-                        cur_state, p, evdev, events, allow_intermediate_states
+                        cur_state, p, evdev, events, allow_intermediate_states, button
                     )
                     assert len(events) >= 3  # X, Y, SYN
-                uhdev.move_to(p, state)
+                uhdev.move_to(p, state, button)
                 if scribble and state != PenState.PEN_IS_OUT_OF_RANGE:
                     p.x += 1
                     p.y -= 1
-                events = self.post(uhdev, p)
+                events = self.post(uhdev, p, button)
                 self.validate_transitions(
-                    cur_state, p, evdev, events, allow_intermediate_states
+                    cur_state, p, evdev, events, allow_intermediate_states, button
                 )
                 cur_state = p.current_state
 
@@ -874,12 +756,17 @@ class BaseTest:
             "state_list",
             [
                 pytest.param(v, id=k)
-                for k, v in PenState.legal_transitions_with_primary_button().items()
+                for k, v in PenState.legal_transitions_with_button().items()
             ],
         )
         def test_valid_primary_button_pen_states(self, state_list, scribble):
             """Rework the transition state machine by adding the primary button."""
-            self._test_states(state_list, scribble, allow_intermediate_states=False)
+            self._test_states(
+                state_list,
+                scribble,
+                allow_intermediate_states=False,
+                button=BtnPressed.PRIMARY_PRESSED,
+            )
 
         @pytest.mark.skip_if_uhdev(
             lambda uhdev: "Secondary Barrel Switch" not in uhdev.fields,
@@ -890,12 +777,38 @@ class BaseTest:
             "state_list",
             [
                 pytest.param(v, id=k)
-                for k, v in PenState.legal_transitions_with_secondary_button().items()
+                for k, v in PenState.legal_transitions_with_button().items()
             ],
         )
         def test_valid_secondary_button_pen_states(self, state_list, scribble):
             """Rework the transition state machine by adding the secondary button."""
-            self._test_states(state_list, scribble, allow_intermediate_states=False)
+            self._test_states(
+                state_list,
+                scribble,
+                allow_intermediate_states=False,
+                button=BtnPressed.SECONDARY_PRESSED,
+            )
+
+        @pytest.mark.skip_if_uhdev(
+            lambda uhdev: "Third Barrel Switch" not in uhdev.fields,
+            "Device not compatible, missing Third Barrel Switch usage",
+        )
+        @pytest.mark.parametrize("scribble", [True, False], ids=["scribble", "static"])
+        @pytest.mark.parametrize(
+            "state_list",
+            [
+                pytest.param(v, id=k)
+                for k, v in PenState.legal_transitions_with_button().items()
+            ],
+        )
+        def test_valid_third_button_pen_states(self, state_list, scribble):
+            """Rework the transition state machine by adding the secondary button."""
+            self._test_states(
+                state_list,
+                scribble,
+                allow_intermediate_states=False,
+                button=BtnPressed.THIRD_PRESSED,
+            )
 
         @pytest.mark.skip_if_uhdev(
             lambda uhdev: "Invert" not in uhdev.fields,
@@ -956,7 +869,7 @@ class BaseTest:
 
 
 class GXTP_pen(PenDigitizer):
-    def event(self, pen):
+    def event(self, pen, test_button):
         if not hasattr(self, "prev_tip_state"):
             self.prev_tip_state = False
 
@@ -977,13 +890,407 @@ class GXTP_pen(PenDigitizer):
         if pen.eraser:
             internal_pen.invert = False
 
-        return super().event(internal_pen)
+        return super().event(internal_pen, test_button)
 
 
 class USIPen(PenDigitizer):
     pass
 
 
+class XPPen_ArtistPro16Gen2_28bd_095b(PenDigitizer):
+    """
+    Pen with two buttons and a rubber end, but which reports
+    the second button as an eraser
+    """
+
+    def __init__(
+        self,
+        name,
+        rdesc_str=None,
+        rdesc=None,
+        application="Pen",
+        physical="Stylus",
+        input_info=(BusType.USB, 0x28BD, 0x095B),
+        evdev_name_suffix=None,
+    ):
+        super().__init__(
+            name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix
+        )
+        self.fields.append("Secondary Barrel Switch")
+
+    def move_to(self, pen, state, button):
+        # fill in the previous values
+        if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE:
+            pen.restore()
+
+        print(f"\n  *** pen is moving to {state} ***")
+
+        if state == PenState.PEN_IS_OUT_OF_RANGE:
+            pen.backup()
+            pen.x = 0
+            pen.y = 0
+            pen.tipswitch = False
+            pen.tippressure = 0
+            pen.azimuth = 0
+            pen.inrange = False
+            pen.width = 0
+            pen.height = 0
+            pen.invert = False
+            pen.eraser = False
+            pen.xtilt = 0
+            pen.ytilt = 0
+            pen.twist = 0
+            pen.barrelswitch = False
+        elif state == PenState.PEN_IS_IN_RANGE:
+            pen.tipswitch = False
+            pen.inrange = True
+            pen.invert = False
+            pen.eraser = False
+            pen.barrelswitch = False
+        elif state == PenState.PEN_IS_IN_CONTACT:
+            pen.tipswitch = True
+            pen.inrange = True
+            pen.invert = False
+            pen.eraser = False
+            pen.barrelswitch = False
+        elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON:
+            pen.tipswitch = False
+            pen.inrange = True
+            pen.invert = False
+            assert button is not None
+            pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED
+            pen.eraser = button == BtnPressed.SECONDARY_PRESSED
+        elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON:
+            pen.tipswitch = True
+            pen.inrange = True
+            pen.invert = False
+            assert button is not None
+            pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED
+            pen.eraser = button == BtnPressed.SECONDARY_PRESSED
+        elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT:
+            pen.tipswitch = False
+            pen.inrange = True
+            pen.invert = True
+            pen.eraser = False
+            pen.barrelswitch = False
+        elif state == PenState.PEN_IS_ERASING:
+            pen.tipswitch = True
+            pen.inrange = True
+            pen.invert = True
+            pen.eraser = False
+            pen.barrelswitch = False
+
+        pen.current_state = state
+
+    def event(self, pen, test_button):
+        import math
+
+        pen_copy = copy.copy(pen)
+        width = 13.567
+        height = 8.480
+        tip_height = 0.055677699
+        hx = tip_height * (32767 / width)
+        hy = tip_height * (32767 / height)
+        if pen_copy.xtilt != 0:
+            pen_copy.x += round(hx * math.sin(math.radians(pen_copy.xtilt)))
+        if pen_copy.ytilt != 0:
+            pen_copy.y += round(hy * math.sin(math.radians(pen_copy.ytilt)))
+
+        return super().event(pen_copy, test_button)
+
+
+class XPPen_Artist24_28bd_093a(PenDigitizer):
+    """
+    Pen that reports secondary barrel switch through eraser
+    """
+
+    def __init__(
+        self,
+        name,
+        rdesc_str=None,
+        rdesc=None,
+        application="Pen",
+        physical="Stylus",
+        input_info=(BusType.USB, 0x28BD, 0x093A),
+        evdev_name_suffix=None,
+    ):
+        super().__init__(
+            name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix
+        )
+        self.fields.append("Secondary Barrel Switch")
+        self.previous_state = PenState.PEN_IS_OUT_OF_RANGE
+
+    def move_to(self, pen, state, button, debug=True):
+        # fill in the previous values
+        if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE:
+            pen.restore()
+
+        if debug:
+            print(f"\n  *** pen is moving to {state} ***")
+
+        if state == PenState.PEN_IS_OUT_OF_RANGE:
+            pen.backup()
+            pen.tipswitch = False
+            pen.tippressure = 0
+            pen.azimuth = 0
+            pen.inrange = False
+            pen.width = 0
+            pen.height = 0
+            pen.invert = False
+            pen.eraser = False
+            pen.xtilt = 0
+            pen.ytilt = 0
+            pen.twist = 0
+            pen.barrelswitch = False
+        elif state == PenState.PEN_IS_IN_RANGE:
+            pen.tipswitch = False
+            pen.inrange = True
+            pen.invert = False
+            pen.eraser = False
+            pen.barrelswitch = False
+        elif state == PenState.PEN_IS_IN_CONTACT:
+            pen.tipswitch = True
+            pen.inrange = True
+            pen.invert = False
+            pen.eraser = False
+            pen.barrelswitch = False
+        elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON:
+            pen.tipswitch = False
+            pen.inrange = True
+            pen.invert = False
+            assert button is not None
+            pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED
+            pen.eraser = button == BtnPressed.SECONDARY_PRESSED
+        elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON:
+            pen.tipswitch = True
+            pen.inrange = True
+            pen.invert = False
+            assert button is not None
+            pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED
+            pen.eraser = button == BtnPressed.SECONDARY_PRESSED
+
+        pen.current_state = state
+
+    def send_intermediate_state(self, pen, state, button):
+        intermediate_pen = copy.copy(pen)
+        self.move_to(intermediate_pen, state, button, debug=False)
+        return super().event(intermediate_pen, button)
+
+    def event(self, pen, button):
+        rs = []
+
+        # the pen reliably sends in-range events in a normal case (non emulation of eraser mode)
+        if self.previous_state == PenState.PEN_IS_IN_CONTACT:
+            if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE:
+                rs.extend(
+                    self.send_intermediate_state(pen, PenState.PEN_IS_IN_RANGE, button)
+                )
+
+        if button == BtnPressed.SECONDARY_PRESSED:
+            if self.previous_state == PenState.PEN_IS_IN_RANGE:
+                if pen.current_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON:
+                    rs.extend(
+                        self.send_intermediate_state(
+                            pen, PenState.PEN_IS_OUT_OF_RANGE, button
+                        )
+                    )
+
+            if self.previous_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON:
+                if pen.current_state == PenState.PEN_IS_IN_RANGE:
+                    rs.extend(
+                        self.send_intermediate_state(
+                            pen, PenState.PEN_IS_OUT_OF_RANGE, button
+                        )
+                    )
+
+            if self.previous_state == PenState.PEN_IS_IN_CONTACT:
+                if pen.current_state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON:
+                    rs.extend(
+                        self.send_intermediate_state(
+                            pen, PenState.PEN_IS_OUT_OF_RANGE, button
+                        )
+                    )
+                    rs.extend(
+                        self.send_intermediate_state(
+                            pen, PenState.PEN_IS_IN_RANGE_WITH_BUTTON, button
+                        )
+                    )
+
+            if self.previous_state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON:
+                if pen.current_state == PenState.PEN_IS_IN_CONTACT:
+                    rs.extend(
+                        self.send_intermediate_state(
+                            pen, PenState.PEN_IS_OUT_OF_RANGE, button
+                        )
+                    )
+                    rs.extend(
+                        self.send_intermediate_state(
+                            pen, PenState.PEN_IS_IN_RANGE, button
+                        )
+                    )
+
+        rs.extend(super().event(pen, button))
+        self.previous_state = pen.current_state
+        return rs
+
+
+class Huion_Kamvas_Pro_19_256c_006b(PenDigitizer):
+    """
+    Pen that reports secondary barrel switch through secondary TipSwtich
+    and 3rd button through Invert
+    """
+
+    def __init__(
+        self,
+        name,
+        rdesc_str=None,
+        rdesc=None,
+        application="Stylus",
+        physical=None,
+        input_info=(BusType.USB, 0x256C, 0x006B),
+        evdev_name_suffix=None,
+    ):
+        super().__init__(
+            name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix
+        )
+        self.fields.append("Secondary Barrel Switch")
+        self.fields.append("Third Barrel Switch")
+        self.previous_state = PenState.PEN_IS_OUT_OF_RANGE
+
+    def move_to(self, pen, state, button, debug=True):
+        # fill in the previous values
+        if pen.current_state == PenState.PEN_IS_OUT_OF_RANGE:
+            pen.restore()
+
+        if debug:
+            print(f"\n  *** pen is moving to {state} ***")
+
+        if state == PenState.PEN_IS_OUT_OF_RANGE:
+            pen.backup()
+            pen.tipswitch = False
+            pen.tippressure = 0
+            pen.azimuth = 0
+            pen.inrange = False
+            pen.width = 0
+            pen.height = 0
+            pen.invert = False
+            pen.eraser = False
+            pen.xtilt = 0
+            pen.ytilt = 0
+            pen.twist = 0
+            pen.barrelswitch = False
+            pen.secondarytipswitch = False
+        elif state == PenState.PEN_IS_IN_RANGE:
+            pen.tipswitch = False
+            pen.inrange = True
+            pen.invert = False
+            pen.eraser = False
+            pen.barrelswitch = False
+            pen.secondarytipswitch = False
+        elif state == PenState.PEN_IS_IN_CONTACT:
+            pen.tipswitch = True
+            pen.inrange = True
+            pen.invert = False
+            pen.eraser = False
+            pen.barrelswitch = False
+            pen.secondarytipswitch = False
+        elif state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON:
+            pen.tipswitch = False
+            pen.inrange = True
+            pen.eraser = False
+            assert button is not None
+            pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED
+            pen.secondarytipswitch = button == BtnPressed.SECONDARY_PRESSED
+            pen.invert = button == BtnPressed.THIRD_PRESSED
+        elif state == PenState.PEN_IS_IN_CONTACT_WITH_BUTTON:
+            pen.tipswitch = True
+            pen.inrange = True
+            pen.eraser = False
+            assert button is not None
+            pen.barrelswitch = button == BtnPressed.PRIMARY_PRESSED
+            pen.secondarytipswitch = button == BtnPressed.SECONDARY_PRESSED
+            pen.invert = button == BtnPressed.THIRD_PRESSED
+        elif state == PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT:
+            pen.tipswitch = False
+            pen.inrange = True
+            pen.invert = True
+            pen.eraser = False
+            pen.barrelswitch = False
+            pen.secondarytipswitch = False
+        elif state == PenState.PEN_IS_ERASING:
+            pen.tipswitch = False
+            pen.inrange = True
+            pen.invert = False
+            pen.eraser = True
+            pen.barrelswitch = False
+            pen.secondarytipswitch = False
+
+        pen.current_state = state
+
+    def call_input_event(self, report):
+        if report[0] == 0x0a:
+            # ensures the original second Eraser usage is null
+            report[1] &= 0xdf
+
+            # ensures the original last bit is equal to bit 6 (In Range)
+            if report[1] & 0x40:
+                report[1] |= 0x80
+
+        super().call_input_event(report)
+
+    def send_intermediate_state(self, pen, state, test_button):
+        intermediate_pen = copy.copy(pen)
+        self.move_to(intermediate_pen, state, test_button, debug=False)
+        return super().event(intermediate_pen, test_button)
+
+    def event(self, pen, button):
+        rs = []
+
+        # it's not possible to go between eraser mode or not without
+        # going out-of-prox: the eraser mode is activated by presenting
+        # the tail of the pen
+        if self.previous_state in (
+            PenState.PEN_IS_IN_RANGE,
+            PenState.PEN_IS_IN_RANGE_WITH_BUTTON,
+            PenState.PEN_IS_IN_CONTACT,
+            PenState.PEN_IS_IN_CONTACT_WITH_BUTTON,
+        ) and pen.current_state in (
+            PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT,
+            PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON,
+            PenState.PEN_IS_ERASING,
+            PenState.PEN_IS_ERASING_WITH_BUTTON,
+        ):
+            rs.extend(
+                self.send_intermediate_state(pen, PenState.PEN_IS_OUT_OF_RANGE, button)
+            )
+
+        # same than above except from eraser to normal
+        if self.previous_state in (
+            PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT,
+            PenState.PEN_IS_IN_RANGE_WITH_ERASING_INTENT_WITH_BUTTON,
+            PenState.PEN_IS_ERASING,
+            PenState.PEN_IS_ERASING_WITH_BUTTON,
+        ) and pen.current_state in (
+            PenState.PEN_IS_IN_RANGE,
+            PenState.PEN_IS_IN_RANGE_WITH_BUTTON,
+            PenState.PEN_IS_IN_CONTACT,
+            PenState.PEN_IS_IN_CONTACT_WITH_BUTTON,
+        ):
+            rs.extend(
+                self.send_intermediate_state(pen, PenState.PEN_IS_OUT_OF_RANGE, button)
+            )
+
+        if self.previous_state == PenState.PEN_IS_OUT_OF_RANGE:
+            if pen.current_state == PenState.PEN_IS_IN_RANGE_WITH_BUTTON:
+                rs.extend(
+                    self.send_intermediate_state(pen, PenState.PEN_IS_IN_RANGE, button)
+                )
+
+        rs.extend(super().event(pen, button))
+        self.previous_state = pen.current_state
+        return rs
+
+
 ################################################################################
 #
 # Windows 7 compatible devices
@@ -1162,3 +1469,37 @@ class TestGoodix_27c6_0e00(BaseTest.TestTablet):
             rdesc="05 0d 09 04 a1 01 85 01 09 22 a1 02 55 0e 65 11 35 00 15 00 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 95 01 75 08 09 51 81 02 75 10 05 01 26 04 20 46 e6 09 09 30 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 15 00 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 75 08 09 51 95 01 81 02 05 01 26 04 20 75 10 55 0e 65 11 09 30 35 00 46 e6 09 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 22 a1 02 09 42 15 00 25 01 75 01 95 01 81 02 25 7f 09 30 75 07 81 42 75 08 09 51 95 01 81 02 05 01 26 04 20 75 10 55 0e 65 11 09 30 35 00 46 e6 09 81 02 26 60 15 46 9a 06 09 31 81 02 05 0d 55 0f 75 08 25 ff 45 ff 09 48 81 42 09 49 81 42 55 0e c0 09 54 15 00 25 7f 75 08 95 01 81 02 85 02 09 55 95 01 25 0a b1 02 85 03 06 00 ff 09 c5 15 00 26 ff 00 75 08 96 00 01 b1 02 c0 05 0d 09 02 a1 01 09 20 a1 00 85 08 05 01 a4 09 30 35 00 46 e6 09 15 00 26 04 20 55 0d 65 13 75 10 95 01 81 02 09 31 46 9a 06 26 60 15 81 02 b4 05 0d 09 38 95 01 75 08 15 00 25 01 81 02 09 30 75 10 26 ff 0f 81 02 09 31 81 02 09 42 09 44 09 5a 09 3c 09 45 09 32 75 01 95 06 25 01 81 02 95 02 81 03 09 3d 55 0e 65 14 36 d8 dc 46 28 23 16 d8 dc 26 28 23 95 01 75 10 81 02 09 3e 81 02 09 41 15 00 27 a0 8c 00 00 35 00 47 a0 8c 00 00 81 02 05 20 0a 53 04 65 00 16 01 f8 26 ff 07 75 10 95 01 81 02 0a 54 04 81 02 0a 55 04 81 02 0a 57 04 81 02 0a 58 04 81 02 0a 59 04 81 02 0a 72 04 81 02 0a 73 04 81 02 0a 74 04 81 02 05 0d 09 3b 15 00 25 64 75 08 81 02 09 5b 25 ff 75 40 81 02 06 00 ff 09 5b 75 20 81 02 05 0d 09 5c 26 ff 00 75 08 81 02 09 5e 81 02 09 70 a1 02 15 01 25 06 09 72 09 73 09 74 09 75 09 76 09 77 81 20 c0 06 00 ff 09 01 15 00 27 ff ff 00 00 75 10 95 01 81 02 85 09 09 81 a1 02 09 81 15 01 25 04 09 82 09 83 09 84 09 85 81 20 c0 85 10 09 5c a1 02 15 00 25 01 75 08 95 01 09 38 b1 02 09 5c 26 ff 00 b1 02 09 5d 75 01 95 01 25 01 b1 02 95 07 b1 03 c0 85 11 09 5e a1 02 09 38 15 00 25 01 75 08 95 01 b1 02 09 5e 26 ff 00 b1 02 09 5f 75 01 25 01 b1 02 75 07 b1 03 c0 85 12 09 70 a1 02 75 08 95 01 15 00 25 01 09 38 b1 02 09 70 a1 02 25 06 09 72 09 73 09 74 09 75 09 76 09 77 b1 20 c0 09 71 75 01 25 01 b1 02 75 07 b1 03 c0 85 13 09 80 15 00 25 ff 75 40 95 01 b1 02 85 14 09 44 a1 02 09 38 75 08 95 01 25 01 b1 02 15 01 25 03 09 44 a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 09 5a a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 09 45 a1 02 09 a4 09 44 09 5a 09 45 09 a3 b1 20 c0 c0 85 15 75 08 95 01 05 0d 09 90 a1 02 09 38 25 01 b1 02 09 91 75 10 26 ff 0f b1 02 09 92 75 40 25 ff b1 02 05 06 09 2a 75 08 26 ff 00 a1 02 09 2d b1 02 09 2e b1 02 c0 c0 85 16 05 06 09 2b a1 02 05 0d 25 01 09 38 b1 02 05 06 09 2b a1 02 09 2d 26 ff 00 b1 02 09 2e b1 02 c0 c0 85 17 06 00 ff 09 01 a1 02 05 0d 09 38 75 08 95 01 25 01 b1 02 06 00 ff 09 01 75 10 27 ff ff 00 00 b1 02 c0 85 18 05 0d 09 38 75 08 95 01 15 00 25 01 b1 02 c0 c0 06 f0 ff 09 01 a1 01 85 0e 09 01 15 00 25 ff 75 08 95 40 91 02 09 01 15 00 25 ff 75 08 95 40 81 02 c0",
             input_info=(BusType.I2C, 0x27C6, 0x0E00),
         )
+
+
+class TestXPPen_ArtistPro16Gen2_28bd_095b(BaseTest.TestTablet):
+    hid_bpfs = [("XPPen__ArtistPro16Gen2.bpf.o", True)]
+
+    def create_device(self):
+        dev = XPPen_ArtistPro16Gen2_28bd_095b(
+            "uhid test XPPen Artist Pro 16 Gen2 28bd 095b",
+            rdesc="05 0d 09 02 a1 01 85 07 09 20 a1 00 09 42 09 44 09 45 09 3c 15 00 25 01 75 01 95 04 81 02 95 01 81 03 09 32 15 00 25 01 95 01 81 02 95 02 81 03 75 10 95 01 35 00 a4 05 01 09 30 65 13 55 0d 46 ff 34 26 ff 7f 81 02 09 31 46 20 21 26 ff 7f 81 02 b4 09 30 45 00 26 ff 3f 81 42 09 3d 15 81 25 7f 75 08 95 01 81 02 09 3e 15 81 25 7f 81 02 c0 c0",
+            input_info=(BusType.USB, 0x28BD, 0x095B),
+        )
+        return dev
+
+
+class TestXPPen_Artist24_28bd_093a(BaseTest.TestTablet):
+    hid_bpfs = [("XPPen__Artist24.bpf.o", True)]
+
+    def create_device(self):
+        return XPPen_Artist24_28bd_093a(
+            "uhid test XPPen Artist 24 28bd 093a",
+            rdesc="05 0d 09 02 a1 01 85 07 09 20 a1 00 09 42 09 44 09 45 15 00 25 01 75 01 95 03 81 02 95 02 81 03 09 32 95 01 81 02 95 02 81 03 75 10 95 01 35 00 a4 05 01 09 30 65 13 55 0d 46 f0 50 26 ff 7f 81 02 09 31 46 91 2d 26 ff 7f 81 02 b4 09 30 45 00 26 ff 1f 81 42 09 3d 15 81 25 7f 75 08 95 01 81 02 09 3e 15 81 25 7f 81 02 c0 c0",
+            input_info=(BusType.USB, 0x28BD, 0x093A),
+        )
+
+
+class TestHuion_Kamvas_Pro_19_256c_006b(BaseTest.TestTablet):
+    hid_bpfs = [("Huion__Kamvas-Pro-19.bpf.o", True)]
+
+    def create_device(self):
+        return Huion_Kamvas_Pro_19_256c_006b(
+            "uhid test HUION Huion Tablet_GT1902",
+            rdesc="05 0d 09 02 a1 01 85 0a 09 20 a1 01 09 42 09 44 09 43 09 3c 09 45 15 00 25 01 75 01 95 06 81 02 09 32 75 01 95 01 81 02 81 03 05 01 09 30 09 31 55 0d 65 33 26 ff 7f 35 00 46 00 08 75 10 95 02 81 02 05 0d 09 30 26 ff 3f 75 10 95 01 81 02 09 3d 09 3e 15 a6 25 5a 75 08 95 02 81 02 c0 c0 05 0d 09 04 a1 01 85 04 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 56 55 00 65 00 27 ff ff ff 7f 95 01 75 20 81 02 09 54 25 7f 95 01 75 08 81 02 75 08 95 08 81 03 85 05 09 55 25 0a 75 08 95 01 b1 02 06 00 ff 09 c5 85 06 15 00 26 ff 00 75 08 96 00 01 b1 02 c0",
+            input_info=(BusType.USB, 0x256C, 0x006B),
+        )
diff --git a/tools/testing/selftests/iommu/config b/tools/testing/selftests/iommu/config
index 110d73917615..02a2a1b267c1 100644
--- a/tools/testing/selftests/iommu/config
+++ b/tools/testing/selftests/iommu/config
@@ -1,3 +1,5 @@
 CONFIG_IOMMUFD=y
+CONFIG_FAULT_INJECTION_DEBUG_FS=y
 CONFIG_FAULT_INJECTION=y
 CONFIG_IOMMUFD_TEST=y
+CONFIG_FAILSLAB=y
diff --git a/tools/testing/selftests/ipc/msgque.c b/tools/testing/selftests/ipc/msgque.c
index 656c43c24044..c75ea4094870 100644
--- a/tools/testing/selftests/ipc/msgque.c
+++ b/tools/testing/selftests/ipc/msgque.c
@@ -198,13 +198,12 @@ int main(int argc, char **argv)
 	struct msgque_data msgque;
 
 	if (getuid() != 0)
-		return ksft_exit_skip(
-				"Please run the test as root - Exiting.\n");
+		ksft_exit_skip("Please run the test as root - Exiting.\n");
 
 	msgque.key = ftok(argv[0], 822155650);
 	if (msgque.key == -1) {
 		printf("Can't make key: %d\n", -errno);
-		return ksft_exit_fail();
+		ksft_exit_fail();
 	}
 
 	msgque.msq_id = msgget(msgque.key, IPC_CREAT | IPC_EXCL | 0666);
@@ -243,13 +242,13 @@ int main(int argc, char **argv)
 		printf("Failed to test queue: %d\n", err);
 		goto err_out;
 	}
-	return ksft_exit_pass();
+	ksft_exit_pass();
 
 err_destroy:
 	if (msgctl(msgque.msq_id, IPC_RMID, NULL)) {
 		printf("Failed to destroy queue: %d\n", -errno);
-		return ksft_exit_fail();
+		ksft_exit_fail();
 	}
 err_out:
-	return ksft_exit_fail();
+	ksft_exit_fail();
 }
diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index 541bf192e30e..76c2a6945d3e 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -16,10 +16,12 @@
  * For each test, report any progress, debugging, etc with:
  *
  *     ksft_print_msg(fmt, ...);
+ *     ksft_perror(msg);
  *
  * and finally report the pass/fail/skip/xfail state of the test with one of:
  *
  *     ksft_test_result(condition, fmt, ...);
+ *     ksft_test_result_report(result, fmt, ...);
  *     ksft_test_result_pass(fmt, ...);
  *     ksft_test_result_fail(fmt, ...);
  *     ksft_test_result_skip(fmt, ...);
@@ -39,6 +41,7 @@
  * the program is aborting before finishing all tests):
  *
  *    ksft_exit_fail_msg(fmt, ...);
+ *    ksft_exit_fail_perror(msg);
  *
  */
 #ifndef __KSELFTEST_H
@@ -51,6 +54,7 @@
 #include <stdarg.h>
 #include <string.h>
 #include <stdio.h>
+#include <sys/utsname.h>
 #endif
 
 #ifndef ARRAY_SIZE
@@ -79,6 +83,9 @@
 #define KSFT_XPASS 3
 #define KSFT_SKIP  4
 
+#ifndef __noreturn
+#define __noreturn       __attribute__((__noreturn__))
+#endif
 #define __printf(a, b)   __attribute__((format(printf, a, b)))
 
 /* counters */
@@ -288,24 +295,47 @@ void ksft_test_result_code(int exit_code, const char *test_name,
 	}
 
 	/* Docs seem to call for double space if directive is absent */
-	if (!directive[0] && msg[0])
+	if (!directive[0] && msg)
 		directive = " #  ";
 
-	va_start(args, msg);
 	printf("%s %u %s%s", tap_code, ksft_test_num(), test_name, directive);
 	errno = saved_errno;
-	vprintf(msg, args);
+	if (msg) {
+		va_start(args, msg);
+		vprintf(msg, args);
+		va_end(args);
+	}
 	printf("\n");
-	va_end(args);
 }
 
-static inline int ksft_exit_pass(void)
+/**
+ * ksft_test_result() - Report test success based on truth of condition
+ *
+ * @condition: if true, report test success, otherwise failure.
+ */
+#define ksft_test_result_report(result, fmt, ...) do {		\
+	switch (result) {					\
+	case KSFT_PASS:						\
+		ksft_test_result_pass(fmt, ##__VA_ARGS__);	\
+		break;						\
+	case KSFT_FAIL:						\
+		ksft_test_result_fail(fmt, ##__VA_ARGS__);	\
+		break;						\
+	case KSFT_XFAIL:					\
+		ksft_test_result_xfail(fmt, ##__VA_ARGS__);	\
+		break;						\
+	case KSFT_SKIP:						\
+		ksft_test_result_skip(fmt, ##__VA_ARGS__);	\
+		break;						\
+	} } while (0)
+
+static inline __noreturn void ksft_exit_pass(void)
 {
 	ksft_print_cnts();
 	exit(KSFT_PASS);
 }
 
-static inline int ksft_exit_fail(void)
+static inline __noreturn void ksft_exit_fail(void)
 {
 	ksft_print_cnts();
 	exit(KSFT_FAIL);
@@ -332,7 +362,7 @@ static inline int ksft_exit_fail(void)
 		  ksft_cnt.ksft_xfail +	\
 		  ksft_cnt.ksft_xskip)
 
-static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...)
+static inline __noreturn __printf(1, 2) void ksft_exit_fail_msg(const char *msg, ...)
 {
 	int saved_errno = errno;
 	va_list args;
@@ -347,19 +377,32 @@ static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...)
 	exit(KSFT_FAIL);
 }
 
-static inline int ksft_exit_xfail(void)
+static inline __noreturn void ksft_exit_fail_perror(const char *msg)
+{
+#ifndef NOLIBC
+	ksft_exit_fail_msg("%s: %s (%d)\n", msg, strerror(errno), errno);
+#else
+	/*
+	 * nolibc doesn't provide strerror() and it seems
+	 * inappropriate to add one, just print the errno.
+	 */
+	ksft_exit_fail_msg("%s: %d)\n", msg, errno);
+#endif
+}
+
+static inline __noreturn void ksft_exit_xfail(void)
 {
 	ksft_print_cnts();
 	exit(KSFT_XFAIL);
 }
 
-static inline int ksft_exit_xpass(void)
+static inline __noreturn void ksft_exit_xpass(void)
 {
 	ksft_print_cnts();
 	exit(KSFT_XPASS);
 }
 
-static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...)
+static inline __noreturn __printf(1, 2) void ksft_exit_skip(const char *msg, ...)
 {
 	int saved_errno = errno;
 	va_list args;
@@ -388,4 +431,21 @@ static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...)
 	exit(KSFT_SKIP);
 }
 
+static inline int ksft_min_kernel_version(unsigned int min_major,
+					  unsigned int min_minor)
+{
+#ifdef NOLIBC
+	ksft_print_msg("NOLIBC: Can't check kernel version: Function not implemented\n");
+	return 0;
+#else
+	unsigned int major, minor;
+	struct utsname info;
+
+	if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2)
+		ksft_exit_fail_msg("Can't parse kernel version\n");
+
+	return major > min_major || (major == min_major && minor >= min_minor);
+#endif
+}
+
 #endif /* __KSELFTEST_H */
diff --git a/tools/testing/selftests/kselftest/ktap_helpers.sh b/tools/testing/selftests/kselftest/ktap_helpers.sh
index f2fbb914e058..79a125eb24c2 100644
--- a/tools/testing/selftests/kselftest/ktap_helpers.sh
+++ b/tools/testing/selftests/kselftest/ktap_helpers.sh
@@ -43,7 +43,7 @@ __ktap_test() {
 	directive="$3" # optional
 
 	local directive_str=
-	[[ ! -z "$directive" ]] && directive_str="# $directive"
+	[ ! -z "$directive" ] && directive_str="# $directive"
 
 	echo $result $KTAP_TESTNO $description $directive_str
 
@@ -99,7 +99,7 @@ ktap_exit_fail_msg() {
 ktap_finished() {
 	ktap_print_totals
 
-	if [ $(("$KTAP_CNT_PASS" + "$KTAP_CNT_SKIP")) -eq "$KSFT_NUM_TESTS" ]; then
+	if [ $((KTAP_CNT_PASS + KTAP_CNT_SKIP)) -eq "$KSFT_NUM_TESTS" ]; then
 		exit "$KSFT_PASS"
 	else
 		exit "$KSFT_FAIL"
diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh
index de59cc8f03c3..487e49fdf2a6 100755
--- a/tools/testing/selftests/kselftest_deps.sh
+++ b/tools/testing/selftests/kselftest_deps.sh
@@ -244,6 +244,7 @@ l4_test()
 l5_test()
 {
 	tests=$(find $(dirname "$test") -type f -name "*.mk")
+	[[ -z "${tests// }" ]] && return
 	test_libs=$(grep "^IOURING_EXTRA_LIBS +\?=" $tests | \
 			cut -d "=" -f 2)
 
diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index 4fd735e48ee7..37b03f1b8741 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -51,12 +51,11 @@
 #define __KSELFTEST_HARNESS_H
 
 #ifndef _GNU_SOURCE
-#define _GNU_SOURCE
+static_assert(0, "kselftest harness requires _GNU_SOURCE to be defined");
 #endif
 #include <asm/types.h>
 #include <ctype.h>
 #include <errno.h>
-#include <limits.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -67,6 +66,8 @@
 #include <sys/wait.h>
 #include <unistd.h>
 #include <setjmp.h>
+#include <syscall.h>
+#include <linux/sched.h>
 
 #include "kselftest.h"
 
@@ -81,6 +82,17 @@
 #  define TH_LOG_ENABLED 1
 #endif
 
+/* Wait for the child process to end but without sharing memory mapping. */
+static inline pid_t clone3_vfork(void)
+{
+	struct clone_args args = {
+		.flags = CLONE_VFORK,
+		.exit_signal = SIGCHLD,
+	};
+
+	return syscall(__NR_clone3, &args, sizeof(args));
+}
+
 /**
  * TH_LOG()
  *
@@ -282,6 +294,32 @@
  * A bare "return;" statement may be used to return early.
  */
 #define FIXTURE_TEARDOWN(fixture_name) \
+	static const bool fixture_name##_teardown_parent; \
+	__FIXTURE_TEARDOWN(fixture_name)
+
+/**
+ * FIXTURE_TEARDOWN_PARENT()
+ * *_metadata* is included so that EXPECT_*, ASSERT_* etc. work correctly.
+ *
+ * @fixture_name: fixture name
+ *
+ * .. code-block:: c
+ *
+ *     FIXTURE_TEARDOWN_PARENT(fixture_name) { implementation }
+ *
+ * Same as FIXTURE_TEARDOWN() but run this code in a parent process.  This
+ * enables the test process to drop its privileges without impacting the
+ * related FIXTURE_TEARDOWN_PARENT() (e.g. to remove files from a directory
+ * where write access was dropped).
+ *
+ * To make it possible for the parent process to use *self*, share (MAP_SHARED)
+ * the fixture data between all forked processes.
+ */
+#define FIXTURE_TEARDOWN_PARENT(fixture_name) \
+	static const bool fixture_name##_teardown_parent = true; \
+	__FIXTURE_TEARDOWN(fixture_name)
+
+#define __FIXTURE_TEARDOWN(fixture_name) \
 	void fixture_name##_teardown( \
 		struct __test_metadata __attribute__((unused)) *_metadata, \
 		FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \
@@ -326,7 +364,7 @@
  * variant.
  */
 #define FIXTURE_VARIANT_ADD(fixture_name, variant_name) \
-	extern FIXTURE_VARIANT(fixture_name) \
+	extern const FIXTURE_VARIANT(fixture_name) \
 		_##fixture_name##_##variant_name##_variant; \
 	static struct __fixture_variant_metadata \
 		_##fixture_name##_##variant_name##_object = \
@@ -338,7 +376,7 @@
 		__register_fixture_variant(&_##fixture_name##_fixture_object, \
 			&_##fixture_name##_##variant_name##_object);	\
 	} \
-	FIXTURE_VARIANT(fixture_name) \
+	const FIXTURE_VARIANT(fixture_name) \
 		_##fixture_name##_##variant_name##_variant =
 
 /**
@@ -356,10 +394,11 @@
  * Very similar to TEST() except that *self* is the setup instance of fixture's
  * datatype exposed for use by the implementation.
  *
- * The @test_name code is run in a separate process sharing the same memory
- * (i.e. vfork), which means that the test process can update its privileges
- * without impacting the related FIXTURE_TEARDOWN() (e.g. to remove files from
- * a directory where write access was dropped).
+ * The _metadata object is shared (MAP_SHARED) with all the potential forked
+ * processes, which enables them to use EXCEPT_*() and ASSERT_*().
+ *
+ * The *self* object is only shared with the potential forked processes if
+ * FIXTURE_TEARDOWN_PARENT() is used instead of FIXTURE_TEARDOWN().
  */
 #define TEST_F(fixture_name, test_name) \
 	__TEST_F_IMPL(fixture_name, test_name, -1, TEST_TIMEOUT_DEFAULT)
@@ -380,50 +419,71 @@
 		struct __fixture_variant_metadata *variant) \
 	{ \
 		/* fixture data is alloced, setup, and torn down per call. */ \
-		FIXTURE_DATA(fixture_name) self; \
+		FIXTURE_DATA(fixture_name) self_private, *self = NULL; \
 		pid_t child = 1; \
 		int status = 0; \
-		memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \
+		/* Makes sure there is only one teardown, even when child forks again. */ \
+		bool *teardown = mmap(NULL, sizeof(*teardown), \
+			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); \
+		*teardown = false; \
+		if (sizeof(*self) > 0) { \
+			if (fixture_name##_teardown_parent) { \
+				self = mmap(NULL, sizeof(*self), PROT_READ | PROT_WRITE, \
+					MAP_SHARED | MAP_ANONYMOUS, -1, 0); \
+			} else { \
+				memset(&self_private, 0, sizeof(self_private)); \
+				self = &self_private; \
+			} \
+		} \
 		if (setjmp(_metadata->env) == 0) { \
-			/* Use the same _metadata. */ \
-			child = vfork(); \
+			/* _metadata and potentially self are shared with all forks. */ \
+			child = clone3_vfork(); \
 			if (child == 0) { \
-				fixture_name##_setup(_metadata, &self, variant->data); \
+				fixture_name##_setup(_metadata, self, variant->data); \
 				/* Let setup failure terminate early. */ \
 				if (_metadata->exit_code) \
 					_exit(0); \
 				_metadata->setup_completed = true; \
-				fixture_name##_##test_name(_metadata, &self, variant->data); \
+				fixture_name##_##test_name(_metadata, self, variant->data); \
 			} else if (child < 0 || child != waitpid(child, &status, 0)) { \
 				ksft_print_msg("ERROR SPAWNING TEST GRANDCHILD\n"); \
 				_metadata->exit_code = KSFT_FAIL; \
 			} \
 		} \
 		if (child == 0) { \
-			if (_metadata->setup_completed && !_metadata->teardown_parent) \
-				fixture_name##_teardown(_metadata, &self, variant->data); \
+			if (_metadata->setup_completed && !fixture_name##_teardown_parent && \
+					__sync_bool_compare_and_swap(teardown, false, true)) \
+				fixture_name##_teardown(_metadata, self, variant->data); \
 			_exit(0); \
 		} \
-		if (_metadata->setup_completed && _metadata->teardown_parent) \
-			fixture_name##_teardown(_metadata, &self, variant->data); \
-		if (!WIFEXITED(status) && WIFSIGNALED(status)) \
+		if (_metadata->setup_completed && fixture_name##_teardown_parent && \
+				__sync_bool_compare_and_swap(teardown, false, true)) \
+			fixture_name##_teardown(_metadata, self, variant->data); \
+		munmap(teardown, sizeof(*teardown)); \
+		if (self && fixture_name##_teardown_parent) \
+			munmap(self, sizeof(*self)); \
+		if (WIFEXITED(status)) { \
+			if (WEXITSTATUS(status)) \
+				_metadata->exit_code = WEXITSTATUS(status); \
+		} else if (WIFSIGNALED(status)) { \
 			/* Forward signal to __wait_for_test(). */ \
 			kill(getpid(), WTERMSIG(status)); \
+		} \
 		__test_check_assert(_metadata); \
 	} \
-	static struct __test_metadata \
-		      _##fixture_name##_##test_name##_object = { \
-		.name = #test_name, \
-		.fn = &wrapper_##fixture_name##_##test_name, \
-		.fixture = &_##fixture_name##_fixture_object, \
-		.termsig = signal, \
-		.timeout = tmout, \
-		.teardown_parent = false, \
-	 }; \
+	static struct __test_metadata *_##fixture_name##_##test_name##_object; \
 	static void __attribute__((constructor)) \
 			_register_##fixture_name##_##test_name(void) \
 	{ \
-		__register_test(&_##fixture_name##_##test_name##_object); \
+		struct __test_metadata *object = mmap(NULL, sizeof(*object), \
+			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); \
+		object->name = #test_name; \
+		object->fn = &wrapper_##fixture_name##_##test_name; \
+		object->fixture = &_##fixture_name##_fixture_object; \
+		object->termsig = signal; \
+		object->timeout = tmout; \
+		_##fixture_name##_##test_name##_object = object; \
+		__register_test(object); \
 	} \
 	static void fixture_name##_##test_name( \
 		struct __test_metadata __attribute__((unused)) *_metadata, \
@@ -831,11 +891,12 @@ struct __test_xfail {
 	{ \
 		.fixture = &_##fixture_name##_fixture_object, \
 		.variant = &_##fixture_name##_##variant_name##_object, \
-		.test = &_##fixture_name##_##test_name##_object, \
 	}; \
 	static void __attribute__((constructor)) \
 		_register_##fixture_name##_##variant_name##_##test_name##_xfail(void) \
 	{ \
+		_##fixture_name##_##variant_name##_##test_name##_xfail.test = \
+			_##fixture_name##_##test_name##_object; \
 		__register_xfail(&_##fixture_name##_##variant_name##_##test_name##_xfail); \
 	}
 
@@ -878,7 +939,6 @@ struct __test_metadata {
 	bool timed_out;	/* did this test timeout instead of exiting? */
 	bool aborted;	/* stopped test due to failed ASSERT */
 	bool setup_completed; /* did setup finish? */
-	bool teardown_parent; /* run teardown in a parent process */
 	jmp_buf env;	/* for exiting out of test early */
 	struct __test_results *results;
 	struct __test_metadata *prev, *next;
@@ -1156,16 +1216,23 @@ void __run_test(struct __fixture_metadata *f,
 		struct __test_metadata *t)
 {
 	struct __test_xfail *xfail;
-	char test_name[LINE_MAX];
+	char *test_name;
 	const char *diagnostic;
 
 	/* reset test struct */
 	t->exit_code = KSFT_PASS;
 	t->trigger = 0;
+	t->aborted = false;
+	t->setup_completed = false;
+	memset(t->env, 0, sizeof(t->env));
 	memset(t->results->reason, 0, sizeof(t->results->reason));
 
-	snprintf(test_name, sizeof(test_name), "%s%s%s.%s",
-		 f->name, variant->name[0] ? "." : "", variant->name, t->name);
+	if (asprintf(&test_name, "%s%s%s.%s", f->name,
+		variant->name[0] ? "." : "", variant->name, t->name) == -1) {
+		ksft_print_msg("ERROR ALLOCATING MEMORY\n");
+		t->exit_code = KSFT_FAIL;
+		_exit(t->exit_code);
+	}
 
 	ksft_print_msg(" RUN           %s ...\n", test_name);
 
@@ -1173,7 +1240,7 @@ void __run_test(struct __fixture_metadata *f,
 	fflush(stdout);
 	fflush(stderr);
 
-	t->pid = fork();
+	t->pid = clone3_vfork();
 	if (t->pid < 0) {
 		ksft_print_msg("ERROR SPAWNING TEST CHILD\n");
 		t->exit_code = KSFT_FAIL;
@@ -1202,7 +1269,8 @@ void __run_test(struct __fixture_metadata *f,
 		diagnostic = "unknown";
 
 	ksft_test_result_code(t->exit_code, test_name,
-			      diagnostic ? "%s" : "", diagnostic);
+			      diagnostic ? "%s" : NULL, diagnostic);
+	free(test_name);
 }
 
 static int test_harness_run(int argc, char **argv)
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 492e937fab00..741c7dc16afc 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -36,7 +36,9 @@ LIBKVM_x86_64 += lib/x86_64/apic.c
 LIBKVM_x86_64 += lib/x86_64/handlers.S
 LIBKVM_x86_64 += lib/x86_64/hyperv.c
 LIBKVM_x86_64 += lib/x86_64/memstress.c
+LIBKVM_x86_64 += lib/x86_64/pmu.c
 LIBKVM_x86_64 += lib/x86_64/processor.c
+LIBKVM_x86_64 += lib/x86_64/sev.c
 LIBKVM_x86_64 += lib/x86_64/svm.c
 LIBKVM_x86_64 += lib/x86_64/ucall.c
 LIBKVM_x86_64 += lib/x86_64/vmx.c
@@ -53,6 +55,7 @@ LIBKVM_s390x += lib/s390x/diag318_test_handler.c
 LIBKVM_s390x += lib/s390x/processor.c
 LIBKVM_s390x += lib/s390x/ucall.c
 
+LIBKVM_riscv += lib/riscv/handlers.S
 LIBKVM_riscv += lib/riscv/processor.c
 LIBKVM_riscv += lib/riscv/ucall.c
 
@@ -80,6 +83,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
 TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
 TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
+TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test
 TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
 TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test
 TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test
@@ -117,6 +121,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test
 TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
 TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
+TEST_GEN_PROGS_x86_64 += x86_64/sev_smoke_test
 TEST_GEN_PROGS_x86_64 += x86_64/amx_test
 TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test
 TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test
@@ -143,7 +148,6 @@ TEST_GEN_PROGS_x86_64 += system_counter_offset_test
 TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test
 
 TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs
-TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
 TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
 TEST_GEN_PROGS_aarch64 += aarch64/hypercalls
 TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test
@@ -155,6 +159,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
 TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
 TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access
 TEST_GEN_PROGS_aarch64 += access_tracking_perf_test
+TEST_GEN_PROGS_aarch64 += arch_timer
 TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
@@ -184,6 +189,7 @@ TEST_GEN_PROGS_s390x += rseq_test
 TEST_GEN_PROGS_s390x += set_memory_region_test
 TEST_GEN_PROGS_s390x += kvm_binary_stats_test
 
+TEST_GEN_PROGS_riscv += arch_timer
 TEST_GEN_PROGS_riscv += demand_paging_test
 TEST_GEN_PROGS_riscv += dirty_log_test
 TEST_GEN_PROGS_riscv += get-reg-list
@@ -194,6 +200,7 @@ TEST_GEN_PROGS_riscv += kvm_page_table_test
 TEST_GEN_PROGS_riscv += set_memory_region_test
 TEST_GEN_PROGS_riscv += steal_time
 
+SPLIT_TESTS += arch_timer
 SPLIT_TESTS += get-reg-list
 
 TEST_PROGS += $(TEST_PROGS_$(ARCH_DIR))
@@ -217,7 +224,7 @@ else
 LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
 endif
 CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
-	-Wno-gnu-variable-sized-type-not-at-end -MD -MP \
+	-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
 	-fno-builtin-memcmp -fno-builtin-memcpy -fno-builtin-memset \
 	-fno-builtin-strnlen \
 	-fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
@@ -260,32 +267,36 @@ LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
 LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
 LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
 LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
-SPLIT_TESTS_TARGETS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS))
-SPLIT_TESTS_OBJS := $(patsubst %, $(ARCH_DIR)/%.o, $(SPLIT_TESTS))
+SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS))
+SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH_DIR)/%.o, $(SPLIT_TESTS))
 
 TEST_GEN_OBJ = $(patsubst %, %.o, $(TEST_GEN_PROGS))
 TEST_GEN_OBJ += $(patsubst %, %.o, $(TEST_GEN_PROGS_EXTENDED))
 TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_OBJ))
 TEST_DEP_FILES += $(patsubst %.o, %.d, $(LIBKVM_OBJS))
-TEST_DEP_FILES += $(patsubst %.o, %.d, $(SPLIT_TESTS_OBJS))
+TEST_DEP_FILES += $(patsubst %.o, %.d, $(SPLIT_TEST_GEN_OBJ))
 -include $(TEST_DEP_FILES)
 
-$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): %: %.o
+$(shell mkdir -p $(sort $(OUTPUT)/$(ARCH_DIR) $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
+
+$(filter-out $(SPLIT_TEST_GEN_PROGS), $(TEST_GEN_PROGS)) \
+$(TEST_GEN_PROGS_EXTENDED): %: %.o
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIBKVM_OBJS) $(LDLIBS) -o $@
 $(TEST_GEN_OBJ): $(OUTPUT)/%.o: %.c
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
 
-$(SPLIT_TESTS_TARGETS): %: %.o $(SPLIT_TESTS_OBJS)
+$(SPLIT_TEST_GEN_PROGS): $(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/$(ARCH_DIR)/%.o
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $^ $(LDLIBS) -o $@
+$(SPLIT_TEST_GEN_OBJ): $(OUTPUT)/$(ARCH_DIR)/%.o: $(ARCH_DIR)/%.c
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
 
 EXTRA_CLEAN += $(GEN_HDRS) \
 	       $(LIBKVM_OBJS) \
-	       $(SPLIT_TESTS_OBJS) \
+	       $(SPLIT_TEST_GEN_OBJ) \
 	       $(TEST_DEP_FILES) \
 	       $(TEST_GEN_OBJ) \
 	       cscope.*
 
-x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
 $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c $(GEN_HDRS)
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
 
@@ -298,8 +309,8 @@ $(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S $(GEN_HDRS)
 $(LIBKVM_STRING_OBJ): $(OUTPUT)/%.o: %.c
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@
 
-x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
-$(SPLIT_TESTS_OBJS): $(GEN_HDRS)
+$(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
+$(SPLIT_TEST_GEN_OBJ): $(GEN_HDRS)
 $(TEST_GEN_PROGS): $(LIBKVM_OBJS)
 $(TEST_GEN_PROGS_EXTENDED): $(LIBKVM_OBJS)
 $(TEST_GEN_OBJ): $(GEN_HDRS)
diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c
index 2cb8dd1f8275..4eaba83cdcf3 100644
--- a/tools/testing/selftests/kvm/aarch64/arch_timer.c
+++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c
@@ -1,64 +1,19 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * arch_timer.c - Tests the aarch64 timer IRQ functionality
- *
  * The test validates both the virtual and physical timer IRQs using
- * CVAL and TVAL registers. This consitutes the four stages in the test.
- * The guest's main thread configures the timer interrupt for a stage
- * and waits for it to fire, with a timeout equal to the timer period.
- * It asserts that the timeout doesn't exceed the timer period.
- *
- * On the other hand, upon receipt of an interrupt, the guest's interrupt
- * handler validates the interrupt by checking if the architectural state
- * is in compliance with the specifications.
- *
- * The test provides command-line options to configure the timer's
- * period (-p), number of vCPUs (-n), and iterations per stage (-i).
- * To stress-test the timer stack even more, an option to migrate the
- * vCPUs across pCPUs (-m), at a particular rate, is also provided.
+ * CVAL and TVAL registers.
  *
  * Copyright (c) 2021, Google LLC.
  */
 #define _GNU_SOURCE
 
-#include <stdlib.h>
-#include <pthread.h>
-#include <linux/kvm.h>
-#include <linux/sizes.h>
-#include <linux/bitmap.h>
-#include <sys/sysinfo.h>
-
-#include "kvm_util.h"
-#include "processor.h"
-#include "delay.h"
 #include "arch_timer.h"
+#include "delay.h"
 #include "gic.h"
+#include "processor.h"
+#include "timer_test.h"
 #include "vgic.h"
 
-#define NR_VCPUS_DEF			4
-#define NR_TEST_ITERS_DEF		5
-#define TIMER_TEST_PERIOD_MS_DEF	10
-#define TIMER_TEST_ERR_MARGIN_US	100
-#define TIMER_TEST_MIGRATION_FREQ_MS	2
-
-struct test_args {
-	int nr_vcpus;
-	int nr_iter;
-	int timer_period_ms;
-	int migration_freq_ms;
-	struct kvm_arm_counter_offset offset;
-};
-
-static struct test_args test_args = {
-	.nr_vcpus = NR_VCPUS_DEF,
-	.nr_iter = NR_TEST_ITERS_DEF,
-	.timer_period_ms = TIMER_TEST_PERIOD_MS_DEF,
-	.migration_freq_ms = TIMER_TEST_MIGRATION_FREQ_MS,
-	.offset = { .reserved = 1 },
-};
-
-#define msecs_to_usecs(msec)		((msec) * 1000LL)
-
 #define GICD_BASE_GPA			0x8000000ULL
 #define GICR_BASE_GPA			0x80A0000ULL
 
@@ -70,22 +25,8 @@ enum guest_stage {
 	GUEST_STAGE_MAX,
 };
 
-/* Shared variables between host and guest */
-struct test_vcpu_shared_data {
-	int nr_iter;
-	enum guest_stage guest_stage;
-	uint64_t xcnt;
-};
-
-static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
-static pthread_t pt_vcpu_run[KVM_MAX_VCPUS];
-static struct test_vcpu_shared_data vcpu_shared_data[KVM_MAX_VCPUS];
-
 static int vtimer_irq, ptimer_irq;
 
-static unsigned long *vcpu_done_map;
-static pthread_mutex_t vcpu_done_map_lock;
-
 static void
 guest_configure_timer_action(struct test_vcpu_shared_data *shared_data)
 {
@@ -158,9 +99,9 @@ static void guest_validate_irq(unsigned int intid,
 
 	/* Basic 'timer condition met' check */
 	__GUEST_ASSERT(xcnt >= cval,
-		       "xcnt = 0x%llx, cval = 0x%llx, xcnt_diff_us = 0x%llx",
+		       "xcnt = 0x%lx, cval = 0x%lx, xcnt_diff_us = 0x%lx",
 		       xcnt, cval, xcnt_diff_us);
-	__GUEST_ASSERT(xctl & CTL_ISTATUS, "xcnt = 0x%llx", xcnt);
+	__GUEST_ASSERT(xctl & CTL_ISTATUS, "xctl = 0x%lx", xctl);
 
 	WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1);
 }
@@ -190,10 +131,14 @@ static void guest_run_stage(struct test_vcpu_shared_data *shared_data,
 
 		/* Setup a timeout for the interrupt to arrive */
 		udelay(msecs_to_usecs(test_args.timer_period_ms) +
-			TIMER_TEST_ERR_MARGIN_US);
+			test_args.timer_err_margin_us);
 
 		irq_iter = READ_ONCE(shared_data->nr_iter);
-		GUEST_ASSERT_EQ(config_iter + 1, irq_iter);
+		__GUEST_ASSERT(config_iter + 1 == irq_iter,
+				"config_iter + 1 = 0x%x, irq_iter = 0x%x.\n"
+				"  Guest timer interrupt was not triggered within the specified\n"
+				"  interval, try to increase the error margin by [-e] option.\n",
+				config_iter + 1, irq_iter);
 	}
 }
 
@@ -222,137 +167,6 @@ static void guest_code(void)
 	GUEST_DONE();
 }
 
-static void *test_vcpu_run(void *arg)
-{
-	unsigned int vcpu_idx = (unsigned long)arg;
-	struct ucall uc;
-	struct kvm_vcpu *vcpu = vcpus[vcpu_idx];
-	struct kvm_vm *vm = vcpu->vm;
-	struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[vcpu_idx];
-
-	vcpu_run(vcpu);
-
-	/* Currently, any exit from guest is an indication of completion */
-	pthread_mutex_lock(&vcpu_done_map_lock);
-	__set_bit(vcpu_idx, vcpu_done_map);
-	pthread_mutex_unlock(&vcpu_done_map_lock);
-
-	switch (get_ucall(vcpu, &uc)) {
-	case UCALL_SYNC:
-	case UCALL_DONE:
-		break;
-	case UCALL_ABORT:
-		sync_global_from_guest(vm, *shared_data);
-		fprintf(stderr, "Guest assert failed,  vcpu %u; stage; %u; iter: %u\n",
-			vcpu_idx, shared_data->guest_stage, shared_data->nr_iter);
-		REPORT_GUEST_ASSERT(uc);
-		break;
-	default:
-		TEST_FAIL("Unexpected guest exit");
-	}
-
-	return NULL;
-}
-
-static uint32_t test_get_pcpu(void)
-{
-	uint32_t pcpu;
-	unsigned int nproc_conf;
-	cpu_set_t online_cpuset;
-
-	nproc_conf = get_nprocs_conf();
-	sched_getaffinity(0, sizeof(cpu_set_t), &online_cpuset);
-
-	/* Randomly find an available pCPU to place a vCPU on */
-	do {
-		pcpu = rand() % nproc_conf;
-	} while (!CPU_ISSET(pcpu, &online_cpuset));
-
-	return pcpu;
-}
-
-static int test_migrate_vcpu(unsigned int vcpu_idx)
-{
-	int ret;
-	cpu_set_t cpuset;
-	uint32_t new_pcpu = test_get_pcpu();
-
-	CPU_ZERO(&cpuset);
-	CPU_SET(new_pcpu, &cpuset);
-
-	pr_debug("Migrating vCPU: %u to pCPU: %u\n", vcpu_idx, new_pcpu);
-
-	ret = pthread_setaffinity_np(pt_vcpu_run[vcpu_idx],
-				     sizeof(cpuset), &cpuset);
-
-	/* Allow the error where the vCPU thread is already finished */
-	TEST_ASSERT(ret == 0 || ret == ESRCH,
-		    "Failed to migrate the vCPU:%u to pCPU: %u; ret: %d",
-		    vcpu_idx, new_pcpu, ret);
-
-	return ret;
-}
-
-static void *test_vcpu_migration(void *arg)
-{
-	unsigned int i, n_done;
-	bool vcpu_done;
-
-	do {
-		usleep(msecs_to_usecs(test_args.migration_freq_ms));
-
-		for (n_done = 0, i = 0; i < test_args.nr_vcpus; i++) {
-			pthread_mutex_lock(&vcpu_done_map_lock);
-			vcpu_done = test_bit(i, vcpu_done_map);
-			pthread_mutex_unlock(&vcpu_done_map_lock);
-
-			if (vcpu_done) {
-				n_done++;
-				continue;
-			}
-
-			test_migrate_vcpu(i);
-		}
-	} while (test_args.nr_vcpus != n_done);
-
-	return NULL;
-}
-
-static void test_run(struct kvm_vm *vm)
-{
-	pthread_t pt_vcpu_migration;
-	unsigned int i;
-	int ret;
-
-	pthread_mutex_init(&vcpu_done_map_lock, NULL);
-	vcpu_done_map = bitmap_zalloc(test_args.nr_vcpus);
-	TEST_ASSERT(vcpu_done_map, "Failed to allocate vcpu done bitmap");
-
-	for (i = 0; i < (unsigned long)test_args.nr_vcpus; i++) {
-		ret = pthread_create(&pt_vcpu_run[i], NULL, test_vcpu_run,
-				     (void *)(unsigned long)i);
-		TEST_ASSERT(!ret, "Failed to create vCPU-%d pthread", i);
-	}
-
-	/* Spawn a thread to control the vCPU migrations */
-	if (test_args.migration_freq_ms) {
-		srand(time(NULL));
-
-		ret = pthread_create(&pt_vcpu_migration, NULL,
-					test_vcpu_migration, NULL);
-		TEST_ASSERT(!ret, "Failed to create the migration pthread");
-	}
-
-
-	for (i = 0; i < test_args.nr_vcpus; i++)
-		pthread_join(pt_vcpu_run[i], NULL);
-
-	if (test_args.migration_freq_ms)
-		pthread_join(pt_vcpu_migration, NULL);
-
-	bitmap_free(vcpu_done_map);
-}
-
 static void test_init_timer_irq(struct kvm_vm *vm)
 {
 	/* Timer initid should be same for all the vCPUs, so query only vCPU-0 */
@@ -369,7 +183,7 @@ static void test_init_timer_irq(struct kvm_vm *vm)
 
 static int gic_fd;
 
-static struct kvm_vm *test_vm_create(void)
+struct kvm_vm *test_vm_create(void)
 {
 	struct kvm_vm *vm;
 	unsigned int i;
@@ -380,10 +194,14 @@ static struct kvm_vm *test_vm_create(void)
 	vm_init_descriptor_tables(vm);
 	vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler);
 
-	if (!test_args.offset.reserved) {
-		if (kvm_has_cap(KVM_CAP_COUNTER_OFFSET))
-			vm_ioctl(vm, KVM_ARM_SET_COUNTER_OFFSET, &test_args.offset);
-		else
+	if (!test_args.reserved) {
+		if (kvm_has_cap(KVM_CAP_COUNTER_OFFSET)) {
+			struct kvm_arm_counter_offset offset = {
+				.counter_offset = test_args.counter_offset,
+				.reserved = 0,
+			};
+			vm_ioctl(vm, KVM_ARM_SET_COUNTER_OFFSET, &offset);
+		} else
 			TEST_FAIL("no support for global offset");
 	}
 
@@ -400,81 +218,8 @@ static struct kvm_vm *test_vm_create(void)
 	return vm;
 }
 
-static void test_vm_cleanup(struct kvm_vm *vm)
+void test_vm_cleanup(struct kvm_vm *vm)
 {
 	close(gic_fd);
 	kvm_vm_free(vm);
 }
-
-static void test_print_help(char *name)
-{
-	pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n",
-		name);
-	pr_info("\t-n: Number of vCPUs to configure (default: %u; max: %u)\n",
-		NR_VCPUS_DEF, KVM_MAX_VCPUS);
-	pr_info("\t-i: Number of iterations per stage (default: %u)\n",
-		NR_TEST_ITERS_DEF);
-	pr_info("\t-p: Periodicity (in ms) of the guest timer (default: %u)\n",
-		TIMER_TEST_PERIOD_MS_DEF);
-	pr_info("\t-m: Frequency (in ms) of vCPUs to migrate to different pCPU. 0 to turn off (default: %u)\n",
-		TIMER_TEST_MIGRATION_FREQ_MS);
-	pr_info("\t-o: Counter offset (in counter cycles, default: 0)\n");
-	pr_info("\t-h: print this help screen\n");
-}
-
-static bool parse_args(int argc, char *argv[])
-{
-	int opt;
-
-	while ((opt = getopt(argc, argv, "hn:i:p:m:o:")) != -1) {
-		switch (opt) {
-		case 'n':
-			test_args.nr_vcpus = atoi_positive("Number of vCPUs", optarg);
-			if (test_args.nr_vcpus > KVM_MAX_VCPUS) {
-				pr_info("Max allowed vCPUs: %u\n",
-					KVM_MAX_VCPUS);
-				goto err;
-			}
-			break;
-		case 'i':
-			test_args.nr_iter = atoi_positive("Number of iterations", optarg);
-			break;
-		case 'p':
-			test_args.timer_period_ms = atoi_positive("Periodicity", optarg);
-			break;
-		case 'm':
-			test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg);
-			break;
-		case 'o':
-			test_args.offset.counter_offset = strtol(optarg, NULL, 0);
-			test_args.offset.reserved = 0;
-			break;
-		case 'h':
-		default:
-			goto err;
-		}
-	}
-
-	return true;
-
-err:
-	test_print_help(argv[0]);
-	return false;
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vm *vm;
-
-	if (!parse_args(argc, argv))
-		exit(KSFT_SKIP);
-
-	__TEST_REQUIRE(!test_args.migration_freq_ms || get_nprocs() >= 2,
-		       "At least two physical CPUs needed for vCPU migration");
-
-	vm = test_vm_create();
-	test_run(vm);
-	test_vm_cleanup(vm);
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
index 866002917441..2582c49e525a 100644
--- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
+++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
@@ -365,7 +365,7 @@ static void guest_wp_handler(struct ex_regs *regs)
 
 static void guest_ss_handler(struct ex_regs *regs)
 {
-	__GUEST_ASSERT(ss_idx < 4, "Expected index < 4, got '%u'", ss_idx);
+	__GUEST_ASSERT(ss_idx < 4, "Expected index < 4, got '%lu'", ss_idx);
 	ss_addr[ss_idx++] = regs->pc;
 	regs->pstate |= SPSR_SS;
 }
diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c
index 27c10e7a7e01..9d192ce0078d 100644
--- a/tools/testing/selftests/kvm/aarch64/hypercalls.c
+++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c
@@ -105,12 +105,12 @@ static void guest_test_hvc(const struct test_hvc_info *hc_info)
 		case TEST_STAGE_HVC_IFACE_FEAT_DISABLED:
 		case TEST_STAGE_HVC_IFACE_FALSE_INFO:
 			__GUEST_ASSERT(res.a0 == SMCCC_RET_NOT_SUPPORTED,
-				       "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%llx, stage = %u",
+				       "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u",
 					res.a0, hc_info->func_id, hc_info->arg1, stage);
 			break;
 		case TEST_STAGE_HVC_IFACE_FEAT_ENABLED:
 			__GUEST_ASSERT(res.a0 != SMCCC_RET_NOT_SUPPORTED,
-				       "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%llx, stage = %u",
+				       "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u",
 					res.a0, hc_info->func_id, hc_info->arg1, stage);
 			break;
 		default:
diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
index 53fddad57cbb..5972905275cf 100644
--- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c
+++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
@@ -292,7 +292,7 @@ static void guest_code(struct test_desc *test)
 
 static void no_dabt_handler(struct ex_regs *regs)
 {
-	GUEST_FAIL("Unexpected dabt, far_el1 = 0x%llx", read_sysreg(far_el1));
+	GUEST_FAIL("Unexpected dabt, far_el1 = 0x%lx", read_sysreg(far_el1));
 }
 
 static void no_iabt_handler(struct ex_regs *regs)
diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
index bac05210b539..16e2338686c1 100644
--- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
@@ -32,6 +32,10 @@ struct reg_ftr_bits {
 	enum ftr_type type;
 	uint8_t shift;
 	uint64_t mask;
+	/*
+	 * For FTR_EXACT, safe_val is used as the exact safe value.
+	 * For FTR_LOWER_SAFE, safe_val is used as the minimal safe value.
+	 */
 	int64_t safe_val;
 };
 
@@ -65,13 +69,13 @@ struct test_feature_reg {
 
 static const struct reg_ftr_bits ftr_id_aa64dfr0_el1[] = {
 	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, PMUVer, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, ID_AA64DFR0_EL1_DebugVer_IMP),
 	REG_FTR_END,
 };
 
 static const struct reg_ftr_bits ftr_id_dfr0_el1[] = {
-	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, PerfMon, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, CopDbg, 0),
+	S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, PerfMon, ID_DFR0_EL1_PerfMon_PMUv3),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, CopDbg, ID_DFR0_EL1_CopDbg_Armv8),
 	REG_FTR_END,
 };
 
@@ -224,13 +228,13 @@ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
 {
 	uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
 
-	if (ftr_bits->type == FTR_UNSIGNED) {
+	if (ftr_bits->sign == FTR_UNSIGNED) {
 		switch (ftr_bits->type) {
 		case FTR_EXACT:
 			ftr = ftr_bits->safe_val;
 			break;
 		case FTR_LOWER_SAFE:
-			if (ftr > 0)
+			if (ftr > ftr_bits->safe_val)
 				ftr--;
 			break;
 		case FTR_HIGHER_SAFE:
@@ -252,7 +256,7 @@ uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
 			ftr = ftr_bits->safe_val;
 			break;
 		case FTR_LOWER_SAFE:
-			if (ftr > 0)
+			if (ftr > ftr_bits->safe_val)
 				ftr--;
 			break;
 		case FTR_HIGHER_SAFE:
@@ -276,7 +280,7 @@ uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr)
 {
 	uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0);
 
-	if (ftr_bits->type == FTR_UNSIGNED) {
+	if (ftr_bits->sign == FTR_UNSIGNED) {
 		switch (ftr_bits->type) {
 		case FTR_EXACT:
 			ftr = max((uint64_t)ftr_bits->safe_val + 1, ftr + 1);
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c
index eef816b80993..ca917c71ff60 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_init.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c
@@ -84,6 +84,18 @@ static struct vm_gic vm_gic_create_with_vcpus(uint32_t gic_dev_type,
 	return v;
 }
 
+static struct vm_gic vm_gic_create_barebones(uint32_t gic_dev_type)
+{
+	struct vm_gic v;
+
+	v.gic_dev_type = gic_dev_type;
+	v.vm = vm_create_barebones();
+	v.gic_fd = kvm_create_device(v.vm, gic_dev_type);
+
+	return v;
+}
+
+
 static void vm_gic_destroy(struct vm_gic *v)
 {
 	close(v->gic_fd);
@@ -357,6 +369,40 @@ static void test_vcpus_then_vgic(uint32_t gic_dev_type)
 	vm_gic_destroy(&v);
 }
 
+#define KVM_VGIC_V2_ATTR(offset, cpu) \
+	(FIELD_PREP(KVM_DEV_ARM_VGIC_OFFSET_MASK, offset) | \
+	 FIELD_PREP(KVM_DEV_ARM_VGIC_CPUID_MASK, cpu))
+
+#define GIC_CPU_CTRL	0x00
+
+static void test_v2_uaccess_cpuif_no_vcpus(void)
+{
+	struct vm_gic v;
+	u64 val = 0;
+	int ret;
+
+	v = vm_gic_create_barebones(KVM_DEV_TYPE_ARM_VGIC_V2);
+	subtest_dist_rdist(&v);
+
+	ret = __kvm_has_device_attr(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS,
+				    KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0));
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "accessed non-existent CPU interface, want errno: %i",
+		    EINVAL);
+	ret = __kvm_device_attr_get(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS,
+				    KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val);
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "accessed non-existent CPU interface, want errno: %i",
+		    EINVAL);
+	ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS,
+				    KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val);
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "accessed non-existent CPU interface, want errno: %i",
+		    EINVAL);
+
+	vm_gic_destroy(&v);
+}
+
 static void test_v3_new_redist_regions(void)
 {
 	struct kvm_vcpu *vcpus[NR_VCPUS];
@@ -675,6 +721,9 @@ void run_tests(uint32_t gic_dev_type)
 	test_vcpus_then_vgic(gic_dev_type);
 	test_vgic_then_vcpus(gic_dev_type);
 
+	if (VGIC_DEV_IS_V2(gic_dev_type))
+		test_v2_uaccess_cpuif_no_vcpus();
+
 	if (VGIC_DEV_IS_V3(gic_dev_type)) {
 		test_v3_new_redist_regions();
 		test_v3_typer_accesses();
diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
index 5f9713364693..f2fb0e3f14bc 100644
--- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
+++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
@@ -93,22 +93,6 @@ static inline void write_sel_evtyper(int sel, unsigned long val)
 	isb();
 }
 
-static inline void enable_counter(int idx)
-{
-	uint64_t v = read_sysreg(pmcntenset_el0);
-
-	write_sysreg(BIT(idx) | v, pmcntenset_el0);
-	isb();
-}
-
-static inline void disable_counter(int idx)
-{
-	uint64_t v = read_sysreg(pmcntenset_el0);
-
-	write_sysreg(BIT(idx) | v, pmcntenclr_el0);
-	isb();
-}
-
 static void pmu_disable_reset(void)
 {
 	uint64_t pmcr = read_sysreg(pmcr_el0);
@@ -195,11 +179,11 @@ struct pmc_accessor pmc_accessors[] = {
 										 \
 	if (set_expected)							 \
 		__GUEST_ASSERT((_tval & mask),					 \
-				"tval: 0x%lx; mask: 0x%lx; set_expected: 0x%lx", \
+				"tval: 0x%lx; mask: 0x%lx; set_expected: %u",	 \
 				_tval, mask, set_expected);			 \
 	else									 \
 		__GUEST_ASSERT(!(_tval & mask),					 \
-				"tval: 0x%lx; mask: 0x%lx; set_expected: 0x%lx", \
+				"tval: 0x%lx; mask: 0x%lx; set_expected: %u",	 \
 				_tval, mask, set_expected);			 \
 }
 
@@ -286,7 +270,7 @@ static void test_access_pmc_regs(struct pmc_accessor *acc, int pmc_idx)
 	acc->write_typer(pmc_idx, write_data);
 	read_data = acc->read_typer(pmc_idx);
 	__GUEST_ASSERT(read_data == write_data,
-		       "pmc_idx: 0x%lx; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx",
+		       "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx",
 		       pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data);
 
 	/*
@@ -297,14 +281,14 @@ static void test_access_pmc_regs(struct pmc_accessor *acc, int pmc_idx)
 
 	/* The count value must be 0, as it is disabled and reset */
 	__GUEST_ASSERT(read_data == 0,
-		       "pmc_idx: 0x%lx; acc_idx: 0x%lx; read_data: 0x%lx",
+		       "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx",
 		       pmc_idx, PMC_ACC_TO_IDX(acc), read_data);
 
 	write_data = read_data + pmc_idx + 0x12345;
 	acc->write_cntr(pmc_idx, write_data);
 	read_data = acc->read_cntr(pmc_idx);
 	__GUEST_ASSERT(read_data == write_data,
-		       "pmc_idx: 0x%lx; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx",
+		       "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx",
 		       pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data);
 }
 
@@ -379,7 +363,7 @@ static void guest_code(uint64_t expected_pmcr_n)
 	int i, pmc;
 
 	__GUEST_ASSERT(expected_pmcr_n <= ARMV8_PMU_MAX_GENERAL_COUNTERS,
-			"Expected PMCR.N: 0x%lx; ARMv8 general counters: 0x%lx",
+			"Expected PMCR.N: 0x%lx; ARMv8 general counters: 0x%x",
 			expected_pmcr_n, ARMV8_PMU_MAX_GENERAL_COUNTERS);
 
 	pmcr = read_sysreg(pmcr_el0);
diff --git a/tools/testing/selftests/kvm/arch_timer.c b/tools/testing/selftests/kvm/arch_timer.c
new file mode 100644
index 000000000000..ae1f1a6d8312
--- /dev/null
+++ b/tools/testing/selftests/kvm/arch_timer.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * arch_timer.c - Tests the arch timer IRQ functionality
+ *
+ * The guest's main thread configures the timer interrupt and waits
+ * for it to fire, with a timeout equal to the timer period.
+ * It asserts that the timeout doesn't exceed the timer period plus
+ * a user configurable error margin(default to 100us)
+ *
+ * On the other hand, upon receipt of an interrupt, the guest's interrupt
+ * handler validates the interrupt by checking if the architectural state
+ * is in compliance with the specifications.
+ *
+ * The test provides command-line options to configure the timer's
+ * period (-p), number of vCPUs (-n), iterations per stage (-i) and timer
+ * interrupt arrival error margin (-e). To stress-test the timer stack
+ * even more, an option to migrate the vCPUs across pCPUs (-m), at a
+ * particular rate, is also provided.
+ *
+ * Copyright (c) 2021, Google LLC.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <linux/sizes.h>
+#include <linux/bitmap.h>
+#include <sys/sysinfo.h>
+
+#include "timer_test.h"
+
+struct test_args test_args = {
+	.nr_vcpus = NR_VCPUS_DEF,
+	.nr_iter = NR_TEST_ITERS_DEF,
+	.timer_period_ms = TIMER_TEST_PERIOD_MS_DEF,
+	.migration_freq_ms = TIMER_TEST_MIGRATION_FREQ_MS,
+	.timer_err_margin_us = TIMER_TEST_ERR_MARGIN_US,
+	.reserved = 1,
+};
+
+struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+struct test_vcpu_shared_data vcpu_shared_data[KVM_MAX_VCPUS];
+
+static pthread_t pt_vcpu_run[KVM_MAX_VCPUS];
+static unsigned long *vcpu_done_map;
+static pthread_mutex_t vcpu_done_map_lock;
+
+static void *test_vcpu_run(void *arg)
+{
+	unsigned int vcpu_idx = (unsigned long)arg;
+	struct ucall uc;
+	struct kvm_vcpu *vcpu = vcpus[vcpu_idx];
+	struct kvm_vm *vm = vcpu->vm;
+	struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[vcpu_idx];
+
+	vcpu_run(vcpu);
+
+	/* Currently, any exit from guest is an indication of completion */
+	pthread_mutex_lock(&vcpu_done_map_lock);
+	__set_bit(vcpu_idx, vcpu_done_map);
+	pthread_mutex_unlock(&vcpu_done_map_lock);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+	case UCALL_DONE:
+		break;
+	case UCALL_ABORT:
+		sync_global_from_guest(vm, *shared_data);
+		fprintf(stderr, "Guest assert failed,  vcpu %u; stage; %u; iter: %u\n",
+			vcpu_idx, shared_data->guest_stage, shared_data->nr_iter);
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	default:
+		TEST_FAIL("Unexpected guest exit");
+	}
+
+	pr_info("PASS(vCPU-%d).\n", vcpu_idx);
+
+	return NULL;
+}
+
+static uint32_t test_get_pcpu(void)
+{
+	uint32_t pcpu;
+	unsigned int nproc_conf;
+	cpu_set_t online_cpuset;
+
+	nproc_conf = get_nprocs_conf();
+	sched_getaffinity(0, sizeof(cpu_set_t), &online_cpuset);
+
+	/* Randomly find an available pCPU to place a vCPU on */
+	do {
+		pcpu = rand() % nproc_conf;
+	} while (!CPU_ISSET(pcpu, &online_cpuset));
+
+	return pcpu;
+}
+
+static int test_migrate_vcpu(unsigned int vcpu_idx)
+{
+	int ret;
+	cpu_set_t cpuset;
+	uint32_t new_pcpu = test_get_pcpu();
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(new_pcpu, &cpuset);
+
+	pr_debug("Migrating vCPU: %u to pCPU: %u\n", vcpu_idx, new_pcpu);
+
+	ret = pthread_setaffinity_np(pt_vcpu_run[vcpu_idx],
+				     sizeof(cpuset), &cpuset);
+
+	/* Allow the error where the vCPU thread is already finished */
+	TEST_ASSERT(ret == 0 || ret == ESRCH,
+		    "Failed to migrate the vCPU:%u to pCPU: %u; ret: %d",
+		    vcpu_idx, new_pcpu, ret);
+
+	return ret;
+}
+
+static void *test_vcpu_migration(void *arg)
+{
+	unsigned int i, n_done;
+	bool vcpu_done;
+
+	do {
+		usleep(msecs_to_usecs(test_args.migration_freq_ms));
+
+		for (n_done = 0, i = 0; i < test_args.nr_vcpus; i++) {
+			pthread_mutex_lock(&vcpu_done_map_lock);
+			vcpu_done = test_bit(i, vcpu_done_map);
+			pthread_mutex_unlock(&vcpu_done_map_lock);
+
+			if (vcpu_done) {
+				n_done++;
+				continue;
+			}
+
+			test_migrate_vcpu(i);
+		}
+	} while (test_args.nr_vcpus != n_done);
+
+	return NULL;
+}
+
+static void test_run(struct kvm_vm *vm)
+{
+	pthread_t pt_vcpu_migration;
+	unsigned int i;
+	int ret;
+
+	pthread_mutex_init(&vcpu_done_map_lock, NULL);
+	vcpu_done_map = bitmap_zalloc(test_args.nr_vcpus);
+	TEST_ASSERT(vcpu_done_map, "Failed to allocate vcpu done bitmap");
+
+	for (i = 0; i < (unsigned long)test_args.nr_vcpus; i++) {
+		ret = pthread_create(&pt_vcpu_run[i], NULL, test_vcpu_run,
+				     (void *)(unsigned long)i);
+		TEST_ASSERT(!ret, "Failed to create vCPU-%d pthread", i);
+	}
+
+	/* Spawn a thread to control the vCPU migrations */
+	if (test_args.migration_freq_ms) {
+		srand(time(NULL));
+
+		ret = pthread_create(&pt_vcpu_migration, NULL,
+					test_vcpu_migration, NULL);
+		TEST_ASSERT(!ret, "Failed to create the migration pthread");
+	}
+
+
+	for (i = 0; i < test_args.nr_vcpus; i++)
+		pthread_join(pt_vcpu_run[i], NULL);
+
+	if (test_args.migration_freq_ms)
+		pthread_join(pt_vcpu_migration, NULL);
+
+	bitmap_free(vcpu_done_map);
+}
+
+static void test_print_help(char *name)
+{
+	pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n"
+		"\t\t    [-m migration_freq_ms] [-o counter_offset]\n"
+		"\t\t    [-e timer_err_margin_us]\n", name);
+	pr_info("\t-n: Number of vCPUs to configure (default: %u; max: %u)\n",
+		NR_VCPUS_DEF, KVM_MAX_VCPUS);
+	pr_info("\t-i: Number of iterations per stage (default: %u)\n",
+		NR_TEST_ITERS_DEF);
+	pr_info("\t-p: Periodicity (in ms) of the guest timer (default: %u)\n",
+		TIMER_TEST_PERIOD_MS_DEF);
+	pr_info("\t-m: Frequency (in ms) of vCPUs to migrate to different pCPU. 0 to turn off (default: %u)\n",
+		TIMER_TEST_MIGRATION_FREQ_MS);
+	pr_info("\t-o: Counter offset (in counter cycles, default: 0) [aarch64-only]\n");
+	pr_info("\t-e: Interrupt arrival error margin (in us) of the guest timer (default: %u)\n",
+		TIMER_TEST_ERR_MARGIN_US);
+	pr_info("\t-h: print this help screen\n");
+}
+
+static bool parse_args(int argc, char *argv[])
+{
+	int opt;
+
+	while ((opt = getopt(argc, argv, "hn:i:p:m:o:e:")) != -1) {
+		switch (opt) {
+		case 'n':
+			test_args.nr_vcpus = atoi_positive("Number of vCPUs", optarg);
+			if (test_args.nr_vcpus > KVM_MAX_VCPUS) {
+				pr_info("Max allowed vCPUs: %u\n",
+					KVM_MAX_VCPUS);
+				goto err;
+			}
+			break;
+		case 'i':
+			test_args.nr_iter = atoi_positive("Number of iterations", optarg);
+			break;
+		case 'p':
+			test_args.timer_period_ms = atoi_positive("Periodicity", optarg);
+			break;
+		case 'm':
+			test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg);
+			break;
+		case 'e':
+			test_args.timer_err_margin_us = atoi_non_negative("Error Margin", optarg);
+			break;
+		case 'o':
+			test_args.counter_offset = strtol(optarg, NULL, 0);
+			test_args.reserved = 0;
+			break;
+		case 'h':
+		default:
+			goto err;
+		}
+	}
+
+	return true;
+
+err:
+	test_print_help(argv[0]);
+	return false;
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+
+	if (!parse_args(argc, argv))
+		exit(KSFT_SKIP);
+
+	__TEST_REQUIRE(!test_args.migration_freq_ms || get_nprocs() >= 2,
+		       "At least two physical CPUs needed for vCPU migration");
+
+	vm = test_vm_create();
+	test_run(vm);
+	test_vm_cleanup(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c
index c78a98c1a915..92eae206baa6 100644
--- a/tools/testing/selftests/kvm/guest_memfd_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_test.c
@@ -167,6 +167,9 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm)
 	TEST_ASSERT(ret != -1, "memfd fstat should succeed");
 	TEST_ASSERT(st1.st_size == 4096, "first memfd st_size should still match requested size");
 	TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers");
+
+	close(fd2);
+	close(fd1);
 }
 
 int main(int argc, char *argv[])
diff --git a/tools/testing/selftests/kvm/include/aarch64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/aarch64/kvm_util_arch.h
new file mode 100644
index 000000000000..e43a57d99b56
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/aarch64/kvm_util_arch.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_UTIL_ARCH_H
+#define SELFTEST_KVM_UTIL_ARCH_H
+
+struct kvm_vm_arch {};
+
+#endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h
index cf20e44e86f2..9e518b562827 100644
--- a/tools/testing/selftests/kvm/include/aarch64/processor.h
+++ b/tools/testing/selftests/kvm/include/aarch64/processor.h
@@ -226,8 +226,4 @@ void smccc_smc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
 	       uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
 	       uint64_t arg6, struct arm_smccc_res *res);
 
-
-
-uint32_t guest_get_vcpuid(void);
-
 #endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/kvm_test_harness.h b/tools/testing/selftests/kvm/include/kvm_test_harness.h
new file mode 100644
index 000000000000..8f7c6858e8e2
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/kvm_test_harness.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Macros for defining a KVM test
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_TEST_HARNESS_H
+#define SELFTEST_KVM_TEST_HARNESS_H
+
+#include "kselftest_harness.h"
+
+#define KVM_ONE_VCPU_TEST_SUITE(name)					\
+	FIXTURE(name) {							\
+		struct kvm_vcpu *vcpu;					\
+	};								\
+									\
+	FIXTURE_SETUP(name) {						\
+		(void)vm_create_with_one_vcpu(&self->vcpu, NULL);	\
+	}								\
+									\
+	FIXTURE_TEARDOWN(name) {					\
+		kvm_vm_free(self->vcpu->vm);				\
+	}
+
+#define KVM_ONE_VCPU_TEST(suite, test, guestcode)			\
+static void __suite##_##test(struct kvm_vcpu *vcpu);			\
+									\
+TEST_F(suite, test)							\
+{									\
+	vcpu_arch_set_entry_point(self->vcpu, guestcode);		\
+	__suite##_##test(self->vcpu);					\
+}									\
+static void __suite##_##test(struct kvm_vcpu *vcpu)
+
+#endif /* SELFTEST_KVM_TEST_HARNESS_H */
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index 9e5afc472c14..3e0db283a46a 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -18,9 +18,11 @@
 #include <linux/types.h>
 
 #include <asm/atomic.h>
+#include <asm/kvm.h>
 
 #include <sys/ioctl.h>
 
+#include "kvm_util_arch.h"
 #include "sparsebit.h"
 
 /*
@@ -46,6 +48,7 @@ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
 struct userspace_mem_region {
 	struct kvm_userspace_memory_region2 region;
 	struct sparsebit *unused_phy_pages;
+	struct sparsebit *protected_phy_pages;
 	int fd;
 	off_t offset;
 	enum vm_mem_backing_src_type backing_src_type;
@@ -90,6 +93,7 @@ enum kvm_mem_region_type {
 struct kvm_vm {
 	int mode;
 	unsigned long type;
+	uint8_t subtype;
 	int kvm_fd;
 	int fd;
 	unsigned int pgtable_levels;
@@ -111,6 +115,9 @@ struct kvm_vm {
 	vm_vaddr_t idt;
 	vm_vaddr_t handlers;
 	uint32_t dirty_ring_size;
+	uint64_t gpa_tag_mask;
+
+	struct kvm_vm_arch arch;
 
 	/* Cache of information for binary stats interface */
 	int stats_fd;
@@ -191,10 +198,14 @@ enum vm_guest_mode {
 };
 
 struct vm_shape {
-	enum vm_guest_mode mode;
-	unsigned int type;
+	uint32_t type;
+	uint8_t  mode;
+	uint8_t  subtype;
+	uint16_t padding;
 };
 
+kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t));
+
 #define VM_TYPE_DEFAULT			0
 
 #define VM_SHAPE(__mode)			\
@@ -259,6 +270,10 @@ bool get_kvm_param_bool(const char *param);
 bool get_kvm_intel_param_bool(const char *param);
 bool get_kvm_amd_param_bool(const char *param);
 
+int get_kvm_param_integer(const char *param);
+int get_kvm_intel_param_integer(const char *param);
+int get_kvm_amd_param_integer(const char *param);
+
 unsigned int kvm_check_cap(long cap);
 
 static inline bool kvm_has_cap(long cap)
@@ -564,6 +579,13 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
 		uint64_t guest_paddr, uint32_t slot, uint64_t npages,
 		uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset);
 
+#ifndef vm_arch_has_protected_memory
+static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm)
+{
+	return false;
+}
+#endif
+
 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
 void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
 void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
@@ -573,6 +595,9 @@ vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_mi
 vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min);
 vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
 			    enum kvm_mem_region_type type);
+vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz,
+				 vm_vaddr_t vaddr_min,
+				 enum kvm_mem_region_type type);
 vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages);
 vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm,
 				 enum kvm_mem_region_type type);
@@ -585,6 +610,12 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva);
 vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
 void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa);
 
+
+static inline vm_paddr_t vm_untag_gpa(struct kvm_vm *vm, vm_paddr_t gpa)
+{
+	return gpa & ~vm->gpa_tag_mask;
+}
+
 void vcpu_run(struct kvm_vcpu *vcpu);
 int _vcpu_run(struct kvm_vcpu *vcpu);
 
@@ -827,10 +858,23 @@ const char *exit_reason_str(unsigned int exit_reason);
 
 vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
 			     uint32_t memslot);
-vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
-			      vm_paddr_t paddr_min, uint32_t memslot);
+vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
+				vm_paddr_t paddr_min, uint32_t memslot,
+				bool protected);
 vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm);
 
+static inline vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
+					    vm_paddr_t paddr_min, uint32_t memslot)
+{
+	/*
+	 * By default, allocate memory as protected for VMs that support
+	 * protected memory, as the majority of memory for such VMs is
+	 * protected, i.e. using shared memory is effectively opt-in.
+	 */
+	return __vm_phy_pages_alloc(vm, num, paddr_min, memslot,
+				    vm_arch_has_protected_memory(vm));
+}
+
 /*
  * ____vm_create() does KVM_CREATE_VM and little else.  __vm_create() also
  * loads the test binary into guest memory and creates an IRQ chip (x86 only).
@@ -969,15 +1013,18 @@ static inline void vcpu_dump(FILE *stream, struct kvm_vcpu *vcpu,
  * Input Args:
  *   vm - Virtual Machine
  *   vcpu_id - The id of the VCPU to add to the VM.
- *   guest_code - The vCPU's entry point
  */
-struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
-				  void *guest_code);
+struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id);
+void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code);
 
 static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
 					   void *guest_code)
 {
-	return vm_arch_vcpu_add(vm, vcpu_id, guest_code);
+	struct kvm_vcpu *vcpu = vm_arch_vcpu_add(vm, vcpu_id);
+
+	vcpu_arch_set_entry_point(vcpu, guest_code);
+
+	return vcpu;
 }
 
 /* Re-create a vCPU after restarting a VM, e.g. for state save/restore tests. */
@@ -1081,4 +1128,8 @@ void kvm_selftest_arch_init(void);
 
 void kvm_arch_vm_post_create(struct kvm_vm *vm);
 
+bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr);
+
+uint32_t guest_get_vcpuid(void);
+
 #endif /* SELFTEST_KVM_UTIL_BASE_H */
diff --git a/tools/testing/selftests/kvm/include/riscv/arch_timer.h b/tools/testing/selftests/kvm/include/riscv/arch_timer.h
new file mode 100644
index 000000000000..225d81dad064
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/riscv/arch_timer.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RISC-V Arch Timer(sstc) specific interface
+ *
+ * Copyright (c) 2024 Intel Corporation
+ */
+
+#ifndef SELFTEST_KVM_ARCH_TIMER_H
+#define SELFTEST_KVM_ARCH_TIMER_H
+
+#include <asm/csr.h>
+#include <asm/vdso/processor.h>
+
+static unsigned long timer_freq;
+
+#define msec_to_cycles(msec)	\
+	((timer_freq) * (uint64_t)(msec) / 1000)
+
+#define usec_to_cycles(usec)	\
+	((timer_freq) * (uint64_t)(usec) / 1000000)
+
+#define cycles_to_usec(cycles) \
+	((uint64_t)(cycles) * 1000000 / (timer_freq))
+
+static inline uint64_t timer_get_cycles(void)
+{
+	return csr_read(CSR_TIME);
+}
+
+static inline void timer_set_cmp(uint64_t cval)
+{
+	csr_write(CSR_STIMECMP, cval);
+}
+
+static inline uint64_t timer_get_cmp(void)
+{
+	return csr_read(CSR_STIMECMP);
+}
+
+static inline void timer_irq_enable(void)
+{
+	csr_set(CSR_SIE, IE_TIE);
+}
+
+static inline void timer_irq_disable(void)
+{
+	csr_clear(CSR_SIE, IE_TIE);
+}
+
+static inline void timer_set_next_cmp_ms(uint32_t msec)
+{
+	uint64_t now_ct = timer_get_cycles();
+	uint64_t next_ct = now_ct + msec_to_cycles(msec);
+
+	timer_set_cmp(next_ct);
+}
+
+static inline void __delay(uint64_t cycles)
+{
+	uint64_t start = timer_get_cycles();
+
+	while ((timer_get_cycles() - start) < cycles)
+		cpu_relax();
+}
+
+static inline void udelay(unsigned long usec)
+{
+	__delay(usec_to_cycles(usec));
+}
+
+#endif /* SELFTEST_KVM_ARCH_TIMER_H */
diff --git a/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h
new file mode 100644
index 000000000000..e43a57d99b56
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_UTIL_ARCH_H
+#define SELFTEST_KVM_UTIL_ARCH_H
+
+struct kvm_vm_arch {};
+
+#endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h
index a0f9efe5a2a8..ce473fe251dd 100644
--- a/tools/testing/selftests/kvm/include/riscv/processor.h
+++ b/tools/testing/selftests/kvm/include/riscv/processor.h
@@ -7,8 +7,9 @@
 #ifndef SELFTEST_KVM_PROCESSOR_H
 #define SELFTEST_KVM_PROCESSOR_H
 
-#include "kvm_util.h"
 #include <linux/stringify.h>
+#include <asm/csr.h>
+#include "kvm_util.h"
 
 static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t subtype,
 				    uint64_t idx, uint64_t size)
@@ -47,6 +48,58 @@ static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t subtype,
 						     KVM_REG_RISCV_SBI_SINGLE,		\
 						     idx, KVM_REG_SIZE_ULONG)
 
+bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext);
+
+struct ex_regs {
+	unsigned long ra;
+	unsigned long sp;
+	unsigned long gp;
+	unsigned long tp;
+	unsigned long t0;
+	unsigned long t1;
+	unsigned long t2;
+	unsigned long s0;
+	unsigned long s1;
+	unsigned long a0;
+	unsigned long a1;
+	unsigned long a2;
+	unsigned long a3;
+	unsigned long a4;
+	unsigned long a5;
+	unsigned long a6;
+	unsigned long a7;
+	unsigned long s2;
+	unsigned long s3;
+	unsigned long s4;
+	unsigned long s5;
+	unsigned long s6;
+	unsigned long s7;
+	unsigned long s8;
+	unsigned long s9;
+	unsigned long s10;
+	unsigned long s11;
+	unsigned long t3;
+	unsigned long t4;
+	unsigned long t5;
+	unsigned long t6;
+	unsigned long epc;
+	unsigned long status;
+	unsigned long cause;
+};
+
+#define NR_VECTORS  2
+#define NR_EXCEPTIONS  32
+#define EC_MASK  (NR_EXCEPTIONS - 1)
+
+typedef void(*exception_handler_fn)(struct ex_regs *);
+
+void vm_init_vector_tables(struct kvm_vm *vm);
+void vcpu_init_vector_tables(struct kvm_vcpu *vcpu);
+
+void vm_install_exception_handler(struct kvm_vm *vm, int vector, exception_handler_fn handler);
+
+void vm_install_interrupt_handler(struct kvm_vm *vm, exception_handler_fn handler);
+
 /* L3 index Bit[47:39] */
 #define PGTBL_L3_INDEX_MASK			0x0000FF8000000000ULL
 #define PGTBL_L3_INDEX_SHIFT			39
@@ -101,13 +154,6 @@ static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t subtype,
 #define PGTBL_PAGE_SIZE				PGTBL_L0_BLOCK_SIZE
 #define PGTBL_PAGE_SIZE_SHIFT			PGTBL_L0_BLOCK_SHIFT
 
-#define SATP_PPN				_AC(0x00000FFFFFFFFFFF, UL)
-#define SATP_MODE_39				_AC(0x8000000000000000, UL)
-#define SATP_MODE_48				_AC(0x9000000000000000, UL)
-#define SATP_ASID_BITS				16
-#define SATP_ASID_SHIFT				44
-#define SATP_ASID_MASK				_AC(0xFFFF, UL)
-
 /* SBI return error codes */
 #define SBI_SUCCESS				0
 #define SBI_ERR_FAILURE				-1
@@ -147,4 +193,14 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
 
 bool guest_sbi_probe_extension(int extid, long *out_val);
 
+static inline void local_irq_enable(void)
+{
+	csr_set(CSR_SSTATUS, SR_SIE);
+}
+
+static inline void local_irq_disable(void)
+{
+	csr_clear(CSR_SSTATUS, SR_SIE);
+}
+
 #endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/s390x/kvm_util_arch.h b/tools/testing/selftests/kvm/include/s390x/kvm_util_arch.h
new file mode 100644
index 000000000000..e43a57d99b56
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390x/kvm_util_arch.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_UTIL_ARCH_H
+#define SELFTEST_KVM_UTIL_ARCH_H
+
+struct kvm_vm_arch {};
+
+#endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/sparsebit.h b/tools/testing/selftests/kvm/include/sparsebit.h
index 12a9a4b9cead..bc760761e1a3 100644
--- a/tools/testing/selftests/kvm/include/sparsebit.h
+++ b/tools/testing/selftests/kvm/include/sparsebit.h
@@ -30,26 +30,26 @@ typedef uint64_t sparsebit_num_t;
 
 struct sparsebit *sparsebit_alloc(void);
 void sparsebit_free(struct sparsebit **sbitp);
-void sparsebit_copy(struct sparsebit *dstp, struct sparsebit *src);
+void sparsebit_copy(struct sparsebit *dstp, const struct sparsebit *src);
 
-bool sparsebit_is_set(struct sparsebit *sbit, sparsebit_idx_t idx);
-bool sparsebit_is_set_num(struct sparsebit *sbit,
+bool sparsebit_is_set(const struct sparsebit *sbit, sparsebit_idx_t idx);
+bool sparsebit_is_set_num(const struct sparsebit *sbit,
 			  sparsebit_idx_t idx, sparsebit_num_t num);
-bool sparsebit_is_clear(struct sparsebit *sbit, sparsebit_idx_t idx);
-bool sparsebit_is_clear_num(struct sparsebit *sbit,
+bool sparsebit_is_clear(const struct sparsebit *sbit, sparsebit_idx_t idx);
+bool sparsebit_is_clear_num(const struct sparsebit *sbit,
 			    sparsebit_idx_t idx, sparsebit_num_t num);
-sparsebit_num_t sparsebit_num_set(struct sparsebit *sbit);
-bool sparsebit_any_set(struct sparsebit *sbit);
-bool sparsebit_any_clear(struct sparsebit *sbit);
-bool sparsebit_all_set(struct sparsebit *sbit);
-bool sparsebit_all_clear(struct sparsebit *sbit);
-sparsebit_idx_t sparsebit_first_set(struct sparsebit *sbit);
-sparsebit_idx_t sparsebit_first_clear(struct sparsebit *sbit);
-sparsebit_idx_t sparsebit_next_set(struct sparsebit *sbit, sparsebit_idx_t prev);
-sparsebit_idx_t sparsebit_next_clear(struct sparsebit *sbit, sparsebit_idx_t prev);
-sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *sbit,
+sparsebit_num_t sparsebit_num_set(const struct sparsebit *sbit);
+bool sparsebit_any_set(const struct sparsebit *sbit);
+bool sparsebit_any_clear(const struct sparsebit *sbit);
+bool sparsebit_all_set(const struct sparsebit *sbit);
+bool sparsebit_all_clear(const struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_first_set(const struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_first_clear(const struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_next_set(const struct sparsebit *sbit, sparsebit_idx_t prev);
+sparsebit_idx_t sparsebit_next_clear(const struct sparsebit *sbit, sparsebit_idx_t prev);
+sparsebit_idx_t sparsebit_next_set_num(const struct sparsebit *sbit,
 				       sparsebit_idx_t start, sparsebit_num_t num);
-sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *sbit,
+sparsebit_idx_t sparsebit_next_clear_num(const struct sparsebit *sbit,
 					 sparsebit_idx_t start, sparsebit_num_t num);
 
 void sparsebit_set(struct sparsebit *sbitp, sparsebit_idx_t idx);
@@ -62,9 +62,29 @@ void sparsebit_clear_num(struct sparsebit *sbitp,
 			 sparsebit_idx_t start, sparsebit_num_t num);
 void sparsebit_clear_all(struct sparsebit *sbitp);
 
-void sparsebit_dump(FILE *stream, struct sparsebit *sbit,
+void sparsebit_dump(FILE *stream, const struct sparsebit *sbit,
 		    unsigned int indent);
-void sparsebit_validate_internal(struct sparsebit *sbit);
+void sparsebit_validate_internal(const struct sparsebit *sbit);
+
+/*
+ * Iterate over an inclusive ranges within sparsebit @s. In each iteration,
+ * @range_begin and @range_end will take the beginning and end of the set
+ * range, which are of type sparsebit_idx_t.
+ *
+ * For example, if the range [3, 7] (inclusive) is set, within the
+ * iteration,@range_begin will take the value 3 and @range_end will take
+ * the value 7.
+ *
+ * Ensure that there is at least one bit set before using this macro with
+ * sparsebit_any_set(), because sparsebit_first_set() will abort if none
+ * are set.
+ */
+#define sparsebit_for_each_set_range(s, range_begin, range_end)         \
+	for (range_begin = sparsebit_first_set(s),                      \
+	     range_end = sparsebit_next_clear(s, range_begin) - 1;	\
+	     range_begin && range_end;                                  \
+	     range_begin = sparsebit_next_set(s, range_end),            \
+	     range_end = sparsebit_next_clear(s, range_begin) - 1)
 
 #ifdef __cplusplus
 }
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index 50a5e31ba8da..8a6e30612c86 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -20,6 +20,8 @@
 #include <sys/mman.h>
 #include "kselftest.h"
 
+#define msecs_to_usecs(msec)    ((msec) * 1000ULL)
+
 static inline int _no_printf(const char *format, ...) { return 0; }
 
 #ifdef DEBUG
diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h
new file mode 100644
index 000000000000..9b6edaafe6d4
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/timer_test.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * timer test specific header
+ *
+ * Copyright (C) 2018, Google LLC
+ */
+
+#ifndef SELFTEST_KVM_TIMER_TEST_H
+#define SELFTEST_KVM_TIMER_TEST_H
+
+#include "kvm_util.h"
+
+#define NR_VCPUS_DEF            4
+#define NR_TEST_ITERS_DEF       5
+#define TIMER_TEST_PERIOD_MS_DEF    10
+#define TIMER_TEST_ERR_MARGIN_US    100
+#define TIMER_TEST_MIGRATION_FREQ_MS    2
+
+/* Timer test cmdline parameters */
+struct test_args {
+	uint32_t nr_vcpus;
+	uint32_t nr_iter;
+	uint32_t timer_period_ms;
+	uint32_t migration_freq_ms;
+	uint32_t timer_err_margin_us;
+	/* Members of struct kvm_arm_counter_offset */
+	uint64_t counter_offset;
+	uint64_t reserved;
+};
+
+/* Shared variables between host and guest */
+struct test_vcpu_shared_data {
+	uint32_t nr_iter;
+	int guest_stage;
+	uint64_t xcnt;
+};
+
+extern struct test_args test_args;
+extern struct kvm_vcpu *vcpus[];
+extern struct test_vcpu_shared_data vcpu_shared_data[];
+
+struct kvm_vm *test_vm_create(void);
+void test_vm_cleanup(struct kvm_vm *vm);
+
+#endif /* SELFTEST_KVM_TIMER_TEST_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h
new file mode 100644
index 000000000000..9f1725192aa2
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_UTIL_ARCH_H
+#define SELFTEST_KVM_UTIL_ARCH_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+struct kvm_vm_arch {
+	uint64_t c_bit;
+	uint64_t s_bit;
+	int sev_fd;
+	bool is_pt_protected;
+};
+
+static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch)
+{
+	return arch->c_bit || arch->s_bit;
+}
+
+#define vm_arch_has_protected_memory(vm) \
+	__vm_arch_has_protected_memory(&(vm)->arch)
+
+#endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/x86_64/pmu.h b/tools/testing/selftests/kvm/include/x86_64/pmu.h
new file mode 100644
index 000000000000..3c10c4dc0ae8
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86_64/pmu.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023, Tencent, Inc.
+ */
+#ifndef SELFTEST_KVM_PMU_H
+#define SELFTEST_KVM_PMU_H
+
+#include <stdint.h>
+
+#define KVM_PMU_EVENT_FILTER_MAX_EVENTS			300
+
+/*
+ * Encode an eventsel+umask pair into event-select MSR format.  Note, this is
+ * technically AMD's format, as Intel's format only supports 8 bits for the
+ * event selector, i.e. doesn't use bits 24:16 for the selector.  But, OR-ing
+ * in '0' is a nop and won't clobber the CMASK.
+ */
+#define RAW_EVENT(eventsel, umask) (((eventsel & 0xf00UL) << 24) |	\
+				    ((eventsel) & 0xff) |		\
+				    ((umask) & 0xff) << 8)
+
+/*
+ * These are technically Intel's definitions, but except for CMASK (see above),
+ * AMD's layout is compatible with Intel's.
+ */
+#define ARCH_PERFMON_EVENTSEL_EVENT		GENMASK_ULL(7, 0)
+#define ARCH_PERFMON_EVENTSEL_UMASK		GENMASK_ULL(15, 8)
+#define ARCH_PERFMON_EVENTSEL_USR		BIT_ULL(16)
+#define ARCH_PERFMON_EVENTSEL_OS		BIT_ULL(17)
+#define ARCH_PERFMON_EVENTSEL_EDGE		BIT_ULL(18)
+#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL	BIT_ULL(19)
+#define ARCH_PERFMON_EVENTSEL_INT		BIT_ULL(20)
+#define ARCH_PERFMON_EVENTSEL_ANY		BIT_ULL(21)
+#define ARCH_PERFMON_EVENTSEL_ENABLE		BIT_ULL(22)
+#define ARCH_PERFMON_EVENTSEL_INV		BIT_ULL(23)
+#define ARCH_PERFMON_EVENTSEL_CMASK		GENMASK_ULL(31, 24)
+
+/* RDPMC control flags, Intel only. */
+#define INTEL_RDPMC_METRICS			BIT_ULL(29)
+#define INTEL_RDPMC_FIXED			BIT_ULL(30)
+#define INTEL_RDPMC_FAST			BIT_ULL(31)
+
+/* Fixed PMC controls, Intel only. */
+#define FIXED_PMC_GLOBAL_CTRL_ENABLE(_idx)	BIT_ULL((32 + (_idx)))
+
+#define FIXED_PMC_KERNEL			BIT_ULL(0)
+#define FIXED_PMC_USER				BIT_ULL(1)
+#define FIXED_PMC_ANYTHREAD			BIT_ULL(2)
+#define FIXED_PMC_ENABLE_PMI			BIT_ULL(3)
+#define FIXED_PMC_NR_BITS			4
+#define FIXED_PMC_CTRL(_idx, _val)		((_val) << ((_idx) * FIXED_PMC_NR_BITS))
+
+#define PMU_CAP_FW_WRITES			BIT_ULL(13)
+#define PMU_CAP_LBR_FMT				0x3f
+
+#define	INTEL_ARCH_CPU_CYCLES			RAW_EVENT(0x3c, 0x00)
+#define	INTEL_ARCH_INSTRUCTIONS_RETIRED		RAW_EVENT(0xc0, 0x00)
+#define	INTEL_ARCH_REFERENCE_CYCLES		RAW_EVENT(0x3c, 0x01)
+#define	INTEL_ARCH_LLC_REFERENCES		RAW_EVENT(0x2e, 0x4f)
+#define	INTEL_ARCH_LLC_MISSES			RAW_EVENT(0x2e, 0x41)
+#define	INTEL_ARCH_BRANCHES_RETIRED		RAW_EVENT(0xc4, 0x00)
+#define	INTEL_ARCH_BRANCHES_MISPREDICTED	RAW_EVENT(0xc5, 0x00)
+#define	INTEL_ARCH_TOPDOWN_SLOTS		RAW_EVENT(0xa4, 0x01)
+
+#define	AMD_ZEN_CORE_CYCLES			RAW_EVENT(0x76, 0x00)
+#define	AMD_ZEN_INSTRUCTIONS_RETIRED		RAW_EVENT(0xc0, 0x00)
+#define	AMD_ZEN_BRANCHES_RETIRED		RAW_EVENT(0xc2, 0x00)
+#define	AMD_ZEN_BRANCHES_MISPREDICTED		RAW_EVENT(0xc3, 0x00)
+
+/*
+ * Note!  The order and thus the index of the architectural events matters as
+ * support for each event is enumerated via CPUID using the index of the event.
+ */
+enum intel_pmu_architectural_events {
+	INTEL_ARCH_CPU_CYCLES_INDEX,
+	INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX,
+	INTEL_ARCH_REFERENCE_CYCLES_INDEX,
+	INTEL_ARCH_LLC_REFERENCES_INDEX,
+	INTEL_ARCH_LLC_MISSES_INDEX,
+	INTEL_ARCH_BRANCHES_RETIRED_INDEX,
+	INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX,
+	INTEL_ARCH_TOPDOWN_SLOTS_INDEX,
+	NR_INTEL_ARCH_EVENTS,
+};
+
+enum amd_pmu_zen_events {
+	AMD_ZEN_CORE_CYCLES_INDEX,
+	AMD_ZEN_INSTRUCTIONS_INDEX,
+	AMD_ZEN_BRANCHES_INDEX,
+	AMD_ZEN_BRANCH_MISSES_INDEX,
+	NR_AMD_ZEN_EVENTS,
+};
+
+extern const uint64_t intel_pmu_arch_events[];
+extern const uint64_t amd_pmu_zen_events[];
+
+#endif /* SELFTEST_KVM_PMU_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 5bca8c947c82..81ce37ec407d 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -23,6 +23,15 @@
 extern bool host_cpu_is_intel;
 extern bool host_cpu_is_amd;
 
+enum vm_guest_x86_subtype {
+	VM_SUBTYPE_NONE = 0,
+	VM_SUBTYPE_SEV,
+	VM_SUBTYPE_SEV_ES,
+};
+
+/* Forced emulation prefix, used to invoke the emulator unconditionally. */
+#define KVM_FEP "ud2; .byte 'k', 'v', 'm';"
+
 #define NMI_VECTOR		0x02
 
 #define X86_EFLAGS_FIXED	 (1u << 1)
@@ -273,6 +282,7 @@ struct kvm_x86_cpu_property {
 #define X86_PROPERTY_MAX_EXT_LEAF		KVM_X86_CPU_PROPERTY(0x80000000, 0, EAX, 0, 31)
 #define X86_PROPERTY_MAX_PHY_ADDR		KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 0, 7)
 #define X86_PROPERTY_MAX_VIRT_ADDR		KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 8, 15)
+#define X86_PROPERTY_SEV_C_BIT			KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 0, 5)
 #define X86_PROPERTY_PHYS_ADDR_REDUCTION	KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11)
 
 #define X86_PROPERTY_MAX_CENTAUR_LEAF		KVM_X86_CPU_PROPERTY(0xC0000000, 0, EAX, 0, 31)
@@ -282,24 +292,41 @@ struct kvm_x86_cpu_property {
  * that indicates the feature is _not_ supported, and a property that states
  * the length of the bit mask of unsupported features.  A feature is supported
  * if the size of the bit mask is larger than the "unavailable" bit, and said
- * bit is not set.
+ * bit is not set.  Fixed counters also bizarre enumeration, but inverted from
+ * arch events for general purpose counters.  Fixed counters are supported if a
+ * feature flag is set **OR** the total number of fixed counters is greater
+ * than index of the counter.
  *
- * Wrap the "unavailable" feature to simplify checking whether or not a given
- * architectural event is supported.
+ * Wrap the events for general purpose and fixed counters to simplify checking
+ * whether or not a given architectural event is supported.
  */
 struct kvm_x86_pmu_feature {
-	struct kvm_x86_cpu_feature anti_feature;
+	struct kvm_x86_cpu_feature f;
 };
-#define	KVM_X86_PMU_FEATURE(name, __bit)					\
-({										\
-	struct kvm_x86_pmu_feature feature = {					\
-		.anti_feature = KVM_X86_CPU_FEATURE(0xa, 0, EBX, __bit),	\
-	};									\
-										\
-	feature;								\
+#define	KVM_X86_PMU_FEATURE(__reg, __bit)				\
+({									\
+	struct kvm_x86_pmu_feature feature = {				\
+		.f = KVM_X86_CPU_FEATURE(0xa, 0, __reg, __bit),		\
+	};								\
+									\
+	kvm_static_assert(KVM_CPUID_##__reg == KVM_CPUID_EBX ||		\
+			  KVM_CPUID_##__reg == KVM_CPUID_ECX);		\
+	feature;							\
 })
 
-#define X86_PMU_FEATURE_BRANCH_INSNS_RETIRED	KVM_X86_PMU_FEATURE(BRANCH_INSNS_RETIRED, 5)
+#define X86_PMU_FEATURE_CPU_CYCLES			KVM_X86_PMU_FEATURE(EBX, 0)
+#define X86_PMU_FEATURE_INSNS_RETIRED			KVM_X86_PMU_FEATURE(EBX, 1)
+#define X86_PMU_FEATURE_REFERENCE_CYCLES		KVM_X86_PMU_FEATURE(EBX, 2)
+#define X86_PMU_FEATURE_LLC_REFERENCES			KVM_X86_PMU_FEATURE(EBX, 3)
+#define X86_PMU_FEATURE_LLC_MISSES			KVM_X86_PMU_FEATURE(EBX, 4)
+#define X86_PMU_FEATURE_BRANCH_INSNS_RETIRED		KVM_X86_PMU_FEATURE(EBX, 5)
+#define X86_PMU_FEATURE_BRANCHES_MISPREDICTED		KVM_X86_PMU_FEATURE(EBX, 6)
+#define X86_PMU_FEATURE_TOPDOWN_SLOTS			KVM_X86_PMU_FEATURE(EBX, 7)
+
+#define X86_PMU_FEATURE_INSNS_RETIRED_FIXED		KVM_X86_PMU_FEATURE(ECX, 0)
+#define X86_PMU_FEATURE_CPU_CYCLES_FIXED		KVM_X86_PMU_FEATURE(ECX, 1)
+#define X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED	KVM_X86_PMU_FEATURE(ECX, 2)
+#define X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED		KVM_X86_PMU_FEATURE(ECX, 3)
 
 static inline unsigned int x86_family(unsigned int eax)
 {
@@ -698,10 +725,16 @@ static __always_inline bool this_cpu_has_p(struct kvm_x86_cpu_property property)
 
 static inline bool this_pmu_has(struct kvm_x86_pmu_feature feature)
 {
-	uint32_t nr_bits = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+	uint32_t nr_bits;
 
-	return nr_bits > feature.anti_feature.bit &&
-	       !this_cpu_has(feature.anti_feature);
+	if (feature.f.reg == KVM_CPUID_EBX) {
+		nr_bits = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+		return nr_bits > feature.f.bit && !this_cpu_has(feature.f);
+	}
+
+	GUEST_ASSERT(feature.f.reg == KVM_CPUID_ECX);
+	nr_bits = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+	return nr_bits > feature.f.bit || this_cpu_has(feature.f);
 }
 
 static __always_inline uint64_t this_cpu_supported_xcr0(void)
@@ -917,10 +950,16 @@ static __always_inline bool kvm_cpu_has_p(struct kvm_x86_cpu_property property)
 
 static inline bool kvm_pmu_has(struct kvm_x86_pmu_feature feature)
 {
-	uint32_t nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+	uint32_t nr_bits;
+
+	if (feature.f.reg == KVM_CPUID_EBX) {
+		nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+		return nr_bits > feature.f.bit && !kvm_cpu_has(feature.f);
+	}
 
-	return nr_bits > feature.anti_feature.bit &&
-	       !kvm_cpu_has(feature.anti_feature);
+	TEST_ASSERT_EQ(feature.f.reg, KVM_CPUID_ECX);
+	nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+	return nr_bits > feature.f.bit || kvm_cpu_has(feature.f);
 }
 
 static __always_inline uint64_t kvm_cpu_supported_xcr0(void)
@@ -995,9 +1034,22 @@ static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu)
 	vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid);
 }
 
+void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,
+			     struct kvm_x86_cpu_property property,
+			     uint32_t value);
 void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr);
 
 void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function);
+
+static inline bool vcpu_cpuid_has(struct kvm_vcpu *vcpu,
+				  struct kvm_x86_cpu_feature feature)
+{
+	struct kvm_cpuid_entry2 *entry;
+
+	entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index);
+	return *((&entry->eax) + feature.reg) & BIT(feature.bit);
+}
+
 void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu,
 				     struct kvm_x86_cpu_feature feature,
 				     bool set);
@@ -1059,6 +1111,7 @@ do {											\
 } while (0)
 
 void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
+void kvm_init_vm_address_properties(struct kvm_vm *vm);
 bool vm_is_unrestricted_guest(struct kvm_vm *vm);
 
 struct ex_regs {
@@ -1120,16 +1173,19 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector,
  * r9  = exception vector (non-zero)
  * r10 = error code
  */
-#define KVM_ASM_SAFE(insn)					\
+#define __KVM_ASM_SAFE(insn, fep)				\
 	"mov $" __stringify(KVM_EXCEPTION_MAGIC) ", %%r9\n\t"	\
 	"lea 1f(%%rip), %%r10\n\t"				\
 	"lea 2f(%%rip), %%r11\n\t"				\
-	"1: " insn "\n\t"					\
+	fep "1: " insn "\n\t"					\
 	"xor %%r9, %%r9\n\t"					\
 	"2:\n\t"						\
 	"mov  %%r9b, %[vector]\n\t"				\
 	"mov  %%r10, %[error_code]\n\t"
 
+#define KVM_ASM_SAFE(insn) __KVM_ASM_SAFE(insn, "")
+#define KVM_ASM_SAFE_FEP(insn) __KVM_ASM_SAFE(insn, KVM_FEP)
+
 #define KVM_ASM_SAFE_OUTPUTS(v, ec)	[vector] "=qm"(v), [error_code] "=rm"(ec)
 #define KVM_ASM_SAFE_CLOBBERS	"r9", "r10", "r11"
 
@@ -1156,21 +1212,58 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector,
 	vector;								\
 })
 
-static inline uint8_t rdmsr_safe(uint32_t msr, uint64_t *val)
-{
-	uint64_t error_code;
-	uint8_t vector;
-	uint32_t a, d;
+#define kvm_asm_safe_fep(insn, inputs...)				\
+({									\
+	uint64_t ign_error_code;					\
+	uint8_t vector;							\
+									\
+	asm volatile(KVM_ASM_SAFE(insn)					\
+		     : KVM_ASM_SAFE_OUTPUTS(vector, ign_error_code)	\
+		     : inputs						\
+		     : KVM_ASM_SAFE_CLOBBERS);				\
+	vector;								\
+})
 
-	asm volatile(KVM_ASM_SAFE("rdmsr")
-		     : "=a"(a), "=d"(d), KVM_ASM_SAFE_OUTPUTS(vector, error_code)
-		     : "c"(msr)
-		     : KVM_ASM_SAFE_CLOBBERS);
+#define kvm_asm_safe_ec_fep(insn, error_code, inputs...)		\
+({									\
+	uint8_t vector;							\
+									\
+	asm volatile(KVM_ASM_SAFE_FEP(insn)				\
+		     : KVM_ASM_SAFE_OUTPUTS(vector, error_code)		\
+		     : inputs						\
+		     : KVM_ASM_SAFE_CLOBBERS);				\
+	vector;								\
+})
 
-	*val = (uint64_t)a | ((uint64_t)d << 32);
-	return vector;
+#define BUILD_READ_U64_SAFE_HELPER(insn, _fep, _FEP)			\
+static inline uint8_t insn##_safe ##_fep(uint32_t idx, uint64_t *val)	\
+{									\
+	uint64_t error_code;						\
+	uint8_t vector;							\
+	uint32_t a, d;							\
+									\
+	asm volatile(KVM_ASM_SAFE##_FEP(#insn)				\
+		     : "=a"(a), "=d"(d),				\
+		       KVM_ASM_SAFE_OUTPUTS(vector, error_code)		\
+		     : "c"(idx)						\
+		     : KVM_ASM_SAFE_CLOBBERS);				\
+									\
+	*val = (uint64_t)a | ((uint64_t)d << 32);			\
+	return vector;							\
 }
 
+/*
+ * Generate {insn}_safe() and {insn}_safe_fep() helpers for instructions that
+ * use ECX as in input index, and EDX:EAX as a 64-bit output.
+ */
+#define BUILD_READ_U64_SAFE_HELPERS(insn)				\
+	BUILD_READ_U64_SAFE_HELPER(insn, , )				\
+	BUILD_READ_U64_SAFE_HELPER(insn, _fep, _FEP)			\
+
+BUILD_READ_U64_SAFE_HELPERS(rdmsr)
+BUILD_READ_U64_SAFE_HELPERS(rdpmc)
+BUILD_READ_U64_SAFE_HELPERS(xgetbv)
+
 static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val)
 {
 	return kvm_asm_safe("wrmsr", "a"(val & -1u), "d"(val >> 32), "c"(msr));
@@ -1186,6 +1279,16 @@ static inline uint8_t xsetbv_safe(uint32_t index, uint64_t value)
 
 bool kvm_is_tdp_enabled(void);
 
+static inline bool kvm_is_pmu_enabled(void)
+{
+	return get_kvm_param_bool("enable_pmu");
+}
+
+static inline bool kvm_is_forced_emulation_enabled(void)
+{
+	return !!get_kvm_param_integer("force_emulation_prefix");
+}
+
 uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
 				    int *level);
 uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr);
diff --git a/tools/testing/selftests/kvm/include/x86_64/sev.h b/tools/testing/selftests/kvm/include/x86_64/sev.h
new file mode 100644
index 000000000000..8a1bf88474c9
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86_64/sev.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Helpers used for SEV guests
+ *
+ */
+#ifndef SELFTEST_KVM_SEV_H
+#define SELFTEST_KVM_SEV_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "linux/psp-sev.h"
+
+#include "kvm_util.h"
+#include "svm_util.h"
+#include "processor.h"
+
+enum sev_guest_state {
+	SEV_GUEST_STATE_UNINITIALIZED = 0,
+	SEV_GUEST_STATE_LAUNCH_UPDATE,
+	SEV_GUEST_STATE_LAUNCH_SECRET,
+	SEV_GUEST_STATE_RUNNING,
+};
+
+#define SEV_POLICY_NO_DBG	(1UL << 0)
+#define SEV_POLICY_ES		(1UL << 2)
+
+#define GHCB_MSR_TERM_REQ	0x100
+
+void sev_vm_launch(struct kvm_vm *vm, uint32_t policy);
+void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement);
+void sev_vm_launch_finish(struct kvm_vm *vm);
+
+struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code,
+					   struct kvm_vcpu **cpu);
+
+kvm_static_assert(SEV_RET_SUCCESS == 0);
+
+/*
+ * The KVM_MEMORY_ENCRYPT_OP uAPI is utter garbage and takes an "unsigned long"
+ * instead of a proper struct.  The size of the parameter is embedded in the
+ * ioctl number, i.e. is ABI and thus immutable.  Hack around the mess by
+ * creating an overlay to pass in an "unsigned long" without a cast (casting
+ * will make the compiler unhappy due to dereferencing an aliased pointer).
+ */
+#define __vm_sev_ioctl(vm, cmd, arg)					\
+({									\
+	int r;								\
+									\
+	union {								\
+		struct kvm_sev_cmd c;					\
+		unsigned long raw;					\
+	} sev_cmd = { .c = {						\
+		.id = (cmd),						\
+		.data = (uint64_t)(arg),				\
+		.sev_fd = (vm)->arch.sev_fd,				\
+	} };								\
+									\
+	r = __vm_ioctl(vm, KVM_MEMORY_ENCRYPT_OP, &sev_cmd.raw);	\
+	r ?: sev_cmd.c.error;						\
+})
+
+#define vm_sev_ioctl(vm, cmd, arg)					\
+({									\
+	int ret = __vm_sev_ioctl(vm, cmd, arg);				\
+									\
+	__TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd,	ret, vm);		\
+})
+
+static inline void sev_vm_init(struct kvm_vm *vm)
+{
+	vm->arch.sev_fd = open_sev_dev_path_or_exit();
+
+	vm_sev_ioctl(vm, KVM_SEV_INIT, NULL);
+}
+
+
+static inline void sev_es_vm_init(struct kvm_vm *vm)
+{
+	vm->arch.sev_fd = open_sev_dev_path_or_exit();
+
+	vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL);
+}
+
+static inline void sev_register_encrypted_memory(struct kvm_vm *vm,
+						 struct userspace_mem_region *region)
+{
+	struct kvm_enc_region range = {
+		.addr = region->region.userspace_addr,
+		.size = region->region.memory_size,
+	};
+
+	vm_ioctl(vm, KVM_MEMORY_ENCRYPT_REG_REGION, &range);
+}
+
+static inline void sev_launch_update_data(struct kvm_vm *vm, vm_paddr_t gpa,
+					  uint64_t size)
+{
+	struct kvm_sev_launch_update_data update_data = {
+		.uaddr = (unsigned long)addr_gpa2hva(vm, gpa),
+		.len = size,
+	};
+
+	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_DATA, &update_data);
+}
+
+#endif /* SELFTEST_KVM_SEV_H */
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index 43b9a7283360..a9eb17295be4 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -365,8 +365,13 @@ void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
 		indent, "", pstate, pc);
 }
 
-struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
-				  struct kvm_vcpu_init *init, void *guest_code)
+void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
+{
+	vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code);
+}
+
+static struct kvm_vcpu *__aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
+					   struct kvm_vcpu_init *init)
 {
 	size_t stack_size;
 	uint64_t stack_vaddr;
@@ -381,15 +386,22 @@ struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
 	aarch64_vcpu_setup(vcpu, init);
 
 	vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size);
-	vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code);
+	return vcpu;
+}
+
+struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
+				  struct kvm_vcpu_init *init, void *guest_code)
+{
+	struct kvm_vcpu *vcpu = __aarch64_vcpu_add(vm, vcpu_id, init);
+
+	vcpu_arch_set_entry_point(vcpu, guest_code);
 
 	return vcpu;
 }
 
-struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
-				  void *guest_code)
+struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
 {
-	return aarch64_vcpu_add(vm, vcpu_id, NULL, guest_code);
+	return __aarch64_vcpu_add(vm, vcpu_id, NULL);
 }
 
 void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 1b197426f29f..b2262b5fad9e 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -52,13 +52,13 @@ int open_kvm_dev_path_or_exit(void)
 	return _open_kvm_dev_path_or_exit(O_RDONLY);
 }
 
-static bool get_module_param_bool(const char *module_name, const char *param)
+static ssize_t get_module_param(const char *module_name, const char *param,
+				void *buffer, size_t buffer_size)
 {
 	const int path_size = 128;
 	char path[path_size];
-	char value;
-	ssize_t r;
-	int fd;
+	ssize_t bytes_read;
+	int fd, r;
 
 	r = snprintf(path, path_size, "/sys/module/%s/parameters/%s",
 		     module_name, param);
@@ -67,11 +67,46 @@ static bool get_module_param_bool(const char *module_name, const char *param)
 
 	fd = open_path_or_exit(path, O_RDONLY);
 
-	r = read(fd, &value, 1);
-	TEST_ASSERT(r == 1, "read(%s) failed", path);
+	bytes_read = read(fd, buffer, buffer_size);
+	TEST_ASSERT(bytes_read > 0, "read(%s) returned %ld, wanted %ld bytes",
+		    path, bytes_read, buffer_size);
 
 	r = close(fd);
 	TEST_ASSERT(!r, "close(%s) failed", path);
+	return bytes_read;
+}
+
+static int get_module_param_integer(const char *module_name, const char *param)
+{
+	/*
+	 * 16 bytes to hold a 64-bit value (1 byte per char), 1 byte for the
+	 * NUL char, and 1 byte because the kernel sucks and inserts a newline
+	 * at the end.
+	 */
+	char value[16 + 1 + 1];
+	ssize_t r;
+
+	memset(value, '\0', sizeof(value));
+
+	r = get_module_param(module_name, param, value, sizeof(value));
+	TEST_ASSERT(value[r - 1] == '\n',
+		    "Expected trailing newline, got char '%c'", value[r - 1]);
+
+	/*
+	 * Squash the newline, otherwise atoi_paranoid() will complain about
+	 * trailing non-NUL characters in the string.
+	 */
+	value[r - 1] = '\0';
+	return atoi_paranoid(value);
+}
+
+static bool get_module_param_bool(const char *module_name, const char *param)
+{
+	char value;
+	ssize_t r;
+
+	r = get_module_param(module_name, param, &value, sizeof(value));
+	TEST_ASSERT_EQ(r, 1);
 
 	if (value == 'Y')
 		return true;
@@ -96,6 +131,21 @@ bool get_kvm_amd_param_bool(const char *param)
 	return get_module_param_bool("kvm_amd", param);
 }
 
+int get_kvm_param_integer(const char *param)
+{
+	return get_module_param_integer("kvm", param);
+}
+
+int get_kvm_intel_param_integer(const char *param)
+{
+	return get_module_param_integer("kvm_intel", param);
+}
+
+int get_kvm_amd_param_integer(const char *param)
+{
+	return get_module_param_integer("kvm_amd", param);
+}
+
 /*
  * Capability
  *
@@ -226,6 +276,7 @@ struct kvm_vm *____vm_create(struct vm_shape shape)
 
 	vm->mode = shape.mode;
 	vm->type = shape.type;
+	vm->subtype = shape.subtype;
 
 	vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits;
 	vm->va_bits = vm_guest_mode_params[vm->mode].va_bits;
@@ -266,6 +317,7 @@ struct kvm_vm *____vm_create(struct vm_shape shape)
 	case VM_MODE_PXXV48_4K:
 #ifdef __x86_64__
 		kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
+		kvm_init_vm_address_properties(vm);
 		/*
 		 * Ignore KVM support for 5-level paging (vm->va_bits == 57),
 		 * it doesn't take effect unless a CR4.LA57 is set, which it
@@ -666,6 +718,7 @@ static void __vm_mem_region_delete(struct kvm_vm *vm,
 	vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
 
 	sparsebit_free(&region->unused_phy_pages);
+	sparsebit_free(&region->protected_phy_pages);
 	ret = munmap(region->mmap_start, region->mmap_size);
 	TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
 	if (region->fd >= 0) {
@@ -1047,6 +1100,8 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
 	}
 
 	region->unused_phy_pages = sparsebit_alloc();
+	if (vm_arch_has_protected_memory(vm))
+		region->protected_phy_pages = sparsebit_alloc();
 	sparsebit_set_num(region->unused_phy_pages,
 		guest_paddr >> vm->page_shift, npages);
 	region->region.slot = slot;
@@ -1377,15 +1432,17 @@ va_found:
 	return pgidx_start * vm->page_size;
 }
 
-vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
-			    enum kvm_mem_region_type type)
+static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz,
+				     vm_vaddr_t vaddr_min,
+				     enum kvm_mem_region_type type,
+				     bool protected)
 {
 	uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
 
 	virt_pgd_alloc(vm);
-	vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages,
-					      KVM_UTIL_MIN_PFN * vm->page_size,
-					      vm->memslots[type]);
+	vm_paddr_t paddr = __vm_phy_pages_alloc(vm, pages,
+						KVM_UTIL_MIN_PFN * vm->page_size,
+						vm->memslots[type], protected);
 
 	/*
 	 * Find an unused range of virtual page addresses of at least
@@ -1405,6 +1462,20 @@ vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
 	return vaddr_start;
 }
 
+vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
+			    enum kvm_mem_region_type type)
+{
+	return ____vm_vaddr_alloc(vm, sz, vaddr_min, type,
+				  vm_arch_has_protected_memory(vm));
+}
+
+vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz,
+				 vm_vaddr_t vaddr_min,
+				 enum kvm_mem_region_type type)
+{
+	return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, false);
+}
+
 /*
  * VM Virtual Address Allocate
  *
@@ -1527,6 +1598,8 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
 {
 	struct userspace_mem_region *region;
 
+	gpa = vm_untag_gpa(vm, gpa);
+
 	region = userspace_mem_region_find(vm, gpa, gpa);
 	if (!region) {
 		TEST_FAIL("No vm physical memory at 0x%lx", gpa);
@@ -1873,6 +1946,10 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 			region->host_mem);
 		fprintf(stream, "%*sunused_phy_pages: ", indent + 2, "");
 		sparsebit_dump(stream, region->unused_phy_pages, 0);
+		if (region->protected_phy_pages) {
+			fprintf(stream, "%*sprotected_phy_pages: ", indent + 2, "");
+			sparsebit_dump(stream, region->protected_phy_pages, 0);
+		}
 	}
 	fprintf(stream, "%*sMapped Virtual Pages:\n", indent, "");
 	sparsebit_dump(stream, vm->vpages_mapped, indent + 2);
@@ -1974,6 +2051,7 @@ const char *exit_reason_str(unsigned int exit_reason)
  *   num - number of pages
  *   paddr_min - Physical address minimum
  *   memslot - Memory region to allocate page from
+ *   protected - True if the pages will be used as protected/private memory
  *
  * Output Args: None
  *
@@ -1985,8 +2063,9 @@ const char *exit_reason_str(unsigned int exit_reason)
  * and their base address is returned. A TEST_ASSERT failure occurs if
  * not enough pages are available at or above paddr_min.
  */
-vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
-			      vm_paddr_t paddr_min, uint32_t memslot)
+vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
+				vm_paddr_t paddr_min, uint32_t memslot,
+				bool protected)
 {
 	struct userspace_mem_region *region;
 	sparsebit_idx_t pg, base;
@@ -1999,8 +2078,10 @@ vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
 		paddr_min, vm->page_size);
 
 	region = memslot2region(vm, memslot);
-	base = pg = paddr_min >> vm->page_shift;
+	TEST_ASSERT(!protected || region->protected_phy_pages,
+		    "Region doesn't support protected memory");
 
+	base = pg = paddr_min >> vm->page_shift;
 	do {
 		for (; pg < base + num; ++pg) {
 			if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
@@ -2019,8 +2100,11 @@ vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
 		abort();
 	}
 
-	for (pg = base; pg < base + num; ++pg)
+	for (pg = base; pg < base + num; ++pg) {
 		sparsebit_clear(region->unused_phy_pages, pg);
+		if (protected)
+			sparsebit_set(region->protected_phy_pages, pg);
+	}
 
 	return base * vm->page_size;
 }
@@ -2224,3 +2308,18 @@ void __attribute((constructor)) kvm_selftest_init(void)
 
 	kvm_selftest_arch_init();
 }
+
+bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr)
+{
+	sparsebit_idx_t pg = 0;
+	struct userspace_mem_region *region;
+
+	if (!vm_arch_has_protected_memory(vm))
+		return false;
+
+	region = userspace_mem_region_find(vm, paddr, paddr);
+	TEST_ASSERT(region, "No vm physical memory at 0x%lx", paddr);
+
+	pg = paddr >> vm->page_shift;
+	return sparsebit_is_set(region->protected_phy_pages, pg);
+}
diff --git a/tools/testing/selftests/kvm/lib/riscv/handlers.S b/tools/testing/selftests/kvm/lib/riscv/handlers.S
new file mode 100644
index 000000000000..aa0abd3f35bb
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/riscv/handlers.S
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Intel Corporation
+ */
+
+#ifndef __ASSEMBLY__
+#define __ASSEMBLY__
+#endif
+
+#include <asm/csr.h>
+
+.macro save_context
+	addi  sp, sp, (-8*34)
+	sd    x1, 0(sp)
+	sd    x2, 8(sp)
+	sd    x3, 16(sp)
+	sd    x4, 24(sp)
+	sd    x5, 32(sp)
+	sd    x6, 40(sp)
+	sd    x7, 48(sp)
+	sd    x8, 56(sp)
+	sd    x9, 64(sp)
+	sd    x10, 72(sp)
+	sd    x11, 80(sp)
+	sd    x12, 88(sp)
+	sd    x13, 96(sp)
+	sd    x14, 104(sp)
+	sd    x15, 112(sp)
+	sd    x16, 120(sp)
+	sd    x17, 128(sp)
+	sd    x18, 136(sp)
+	sd    x19, 144(sp)
+	sd    x20, 152(sp)
+	sd    x21, 160(sp)
+	sd    x22, 168(sp)
+	sd    x23, 176(sp)
+	sd    x24, 184(sp)
+	sd    x25, 192(sp)
+	sd    x26, 200(sp)
+	sd    x27, 208(sp)
+	sd    x28, 216(sp)
+	sd    x29, 224(sp)
+	sd    x30, 232(sp)
+	sd    x31, 240(sp)
+	csrr  s0, CSR_SEPC
+	csrr  s1, CSR_SSTATUS
+	csrr  s2, CSR_SCAUSE
+	sd    s0, 248(sp)
+	sd    s1, 256(sp)
+	sd    s2, 264(sp)
+.endm
+
+.macro restore_context
+	ld    s2, 264(sp)
+	ld    s1, 256(sp)
+	ld    s0, 248(sp)
+	csrw  CSR_SCAUSE, s2
+	csrw  CSR_SSTATUS, s1
+	csrw  CSR_SEPC, s0
+	ld    x31, 240(sp)
+	ld    x30, 232(sp)
+	ld    x29, 224(sp)
+	ld    x28, 216(sp)
+	ld    x27, 208(sp)
+	ld    x26, 200(sp)
+	ld    x25, 192(sp)
+	ld    x24, 184(sp)
+	ld    x23, 176(sp)
+	ld    x22, 168(sp)
+	ld    x21, 160(sp)
+	ld    x20, 152(sp)
+	ld    x19, 144(sp)
+	ld    x18, 136(sp)
+	ld    x17, 128(sp)
+	ld    x16, 120(sp)
+	ld    x15, 112(sp)
+	ld    x14, 104(sp)
+	ld    x13, 96(sp)
+	ld    x12, 88(sp)
+	ld    x11, 80(sp)
+	ld    x10, 72(sp)
+	ld    x9, 64(sp)
+	ld    x8, 56(sp)
+	ld    x7, 48(sp)
+	ld    x6, 40(sp)
+	ld    x5, 32(sp)
+	ld    x4, 24(sp)
+	ld    x3, 16(sp)
+	ld    x2, 8(sp)
+	ld    x1, 0(sp)
+	addi  sp, sp, (8*34)
+.endm
+
+.balign 4
+.global exception_vectors
+exception_vectors:
+	save_context
+	move  a0, sp
+	call  route_exception
+	restore_context
+	sret
diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c
index 2bb33a8ac03c..e8211f5d6863 100644
--- a/tools/testing/selftests/kvm/lib/riscv/processor.c
+++ b/tools/testing/selftests/kvm/lib/riscv/processor.c
@@ -13,6 +13,18 @@
 
 #define DEFAULT_RISCV_GUEST_STACK_VADDR_MIN	0xac0000
 
+static vm_vaddr_t exception_handlers;
+
+bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext)
+{
+	unsigned long value = 0;
+	int ret;
+
+	ret = __vcpu_get_reg(vcpu, ext, &value);
+
+	return !ret && !!value;
+}
+
 static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
 {
 	return (v + vm->page_size) & ~(vm->page_size - 1);
@@ -277,8 +289,12 @@ static void __aligned(16) guest_unexp_trap(void)
 		  0, 0, 0, 0, 0, 0);
 }
 
-struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
-				  void *guest_code)
+void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
+{
+	vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), (unsigned long)guest_code);
+}
+
+struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
 {
 	int r;
 	size_t stack_size;
@@ -312,7 +328,9 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
 
 	/* Setup stack pointer and program counter of guest */
 	vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.sp), stack_vaddr + stack_size);
-	vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), (unsigned long)guest_code);
+
+	/* Setup sscratch for guest_get_vcpuid() */
+	vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(sscratch), vcpu_id);
 
 	/* Setup default exception vector of guest */
 	vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(stvec), (unsigned long)guest_unexp_trap);
@@ -364,8 +382,80 @@ void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)
 	va_end(ap);
 }
 
+void kvm_exit_unexpected_exception(int vector, int ec)
+{
+	ucall(UCALL_UNHANDLED, 2, vector, ec);
+}
+
 void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
 {
+	struct ucall uc;
+
+	if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) {
+		TEST_FAIL("Unexpected exception (vector:0x%lx, ec:0x%lx)",
+			uc.args[0], uc.args[1]);
+	}
+}
+
+struct handlers {
+	exception_handler_fn exception_handlers[NR_VECTORS][NR_EXCEPTIONS];
+};
+
+void route_exception(struct ex_regs *regs)
+{
+	struct handlers *handlers = (struct handlers *)exception_handlers;
+	int vector = 0, ec;
+
+	ec = regs->cause & ~CAUSE_IRQ_FLAG;
+	if (ec >= NR_EXCEPTIONS)
+		goto unexpected_exception;
+
+	/* Use the same handler for all the interrupts */
+	if (regs->cause & CAUSE_IRQ_FLAG) {
+		vector = 1;
+		ec = 0;
+	}
+
+	if (handlers && handlers->exception_handlers[vector][ec])
+		return handlers->exception_handlers[vector][ec](regs);
+
+unexpected_exception:
+	return kvm_exit_unexpected_exception(vector, ec);
+}
+
+void vcpu_init_vector_tables(struct kvm_vcpu *vcpu)
+{
+	extern char exception_vectors;
+
+	vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(stvec), (unsigned long)&exception_vectors);
+}
+
+void vm_init_vector_tables(struct kvm_vm *vm)
+{
+	vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers),
+				   vm->page_size, MEM_REGION_DATA);
+
+	*(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
+}
+
+void vm_install_exception_handler(struct kvm_vm *vm, int vector, exception_handler_fn handler)
+{
+	struct handlers *handlers = addr_gva2hva(vm, vm->handlers);
+
+	assert(vector < NR_EXCEPTIONS);
+	handlers->exception_handlers[0][vector] = handler;
+}
+
+void vm_install_interrupt_handler(struct kvm_vm *vm, exception_handler_fn handler)
+{
+	struct handlers *handlers = addr_gva2hva(vm, vm->handlers);
+
+	handlers->exception_handlers[1][0] = handler;
+}
+
+uint32_t guest_get_vcpuid(void)
+{
+	return csr_read(CSR_SSCRATCH);
 }
 
 struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c
index f6d227892cbc..4ad4492eea1d 100644
--- a/tools/testing/selftests/kvm/lib/s390x/processor.c
+++ b/tools/testing/selftests/kvm/lib/s390x/processor.c
@@ -155,15 +155,18 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 	virt_dump_region(stream, vm, indent, vm->pgd);
 }
 
-struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
-				  void *guest_code)
+void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
+{
+	vcpu->run->psw_addr = (uintptr_t)guest_code;
+}
+
+struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
 {
 	size_t stack_size =  DEFAULT_STACK_PGS * getpagesize();
 	uint64_t stack_vaddr;
 	struct kvm_regs regs;
 	struct kvm_sregs sregs;
 	struct kvm_vcpu *vcpu;
-	struct kvm_run *run;
 
 	TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
 		    vm->page_size);
@@ -184,9 +187,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
 	sregs.crs[1] = vm->pgd | 0xf;		/* Primary region table */
 	vcpu_sregs_set(vcpu, &sregs);
 
-	run = vcpu->run;
-	run->psw_mask = 0x0400000180000000ULL;  /* DAT enabled + 64 bit mode */
-	run->psw_addr = (uintptr_t)guest_code;
+	vcpu->run->psw_mask = 0x0400000180000000ULL;  /* DAT enabled + 64 bit mode */
 
 	return vcpu;
 }
diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c
index 88cb6b84e6f3..cfed9d26cc71 100644
--- a/tools/testing/selftests/kvm/lib/sparsebit.c
+++ b/tools/testing/selftests/kvm/lib/sparsebit.c
@@ -202,7 +202,7 @@ static sparsebit_num_t node_num_set(struct node *nodep)
 /* Returns a pointer to the node that describes the
  * lowest bit index.
  */
-static struct node *node_first(struct sparsebit *s)
+static struct node *node_first(const struct sparsebit *s)
 {
 	struct node *nodep;
 
@@ -216,7 +216,7 @@ static struct node *node_first(struct sparsebit *s)
  * lowest bit index > the index of the node pointed to by np.
  * Returns NULL if no node with a higher index exists.
  */
-static struct node *node_next(struct sparsebit *s, struct node *np)
+static struct node *node_next(const struct sparsebit *s, struct node *np)
 {
 	struct node *nodep = np;
 
@@ -244,7 +244,7 @@ static struct node *node_next(struct sparsebit *s, struct node *np)
  * highest index < the index of the node pointed to by np.
  * Returns NULL if no node with a lower index exists.
  */
-static struct node *node_prev(struct sparsebit *s, struct node *np)
+static struct node *node_prev(const struct sparsebit *s, struct node *np)
 {
 	struct node *nodep = np;
 
@@ -273,7 +273,7 @@ static struct node *node_prev(struct sparsebit *s, struct node *np)
  * subtree and duplicates the bit settings to the newly allocated nodes.
  * Returns the newly allocated copy of subtree.
  */
-static struct node *node_copy_subtree(struct node *subtree)
+static struct node *node_copy_subtree(const struct node *subtree)
 {
 	struct node *root;
 
@@ -307,7 +307,7 @@ static struct node *node_copy_subtree(struct node *subtree)
  * index is within the bits described by the mask bits or the number of
  * contiguous bits set after the mask.  Returns NULL if there is no such node.
  */
-static struct node *node_find(struct sparsebit *s, sparsebit_idx_t idx)
+static struct node *node_find(const struct sparsebit *s, sparsebit_idx_t idx)
 {
 	struct node *nodep;
 
@@ -393,7 +393,7 @@ static struct node *node_add(struct sparsebit *s, sparsebit_idx_t idx)
 }
 
 /* Returns whether all the bits in the sparsebit array are set.  */
-bool sparsebit_all_set(struct sparsebit *s)
+bool sparsebit_all_set(const struct sparsebit *s)
 {
 	/*
 	 * If any nodes there must be at least one bit set.  Only case
@@ -775,7 +775,7 @@ static void node_reduce(struct sparsebit *s, struct node *nodep)
 /* Returns whether the bit at the index given by idx, within the
  * sparsebit array is set or not.
  */
-bool sparsebit_is_set(struct sparsebit *s, sparsebit_idx_t idx)
+bool sparsebit_is_set(const struct sparsebit *s, sparsebit_idx_t idx)
 {
 	struct node *nodep;
 
@@ -921,7 +921,7 @@ static inline sparsebit_idx_t node_first_clear(struct node *nodep, int start)
  * used by test cases after they detect an unexpected condition, as a means
  * to capture diagnostic information.
  */
-static void sparsebit_dump_internal(FILE *stream, struct sparsebit *s,
+static void sparsebit_dump_internal(FILE *stream, const struct sparsebit *s,
 	unsigned int indent)
 {
 	/* Dump the contents of s */
@@ -969,7 +969,7 @@ void sparsebit_free(struct sparsebit **sbitp)
  * sparsebit_alloc().  It can though already have bits set, which
  * if different from src will be cleared.
  */
-void sparsebit_copy(struct sparsebit *d, struct sparsebit *s)
+void sparsebit_copy(struct sparsebit *d, const struct sparsebit *s)
 {
 	/* First clear any bits already set in the destination */
 	sparsebit_clear_all(d);
@@ -981,7 +981,7 @@ void sparsebit_copy(struct sparsebit *d, struct sparsebit *s)
 }
 
 /* Returns whether num consecutive bits starting at idx are all set.  */
-bool sparsebit_is_set_num(struct sparsebit *s,
+bool sparsebit_is_set_num(const struct sparsebit *s,
 	sparsebit_idx_t idx, sparsebit_num_t num)
 {
 	sparsebit_idx_t next_cleared;
@@ -1005,14 +1005,14 @@ bool sparsebit_is_set_num(struct sparsebit *s,
 }
 
 /* Returns whether the bit at the index given by idx.  */
-bool sparsebit_is_clear(struct sparsebit *s,
+bool sparsebit_is_clear(const struct sparsebit *s,
 	sparsebit_idx_t idx)
 {
 	return !sparsebit_is_set(s, idx);
 }
 
 /* Returns whether num consecutive bits starting at idx are all cleared.  */
-bool sparsebit_is_clear_num(struct sparsebit *s,
+bool sparsebit_is_clear_num(const struct sparsebit *s,
 	sparsebit_idx_t idx, sparsebit_num_t num)
 {
 	sparsebit_idx_t next_set;
@@ -1041,13 +1041,13 @@ bool sparsebit_is_clear_num(struct sparsebit *s,
  * value.  Use sparsebit_any_set(), instead of sparsebit_num_set() > 0,
  * to determine if the sparsebit array has any bits set.
  */
-sparsebit_num_t sparsebit_num_set(struct sparsebit *s)
+sparsebit_num_t sparsebit_num_set(const struct sparsebit *s)
 {
 	return s->num_set;
 }
 
 /* Returns whether any bit is set in the sparsebit array.  */
-bool sparsebit_any_set(struct sparsebit *s)
+bool sparsebit_any_set(const struct sparsebit *s)
 {
 	/*
 	 * Nodes only describe set bits.  If any nodes then there
@@ -1070,20 +1070,20 @@ bool sparsebit_any_set(struct sparsebit *s)
 }
 
 /* Returns whether all the bits in the sparsebit array are cleared.  */
-bool sparsebit_all_clear(struct sparsebit *s)
+bool sparsebit_all_clear(const struct sparsebit *s)
 {
 	return !sparsebit_any_set(s);
 }
 
 /* Returns whether all the bits in the sparsebit array are set.  */
-bool sparsebit_any_clear(struct sparsebit *s)
+bool sparsebit_any_clear(const struct sparsebit *s)
 {
 	return !sparsebit_all_set(s);
 }
 
 /* Returns the index of the first set bit.  Abort if no bits are set.
  */
-sparsebit_idx_t sparsebit_first_set(struct sparsebit *s)
+sparsebit_idx_t sparsebit_first_set(const struct sparsebit *s)
 {
 	struct node *nodep;
 
@@ -1097,7 +1097,7 @@ sparsebit_idx_t sparsebit_first_set(struct sparsebit *s)
 /* Returns the index of the first cleared bit.  Abort if
  * no bits are cleared.
  */
-sparsebit_idx_t sparsebit_first_clear(struct sparsebit *s)
+sparsebit_idx_t sparsebit_first_clear(const struct sparsebit *s)
 {
 	struct node *nodep1, *nodep2;
 
@@ -1151,7 +1151,7 @@ sparsebit_idx_t sparsebit_first_clear(struct sparsebit *s)
 /* Returns index of next bit set within s after the index given by prev.
  * Returns 0 if there are no bits after prev that are set.
  */
-sparsebit_idx_t sparsebit_next_set(struct sparsebit *s,
+sparsebit_idx_t sparsebit_next_set(const struct sparsebit *s,
 	sparsebit_idx_t prev)
 {
 	sparsebit_idx_t lowest_possible = prev + 1;
@@ -1244,7 +1244,7 @@ sparsebit_idx_t sparsebit_next_set(struct sparsebit *s,
 /* Returns index of next bit cleared within s after the index given by prev.
  * Returns 0 if there are no bits after prev that are cleared.
  */
-sparsebit_idx_t sparsebit_next_clear(struct sparsebit *s,
+sparsebit_idx_t sparsebit_next_clear(const struct sparsebit *s,
 	sparsebit_idx_t prev)
 {
 	sparsebit_idx_t lowest_possible = prev + 1;
@@ -1300,7 +1300,7 @@ sparsebit_idx_t sparsebit_next_clear(struct sparsebit *s,
  * and returns the index of the first sequence of num consecutively set
  * bits.  Returns a value of 0 of no such sequence exists.
  */
-sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *s,
+sparsebit_idx_t sparsebit_next_set_num(const struct sparsebit *s,
 	sparsebit_idx_t start, sparsebit_num_t num)
 {
 	sparsebit_idx_t idx;
@@ -1335,7 +1335,7 @@ sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *s,
  * and returns the index of the first sequence of num consecutively cleared
  * bits.  Returns a value of 0 of no such sequence exists.
  */
-sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *s,
+sparsebit_idx_t sparsebit_next_clear_num(const struct sparsebit *s,
 	sparsebit_idx_t start, sparsebit_num_t num)
 {
 	sparsebit_idx_t idx;
@@ -1583,7 +1583,7 @@ static size_t display_range(FILE *stream, sparsebit_idx_t low,
  * contiguous bits.  This is done because '-' is used to specify command-line
  * options, and sometimes ranges are specified as command-line arguments.
  */
-void sparsebit_dump(FILE *stream, struct sparsebit *s,
+void sparsebit_dump(FILE *stream, const struct sparsebit *s,
 	unsigned int indent)
 {
 	size_t current_line_len = 0;
@@ -1681,7 +1681,7 @@ void sparsebit_dump(FILE *stream, struct sparsebit *s,
  * s.  On error, diagnostic information is printed to stderr and
  * abort is called.
  */
-void sparsebit_validate_internal(struct sparsebit *s)
+void sparsebit_validate_internal(const struct sparsebit *s)
 {
 	bool error_detected = false;
 	struct node *nodep, *prev = NULL;
diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c
index 816a3fa109bf..f5af65a41c29 100644
--- a/tools/testing/selftests/kvm/lib/ucall_common.c
+++ b/tools/testing/selftests/kvm/lib/ucall_common.c
@@ -29,7 +29,8 @@ void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
 	vm_vaddr_t vaddr;
 	int i;
 
-	vaddr = __vm_vaddr_alloc(vm, sizeof(*hdr), KVM_UTIL_MIN_VADDR, MEM_REGION_DATA);
+	vaddr = vm_vaddr_alloc_shared(vm, sizeof(*hdr), KVM_UTIL_MIN_VADDR,
+				      MEM_REGION_DATA);
 	hdr = (struct ucall_header *)addr_gva2hva(vm, vaddr);
 	memset(hdr, 0, sizeof(*hdr));
 
diff --git a/tools/testing/selftests/kvm/lib/x86_64/pmu.c b/tools/testing/selftests/kvm/lib/x86_64/pmu.c
new file mode 100644
index 000000000000..f31f0427c17c
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/pmu.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023, Tencent, Inc.
+ */
+
+#include <stdint.h>
+
+#include <linux/kernel.h>
+
+#include "kvm_util.h"
+#include "pmu.h"
+
+const uint64_t intel_pmu_arch_events[] = {
+	INTEL_ARCH_CPU_CYCLES,
+	INTEL_ARCH_INSTRUCTIONS_RETIRED,
+	INTEL_ARCH_REFERENCE_CYCLES,
+	INTEL_ARCH_LLC_REFERENCES,
+	INTEL_ARCH_LLC_MISSES,
+	INTEL_ARCH_BRANCHES_RETIRED,
+	INTEL_ARCH_BRANCHES_MISPREDICTED,
+	INTEL_ARCH_TOPDOWN_SLOTS,
+};
+kvm_static_assert(ARRAY_SIZE(intel_pmu_arch_events) == NR_INTEL_ARCH_EVENTS);
+
+const uint64_t amd_pmu_zen_events[] = {
+	AMD_ZEN_CORE_CYCLES,
+	AMD_ZEN_INSTRUCTIONS_RETIRED,
+	AMD_ZEN_BRANCHES_RETIRED,
+	AMD_ZEN_BRANCHES_MISPREDICTED,
+};
+kvm_static_assert(ARRAY_SIZE(amd_pmu_zen_events) == NR_AMD_ZEN_EVENTS);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index f639b3e062e3..74a4c736c9ae 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -9,6 +9,7 @@
 #include "test_util.h"
 #include "kvm_util.h"
 #include "processor.h"
+#include "sev.h"
 
 #ifndef NUM_INTERRUPTS
 #define NUM_INTERRUPTS 256
@@ -157,6 +158,8 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
 {
 	uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level);
 
+	paddr = vm_untag_gpa(vm, paddr);
+
 	if (!(*pte & PTE_PRESENT_MASK)) {
 		*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
 		if (current_level == target_level)
@@ -200,6 +203,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
 		    "Physical address beyond maximum supported,\n"
 		    "  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
 		    paddr, vm->max_gfn, vm->page_size);
+	TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr,
+		    "Unexpected bits in paddr: %lx", paddr);
 
 	/*
 	 * Allocate upper level page tables, if not already present.  Return
@@ -222,6 +227,15 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
 	TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
 		    "PTE already present for 4k page at vaddr: 0x%lx", vaddr);
 	*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
+
+	/*
+	 * Neither SEV nor TDX supports shared page tables, so only the final
+	 * leaf PTE needs manually set the C/S-bit.
+	 */
+	if (vm_is_gpa_protected(vm, paddr))
+		*pte |= vm->arch.c_bit;
+	else
+		*pte |= vm->arch.s_bit;
 }
 
 void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
@@ -265,6 +279,9 @@ uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
 {
 	uint64_t *pml4e, *pdpe, *pde;
 
+	TEST_ASSERT(!vm->arch.is_pt_protected,
+		    "Walking page tables of protected guests is impossible");
+
 	TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM,
 		    "Invalid PG_LEVEL_* '%d'", *level);
 
@@ -496,7 +513,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 	 * No need for a hugepage mask on the PTE, x86-64 requires the "unused"
 	 * address bits to be zero.
 	 */
-	return PTE_GET_PA(*pte) | (gva & ~HUGEPAGE_MASK(level));
+	return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level));
 }
 
 static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt)
@@ -560,10 +577,23 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm)
 	vm_create_irqchip(vm);
 	sync_global_to_guest(vm, host_cpu_is_intel);
 	sync_global_to_guest(vm, host_cpu_is_amd);
+
+	if (vm->subtype == VM_SUBTYPE_SEV)
+		sev_vm_init(vm);
+	else if (vm->subtype == VM_SUBTYPE_SEV_ES)
+		sev_es_vm_init(vm);
+}
+
+void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
+{
+	struct kvm_regs regs;
+
+	vcpu_regs_get(vcpu, &regs);
+	regs.rip = (unsigned long) guest_code;
+	vcpu_regs_set(vcpu, &regs);
 }
 
-struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
-				  void *guest_code)
+struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
 {
 	struct kvm_mp_state mp_state;
 	struct kvm_regs regs;
@@ -597,7 +627,6 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
 	vcpu_regs_get(vcpu, &regs);
 	regs.rflags = regs.rflags | 0x2;
 	regs.rsp = stack_vaddr;
-	regs.rip = (unsigned long) guest_code;
 	vcpu_regs_set(vcpu, &regs);
 
 	/* Setup the MP state */
@@ -752,12 +781,21 @@ void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid)
 	vcpu_set_cpuid(vcpu);
 }
 
-void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr)
+void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,
+			     struct kvm_x86_cpu_property property,
+			     uint32_t value)
 {
-	struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, 0x80000008);
+	struct kvm_cpuid_entry2 *entry;
+
+	entry = __vcpu_get_cpuid_entry(vcpu, property.function, property.index);
+
+	(&entry->eax)[property.reg] &= ~GENMASK(property.hi_bit, property.lo_bit);
+	(&entry->eax)[property.reg] |= value << property.lo_bit;
 
-	entry->eax = (entry->eax & ~0xff) | maxphyaddr;
 	vcpu_set_cpuid(vcpu);
+
+	/* Sanity check that @value doesn't exceed the bounds in any way. */
+	TEST_ASSERT_EQ(kvm_cpuid_property(vcpu->cpuid, property), value);
 }
 
 void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function)
@@ -1041,6 +1079,14 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
 	}
 }
 
+void kvm_init_vm_address_properties(struct kvm_vm *vm)
+{
+	if (vm->subtype == VM_SUBTYPE_SEV || vm->subtype == VM_SUBTYPE_SEV_ES) {
+		vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT));
+		vm->gpa_tag_mask = vm->arch.c_bit;
+	}
+}
+
 static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
 			  int dpl, unsigned short selector)
 {
diff --git a/tools/testing/selftests/kvm/lib/x86_64/sev.c b/tools/testing/selftests/kvm/lib/x86_64/sev.c
new file mode 100644
index 000000000000..e248d3364b9c
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/sev.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "sev.h"
+
+/*
+ * sparsebit_next_clear() can return 0 if [x, 2**64-1] are all set, and the
+ * -1 would then cause an underflow back to 2**64 - 1. This is expected and
+ * correct.
+ *
+ * If the last range in the sparsebit is [x, y] and we try to iterate,
+ * sparsebit_next_set() will return 0, and sparsebit_next_clear() will try
+ * and find the first range, but that's correct because the condition
+ * expression would cause us to quit the loop.
+ */
+static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *region)
+{
+	const struct sparsebit *protected_phy_pages = region->protected_phy_pages;
+	const vm_paddr_t gpa_base = region->region.guest_phys_addr;
+	const sparsebit_idx_t lowest_page_in_region = gpa_base >> vm->page_shift;
+	sparsebit_idx_t i, j;
+
+	if (!sparsebit_any_set(protected_phy_pages))
+		return;
+
+	sev_register_encrypted_memory(vm, region);
+
+	sparsebit_for_each_set_range(protected_phy_pages, i, j) {
+		const uint64_t size = (j - i + 1) * vm->page_size;
+		const uint64_t offset = (i - lowest_page_in_region) * vm->page_size;
+
+		sev_launch_update_data(vm, gpa_base + offset, size);
+	}
+}
+
+void sev_vm_launch(struct kvm_vm *vm, uint32_t policy)
+{
+	struct kvm_sev_launch_start launch_start = {
+		.policy = policy,
+	};
+	struct userspace_mem_region *region;
+	struct kvm_sev_guest_status status;
+	int ctr;
+
+	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_START, &launch_start);
+	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
+
+	TEST_ASSERT_EQ(status.policy, policy);
+	TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_LAUNCH_UPDATE);
+
+	hash_for_each(vm->regions.slot_hash, ctr, region, slot_node)
+		encrypt_region(vm, region);
+
+	if (policy & SEV_POLICY_ES)
+		vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
+
+	vm->arch.is_pt_protected = true;
+}
+
+void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement)
+{
+	struct kvm_sev_launch_measure launch_measure;
+	struct kvm_sev_guest_status guest_status;
+
+	launch_measure.len = 256;
+	launch_measure.uaddr = (__u64)measurement;
+	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_MEASURE, &launch_measure);
+
+	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &guest_status);
+	TEST_ASSERT_EQ(guest_status.state, SEV_GUEST_STATE_LAUNCH_SECRET);
+}
+
+void sev_vm_launch_finish(struct kvm_vm *vm)
+{
+	struct kvm_sev_guest_status status;
+
+	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
+	TEST_ASSERT(status.state == SEV_GUEST_STATE_LAUNCH_UPDATE ||
+		    status.state == SEV_GUEST_STATE_LAUNCH_SECRET,
+		    "Unexpected guest state: %d", status.state);
+
+	vm_sev_ioctl(vm, KVM_SEV_LAUNCH_FINISH, NULL);
+
+	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
+	TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING);
+}
+
+struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t policy, void *guest_code,
+					   struct kvm_vcpu **cpu)
+{
+	struct vm_shape shape = {
+		.type = VM_TYPE_DEFAULT,
+		.mode = VM_MODE_DEFAULT,
+		.subtype = policy & SEV_POLICY_ES ? VM_SUBTYPE_SEV_ES :
+						    VM_SUBTYPE_SEV,
+	};
+	struct kvm_vm *vm;
+	struct kvm_vcpu *cpus[1];
+	uint8_t measurement[512];
+
+	vm = __vm_create_with_vcpus(shape, 1, 0, guest_code, cpus);
+	*cpu = cpus[0];
+
+	sev_vm_launch(vm, policy);
+
+	/* TODO: Validate the measurement is as expected. */
+	sev_vm_launch_measure(vm, measurement);
+
+	sev_vm_launch_finish(vm);
+
+	return vm;
+}
diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c
index 6628dc4dda89..1a6da7389bf1 100644
--- a/tools/testing/selftests/kvm/max_guest_memory_test.c
+++ b/tools/testing/selftests/kvm/max_guest_memory_test.c
@@ -22,10 +22,11 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 {
 	uint64_t gpa;
 
-	for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
-		*((volatile uint64_t *)gpa) = gpa;
-
-	GUEST_DONE();
+	for (;;) {
+		for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
+			*((volatile uint64_t *)gpa) = gpa;
+		GUEST_SYNC(0);
+	}
 }
 
 struct vcpu_info {
@@ -55,7 +56,7 @@ static void rendezvous_with_boss(void)
 static void run_vcpu(struct kvm_vcpu *vcpu)
 {
 	vcpu_run(vcpu);
-	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
+	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
 }
 
 static void *vcpu_worker(void *data)
@@ -64,17 +65,13 @@ static void *vcpu_worker(void *data)
 	struct kvm_vcpu *vcpu = info->vcpu;
 	struct kvm_vm *vm = vcpu->vm;
 	struct kvm_sregs sregs;
-	struct kvm_regs regs;
 
 	vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size);
 
-	/* Snapshot regs before the first run. */
-	vcpu_regs_get(vcpu, &regs);
 	rendezvous_with_boss();
 
 	run_vcpu(vcpu);
 	rendezvous_with_boss();
-	vcpu_regs_set(vcpu, &regs);
 	vcpu_sregs_get(vcpu, &sregs);
 #ifdef __x86_64__
 	/* Toggle CR0.WP to trigger a MMU context reset. */
diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c
new file mode 100644
index 000000000000..0f9cabd99fd4
--- /dev/null
+++ b/tools/testing/selftests/kvm/riscv/arch_timer.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * arch_timer.c - Tests the riscv64 sstc timer IRQ functionality
+ *
+ * The test validates the sstc timer IRQs using vstimecmp registers.
+ * It's ported from the aarch64 arch_timer test.
+ *
+ * Copyright (c) 2024, Intel Corporation.
+ */
+
+#define _GNU_SOURCE
+
+#include "arch_timer.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "timer_test.h"
+
+static int timer_irq = IRQ_S_TIMER;
+
+static void guest_irq_handler(struct ex_regs *regs)
+{
+	uint64_t xcnt, xcnt_diff_us, cmp;
+	unsigned int intid = regs->cause & ~CAUSE_IRQ_FLAG;
+	uint32_t cpu = guest_get_vcpuid();
+	struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu];
+
+	timer_irq_disable();
+
+	xcnt = timer_get_cycles();
+	cmp = timer_get_cmp();
+	xcnt_diff_us = cycles_to_usec(xcnt - shared_data->xcnt);
+
+	/* Make sure we are dealing with the correct timer IRQ */
+	GUEST_ASSERT_EQ(intid, timer_irq);
+
+	__GUEST_ASSERT(xcnt >= cmp,
+			"xcnt = 0x%"PRIx64", cmp = 0x%"PRIx64", xcnt_diff_us = 0x%" PRIx64,
+			xcnt, cmp, xcnt_diff_us);
+
+	WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1);
+}
+
+static void guest_run(struct test_vcpu_shared_data *shared_data)
+{
+	uint32_t irq_iter, config_iter;
+
+	shared_data->nr_iter = 0;
+	shared_data->guest_stage = 0;
+
+	for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) {
+		/* Setup the next interrupt */
+		timer_set_next_cmp_ms(test_args.timer_period_ms);
+		shared_data->xcnt = timer_get_cycles();
+		timer_irq_enable();
+
+		/* Setup a timeout for the interrupt to arrive */
+		udelay(msecs_to_usecs(test_args.timer_period_ms) +
+			test_args.timer_err_margin_us);
+
+		irq_iter = READ_ONCE(shared_data->nr_iter);
+		__GUEST_ASSERT(config_iter + 1 == irq_iter,
+				"config_iter + 1 = 0x%x, irq_iter = 0x%x.\n"
+				"  Guest timer interrupt was not triggered within the specified\n"
+				"  interval, try to increase the error margin by [-e] option.\n",
+				config_iter + 1, irq_iter);
+	}
+}
+
+static void guest_code(void)
+{
+	uint32_t cpu = guest_get_vcpuid();
+	struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu];
+
+	timer_irq_disable();
+	local_irq_enable();
+
+	guest_run(shared_data);
+
+	GUEST_DONE();
+}
+
+struct kvm_vm *test_vm_create(void)
+{
+	struct kvm_vm *vm;
+	int nr_vcpus = test_args.nr_vcpus;
+
+	vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
+	__TEST_REQUIRE(__vcpu_has_ext(vcpus[0], RISCV_ISA_EXT_REG(KVM_RISCV_ISA_EXT_SSTC)),
+				   "SSTC not available, skipping test\n");
+
+	vm_init_vector_tables(vm);
+	vm_install_interrupt_handler(vm, guest_irq_handler);
+
+	for (int i = 0; i < nr_vcpus; i++)
+		vcpu_init_vector_tables(vcpus[i]);
+
+	/* Initialize guest timer frequency. */
+	vcpu_get_reg(vcpus[0], RISCV_TIMER_REG(frequency), &timer_freq);
+	sync_global_to_guest(vm, timer_freq);
+	pr_debug("timer_freq: %lu\n", timer_freq);
+
+	/* Make all the test's cmdline args visible to the guest */
+	sync_global_to_guest(vm, test_args);
+
+	return vm;
+}
+
+void test_vm_cleanup(struct kvm_vm *vm)
+{
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c
index 6435e7a65642..b882b7b9b785 100644
--- a/tools/testing/selftests/kvm/riscv/get-reg-list.c
+++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c
@@ -47,6 +47,7 @@ bool filter_reg(__u64 reg)
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVINVAL:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVNAPOT:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVPBMT:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZACAS:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBA:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBB:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBC:
@@ -73,6 +74,7 @@ bool filter_reg(__u64 reg)
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKSED:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKSH:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKT:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZTSO:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVBB:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVBC:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVFH:
@@ -123,15 +125,6 @@ bool check_reject_set(int err)
 	return err == EINVAL;
 }
 
-static bool vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext_id)
-{
-	int ret;
-	unsigned long value;
-
-	ret = __vcpu_get_reg(vcpu, ext_id, &value);
-	return (ret) ? false : !!value;
-}
-
 void finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_reg_list *c)
 {
 	unsigned long isa_ext_state[KVM_RISCV_ISA_EXT_MAX] = { 0 };
@@ -176,7 +169,7 @@ void finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_reg_list *c)
 		__vcpu_set_reg(vcpu, feature, 1);
 
 		/* Double check whether the desired extension was enabled */
-		__TEST_REQUIRE(vcpu_has_ext(vcpu, feature),
+		__TEST_REQUIRE(__vcpu_has_ext(vcpu, feature),
 			       "%s not available, skipping tests", s->name);
 	}
 }
@@ -419,6 +412,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off)
 		KVM_ISA_EXT_ARR(SVINVAL),
 		KVM_ISA_EXT_ARR(SVNAPOT),
 		KVM_ISA_EXT_ARR(SVPBMT),
+		KVM_ISA_EXT_ARR(ZACAS),
 		KVM_ISA_EXT_ARR(ZBA),
 		KVM_ISA_EXT_ARR(ZBB),
 		KVM_ISA_EXT_ARR(ZBC),
@@ -445,6 +439,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off)
 		KVM_ISA_EXT_ARR(ZKSED),
 		KVM_ISA_EXT_ARR(ZKSH),
 		KVM_ISA_EXT_ARR(ZKT),
+		KVM_ISA_EXT_ARR(ZTSO),
 		KVM_ISA_EXT_ARR(ZVBB),
 		KVM_ISA_EXT_ARR(ZVBC),
 		KVM_ISA_EXT_ARR(ZVFH),
@@ -940,6 +935,7 @@ KVM_ISA_EXT_SIMPLE_CONFIG(sstc, SSTC);
 KVM_ISA_EXT_SIMPLE_CONFIG(svinval, SVINVAL);
 KVM_ISA_EXT_SIMPLE_CONFIG(svnapot, SVNAPOT);
 KVM_ISA_EXT_SIMPLE_CONFIG(svpbmt, SVPBMT);
+KVM_ISA_EXT_SIMPLE_CONFIG(zacas, ZACAS);
 KVM_ISA_EXT_SIMPLE_CONFIG(zba, ZBA);
 KVM_ISA_EXT_SIMPLE_CONFIG(zbb, ZBB);
 KVM_ISA_EXT_SIMPLE_CONFIG(zbc, ZBC);
@@ -966,6 +962,7 @@ KVM_ISA_EXT_SIMPLE_CONFIG(zkr, ZKR);
 KVM_ISA_EXT_SIMPLE_CONFIG(zksed, ZKSED);
 KVM_ISA_EXT_SIMPLE_CONFIG(zksh, ZKSH);
 KVM_ISA_EXT_SIMPLE_CONFIG(zkt, ZKT);
+KVM_ISA_EXT_SIMPLE_CONFIG(ztso, ZTSO);
 KVM_ISA_EXT_SIMPLE_CONFIG(zvbb, ZVBB);
 KVM_ISA_EXT_SIMPLE_CONFIG(zvbc, ZVBC);
 KVM_ISA_EXT_SIMPLE_CONFIG(zvfh, ZVFH);
@@ -993,6 +990,7 @@ struct vcpu_reg_list *vcpu_configs[] = {
 	&config_svinval,
 	&config_svnapot,
 	&config_svpbmt,
+	&config_zacas,
 	&config_zba,
 	&config_zbb,
 	&config_zbc,
@@ -1019,6 +1017,7 @@ struct vcpu_reg_list *vcpu_configs[] = {
 	&config_zksed,
 	&config_zksh,
 	&config_zkt,
+	&config_ztso,
 	&config_zvbb,
 	&config_zvbc,
 	&config_zvfh,
diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index b6da8f71ea19..48cb910e660d 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -515,6 +515,8 @@ static __uint128_t rotate(int size, __uint128_t val, int amount)
 
 	amount = (amount + bits) % bits;
 	val = cut_to_size(size, val);
+	if (!amount)
+		return val;
 	return (val << (bits - amount)) | (val >> amount);
 }
 
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index 06b43ed23580..bd57d991e27d 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -333,7 +333,7 @@ static void test_invalid_memory_region_flags(void)
 	struct kvm_vm *vm;
 	int r, i;
 
-#if defined __aarch64__ || defined __x86_64__
+#if defined __aarch64__ || defined __riscv || defined __x86_64__
 	supported_flags |= KVM_MEM_READONLY;
 #endif
 
diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
index 0f728f05ea82..f3c2239228b1 100644
--- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
+++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
@@ -9,6 +9,7 @@
 #include <linux/stringify.h>
 #include <stdint.h>
 
+#include "kvm_test_harness.h"
 #include "apic.h"
 #include "test_util.h"
 #include "kvm_util.h"
@@ -83,6 +84,8 @@ static void guest_main(void)
 	GUEST_DONE();
 }
 
+KVM_ONE_VCPU_TEST_SUITE(fix_hypercall);
+
 static void enter_guest(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
@@ -103,14 +106,11 @@ static void enter_guest(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void test_fix_hypercall(bool disable_quirk)
+static void test_fix_hypercall(struct kvm_vcpu *vcpu, bool disable_quirk)
 {
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+	struct kvm_vm *vm = vcpu->vm;
 
-	vm_init_descriptor_tables(vcpu->vm);
+	vm_init_descriptor_tables(vm);
 	vcpu_init_descriptor_tables(vcpu);
 	vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler);
 
@@ -126,10 +126,19 @@ static void test_fix_hypercall(bool disable_quirk)
 	enter_guest(vcpu);
 }
 
-int main(void)
+KVM_ONE_VCPU_TEST(fix_hypercall, enable_quirk, guest_main)
+{
+	test_fix_hypercall(vcpu, false);
+}
+
+KVM_ONE_VCPU_TEST(fix_hypercall, disable_quirk, guest_main)
+{
+	test_fix_hypercall(vcpu, true);
+}
+
+int main(int argc, char *argv[])
 {
 	TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN);
 
-	test_fix_hypercall(false);
-	test_fix_hypercall(true);
+	return test_harness_run(argc, argv);
 }
diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
index 9e2879af7c20..40cc59f4e650 100644
--- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
+++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
@@ -133,6 +133,43 @@ static void enter_guest(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void test_pv_unhalt(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct kvm_cpuid_entry2 *ent;
+	u32 kvm_sig_old;
+
+	pr_info("testing KVM_FEATURE_PV_UNHALT\n");
+
+	TEST_REQUIRE(KVM_CAP_X86_DISABLE_EXITS);
+
+	/* KVM_PV_UNHALT test */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_KVM_PV_UNHALT);
+
+	TEST_ASSERT(vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT),
+		    "Enabling X86_FEATURE_KVM_PV_UNHALT had no effect");
+
+	/* Make sure KVM clears vcpu->arch.kvm_cpuid */
+	ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE);
+	kvm_sig_old = ent->ebx;
+	ent->ebx = 0xdeadbeef;
+	vcpu_set_cpuid(vcpu);
+
+	vm_enable_cap(vm, KVM_CAP_X86_DISABLE_EXITS, KVM_X86_DISABLE_EXITS_HLT);
+	ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE);
+	ent->ebx = kvm_sig_old;
+	vcpu_set_cpuid(vcpu);
+
+	TEST_ASSERT(!vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT),
+		    "KVM_FEATURE_PV_UNHALT is set with KVM_CAP_X86_DISABLE_EXITS");
+
+	/* FIXME: actually test KVM_FEATURE_PV_UNHALT feature */
+
+	kvm_vm_free(vm);
+}
+
 int main(void)
 {
 	struct kvm_vcpu *vcpu;
@@ -151,4 +188,6 @@ int main(void)
 
 	enter_guest(vcpu);
 	kvm_vm_free(vm);
+
+	test_pv_unhalt();
 }
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
new file mode 100644
index 000000000000..26c85815f7e9
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
@@ -0,0 +1,638 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023, Tencent, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <x86intrin.h>
+
+#include "pmu.h"
+#include "processor.h"
+
+/* Number of LOOP instructions for the guest measurement payload. */
+#define NUM_BRANCHES		10
+/*
+ * Number of "extra" instructions that will be counted, i.e. the number of
+ * instructions that are needed to set up the loop and then disabled the
+ * counter.  1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 2 MOV, 2 XOR, 1 WRMSR.
+ */
+#define NUM_EXTRA_INSNS		7
+#define NUM_INSNS_RETIRED	(NUM_BRANCHES + NUM_EXTRA_INSNS)
+
+static uint8_t kvm_pmu_version;
+static bool kvm_has_perf_caps;
+static bool is_forced_emulation_enabled;
+
+static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
+						  void *guest_code,
+						  uint8_t pmu_version,
+						  uint64_t perf_capabilities)
+{
+	struct kvm_vm *vm;
+
+	vm = vm_create_with_one_vcpu(vcpu, guest_code);
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(*vcpu);
+
+	sync_global_to_guest(vm, kvm_pmu_version);
+	sync_global_to_guest(vm, is_forced_emulation_enabled);
+
+	/*
+	 * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling
+	 * features via PERF_CAPABILITIES if the guest doesn't have a vPMU.
+	 */
+	if (kvm_has_perf_caps)
+		vcpu_set_msr(*vcpu, MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
+
+	vcpu_set_cpuid_property(*vcpu, X86_PROPERTY_PMU_VERSION, pmu_version);
+	return vm;
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	do {
+		vcpu_run(vcpu);
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_PRINTF:
+			pr_info("%s", uc.buffer);
+			break;
+		case UCALL_DONE:
+			break;
+		default:
+			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+		}
+	} while (uc.cmd != UCALL_DONE);
+}
+
+static uint8_t guest_get_pmu_version(void)
+{
+	/*
+	 * Return the effective PMU version, i.e. the minimum between what KVM
+	 * supports and what is enumerated to the guest.  The host deliberately
+	 * advertises a PMU version to the guest beyond what is actually
+	 * supported by KVM to verify KVM doesn't freak out and do something
+	 * bizarre with an architecturally valid, but unsupported, version.
+	 */
+	return min_t(uint8_t, kvm_pmu_version, this_cpu_property(X86_PROPERTY_PMU_VERSION));
+}
+
+/*
+ * If an architectural event is supported and guaranteed to generate at least
+ * one "hit, assert that its count is non-zero.  If an event isn't supported or
+ * the test can't guarantee the associated action will occur, then all bets are
+ * off regarding the count, i.e. no checks can be done.
+ *
+ * Sanity check that in all cases, the event doesn't count when it's disabled,
+ * and that KVM correctly emulates the write of an arbitrary value.
+ */
+static void guest_assert_event_count(uint8_t idx,
+				     struct kvm_x86_pmu_feature event,
+				     uint32_t pmc, uint32_t pmc_msr)
+{
+	uint64_t count;
+
+	count = _rdpmc(pmc);
+	if (!this_pmu_has(event))
+		goto sanity_checks;
+
+	switch (idx) {
+	case INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX:
+		GUEST_ASSERT_EQ(count, NUM_INSNS_RETIRED);
+		break;
+	case INTEL_ARCH_BRANCHES_RETIRED_INDEX:
+		GUEST_ASSERT_EQ(count, NUM_BRANCHES);
+		break;
+	case INTEL_ARCH_LLC_REFERENCES_INDEX:
+	case INTEL_ARCH_LLC_MISSES_INDEX:
+		if (!this_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
+		    !this_cpu_has(X86_FEATURE_CLFLUSH))
+			break;
+		fallthrough;
+	case INTEL_ARCH_CPU_CYCLES_INDEX:
+	case INTEL_ARCH_REFERENCE_CYCLES_INDEX:
+		GUEST_ASSERT_NE(count, 0);
+		break;
+	case INTEL_ARCH_TOPDOWN_SLOTS_INDEX:
+		GUEST_ASSERT(count >= NUM_INSNS_RETIRED);
+		break;
+	default:
+		break;
+	}
+
+sanity_checks:
+	__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+	GUEST_ASSERT_EQ(_rdpmc(pmc), count);
+
+	wrmsr(pmc_msr, 0xdead);
+	GUEST_ASSERT_EQ(_rdpmc(pmc), 0xdead);
+}
+
+/*
+ * Enable and disable the PMC in a monolithic asm blob to ensure that the
+ * compiler can't insert _any_ code into the measured sequence.  Note, ECX
+ * doesn't need to be clobbered as the input value, @pmc_msr, is restored
+ * before the end of the sequence.
+ *
+ * If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the
+ * start of the loop to force LLC references and misses, i.e. to allow testing
+ * that those events actually count.
+ *
+ * If forced emulation is enabled (and specified), force emulation on a subset
+ * of the measured code to verify that KVM correctly emulates instructions and
+ * branches retired events in conjunction with hardware also counting said
+ * events.
+ */
+#define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP)				\
+do {										\
+	__asm__ __volatile__("wrmsr\n\t"					\
+			     clflush "\n\t"					\
+			     "mfence\n\t"					\
+			     "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
+			     FEP "loop .\n\t"					\
+			     FEP "mov %%edi, %%ecx\n\t"				\
+			     FEP "xor %%eax, %%eax\n\t"				\
+			     FEP "xor %%edx, %%edx\n\t"				\
+			     "wrmsr\n\t"					\
+			     :: "a"((uint32_t)_value), "d"(_value >> 32),	\
+				"c"(_msr), "D"(_msr)				\
+	);									\
+} while (0)
+
+#define GUEST_TEST_EVENT(_idx, _event, _pmc, _pmc_msr, _ctrl_msr, _value, FEP)	\
+do {										\
+	wrmsr(pmc_msr, 0);							\
+										\
+	if (this_cpu_has(X86_FEATURE_CLFLUSHOPT))				\
+		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP);	\
+	else if (this_cpu_has(X86_FEATURE_CLFLUSH))				\
+		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush 1f", FEP);	\
+	else									\
+		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP);		\
+										\
+	guest_assert_event_count(_idx, _event, _pmc, _pmc_msr);			\
+} while (0)
+
+static void __guest_test_arch_event(uint8_t idx, struct kvm_x86_pmu_feature event,
+				    uint32_t pmc, uint32_t pmc_msr,
+				    uint32_t ctrl_msr, uint64_t ctrl_msr_value)
+{
+	GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, "");
+
+	if (is_forced_emulation_enabled)
+		GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, KVM_FEP);
+}
+
+#define X86_PMU_FEATURE_NULL						\
+({									\
+	struct kvm_x86_pmu_feature feature = {};			\
+									\
+	feature;							\
+})
+
+static bool pmu_is_null_feature(struct kvm_x86_pmu_feature event)
+{
+	return !(*(u64 *)&event);
+}
+
+static void guest_test_arch_event(uint8_t idx)
+{
+	const struct {
+		struct kvm_x86_pmu_feature gp_event;
+		struct kvm_x86_pmu_feature fixed_event;
+	} intel_event_to_feature[] = {
+		[INTEL_ARCH_CPU_CYCLES_INDEX]		 = { X86_PMU_FEATURE_CPU_CYCLES, X86_PMU_FEATURE_CPU_CYCLES_FIXED },
+		[INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX]	 = { X86_PMU_FEATURE_INSNS_RETIRED, X86_PMU_FEATURE_INSNS_RETIRED_FIXED },
+		/*
+		 * Note, the fixed counter for reference cycles is NOT the same
+		 * as the general purpose architectural event.  The fixed counter
+		 * explicitly counts at the same frequency as the TSC, whereas
+		 * the GP event counts at a fixed, but uarch specific, frequency.
+		 * Bundle them here for simplicity.
+		 */
+		[INTEL_ARCH_REFERENCE_CYCLES_INDEX]	 = { X86_PMU_FEATURE_REFERENCE_CYCLES, X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED },
+		[INTEL_ARCH_LLC_REFERENCES_INDEX]	 = { X86_PMU_FEATURE_LLC_REFERENCES, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_LLC_MISSES_INDEX]		 = { X86_PMU_FEATURE_LLC_MISSES, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_BRANCHES_RETIRED_INDEX]	 = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_TOPDOWN_SLOTS_INDEX]	 = { X86_PMU_FEATURE_TOPDOWN_SLOTS, X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED },
+	};
+
+	uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
+	uint32_t pmu_version = guest_get_pmu_version();
+	/* PERF_GLOBAL_CTRL exists only for Architectural PMU Version 2+. */
+	bool guest_has_perf_global_ctrl = pmu_version >= 2;
+	struct kvm_x86_pmu_feature gp_event, fixed_event;
+	uint32_t base_pmc_msr;
+	unsigned int i;
+
+	/* The host side shouldn't invoke this without a guest PMU. */
+	GUEST_ASSERT(pmu_version);
+
+	if (this_cpu_has(X86_FEATURE_PDCM) &&
+	    rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
+		base_pmc_msr = MSR_IA32_PMC0;
+	else
+		base_pmc_msr = MSR_IA32_PERFCTR0;
+
+	gp_event = intel_event_to_feature[idx].gp_event;
+	GUEST_ASSERT_EQ(idx, gp_event.f.bit);
+
+	GUEST_ASSERT(nr_gp_counters);
+
+	for (i = 0; i < nr_gp_counters; i++) {
+		uint64_t eventsel = ARCH_PERFMON_EVENTSEL_OS |
+				    ARCH_PERFMON_EVENTSEL_ENABLE |
+				    intel_pmu_arch_events[idx];
+
+		wrmsr(MSR_P6_EVNTSEL0 + i, 0);
+		if (guest_has_perf_global_ctrl)
+			wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, BIT_ULL(i));
+
+		__guest_test_arch_event(idx, gp_event, i, base_pmc_msr + i,
+					MSR_P6_EVNTSEL0 + i, eventsel);
+	}
+
+	if (!guest_has_perf_global_ctrl)
+		return;
+
+	fixed_event = intel_event_to_feature[idx].fixed_event;
+	if (pmu_is_null_feature(fixed_event) || !this_pmu_has(fixed_event))
+		return;
+
+	i = fixed_event.f.bit;
+
+	wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
+
+	__guest_test_arch_event(idx, fixed_event, i | INTEL_RDPMC_FIXED,
+				MSR_CORE_PERF_FIXED_CTR0 + i,
+				MSR_CORE_PERF_GLOBAL_CTRL,
+				FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
+}
+
+static void guest_test_arch_events(void)
+{
+	uint8_t i;
+
+	for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++)
+		guest_test_arch_event(i);
+
+	GUEST_DONE();
+}
+
+static void test_arch_events(uint8_t pmu_version, uint64_t perf_capabilities,
+			     uint8_t length, uint8_t unavailable_mask)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	/* Testing arch events requires a vPMU (there are no negative tests). */
+	if (!pmu_version)
+		return;
+
+	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_arch_events,
+					 pmu_version, perf_capabilities);
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH,
+				length);
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EVENTS_MASK,
+				unavailable_mask);
+
+	run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+/*
+ * Limit testing to MSRs that are actually defined by Intel (in the SDM).  MSRs
+ * that aren't defined counter MSRs *probably* don't exist, but there's no
+ * guarantee that currently undefined MSR indices won't be used for something
+ * other than PMCs in the future.
+ */
+#define MAX_NR_GP_COUNTERS	8
+#define MAX_NR_FIXED_COUNTERS	3
+
+#define GUEST_ASSERT_PMC_MSR_ACCESS(insn, msr, expect_gp, vector)		\
+__GUEST_ASSERT(expect_gp ? vector == GP_VECTOR : !vector,			\
+	       "Expected %s on " #insn "(0x%x), got vector %u",			\
+	       expect_gp ? "#GP" : "no fault", msr, vector)			\
+
+#define GUEST_ASSERT_PMC_VALUE(insn, msr, val, expected)			\
+	__GUEST_ASSERT(val == expected_val,					\
+		       "Expected " #insn "(0x%x) to yield 0x%lx, got 0x%lx",	\
+		       msr, expected_val, val);
+
+static void guest_test_rdpmc(uint32_t rdpmc_idx, bool expect_success,
+			     uint64_t expected_val)
+{
+	uint8_t vector;
+	uint64_t val;
+
+	vector = rdpmc_safe(rdpmc_idx, &val);
+	GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector);
+	if (expect_success)
+		GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val);
+
+	if (!is_forced_emulation_enabled)
+		return;
+
+	vector = rdpmc_safe_fep(rdpmc_idx, &val);
+	GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector);
+	if (expect_success)
+		GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val);
+}
+
+static void guest_rd_wr_counters(uint32_t base_msr, uint8_t nr_possible_counters,
+				 uint8_t nr_counters, uint32_t or_mask)
+{
+	const bool pmu_has_fast_mode = !guest_get_pmu_version();
+	uint8_t i;
+
+	for (i = 0; i < nr_possible_counters; i++) {
+		/*
+		 * TODO: Test a value that validates full-width writes and the
+		 * width of the counters.
+		 */
+		const uint64_t test_val = 0xffff;
+		const uint32_t msr = base_msr + i;
+
+		/*
+		 * Fixed counters are supported if the counter is less than the
+		 * number of enumerated contiguous counters *or* the counter is
+		 * explicitly enumerated in the supported counters mask.
+		 */
+		const bool expect_success = i < nr_counters || (or_mask & BIT(i));
+
+		/*
+		 * KVM drops writes to MSR_P6_PERFCTR[0|1] if the counters are
+		 * unsupported, i.e. doesn't #GP and reads back '0'.
+		 */
+		const uint64_t expected_val = expect_success ? test_val : 0;
+		const bool expect_gp = !expect_success && msr != MSR_P6_PERFCTR0 &&
+				       msr != MSR_P6_PERFCTR1;
+		uint32_t rdpmc_idx;
+		uint8_t vector;
+		uint64_t val;
+
+		vector = wrmsr_safe(msr, test_val);
+		GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector);
+
+		vector = rdmsr_safe(msr, &val);
+		GUEST_ASSERT_PMC_MSR_ACCESS(RDMSR, msr, expect_gp, vector);
+
+		/* On #GP, the result of RDMSR is undefined. */
+		if (!expect_gp)
+			GUEST_ASSERT_PMC_VALUE(RDMSR, msr, val, expected_val);
+
+		/*
+		 * Redo the read tests with RDPMC, which has different indexing
+		 * semantics and additional capabilities.
+		 */
+		rdpmc_idx = i;
+		if (base_msr == MSR_CORE_PERF_FIXED_CTR0)
+			rdpmc_idx |= INTEL_RDPMC_FIXED;
+
+		guest_test_rdpmc(rdpmc_idx, expect_success, expected_val);
+
+		/*
+		 * KVM doesn't support non-architectural PMUs, i.e. it should
+		 * impossible to have fast mode RDPMC.  Verify that attempting
+		 * to use fast RDPMC always #GPs.
+		 */
+		GUEST_ASSERT(!expect_success || !pmu_has_fast_mode);
+		rdpmc_idx |= INTEL_RDPMC_FAST;
+		guest_test_rdpmc(rdpmc_idx, false, -1ull);
+
+		vector = wrmsr_safe(msr, 0);
+		GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector);
+	}
+}
+
+static void guest_test_gp_counters(void)
+{
+	uint8_t pmu_version = guest_get_pmu_version();
+	uint8_t nr_gp_counters = 0;
+	uint32_t base_msr;
+
+	if (pmu_version)
+		nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
+
+	/*
+	 * For v2+ PMUs, PERF_GLOBAL_CTRL's architectural post-RESET value is
+	 * "Sets bits n-1:0 and clears the upper bits", where 'n' is the number
+	 * of GP counters.  If there are no GP counters, require KVM to leave
+	 * PERF_GLOBAL_CTRL '0'.  This edge case isn't covered by the SDM, but
+	 * follow the spirit of the architecture and only globally enable GP
+	 * counters, of which there are none.
+	 */
+	if (pmu_version > 1) {
+		uint64_t global_ctrl = rdmsr(MSR_CORE_PERF_GLOBAL_CTRL);
+
+		if (nr_gp_counters)
+			GUEST_ASSERT_EQ(global_ctrl, GENMASK_ULL(nr_gp_counters - 1, 0));
+		else
+			GUEST_ASSERT_EQ(global_ctrl, 0);
+	}
+
+	if (this_cpu_has(X86_FEATURE_PDCM) &&
+	    rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
+		base_msr = MSR_IA32_PMC0;
+	else
+		base_msr = MSR_IA32_PERFCTR0;
+
+	guest_rd_wr_counters(base_msr, MAX_NR_GP_COUNTERS, nr_gp_counters, 0);
+	GUEST_DONE();
+}
+
+static void test_gp_counters(uint8_t pmu_version, uint64_t perf_capabilities,
+			     uint8_t nr_gp_counters)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_gp_counters,
+					 pmu_version, perf_capabilities);
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_GP_COUNTERS,
+				nr_gp_counters);
+
+	run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+static void guest_test_fixed_counters(void)
+{
+	uint64_t supported_bitmask = 0;
+	uint8_t nr_fixed_counters = 0;
+	uint8_t i;
+
+	/* Fixed counters require Architectural vPMU Version 2+. */
+	if (guest_get_pmu_version() >= 2)
+		nr_fixed_counters = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+
+	/*
+	 * The supported bitmask for fixed counters was introduced in PMU
+	 * version 5.
+	 */
+	if (guest_get_pmu_version() >= 5)
+		supported_bitmask = this_cpu_property(X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK);
+
+	guest_rd_wr_counters(MSR_CORE_PERF_FIXED_CTR0, MAX_NR_FIXED_COUNTERS,
+			     nr_fixed_counters, supported_bitmask);
+
+	for (i = 0; i < MAX_NR_FIXED_COUNTERS; i++) {
+		uint8_t vector;
+		uint64_t val;
+
+		if (i >= nr_fixed_counters && !(supported_bitmask & BIT_ULL(i))) {
+			vector = wrmsr_safe(MSR_CORE_PERF_FIXED_CTR_CTRL,
+					    FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
+			__GUEST_ASSERT(vector == GP_VECTOR,
+				       "Expected #GP for counter %u in FIXED_CTR_CTRL", i);
+
+			vector = wrmsr_safe(MSR_CORE_PERF_GLOBAL_CTRL,
+					    FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
+			__GUEST_ASSERT(vector == GP_VECTOR,
+				       "Expected #GP for counter %u in PERF_GLOBAL_CTRL", i);
+			continue;
+		}
+
+		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
+		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
+		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+		val = rdmsr(MSR_CORE_PERF_FIXED_CTR0 + i);
+
+		GUEST_ASSERT_NE(val, 0);
+	}
+	GUEST_DONE();
+}
+
+static void test_fixed_counters(uint8_t pmu_version, uint64_t perf_capabilities,
+				uint8_t nr_fixed_counters,
+				uint32_t supported_bitmask)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_fixed_counters,
+					 pmu_version, perf_capabilities);
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK,
+				supported_bitmask);
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_FIXED_COUNTERS,
+				nr_fixed_counters);
+
+	run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+static void test_intel_counters(void)
+{
+	uint8_t nr_arch_events = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+	uint8_t nr_gp_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
+	uint8_t pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION);
+	unsigned int i;
+	uint8_t v, j;
+	uint32_t k;
+
+	const uint64_t perf_caps[] = {
+		0,
+		PMU_CAP_FW_WRITES,
+	};
+
+	/*
+	 * Test up to PMU v5, which is the current maximum version defined by
+	 * Intel, i.e. is the last version that is guaranteed to be backwards
+	 * compatible with KVM's existing behavior.
+	 */
+	uint8_t max_pmu_version = max_t(typeof(pmu_version), pmu_version, 5);
+
+	/*
+	 * Detect the existence of events that aren't supported by selftests.
+	 * This will (obviously) fail any time the kernel adds support for a
+	 * new event, but it's worth paying that price to keep the test fresh.
+	 */
+	TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS,
+		    "New architectural event(s) detected; please update this test (length = %u, mask = %x)",
+		    nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
+
+	/*
+	 * Force iterating over known arch events regardless of whether or not
+	 * KVM/hardware supports a given event.
+	 */
+	nr_arch_events = max_t(typeof(nr_arch_events), nr_arch_events, NR_INTEL_ARCH_EVENTS);
+
+	for (v = 0; v <= max_pmu_version; v++) {
+		for (i = 0; i < ARRAY_SIZE(perf_caps); i++) {
+			if (!kvm_has_perf_caps && perf_caps[i])
+				continue;
+
+			pr_info("Testing arch events, PMU version %u, perf_caps = %lx\n",
+				v, perf_caps[i]);
+			/*
+			 * To keep the total runtime reasonable, test every
+			 * possible non-zero, non-reserved bitmap combination
+			 * only with the native PMU version and the full bit
+			 * vector length.
+			 */
+			if (v == pmu_version) {
+				for (k = 1; k < (BIT(nr_arch_events) - 1); k++)
+					test_arch_events(v, perf_caps[i], nr_arch_events, k);
+			}
+			/*
+			 * Test single bits for all PMU version and lengths up
+			 * the number of events +1 (to verify KVM doesn't do
+			 * weird things if the guest length is greater than the
+			 * host length).  Explicitly test a mask of '0' and all
+			 * ones i.e. all events being available and unavailable.
+			 */
+			for (j = 0; j <= nr_arch_events + 1; j++) {
+				test_arch_events(v, perf_caps[i], j, 0);
+				test_arch_events(v, perf_caps[i], j, 0xff);
+
+				for (k = 0; k < nr_arch_events; k++)
+					test_arch_events(v, perf_caps[i], j, BIT(k));
+			}
+
+			pr_info("Testing GP counters, PMU version %u, perf_caps = %lx\n",
+				v, perf_caps[i]);
+			for (j = 0; j <= nr_gp_counters; j++)
+				test_gp_counters(v, perf_caps[i], j);
+
+			pr_info("Testing fixed counters, PMU version %u, perf_caps = %lx\n",
+				v, perf_caps[i]);
+			for (j = 0; j <= nr_fixed_counters; j++) {
+				for (k = 0; k <= (BIT(nr_fixed_counters) - 1); k++)
+					test_fixed_counters(v, perf_caps[i], j, k);
+			}
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_is_pmu_enabled());
+
+	TEST_REQUIRE(host_cpu_is_intel);
+	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
+	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
+
+	kvm_pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION);
+	kvm_has_perf_caps = kvm_cpu_has(X86_FEATURE_PDCM);
+	is_forced_emulation_enabled = kvm_is_forced_emulation_enabled();
+
+	test_intel_counters();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
index a3bd54b925ab..3c85d1ae9893 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -11,72 +11,18 @@
  */
 
 #define _GNU_SOURCE /* for program_invocation_short_name */
-#include "test_util.h"
+
 #include "kvm_util.h"
+#include "pmu.h"
 #include "processor.h"
-
-/*
- * In lieu of copying perf_event.h into tools...
- */
-#define ARCH_PERFMON_EVENTSEL_OS			(1ULL << 17)
-#define ARCH_PERFMON_EVENTSEL_ENABLE			(1ULL << 22)
-
-/* End of stuff taken from perf_event.h. */
-
-/* Oddly, this isn't in perf_event.h. */
-#define ARCH_PERFMON_BRANCHES_RETIRED		5
+#include "test_util.h"
 
 #define NUM_BRANCHES 42
-#define INTEL_PMC_IDX_FIXED		32
-
-/* Matches KVM_PMU_EVENT_FILTER_MAX_EVENTS in pmu.c */
-#define MAX_FILTER_EVENTS		300
 #define MAX_TEST_EVENTS		10
 
 #define PMU_EVENT_FILTER_INVALID_ACTION		(KVM_PMU_EVENT_DENY + 1)
 #define PMU_EVENT_FILTER_INVALID_FLAGS			(KVM_PMU_EVENT_FLAGS_VALID_MASK << 1)
-#define PMU_EVENT_FILTER_INVALID_NEVENTS		(MAX_FILTER_EVENTS + 1)
-
-/*
- * This is how the event selector and unit mask are stored in an AMD
- * core performance event-select register. Intel's format is similar,
- * but the event selector is only 8 bits.
- */
-#define EVENT(select, umask) ((select & 0xf00UL) << 24 | (select & 0xff) | \
-			      (umask & 0xff) << 8)
-
-/*
- * "Branch instructions retired", from the Intel SDM, volume 3,
- * "Pre-defined Architectural Performance Events."
- */
-
-#define INTEL_BR_RETIRED EVENT(0xc4, 0)
-
-/*
- * "Retired branch instructions", from Processor Programming Reference
- * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
- * Preliminary Processor Programming Reference (PPR) for AMD Family
- * 17h Model 31h, Revision B0 Processors, and Preliminary Processor
- * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
- * B1 Processors Volume 1 of 2.
- */
-
-#define AMD_ZEN_BR_RETIRED EVENT(0xc2, 0)
-
-
-/*
- * "Retired instructions", from Processor Programming Reference
- * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
- * Preliminary Processor Programming Reference (PPR) for AMD Family
- * 17h Model 31h, Revision B0 Processors, and Preliminary Processor
- * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
- * B1 Processors Volume 1 of 2.
- *                      --- and ---
- * "Instructions retired", from the Intel SDM, volume 3,
- * "Pre-defined Architectural Performance Events."
- */
-
-#define INST_RETIRED EVENT(0xc0, 0)
+#define PMU_EVENT_FILTER_INVALID_NEVENTS		(KVM_PMU_EVENT_FILTER_MAX_EVENTS + 1)
 
 struct __kvm_pmu_event_filter {
 	__u32 action;
@@ -84,26 +30,28 @@ struct __kvm_pmu_event_filter {
 	__u32 fixed_counter_bitmap;
 	__u32 flags;
 	__u32 pad[4];
-	__u64 events[MAX_FILTER_EVENTS];
+	__u64 events[KVM_PMU_EVENT_FILTER_MAX_EVENTS];
 };
 
 /*
- * This event list comprises Intel's eight architectural events plus
- * AMD's "retired branch instructions" for Zen[123] (and possibly
- * other AMD CPUs).
+ * This event list comprises Intel's known architectural events, plus AMD's
+ * "retired branch instructions" for Zen1-Zen3 (and* possibly other AMD CPUs).
+ * Note, AMD and Intel use the same encoding for instructions retired.
  */
+kvm_static_assert(INTEL_ARCH_INSTRUCTIONS_RETIRED == AMD_ZEN_INSTRUCTIONS_RETIRED);
+
 static const struct __kvm_pmu_event_filter base_event_filter = {
 	.nevents = ARRAY_SIZE(base_event_filter.events),
 	.events = {
-		EVENT(0x3c, 0),
-		INST_RETIRED,
-		EVENT(0x3c, 1),
-		EVENT(0x2e, 0x4f),
-		EVENT(0x2e, 0x41),
-		EVENT(0xc4, 0),
-		EVENT(0xc5, 0),
-		EVENT(0xa4, 1),
-		AMD_ZEN_BR_RETIRED,
+		INTEL_ARCH_CPU_CYCLES,
+		INTEL_ARCH_INSTRUCTIONS_RETIRED,
+		INTEL_ARCH_REFERENCE_CYCLES,
+		INTEL_ARCH_LLC_REFERENCES,
+		INTEL_ARCH_LLC_MISSES,
+		INTEL_ARCH_BRANCHES_RETIRED,
+		INTEL_ARCH_BRANCHES_MISPREDICTED,
+		INTEL_ARCH_TOPDOWN_SLOTS,
+		AMD_ZEN_BRANCHES_RETIRED,
 	},
 };
 
@@ -165,9 +113,9 @@ static void intel_guest_code(void)
 	for (;;) {
 		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 		wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | INTEL_BR_RETIRED);
+		      ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_BRANCHES_RETIRED);
 		wrmsr(MSR_P6_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | INST_RETIRED);
+		      ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_INSTRUCTIONS_RETIRED);
 		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
 
 		run_and_measure_loop(MSR_IA32_PMC0);
@@ -189,9 +137,9 @@ static void amd_guest_code(void)
 	for (;;) {
 		wrmsr(MSR_K7_EVNTSEL0, 0);
 		wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BR_RETIRED);
+		      ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BRANCHES_RETIRED);
 		wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
-		      ARCH_PERFMON_EVENTSEL_OS | INST_RETIRED);
+		      ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_INSTRUCTIONS_RETIRED);
 
 		run_and_measure_loop(MSR_K7_PERFCTR0);
 		GUEST_SYNC(0);
@@ -312,7 +260,7 @@ static void test_amd_deny_list(struct kvm_vcpu *vcpu)
 		.action = KVM_PMU_EVENT_DENY,
 		.nevents = 1,
 		.events = {
-			EVENT(0x1C2, 0),
+			RAW_EVENT(0x1C2, 0),
 		},
 	};
 
@@ -347,9 +295,9 @@ static void test_not_member_deny_list(struct kvm_vcpu *vcpu)
 
 	f.action = KVM_PMU_EVENT_DENY;
 
-	remove_event(&f, INST_RETIRED);
-	remove_event(&f, INTEL_BR_RETIRED);
-	remove_event(&f, AMD_ZEN_BR_RETIRED);
+	remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED);
+	remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED);
+	remove_event(&f, AMD_ZEN_BRANCHES_RETIRED);
 	test_with_filter(vcpu, &f);
 
 	ASSERT_PMC_COUNTING_INSTRUCTIONS();
@@ -361,9 +309,9 @@ static void test_not_member_allow_list(struct kvm_vcpu *vcpu)
 
 	f.action = KVM_PMU_EVENT_ALLOW;
 
-	remove_event(&f, INST_RETIRED);
-	remove_event(&f, INTEL_BR_RETIRED);
-	remove_event(&f, AMD_ZEN_BR_RETIRED);
+	remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED);
+	remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED);
+	remove_event(&f, AMD_ZEN_BRANCHES_RETIRED);
 	test_with_filter(vcpu, &f);
 
 	ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS();
@@ -452,9 +400,9 @@ static bool use_amd_pmu(void)
  *  - Sapphire Rapids, Ice Lake, Cascade Lake, Skylake.
  */
 #define MEM_INST_RETIRED		0xD0
-#define MEM_INST_RETIRED_LOAD		EVENT(MEM_INST_RETIRED, 0x81)
-#define MEM_INST_RETIRED_STORE		EVENT(MEM_INST_RETIRED, 0x82)
-#define MEM_INST_RETIRED_LOAD_STORE	EVENT(MEM_INST_RETIRED, 0x83)
+#define MEM_INST_RETIRED_LOAD		RAW_EVENT(MEM_INST_RETIRED, 0x81)
+#define MEM_INST_RETIRED_STORE		RAW_EVENT(MEM_INST_RETIRED, 0x82)
+#define MEM_INST_RETIRED_LOAD_STORE	RAW_EVENT(MEM_INST_RETIRED, 0x83)
 
 static bool supports_event_mem_inst_retired(void)
 {
@@ -486,9 +434,9 @@ static bool supports_event_mem_inst_retired(void)
  * B1 Processors Volume 1 of 2.
  */
 #define LS_DISPATCH		0x29
-#define LS_DISPATCH_LOAD	EVENT(LS_DISPATCH, BIT(0))
-#define LS_DISPATCH_STORE	EVENT(LS_DISPATCH, BIT(1))
-#define LS_DISPATCH_LOAD_STORE	EVENT(LS_DISPATCH, BIT(2))
+#define LS_DISPATCH_LOAD	RAW_EVENT(LS_DISPATCH, BIT(0))
+#define LS_DISPATCH_STORE	RAW_EVENT(LS_DISPATCH, BIT(1))
+#define LS_DISPATCH_LOAD_STORE	RAW_EVENT(LS_DISPATCH, BIT(2))
 
 #define INCLUDE_MASKED_ENTRY(event_select, mask, match) \
 	KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, false)
@@ -729,14 +677,14 @@ static void add_dummy_events(uint64_t *events, int nevents)
 
 static void test_masked_events(struct kvm_vcpu *vcpu)
 {
-	int nevents = MAX_FILTER_EVENTS - MAX_TEST_EVENTS;
-	uint64_t events[MAX_FILTER_EVENTS];
+	int nevents = KVM_PMU_EVENT_FILTER_MAX_EVENTS - MAX_TEST_EVENTS;
+	uint64_t events[KVM_PMU_EVENT_FILTER_MAX_EVENTS];
 
 	/* Run the test cases against a sparse PMU event filter. */
 	run_masked_events_tests(vcpu, events, 0);
 
 	/* Run the test cases against a dense PMU event filter. */
-	add_dummy_events(events, MAX_FILTER_EVENTS);
+	add_dummy_events(events, KVM_PMU_EVENT_FILTER_MAX_EVENTS);
 	run_masked_events_tests(vcpu, events, nevents);
 }
 
@@ -809,20 +757,19 @@ static void test_filter_ioctl(struct kvm_vcpu *vcpu)
 	TEST_ASSERT(!r, "Masking non-existent fixed counters should be allowed");
 }
 
-static void intel_run_fixed_counter_guest_code(uint8_t fixed_ctr_idx)
+static void intel_run_fixed_counter_guest_code(uint8_t idx)
 {
 	for (;;) {
 		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + fixed_ctr_idx, 0);
+		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, 0);
 
 		/* Only OS_EN bit is enabled for fixed counter[idx]. */
-		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, BIT_ULL(4 * fixed_ctr_idx));
-		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL,
-		      BIT_ULL(INTEL_PMC_IDX_FIXED + fixed_ctr_idx));
+		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(idx, FIXED_PMC_KERNEL));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(idx));
 		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
 		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
-		GUEST_SYNC(rdmsr(MSR_CORE_PERF_FIXED_CTR0 + fixed_ctr_idx));
+		GUEST_SYNC(rdmsr(MSR_CORE_PERF_FIXED_CTR0 + idx));
 	}
 }
 
@@ -920,7 +867,7 @@ int main(int argc, char *argv[])
 	struct kvm_vcpu *vcpu, *vcpu2 = NULL;
 	struct kvm_vm *vm;
 
-	TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
+	TEST_REQUIRE(kvm_is_pmu_enabled());
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER));
 	TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_MASKED_EVENTS));
 
diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
index 65ad38b6be1f..e0f642d2a3c4 100644
--- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
@@ -434,6 +434,8 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t
 
 	r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size);
 	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
+
+	close(memfd);
 }
 
 static void usage(const char *cmd)
diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
index a49828adf294..0a6dfba3905b 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -10,11 +10,9 @@
 #include "test_util.h"
 #include "kvm_util.h"
 #include "processor.h"
-#include "svm_util.h"
+#include "sev.h"
 #include "kselftest.h"
 
-#define SEV_POLICY_ES 0b100
-
 #define NR_MIGRATE_TEST_VCPUS 4
 #define NR_MIGRATE_TEST_VMS 3
 #define NR_LOCK_TESTING_THREADS 3
@@ -22,46 +20,24 @@
 
 bool have_sev_es;
 
-static int __sev_ioctl(int vm_fd, int cmd_id, void *data, __u32 *fw_error)
-{
-	struct kvm_sev_cmd cmd = {
-		.id = cmd_id,
-		.data = (uint64_t)data,
-		.sev_fd = open_sev_dev_path_or_exit(),
-	};
-	int ret;
-
-	ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd);
-	*fw_error = cmd.error;
-	return ret;
-}
-
-static void sev_ioctl(int vm_fd, int cmd_id, void *data)
-{
-	int ret;
-	__u32 fw_error;
-
-	ret = __sev_ioctl(vm_fd, cmd_id, data, &fw_error);
-	TEST_ASSERT(ret == 0 && fw_error == SEV_RET_SUCCESS,
-		    "%d failed: return code: %d, errno: %d, fw error: %d",
-		    cmd_id, ret, errno, fw_error);
-}
-
 static struct kvm_vm *sev_vm_create(bool es)
 {
 	struct kvm_vm *vm;
-	struct kvm_sev_launch_start start = { 0 };
 	int i;
 
 	vm = vm_create_barebones();
-	sev_ioctl(vm->fd, es ? KVM_SEV_ES_INIT : KVM_SEV_INIT, NULL);
+	if (!es)
+		sev_vm_init(vm);
+	else
+		sev_es_vm_init(vm);
+
 	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
 		__vm_vcpu_add(vm, i);
+
+	sev_vm_launch(vm, es ? SEV_POLICY_ES : 0);
+
 	if (es)
-		start.policy |= SEV_POLICY_ES;
-	sev_ioctl(vm->fd, KVM_SEV_LAUNCH_START, &start);
-	if (es)
-		sev_ioctl(vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
+		vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
 	return vm;
 }
 
@@ -181,7 +157,7 @@ static void test_sev_migrate_parameters(void)
 	sev_vm = sev_vm_create(/* es= */ false);
 	sev_es_vm = sev_vm_create(/* es= */ true);
 	sev_es_vm_no_vmsa = vm_create_barebones();
-	sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL);
+	sev_es_vm_init(sev_es_vm_no_vmsa);
 	__vm_vcpu_add(sev_es_vm_no_vmsa, 1);
 
 	ret = __sev_migrate_from(sev_vm, sev_es_vm);
@@ -230,13 +206,13 @@ static void sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src)
 	TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d", ret, errno);
 }
 
-static void verify_mirror_allowed_cmds(int vm_fd)
+static void verify_mirror_allowed_cmds(struct kvm_vm *vm)
 {
 	struct kvm_sev_guest_status status;
+	int cmd_id;
 
-	for (int cmd_id = KVM_SEV_INIT; cmd_id < KVM_SEV_NR_MAX; ++cmd_id) {
+	for (cmd_id = KVM_SEV_INIT; cmd_id < KVM_SEV_NR_MAX; ++cmd_id) {
 		int ret;
-		__u32 fw_error;
 
 		/*
 		 * These commands are allowed for mirror VMs, all others are
@@ -256,14 +232,14 @@ static void verify_mirror_allowed_cmds(int vm_fd)
 		 * These commands should be disallowed before the data
 		 * parameter is examined so NULL is OK here.
 		 */
-		ret = __sev_ioctl(vm_fd, cmd_id, NULL, &fw_error);
+		ret = __vm_sev_ioctl(vm, cmd_id, NULL);
 		TEST_ASSERT(
 			ret == -1 && errno == EINVAL,
 			"Should not be able call command: %d. ret: %d, errno: %d",
 			cmd_id, ret, errno);
 	}
 
-	sev_ioctl(vm_fd, KVM_SEV_GUEST_STATUS, &status);
+	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
 }
 
 static void test_sev_mirror(bool es)
@@ -281,9 +257,9 @@ static void test_sev_mirror(bool es)
 		__vm_vcpu_add(dst_vm, i);
 
 	if (es)
-		sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
+		vm_sev_ioctl(dst_vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
 
-	verify_mirror_allowed_cmds(dst_vm->fd);
+	verify_mirror_allowed_cmds(dst_vm);
 
 	kvm_vm_free(src_vm);
 	kvm_vm_free(dst_vm);
diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
new file mode 100644
index 000000000000..026779f3ed06
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "linux/psp-sev.h"
+#include "sev.h"
+
+
+static void guest_sev_es_code(void)
+{
+	/* TODO: Check CPUID after GHCB-based hypercall support is added. */
+	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED);
+	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ES_ENABLED);
+
+	/*
+	 * TODO: Add GHCB and ucall support for SEV-ES guests.  For now, simply
+	 * force "termination" to signal "done" via the GHCB MSR protocol.
+	 */
+	wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_TERM_REQ);
+	__asm__ __volatile__("rep; vmmcall");
+}
+
+static void guest_sev_code(void)
+{
+	GUEST_ASSERT(this_cpu_has(X86_FEATURE_SEV));
+	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED);
+
+	GUEST_DONE();
+}
+
+static void test_sev(void *guest_code, uint64_t policy)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	vm = vm_sev_create_with_one_vcpu(policy, guest_code, &vcpu);
+
+	for (;;) {
+		vcpu_run(vcpu);
+
+		if (policy & SEV_POLICY_ES) {
+			TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT,
+				    "Wanted SYSTEM_EVENT, got %s",
+				    exit_reason_str(vcpu->run->exit_reason));
+			TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM);
+			TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1);
+			TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ);
+			break;
+		}
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			continue;
+		case UCALL_DONE:
+			return;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		default:
+			TEST_FAIL("Unexpected exit: %s",
+				  exit_reason_str(vcpu->run->exit_reason));
+		}
+	}
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
+
+	test_sev(guest_sev_code, SEV_POLICY_NO_DBG);
+	test_sev(guest_sev_code, 0);
+
+	if (kvm_cpu_has(X86_FEATURE_SEV_ES)) {
+		test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG);
+		test_sev(guest_sev_es_code, SEV_POLICY_ES);
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c
index 1a46dd7bb391..416207c38a17 100644
--- a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c
+++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c
@@ -63,7 +63,7 @@ int main(int argc, char *argv[])
 	vm_init_descriptor_tables(vm);
 	vcpu_init_descriptor_tables(vcpu);
 
-	vcpu_set_cpuid_maxphyaddr(vcpu, MAXPHYADDR);
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, MAXPHYADDR);
 
 	rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE);
 	TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable");
diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
index a91b5b145fa3..adb5593daf48 100644
--- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
@@ -17,6 +17,7 @@
 #include <sys/ioctl.h>
 #include <pthread.h>
 
+#include "kvm_test_harness.h"
 #include "test_util.h"
 #include "kvm_util.h"
 #include "processor.h"
@@ -41,6 +42,8 @@ void guest_code(void)
 		     : "rax", "rbx");
 }
 
+KVM_ONE_VCPU_TEST_SUITE(sync_regs_test);
+
 static void compare_regs(struct kvm_regs *left, struct kvm_regs *right)
 {
 #define REG_COMPARE(reg) \
@@ -152,18 +155,15 @@ static noinline void *race_sregs_cr4(void *arg)
 	return NULL;
 }
 
-static void race_sync_regs(void *racer)
+static void race_sync_regs(struct kvm_vcpu *vcpu, void *racer)
 {
 	const time_t TIMEOUT = 2; /* seconds, roughly */
 	struct kvm_x86_state *state;
 	struct kvm_translation tr;
-	struct kvm_vcpu *vcpu;
 	struct kvm_run *run;
-	struct kvm_vm *vm;
 	pthread_t thread;
 	time_t t;
 
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 	run = vcpu->run;
 
 	run->kvm_valid_regs = KVM_SYNC_X86_SREGS;
@@ -205,26 +205,12 @@ static void race_sync_regs(void *racer)
 	TEST_ASSERT_EQ(pthread_join(thread, NULL), 0);
 
 	kvm_x86_state_cleanup(state);
-	kvm_vm_free(vm);
 }
 
-int main(int argc, char *argv[])
+KVM_ONE_VCPU_TEST(sync_regs_test, read_invalid, guest_code)
 {
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct kvm_run *run;
-	struct kvm_regs regs;
-	struct kvm_sregs sregs;
-	struct kvm_vcpu_events events;
-	int rv, cap;
-
-	cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
-	TEST_REQUIRE((cap & TEST_SYNC_FIELDS) == TEST_SYNC_FIELDS);
-	TEST_REQUIRE(!(cap & INVALID_SYNC_FIELD));
-
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
-	run = vcpu->run;
+	struct kvm_run *run = vcpu->run;
+	int rv;
 
 	/* Request reading invalid register set from VCPU. */
 	run->kvm_valid_regs = INVALID_SYNC_FIELD;
@@ -240,6 +226,12 @@ int main(int argc, char *argv[])
 		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d",
 		    rv);
 	run->kvm_valid_regs = 0;
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, set_invalid, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	int rv;
 
 	/* Request setting invalid register set into VCPU. */
 	run->kvm_dirty_regs = INVALID_SYNC_FIELD;
@@ -255,11 +247,19 @@ int main(int argc, char *argv[])
 		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d",
 		    rv);
 	run->kvm_dirty_regs = 0;
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, req_and_verify_all_valid, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_vcpu_events events;
+	struct kvm_sregs sregs;
+	struct kvm_regs regs;
 
 	/* Request and verify all valid register sets. */
 	/* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */
 	run->kvm_valid_regs = TEST_SYNC_FIELDS;
-	rv = _vcpu_run(vcpu);
+	vcpu_run(vcpu);
 	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
 	vcpu_regs_get(vcpu, &regs);
@@ -270,6 +270,19 @@ int main(int argc, char *argv[])
 
 	vcpu_events_get(vcpu, &events);
 	compare_vcpu_events(&events, &run->s.regs.events);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, set_and_verify_various, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_vcpu_events events;
+	struct kvm_sregs sregs;
+	struct kvm_regs regs;
+
+	/* Run once to get register set */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
 	/* Set and verify various register values. */
 	run->s.regs.regs.rbx = 0xBAD1DEA;
@@ -278,7 +291,7 @@ int main(int argc, char *argv[])
 
 	run->kvm_valid_regs = TEST_SYNC_FIELDS;
 	run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS;
-	rv = _vcpu_run(vcpu);
+	vcpu_run(vcpu);
 	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 	TEST_ASSERT(run->s.regs.regs.rbx == 0xBAD1DEA + 1,
 		    "rbx sync regs value incorrect 0x%llx.",
@@ -295,6 +308,11 @@ int main(int argc, char *argv[])
 
 	vcpu_events_get(vcpu, &events);
 	compare_vcpu_events(&events, &run->s.regs.events);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_dirty_regs_bits, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
 
 	/* Clear kvm_dirty_regs bits, verify new s.regs values are
 	 * overwritten with existing guest values.
@@ -302,11 +320,22 @@ int main(int argc, char *argv[])
 	run->kvm_valid_regs = TEST_SYNC_FIELDS;
 	run->kvm_dirty_regs = 0;
 	run->s.regs.regs.rbx = 0xDEADBEEF;
-	rv = _vcpu_run(vcpu);
+	vcpu_run(vcpu);
 	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 	TEST_ASSERT(run->s.regs.regs.rbx != 0xDEADBEEF,
 		    "rbx sync regs value incorrect 0x%llx.",
 		    run->s.regs.regs.rbx);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_and_dirty_regs, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_regs regs;
+
+	/* Run once to get register set */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
 	/* Clear kvm_valid_regs bits and kvm_dirty_bits.
 	 * Verify s.regs values are not overwritten with existing guest values
@@ -315,9 +344,10 @@ int main(int argc, char *argv[])
 	run->kvm_valid_regs = 0;
 	run->kvm_dirty_regs = 0;
 	run->s.regs.regs.rbx = 0xAAAA;
+	vcpu_regs_get(vcpu, &regs);
 	regs.rbx = 0xBAC0;
 	vcpu_regs_set(vcpu, &regs);
-	rv = _vcpu_run(vcpu);
+	vcpu_run(vcpu);
 	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 	TEST_ASSERT(run->s.regs.regs.rbx == 0xAAAA,
 		    "rbx sync regs value incorrect 0x%llx.",
@@ -326,6 +356,17 @@ int main(int argc, char *argv[])
 	TEST_ASSERT(regs.rbx == 0xBAC0 + 1,
 		    "rbx guest value incorrect 0x%llx.",
 		    regs.rbx);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_regs_bits, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_regs regs;
+
+	/* Run once to get register set */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
 	/* Clear kvm_valid_regs bits. Verify s.regs values are not overwritten
 	 * with existing guest values but that guest values are overwritten
@@ -334,7 +375,7 @@ int main(int argc, char *argv[])
 	run->kvm_valid_regs = 0;
 	run->kvm_dirty_regs = TEST_SYNC_FIELDS;
 	run->s.regs.regs.rbx = 0xBBBB;
-	rv = _vcpu_run(vcpu);
+	vcpu_run(vcpu);
 	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 	TEST_ASSERT(run->s.regs.regs.rbx == 0xBBBB,
 		    "rbx sync regs value incorrect 0x%llx.",
@@ -343,12 +384,30 @@ int main(int argc, char *argv[])
 	TEST_ASSERT(regs.rbx == 0xBBBB + 1,
 		    "rbx guest value incorrect 0x%llx.",
 		    regs.rbx);
+}
 
-	kvm_vm_free(vm);
+KVM_ONE_VCPU_TEST(sync_regs_test, race_cr4, guest_code)
+{
+	race_sync_regs(vcpu, race_sregs_cr4);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, race_exc, guest_code)
+{
+	race_sync_regs(vcpu, race_events_exc);
+}
 
-	race_sync_regs(race_sregs_cr4);
-	race_sync_regs(race_events_exc);
-	race_sync_regs(race_events_inj_pen);
+KVM_ONE_VCPU_TEST(sync_regs_test, race_inj_pen, guest_code)
+{
+	race_sync_regs(vcpu, race_events_inj_pen);
+}
+
+int main(int argc, char *argv[])
+{
+	int cap;
+
+	cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
+	TEST_REQUIRE((cap & TEST_SYNC_FIELDS) == TEST_SYNC_FIELDS);
+	TEST_REQUIRE(!(cap & INVALID_SYNC_FIELD));
 
-	return 0;
+	return test_harness_run(argc, argv);
 }
diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c
index 3533dc2fbfee..f4f61a2d2464 100644
--- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c
+++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c
@@ -8,14 +8,12 @@
 #define _GNU_SOURCE /* for program_invocation_short_name */
 #include <sys/ioctl.h>
 
+#include "kvm_test_harness.h"
 #include "test_util.h"
 #include "kvm_util.h"
 #include "vmx.h"
 
-/* Forced emulation prefix, used to invoke the emulator unconditionally. */
-#define KVM_FEP "ud2; .byte 'k', 'v', 'm';"
-#define KVM_FEP_LENGTH 5
-static int fep_available = 1;
+static bool fep_available;
 
 #define MSR_NON_EXISTENT 0x474f4f00
 
@@ -260,13 +258,6 @@ static void guest_code_filter_allow(void)
 	GUEST_ASSERT(data == 2);
 	GUEST_ASSERT(guest_exception_count == 0);
 
-	/*
-	 * Test to see if the instruction emulator is available (ie: the module
-	 * parameter 'kvm.force_emulation_prefix=1' is set).  This instruction
-	 * will #UD if it isn't available.
-	 */
-	__asm__ __volatile__(KVM_FEP "nop");
-
 	if (fep_available) {
 		/* Let userspace know we aren't done. */
 		GUEST_SYNC(0);
@@ -388,12 +379,6 @@ static void guest_fep_gp_handler(struct ex_regs *regs)
 			   &em_wrmsr_start, &em_wrmsr_end);
 }
 
-static void guest_ud_handler(struct ex_regs *regs)
-{
-	fep_available = 0;
-	regs->rip += KVM_FEP_LENGTH;
-}
-
 static void check_for_guest_assert(struct kvm_vcpu *vcpu)
 {
 	struct ucall uc;
@@ -527,13 +512,15 @@ static void run_guest_then_process_ucall_done(struct kvm_vcpu *vcpu)
 	process_ucall_done(vcpu);
 }
 
-static void test_msr_filter_allow(void)
+KVM_ONE_VCPU_TEST_SUITE(user_msr);
+
+KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow)
 {
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
+	struct kvm_vm *vm = vcpu->vm;
+	uint64_t cmd;
 	int rc;
 
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code_filter_allow);
+	sync_global_to_guest(vm, fep_available);
 
 	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
 	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
@@ -561,11 +548,11 @@ static void test_msr_filter_allow(void)
 	run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT);
 	run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT);
 
-	vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
 	vcpu_run(vcpu);
-	vm_install_exception_handler(vm, UD_VECTOR, NULL);
+	cmd = process_ucall(vcpu);
 
-	if (process_ucall(vcpu) != UCALL_DONE) {
+	if (fep_available) {
+		TEST_ASSERT_EQ(cmd, UCALL_SYNC);
 		vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler);
 
 		/* Process emulated rdmsr and wrmsr instructions. */
@@ -583,10 +570,9 @@ static void test_msr_filter_allow(void)
 		/* Confirm the guest completed without issues. */
 		run_guest_then_process_ucall_done(vcpu);
 	} else {
+		TEST_ASSERT_EQ(cmd, UCALL_DONE);
 		printf("To run the instruction emulated tests set the module parameter 'kvm.force_emulation_prefix=1'\n");
 	}
-
-	kvm_vm_free(vm);
 }
 
 static int handle_ucall(struct kvm_vcpu *vcpu)
@@ -646,16 +632,12 @@ static void handle_wrmsr(struct kvm_run *run)
 	}
 }
 
-static void test_msr_filter_deny(void)
+KVM_ONE_VCPU_TEST(user_msr, msr_filter_deny, guest_code_filter_deny)
 {
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct kvm_run *run;
+	struct kvm_vm *vm = vcpu->vm;
+	struct kvm_run *run = vcpu->run;
 	int rc;
 
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code_filter_deny);
-	run = vcpu->run;
-
 	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
 	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
 	vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_INVAL |
@@ -689,18 +671,13 @@ static void test_msr_filter_deny(void)
 done:
 	TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space");
 	TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space");
-
-	kvm_vm_free(vm);
 }
 
-static void test_msr_permission_bitmap(void)
+KVM_ONE_VCPU_TEST(user_msr, msr_permission_bitmap, guest_code_permission_bitmap)
 {
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
+	struct kvm_vm *vm = vcpu->vm;
 	int rc;
 
-	vm = vm_create_with_one_vcpu(&vcpu, guest_code_permission_bitmap);
-
 	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
 	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
 	vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER);
@@ -715,8 +692,6 @@ static void test_msr_permission_bitmap(void)
 	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_gs);
 	run_guest_then_process_rdmsr(vcpu, MSR_GS_BASE);
 	run_guest_then_process_ucall_done(vcpu);
-
-	kvm_vm_free(vm);
 }
 
 #define test_user_exit_msr_ioctl(vm, cmd, arg, flag, valid_mask)	\
@@ -786,31 +761,20 @@ static void run_msr_filter_flag_test(struct kvm_vm *vm)
 }
 
 /* Test that attempts to write to the unused bits in a flag fails. */
-static void test_user_exit_msr_flags(void)
+KVM_ONE_VCPU_TEST(user_msr, user_exit_msr_flags, NULL)
 {
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+	struct kvm_vm *vm = vcpu->vm;
 
 	/* Test flags for KVM_CAP_X86_USER_SPACE_MSR. */
 	run_user_space_msr_flag_test(vm);
 
 	/* Test flags and range flags for KVM_X86_SET_MSR_FILTER. */
 	run_msr_filter_flag_test(vm);
-
-	kvm_vm_free(vm);
 }
 
 int main(int argc, char *argv[])
 {
-	test_msr_filter_allow();
-
-	test_msr_filter_deny();
-
-	test_msr_permission_bitmap();
+	fep_available = kvm_is_forced_emulation_enabled();
 
-	test_user_exit_msr_flags();
-
-	return 0;
+	return test_harness_run(argc, argv);
 }
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
index 7f6f5f23fb9b..977948fd52e6 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
@@ -28,16 +28,16 @@
 #define NESTED_TEST_MEM1		0xc0001000
 #define NESTED_TEST_MEM2		0xc0002000
 
-static void l2_guest_code(void)
+static void l2_guest_code(u64 *a, u64 *b)
 {
-	*(volatile uint64_t *)NESTED_TEST_MEM1;
-	*(volatile uint64_t *)NESTED_TEST_MEM1 = 1;
+	READ_ONCE(*a);
+	WRITE_ONCE(*a, 1);
 	GUEST_SYNC(true);
 	GUEST_SYNC(false);
 
-	*(volatile uint64_t *)NESTED_TEST_MEM2 = 1;
+	WRITE_ONCE(*b, 1);
 	GUEST_SYNC(true);
-	*(volatile uint64_t *)NESTED_TEST_MEM2 = 1;
+	WRITE_ONCE(*b, 1);
 	GUEST_SYNC(true);
 	GUEST_SYNC(false);
 
@@ -45,17 +45,33 @@ static void l2_guest_code(void)
 	vmcall();
 }
 
+static void l2_guest_code_ept_enabled(void)
+{
+	l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2);
+}
+
+static void l2_guest_code_ept_disabled(void)
+{
+	/* Access the same L1 GPAs as l2_guest_code_ept_enabled() */
+	l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM);
+}
+
 void l1_guest_code(struct vmx_pages *vmx)
 {
 #define L2_GUEST_STACK_SIZE 64
 	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	void *l2_rip;
 
 	GUEST_ASSERT(vmx->vmcs_gpa);
 	GUEST_ASSERT(prepare_for_vmx_operation(vmx));
 	GUEST_ASSERT(load_vmcs(vmx));
 
-	prepare_vmcs(vmx, l2_guest_code,
-		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	if (vmx->eptp_gpa)
+		l2_rip = l2_guest_code_ept_enabled;
+	else
+		l2_rip = l2_guest_code_ept_disabled;
+
+	prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
 
 	GUEST_SYNC(false);
 	GUEST_ASSERT(!vmlaunch());
@@ -64,7 +80,7 @@ void l1_guest_code(struct vmx_pages *vmx)
 	GUEST_DONE();
 }
 
-int main(int argc, char *argv[])
+static void test_vmx_dirty_log(bool enable_ept)
 {
 	vm_vaddr_t vmx_pages_gva = 0;
 	struct vmx_pages *vmx;
@@ -76,8 +92,7 @@ int main(int argc, char *argv[])
 	struct ucall uc;
 	bool done = false;
 
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-	TEST_REQUIRE(kvm_cpu_has_ept());
+	pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled");
 
 	/* Create VM */
 	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
@@ -103,11 +118,16 @@ int main(int argc, char *argv[])
 	 *
 	 * Note that prepare_eptp should be called only L1's GPA map is done,
 	 * meaning after the last call to virt_map.
+	 *
+	 * When EPT is disabled, the L2 guest code will still access the same L1
+	 * GPAs as the EPT enabled case.
 	 */
-	prepare_eptp(vmx, vm, 0);
-	nested_map_memslot(vmx, vm, 0);
-	nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096);
-	nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096);
+	if (enable_ept) {
+		prepare_eptp(vmx, vm, 0);
+		nested_map_memslot(vmx, vm, 0);
+		nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096);
+		nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096);
+	}
 
 	bmap = bitmap_zalloc(TEST_MEM_PAGES);
 	host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM);
@@ -148,3 +168,15 @@ int main(int argc, char *argv[])
 		}
 	}
 }
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	test_vmx_dirty_log(/*enable_ept=*/false);
+
+	if (kvm_cpu_has_ept())
+		test_vmx_dirty_log(/*enable_ept=*/true);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
index 2a8d4ac2f020..ea0cb3cae0f7 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
@@ -15,10 +15,11 @@
 
 #include <linux/bitmap.h>
 
+#include "kvm_test_harness.h"
 #include "kvm_util.h"
 #include "vmx.h"
 
-union perf_capabilities {
+static union perf_capabilities {
 	struct {
 		u64	lbr_format:6;
 		u64	pebs_trap:1;
@@ -32,7 +33,7 @@ union perf_capabilities {
 		u64	anythread_deprecated:1;
 	};
 	u64	capabilities;
-};
+} host_cap;
 
 /*
  * The LBR format and most PEBS features are immutable, all other features are
@@ -73,19 +74,19 @@ static void guest_code(uint64_t current_val)
 	GUEST_DONE();
 }
 
+KVM_ONE_VCPU_TEST_SUITE(vmx_pmu_caps);
+
 /*
  * Verify that guest WRMSRs to PERF_CAPABILITIES #GP regardless of the value
  * written, that the guest always sees the userspace controlled value, and that
  * PERF_CAPABILITIES is immutable after KVM_RUN.
  */
-static void test_guest_wrmsr_perf_capabilities(union perf_capabilities host_cap)
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, guest_wrmsr_perf_capabilities, guest_code)
 {
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 	struct ucall uc;
 	int r, i;
 
-	vm_init_descriptor_tables(vm);
+	vm_init_descriptor_tables(vcpu->vm);
 	vcpu_init_descriptor_tables(vcpu);
 
 	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
@@ -117,31 +118,21 @@ static void test_guest_wrmsr_perf_capabilities(union perf_capabilities host_cap)
 		TEST_ASSERT(!r, "Post-KVM_RUN write '0x%llx'didn't fail",
 			    host_cap.capabilities ^ BIT_ULL(i));
 	}
-
-	kvm_vm_free(vm);
 }
 
 /*
  * Verify KVM allows writing PERF_CAPABILITIES with all KVM-supported features
  * enabled, as well as '0' (to disable all features).
  */
-static void test_basic_perf_capabilities(union perf_capabilities host_cap)
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, basic_perf_capabilities, guest_code)
 {
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = vm_create_with_one_vcpu(&vcpu, NULL);
-
 	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0);
 	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
-
-	kvm_vm_free(vm);
 }
 
-static void test_fungible_perf_capabilities(union perf_capabilities host_cap)
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, fungible_perf_capabilities, guest_code)
 {
 	const uint64_t fungible_caps = host_cap.capabilities & ~immutable_caps.capabilities;
-
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = vm_create_with_one_vcpu(&vcpu, NULL);
 	int bit;
 
 	for_each_set_bit(bit, &fungible_caps, 64) {
@@ -150,8 +141,6 @@ static void test_fungible_perf_capabilities(union perf_capabilities host_cap)
 			     host_cap.capabilities & ~BIT_ULL(bit));
 	}
 	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
-
-	kvm_vm_free(vm);
 }
 
 /*
@@ -160,14 +149,11 @@ static void test_fungible_perf_capabilities(union perf_capabilities host_cap)
  * separately as they are multi-bit values, e.g. toggling or setting a single
  * bit can generate a false positive without dedicated safeguards.
  */
-static void test_immutable_perf_capabilities(union perf_capabilities host_cap)
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, immutable_perf_capabilities, guest_code)
 {
 	const uint64_t reserved_caps = (~host_cap.capabilities |
 					immutable_caps.capabilities) &
 				       ~format_caps.capabilities;
-
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm = vm_create_with_one_vcpu(&vcpu, NULL);
 	union perf_capabilities val = host_cap;
 	int r, bit;
 
@@ -201,8 +187,6 @@ static void test_immutable_perf_capabilities(union perf_capabilities host_cap)
 		TEST_ASSERT(!r, "Bad PEBS FMT = 0x%x didn't fail, host = 0x%x",
 			    val.pebs_format, host_cap.pebs_format);
 	}
-
-	kvm_vm_free(vm);
 }
 
 /*
@@ -211,17 +195,13 @@ static void test_immutable_perf_capabilities(union perf_capabilities host_cap)
  * LBR_TOS as those bits are writable across all uarch implementations (arch
  * LBRs will need to poke a different MSR).
  */
-static void test_lbr_perf_capabilities(union perf_capabilities host_cap)
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, lbr_perf_capabilities, guest_code)
 {
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
 	int r;
 
 	if (!host_cap.lbr_format)
 		return;
 
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-
 	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
 	vcpu_set_msr(vcpu, MSR_LBR_TOS, 7);
 
@@ -229,15 +209,11 @@ static void test_lbr_perf_capabilities(union perf_capabilities host_cap)
 
 	r = _vcpu_set_msr(vcpu, MSR_LBR_TOS, 7);
 	TEST_ASSERT(!r, "Writing LBR_TOS should fail after disabling vPMU");
-
-	kvm_vm_free(vm);
 }
 
 int main(int argc, char *argv[])
 {
-	union perf_capabilities host_cap;
-
-	TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
+	TEST_REQUIRE(kvm_is_pmu_enabled());
 	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM));
 
 	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
@@ -248,9 +224,5 @@ int main(int argc, char *argv[])
 	TEST_ASSERT(host_cap.full_width_write,
 		    "Full-width writes should always be supported");
 
-	test_basic_perf_capabilities(host_cap);
-	test_fungible_perf_capabilities(host_cap);
-	test_immutable_perf_capabilities(host_cap);
-	test_guest_wrmsr_perf_capabilities(host_cap);
-	test_lbr_perf_capabilities(host_cap);
+	return test_harness_run(argc, argv);
 }
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index 9ec9ab60b63e..d2ea0435f4f7 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -62,6 +62,7 @@ enum {
 	TEST_POLL_TIMEOUT,
 	TEST_POLL_MASKED,
 	TEST_POLL_WAKE,
+	SET_VCPU_INFO,
 	TEST_TIMER_PAST,
 	TEST_LOCKING_SEND_RACE,
 	TEST_LOCKING_POLL_RACE,
@@ -321,6 +322,10 @@ static void guest_code(void)
 
 	GUEST_SYNC(TEST_POLL_WAKE);
 
+	/* Set the vcpu_info to point at exactly the place it already is to
+	 * make sure the attribute is functional. */
+	GUEST_SYNC(SET_VCPU_INFO);
+
 	/* A timer wake an *unmasked* port which should wake us with an
 	 * actual interrupt, while we're polling on a different port. */
 	ports[0]++;
@@ -389,6 +394,7 @@ static int cmp_timespec(struct timespec *a, struct timespec *b)
 		return 0;
 }
 
+static struct shared_info *shinfo;
 static struct vcpu_info *vinfo;
 static struct kvm_vcpu *vcpu;
 
@@ -404,20 +410,38 @@ static void *juggle_shinfo_state(void *arg)
 {
 	struct kvm_vm *vm = (struct kvm_vm *)arg;
 
-	struct kvm_xen_hvm_attr cache_activate = {
+	struct kvm_xen_hvm_attr cache_activate_gfn = {
 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
 		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE
 	};
 
-	struct kvm_xen_hvm_attr cache_deactivate = {
+	struct kvm_xen_hvm_attr cache_deactivate_gfn = {
 		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
 		.u.shared_info.gfn = KVM_XEN_INVALID_GFN
 	};
 
+	struct kvm_xen_hvm_attr cache_activate_hva = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA,
+		.u.shared_info.hva = (unsigned long)shinfo
+	};
+
+	struct kvm_xen_hvm_attr cache_deactivate_hva = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+		.u.shared_info.hva = 0
+	};
+
+	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
+
 	for (;;) {
-		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate);
-		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate);
+		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_gfn);
 		pthread_testcancel();
+		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_gfn);
+
+		if (xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA) {
+			__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_hva);
+			pthread_testcancel();
+			__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_hva);
+		}
 	}
 
 	return NULL;
@@ -442,6 +466,7 @@ int main(int argc, char *argv[])
 	bool do_runstate_flag = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG);
 	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
 	bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
+	bool has_shinfo_hva = !!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA);
 
 	clock_gettime(CLOCK_REALTIME, &min_ts);
 
@@ -452,7 +477,7 @@ int main(int argc, char *argv[])
 				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 3, 0);
 	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 3);
 
-	struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
+	shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
 
 	int zero_fd = open("/dev/zero", O_RDONLY);
 	TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero");
@@ -488,10 +513,16 @@ int main(int argc, char *argv[])
 			    "Failed to read back RUNSTATE_UPDATE_FLAG attr");
 	}
 
-	struct kvm_xen_hvm_attr ha = {
-		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
-		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE,
-	};
+	struct kvm_xen_hvm_attr ha = {};
+
+	if (has_shinfo_hva) {
+		ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA;
+		ha.u.shared_info.hva = (unsigned long)shinfo;
+	} else {
+		ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO;
+		ha.u.shared_info.gfn = SHINFO_ADDR / PAGE_SIZE;
+	}
+
 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
 
 	/*
@@ -862,6 +893,16 @@ int main(int argc, char *argv[])
 				alarm(1);
 				break;
 
+			case SET_VCPU_INFO:
+				if (has_shinfo_hva) {
+					struct kvm_xen_vcpu_attr vih = {
+						.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA,
+						.u.hva = (unsigned long)vinfo
+					};
+					vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vih);
+				}
+				break;
+
 			case TEST_TIMER_PAST:
 				TEST_ASSERT(!evtchn_irq_expected,
 					    "Expected event channel IRQ but it didn't happen");
diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
index 401e2eb092a3..7e2b431b9f90 100644
--- a/tools/testing/selftests/landlock/common.h
+++ b/tools/testing/selftests/landlock/common.h
@@ -74,31 +74,19 @@ static void _init_caps(struct __test_metadata *const _metadata, bool drop_all)
 		EXPECT_EQ(0, cap_set_secbits(noroot));
 
 	cap_p = cap_get_proc();
-	EXPECT_NE(NULL, cap_p)
-	{
-		TH_LOG("Failed to cap_get_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_clear(cap_p))
-	{
-		TH_LOG("Failed to cap_clear: %s", strerror(errno));
-	}
+	EXPECT_NE(NULL, cap_p);
+	EXPECT_NE(-1, cap_clear(cap_p));
 	if (!drop_all) {
 		EXPECT_NE(-1, cap_set_flag(cap_p, CAP_PERMITTED,
-					   ARRAY_SIZE(caps), caps, CAP_SET))
-		{
-			TH_LOG("Failed to cap_set_flag: %s", strerror(errno));
-		}
+					   ARRAY_SIZE(caps), caps, CAP_SET));
 	}
 
 	/* Automatically resets ambient capabilities. */
 	EXPECT_NE(-1, cap_set_proc(cap_p))
 	{
-		TH_LOG("Failed to cap_set_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_free(cap_p))
-	{
-		TH_LOG("Failed to cap_free: %s", strerror(errno));
+		TH_LOG("Failed to set capabilities: %s", strerror(errno));
 	}
+	EXPECT_NE(-1, cap_free(cap_p));
 
 	/* Quickly checks that ambient capabilities are cleared. */
 	EXPECT_NE(-1, cap_get_ambient(caps[0]));
@@ -122,22 +110,13 @@ static void _change_cap(struct __test_metadata *const _metadata,
 	cap_t cap_p;
 
 	cap_p = cap_get_proc();
-	EXPECT_NE(NULL, cap_p)
-	{
-		TH_LOG("Failed to cap_get_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_set_flag(cap_p, flag, 1, &cap, value))
-	{
-		TH_LOG("Failed to cap_set_flag: %s", strerror(errno));
-	}
+	EXPECT_NE(NULL, cap_p);
+	EXPECT_NE(-1, cap_set_flag(cap_p, flag, 1, &cap, value));
 	EXPECT_NE(-1, cap_set_proc(cap_p))
 	{
-		TH_LOG("Failed to cap_set_proc: %s", strerror(errno));
-	}
-	EXPECT_NE(-1, cap_free(cap_p))
-	{
-		TH_LOG("Failed to cap_free: %s", strerror(errno));
+		TH_LOG("Failed to set capability %d: %s", cap, strerror(errno));
 	}
+	EXPECT_NE(-1, cap_free(cap_p));
 }
 
 static void __maybe_unused set_cap(struct __test_metadata *const _metadata,
diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 9a6036fbf289..27744524df51 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -9,6 +9,7 @@
 
 #define _GNU_SOURCE
 #include <fcntl.h>
+#include <libgen.h>
 #include <linux/landlock.h>
 #include <linux/magic.h>
 #include <sched.h>
@@ -285,15 +286,21 @@ static void prepare_layout_opt(struct __test_metadata *const _metadata,
 
 static void prepare_layout(struct __test_metadata *const _metadata)
 {
-	_metadata->teardown_parent = true;
-
 	prepare_layout_opt(_metadata, &mnt_tmp);
 }
 
 static void cleanup_layout(struct __test_metadata *const _metadata)
 {
 	set_cap(_metadata, CAP_SYS_ADMIN);
-	EXPECT_EQ(0, umount(TMP_DIR));
+	if (umount(TMP_DIR)) {
+		/*
+		 * According to the test environment, the mount point of the
+		 * current directory may be shared or not, which changes the
+		 * visibility of the nested TMP_DIR mount point for the test's
+		 * parent process doing this cleanup.
+		 */
+		ASSERT_EQ(EINVAL, errno);
+	}
 	clear_cap(_metadata, CAP_SYS_ADMIN);
 	EXPECT_EQ(0, remove_path(TMP_DIR));
 }
@@ -307,7 +314,7 @@ FIXTURE_SETUP(layout0)
 	prepare_layout(_metadata);
 }
 
-FIXTURE_TEARDOWN(layout0)
+FIXTURE_TEARDOWN_PARENT(layout0)
 {
 	cleanup_layout(_metadata);
 }
@@ -370,7 +377,7 @@ FIXTURE_SETUP(layout1)
 	create_layout1(_metadata);
 }
 
-FIXTURE_TEARDOWN(layout1)
+FIXTURE_TEARDOWN_PARENT(layout1)
 {
 	remove_layout1(_metadata);
 
@@ -3683,7 +3690,7 @@ FIXTURE_SETUP(ftruncate)
 	create_file(_metadata, file1_s1d1);
 }
 
-FIXTURE_TEARDOWN(ftruncate)
+FIXTURE_TEARDOWN_PARENT(ftruncate)
 {
 	EXPECT_EQ(0, remove_path(file1_s1d1));
 	cleanup_layout(_metadata);
@@ -3861,7 +3868,7 @@ FIXTURE_SETUP(layout1_bind)
 	clear_cap(_metadata, CAP_SYS_ADMIN);
 }
 
-FIXTURE_TEARDOWN(layout1_bind)
+FIXTURE_TEARDOWN_PARENT(layout1_bind)
 {
 	/* umount(dir_s2d2)) is handled by namespace lifetime. */
 
@@ -4266,7 +4273,7 @@ FIXTURE_SETUP(layout2_overlay)
 	clear_cap(_metadata, CAP_SYS_ADMIN);
 }
 
-FIXTURE_TEARDOWN(layout2_overlay)
+FIXTURE_TEARDOWN_PARENT(layout2_overlay)
 {
 	if (self->skip_test)
 		SKIP(return, "overlayfs is not supported (teardown)");
@@ -4616,7 +4623,6 @@ FIXTURE(layout3_fs)
 {
 	bool has_created_dir;
 	bool has_created_file;
-	char *dir_path;
 	bool skip_test;
 };
 
@@ -4675,11 +4681,24 @@ FIXTURE_VARIANT_ADD(layout3_fs, hostfs) {
 	.cwd_fs_magic = HOSTFS_SUPER_MAGIC,
 };
 
+static char *dirname_alloc(const char *path)
+{
+	char *dup;
+
+	if (!path)
+		return NULL;
+
+	dup = strdup(path);
+	if (!dup)
+		return NULL;
+
+	return dirname(dup);
+}
+
 FIXTURE_SETUP(layout3_fs)
 {
 	struct stat statbuf;
-	const char *slash;
-	size_t dir_len;
+	char *dir_path = dirname_alloc(variant->file_path);
 
 	if (!supports_filesystem(variant->mnt.type) ||
 	    !cwd_matches_fs(variant->cwd_fs_magic)) {
@@ -4687,27 +4706,15 @@ FIXTURE_SETUP(layout3_fs)
 		SKIP(return, "this filesystem is not supported (setup)");
 	}
 
-	_metadata->teardown_parent = true;
-
-	slash = strrchr(variant->file_path, '/');
-	ASSERT_NE(slash, NULL);
-	dir_len = (size_t)slash - (size_t)variant->file_path;
-	ASSERT_LT(0, dir_len);
-	self->dir_path = malloc(dir_len + 1);
-	self->dir_path[dir_len] = '\0';
-	strncpy(self->dir_path, variant->file_path, dir_len);
-
 	prepare_layout_opt(_metadata, &variant->mnt);
 
 	/* Creates directory when required. */
-	if (stat(self->dir_path, &statbuf)) {
+	if (stat(dir_path, &statbuf)) {
 		set_cap(_metadata, CAP_DAC_OVERRIDE);
-		EXPECT_EQ(0, mkdir(self->dir_path, 0700))
+		EXPECT_EQ(0, mkdir(dir_path, 0700))
 		{
 			TH_LOG("Failed to create directory \"%s\": %s",
-			       self->dir_path, strerror(errno));
-			free(self->dir_path);
-			self->dir_path = NULL;
+			       dir_path, strerror(errno));
 		}
 		self->has_created_dir = true;
 		clear_cap(_metadata, CAP_DAC_OVERRIDE);
@@ -4728,9 +4735,11 @@ FIXTURE_SETUP(layout3_fs)
 		self->has_created_file = true;
 		clear_cap(_metadata, CAP_DAC_OVERRIDE);
 	}
+
+	free(dir_path);
 }
 
-FIXTURE_TEARDOWN(layout3_fs)
+FIXTURE_TEARDOWN_PARENT(layout3_fs)
 {
 	if (self->skip_test)
 		SKIP(return, "this filesystem is not supported (teardown)");
@@ -4746,16 +4755,17 @@ FIXTURE_TEARDOWN(layout3_fs)
 	}
 
 	if (self->has_created_dir) {
+		char *dir_path = dirname_alloc(variant->file_path);
+
 		set_cap(_metadata, CAP_DAC_OVERRIDE);
 		/*
 		 * Don't check for error because the directory might already
 		 * have been removed (cf. release_inode test).
 		 */
-		rmdir(self->dir_path);
+		rmdir(dir_path);
 		clear_cap(_metadata, CAP_DAC_OVERRIDE);
+		free(dir_path);
 	}
-	free(self->dir_path);
-	self->dir_path = NULL;
 
 	cleanup_layout(_metadata);
 }
@@ -4822,7 +4832,10 @@ TEST_F_FORK(layout3_fs, tag_inode_dir_mnt)
 
 TEST_F_FORK(layout3_fs, tag_inode_dir_child)
 {
-	layer3_fs_tag_inode(_metadata, self, variant, self->dir_path);
+	char *dir_path = dirname_alloc(variant->file_path);
+
+	layer3_fs_tag_inode(_metadata, self, variant, dir_path);
+	free(dir_path);
 }
 
 TEST_F_FORK(layout3_fs, tag_inode_file)
@@ -4849,9 +4862,13 @@ TEST_F_FORK(layout3_fs, release_inodes)
 	if (self->has_created_file)
 		EXPECT_EQ(0, remove_path(variant->file_path));
 
-	if (self->has_created_dir)
+	if (self->has_created_dir) {
+		char *dir_path = dirname_alloc(variant->file_path);
+
 		/* Don't check for error because of cgroup specificities. */
-		remove_path(self->dir_path);
+		remove_path(dir_path);
+		free(dir_path);
+	}
 
 	ruleset_fd =
 		create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_DIR, layer1);
diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
index da2cade3bab0..33c24ceddfb7 100644
--- a/tools/testing/selftests/lib.mk
+++ b/tools/testing/selftests/lib.mk
@@ -7,6 +7,8 @@ else ifneq ($(filter -%,$(LLVM)),)
 LLVM_SUFFIX := $(LLVM)
 endif
 
+CLANG := $(LLVM_PREFIX)clang$(LLVM_SUFFIX)
+
 CLANG_TARGET_FLAGS_arm          := arm-linux-gnueabi
 CLANG_TARGET_FLAGS_arm64        := aarch64-linux-gnu
 CLANG_TARGET_FLAGS_hexagon      := hexagon-linux-musl
@@ -18,7 +20,13 @@ CLANG_TARGET_FLAGS_riscv        := riscv64-linux-gnu
 CLANG_TARGET_FLAGS_s390         := s390x-linux-gnu
 CLANG_TARGET_FLAGS_x86          := x86_64-linux-gnu
 CLANG_TARGET_FLAGS_x86_64       := x86_64-linux-gnu
-CLANG_TARGET_FLAGS              := $(CLANG_TARGET_FLAGS_$(ARCH))
+
+# Default to host architecture if ARCH is not explicitly given.
+ifeq ($(ARCH),)
+CLANG_TARGET_FLAGS := $(shell $(CLANG) -print-target-triple)
+else
+CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH))
+endif
 
 ifeq ($(CROSS_COMPILE),)
 ifeq ($(CLANG_TARGET_FLAGS),)
@@ -30,7 +38,7 @@ else
 CLANG_FLAGS     += --target=$(notdir $(CROSS_COMPILE:%-=%))
 endif # CROSS_COMPILE
 
-CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
+CC := $(CLANG) $(CLANG_FLAGS) -fintegrated-as
 else
 CC := $(CROSS_COMPILE)gcc
 endif # LLVM
@@ -44,8 +52,22 @@ endif
 selfdir = $(realpath $(dir $(filter %/lib.mk,$(MAKEFILE_LIST))))
 top_srcdir = $(selfdir)/../../..
 
+# msg: emit succinct information message describing current building step
+# $1 - generic step name (e.g., CC, LINK, etc);
+# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor];
+# $3 - target (assumed to be file); only file name will be emitted;
+# $4 - optional extra arg, emitted as-is, if provided.
+ifeq ($(V),1)
+Q =
+msg =
+else
+Q = @
+msg = @printf '  %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))";
+MAKEFLAGS += --no-print-directory
+endif
+
 ifeq ($(KHDR_INCLUDES),)
-KHDR_INCLUDES := -isystem $(top_srcdir)/usr/include
+KHDR_INCLUDES := -D_GNU_SOURCE -isystem $(top_srcdir)/usr/include
 endif
 
 # The following are built by lib.mk common compile rules.
@@ -176,7 +198,8 @@ endif
 ifeq ($(OVERRIDE_TARGETS),)
 LOCAL_HDRS += $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h
 $(OUTPUT)/%:%.c $(LOCAL_HDRS)
-	$(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@
+	$(call msg,CC,,$@)
+	$(Q)$(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@
 
 $(OUTPUT)/%.o:%.S
 	$(COMPILE.S) $^ -o $@
diff --git a/tools/testing/selftests/lsm/common.h b/tools/testing/selftests/lsm/common.h
index d404329e5eeb..06d12110d241 100644
--- a/tools/testing/selftests/lsm/common.h
+++ b/tools/testing/selftests/lsm/common.h
@@ -7,7 +7,7 @@
 
 #ifndef lsm_get_self_attr
 static inline int lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx,
-				    size_t *size, __u32 flags)
+				    __u32 *size, __u32 flags)
 {
 	return syscall(__NR_lsm_get_self_attr, attr, ctx, size, flags);
 }
@@ -15,14 +15,14 @@ static inline int lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx,
 
 #ifndef lsm_set_self_attr
 static inline int lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx,
-				    size_t size, __u32 flags)
+				    __u32 size, __u32 flags)
 {
 	return syscall(__NR_lsm_set_self_attr, attr, ctx, size, flags);
 }
 #endif
 
 #ifndef lsm_list_modules
-static inline int lsm_list_modules(__u64 *ids, size_t *size, __u32 flags)
+static inline int lsm_list_modules(__u64 *ids, __u32 *size, __u32 flags)
 {
 	return syscall(__NR_lsm_list_modules, ids, size, flags);
 }
diff --git a/tools/testing/selftests/lsm/lsm_get_self_attr_test.c b/tools/testing/selftests/lsm/lsm_get_self_attr_test.c
index e0e313d9047a..df215e4aa63f 100644
--- a/tools/testing/selftests/lsm/lsm_get_self_attr_test.c
+++ b/tools/testing/selftests/lsm/lsm_get_self_attr_test.c
@@ -40,7 +40,7 @@ TEST(size_null_lsm_get_self_attr)
 TEST(ctx_null_lsm_get_self_attr)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
-	size_t size = page_size;
+	__u32 size = page_size;
 	int rc;
 
 	rc = lsm_get_self_attr(LSM_ATTR_CURRENT, NULL, &size, 0);
@@ -57,7 +57,7 @@ TEST(size_too_small_lsm_get_self_attr)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
 	struct lsm_ctx *ctx = calloc(page_size, 1);
-	size_t size = 1;
+	__u32 size = 1;
 
 	ASSERT_NE(NULL, ctx);
 	errno = 0;
@@ -77,7 +77,7 @@ TEST(flags_zero_lsm_get_self_attr)
 	const long page_size = sysconf(_SC_PAGESIZE);
 	struct lsm_ctx *ctx = calloc(page_size, 1);
 	__u64 *syscall_lsms = calloc(page_size, 1);
-	size_t size;
+	__u32 size;
 	int lsmcount;
 	int i;
 
@@ -117,7 +117,7 @@ TEST(flags_overset_lsm_get_self_attr)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
 	struct lsm_ctx *ctx = calloc(page_size, 1);
-	size_t size;
+	__u32 size;
 
 	ASSERT_NE(NULL, ctx);
 
@@ -140,7 +140,7 @@ TEST(flags_overset_lsm_get_self_attr)
 TEST(basic_lsm_get_self_attr)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
-	size_t size = page_size;
+	__u32 size = page_size;
 	struct lsm_ctx *ctx = calloc(page_size, 1);
 	struct lsm_ctx *tctx = NULL;
 	__u64 *syscall_lsms = calloc(page_size, 1);
diff --git a/tools/testing/selftests/lsm/lsm_list_modules_test.c b/tools/testing/selftests/lsm/lsm_list_modules_test.c
index 4d5d4cee2586..06d24d4679a6 100644
--- a/tools/testing/selftests/lsm/lsm_list_modules_test.c
+++ b/tools/testing/selftests/lsm/lsm_list_modules_test.c
@@ -31,7 +31,7 @@ TEST(size_null_lsm_list_modules)
 TEST(ids_null_lsm_list_modules)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
-	size_t size = page_size;
+	__u32 size = page_size;
 
 	errno = 0;
 	ASSERT_EQ(-1, lsm_list_modules(NULL, &size, 0));
@@ -43,7 +43,7 @@ TEST(size_too_small_lsm_list_modules)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
 	__u64 *syscall_lsms = calloc(page_size, 1);
-	size_t size = 1;
+	__u32 size = 1;
 
 	ASSERT_NE(NULL, syscall_lsms);
 	errno = 0;
@@ -58,7 +58,7 @@ TEST(flags_set_lsm_list_modules)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
 	__u64 *syscall_lsms = calloc(page_size, 1);
-	size_t size = page_size;
+	__u32 size = page_size;
 
 	ASSERT_NE(NULL, syscall_lsms);
 	errno = 0;
@@ -72,7 +72,7 @@ TEST(flags_set_lsm_list_modules)
 TEST(correct_lsm_list_modules)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
-	size_t size = page_size;
+	__u32 size = page_size;
 	__u64 *syscall_lsms = calloc(page_size, 1);
 	char *sysfs_lsms = calloc(page_size, 1);
 	char *name;
diff --git a/tools/testing/selftests/lsm/lsm_set_self_attr_test.c b/tools/testing/selftests/lsm/lsm_set_self_attr_test.c
index e9712c6cf596..66dec47e3ca3 100644
--- a/tools/testing/selftests/lsm/lsm_set_self_attr_test.c
+++ b/tools/testing/selftests/lsm/lsm_set_self_attr_test.c
@@ -25,7 +25,7 @@ TEST(size_too_small_lsm_set_self_attr)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
 	struct lsm_ctx *ctx = calloc(page_size, 1);
-	size_t size = page_size;
+	__u32 size = page_size;
 
 	ASSERT_NE(NULL, ctx);
 	if (attr_lsm_count()) {
@@ -41,7 +41,7 @@ TEST(flags_zero_lsm_set_self_attr)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
 	struct lsm_ctx *ctx = calloc(page_size, 1);
-	size_t size = page_size;
+	__u32 size = page_size;
 
 	ASSERT_NE(NULL, ctx);
 	if (attr_lsm_count()) {
@@ -57,7 +57,7 @@ TEST(flags_overset_lsm_set_self_attr)
 {
 	const long page_size = sysconf(_SC_PAGESIZE);
 	char *ctx = calloc(page_size, 1);
-	size_t size = page_size;
+	__u32 size = page_size;
 	struct lsm_ctx *tctx = (struct lsm_ctx *)ctx;
 
 	ASSERT_NE(NULL, ctx);
diff --git a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c
index a9cc17facfb3..4e14dba81234 100644
--- a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c
+++ b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c
@@ -69,5 +69,5 @@ int main(int argc, char **argv)
 	/* Multi-threaded */
 	test_mt_membarrier();
 
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c
index 4cdc8b1d124c..fa3f1d6c37a0 100644
--- a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c
+++ b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c
@@ -24,5 +24,5 @@ int main(int argc, char **argv)
 
 	test_membarrier_get_registrations(/*cmd=*/0);
 
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 3df008677239..18f585684e20 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -44,8 +44,6 @@
  */
 static size_t mfd_def_size = MFD_DEF_SIZE;
 static const char *memfd_str = MEMFD_STR;
-static int newpid_thread_fn2(void *arg);
-static void join_newpid_thread(pid_t pid);
 
 static ssize_t fd2name(int fd, char *buf, size_t bufsize)
 {
@@ -194,7 +192,6 @@ static unsigned int mfd_assert_get_seals(int fd)
 static void mfd_assert_has_seals(int fd, unsigned int seals)
 {
 	char buf[PATH_MAX];
-	int nbytes;
 	unsigned int s;
 	fd2name(fd, buf, PATH_MAX);
 
@@ -696,7 +693,6 @@ static void mfd_assert_mode(int fd, int mode)
 {
 	struct stat st;
 	char buf[PATH_MAX];
-	int nbytes;
 
 	fd2name(fd, buf, PATH_MAX);
 
@@ -715,7 +711,6 @@ static void mfd_assert_mode(int fd, int mode)
 static void mfd_assert_chmod(int fd, int mode)
 {
 	char buf[PATH_MAX];
-	int nbytes;
 
 	fd2name(fd, buf, PATH_MAX);
 
@@ -731,7 +726,6 @@ static void mfd_fail_chmod(int fd, int mode)
 {
 	struct stat st;
 	char buf[PATH_MAX];
-	int nbytes;
 
 	fd2name(fd, buf, PATH_MAX);
 
@@ -1254,9 +1248,6 @@ static void test_sysctl_set_sysctl2(void)
 
 static int sysctl_simple_child(void *arg)
 {
-	int fd;
-	int pid;
-
 	printf("%s sysctl 0\n", memfd_str);
 	test_sysctl_set_sysctl0();
 
@@ -1321,7 +1312,6 @@ static void test_sysctl_sysctl2_failset(void)
 
 static int sysctl_nested_child(void *arg)
 {
-	int fd;
 	int pid;
 
 	printf("%s nested sysctl 0\n", memfd_str);
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 4ff10ea61461..d26e962f2ac4 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -46,3 +46,4 @@ gup_longterm
 mkdirty
 va_high_addr_switch
 hugetlb_fault_after_madv
+hugetlb_madv_vs_map
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 2453add65d12..410495e0a611 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -12,7 +12,7 @@ uname_M := $(shell uname -m 2>/dev/null || echo not)
 else
 uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+')
 endif
-ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
+ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/powerpc/')
 endif
 
 # Without this, failed build products remain, with up-to-date timestamps,
@@ -70,6 +70,7 @@ TEST_GEN_FILES += ksm_tests
 TEST_GEN_FILES += ksm_functional_tests
 TEST_GEN_FILES += mdwe_test
 TEST_GEN_FILES += hugetlb_fault_after_madv
+TEST_GEN_FILES += hugetlb_madv_vs_map
 
 ifneq ($(ARCH),arm64)
 TEST_GEN_FILES += soft-dirty
@@ -97,13 +98,13 @@ TEST_GEN_FILES += $(BINARIES_64)
 endif
 else
 
-ifneq (,$(findstring $(ARCH),ppc64))
+ifneq (,$(findstring $(ARCH),powerpc))
 TEST_GEN_FILES += protection_keys
 endif
 
 endif
 
-ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64))
+ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64))
 TEST_GEN_FILES += va_high_addr_switch
 TEST_GEN_FILES += virtual_address_range
 TEST_GEN_FILES += write_to_hugetlbfs
@@ -114,6 +115,11 @@ TEST_PROGS := run_vmtests.sh
 TEST_FILES := test_vmalloc.sh
 TEST_FILES += test_hmm.sh
 TEST_FILES += va_high_addr_switch.sh
+TEST_FILES += charge_reserved_hugetlb.sh
+TEST_FILES += hugetlb_reparenting_test.sh
+
+# required by charge_reserved_hugetlb.sh
+TEST_FILES += write_hugetlb_memory.sh
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
index e14bdd4455f2..d680c00d2853 100755
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -11,6 +11,8 @@ if [[ $(id -u) -ne 0 ]]; then
   exit $ksft_skip
 fi
 
+nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
+
 fault_limit_file=limit_in_bytes
 reservation_limit_file=rsvd.limit_in_bytes
 fault_usage_file=usage_in_bytes
@@ -582,3 +584,5 @@ if [[ $do_umount ]]; then
   umount $cgroup_path
   rmdir $cgroup_path
 fi
+
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c
index 656afba02dbc..4f42eb7d7636 100644
--- a/tools/testing/selftests/mm/compaction_test.c
+++ b/tools/testing/selftests/mm/compaction_test.c
@@ -95,21 +95,22 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
 
 	fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
 	if (fd < 0) {
-		ksft_test_result_fail("Failed to open /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
-		return -1;
+		ksft_print_msg("Failed to open /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
+		ret = -1;
+		goto out;
 	}
 
 	if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) {
-		ksft_test_result_fail("Failed to read from /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
+		ksft_print_msg("Failed to read from /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
 		goto close_fd;
 	}
 
 	/* Start with the initial condition of 0 huge pages*/
 	if (write(fd, "0", sizeof(char)) != sizeof(char)) {
-		ksft_test_result_fail("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
+		ksft_print_msg("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
 		goto close_fd;
 	}
 
@@ -118,16 +119,16 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
 	/* Request a large number of huge pages. The Kernel will allocate
 	   as much as it can */
 	if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
-		ksft_test_result_fail("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
+		ksft_print_msg("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
 		goto close_fd;
 	}
 
 	lseek(fd, 0, SEEK_SET);
 
 	if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
-		ksft_test_result_fail("Failed to re-read from /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
+		ksft_print_msg("Failed to re-read from /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
 		goto close_fd;
 	}
 
@@ -139,24 +140,26 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
 
 	if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages))
 	    != strlen(initial_nr_hugepages)) {
-		ksft_test_result_fail("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n",
-				      strerror(errno));
+		ksft_print_msg("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n",
+			       strerror(errno));
 		goto close_fd;
 	}
 
+	ksft_print_msg("Number of huge pages allocated = %d\n",
+		       atoi(nr_hugepages));
+
 	if (compaction_index > 3) {
 		ksft_print_msg("ERROR: Less that 1/%d of memory is available\n"
 			       "as huge pages\n", compaction_index);
-		ksft_test_result_fail("No of huge pages allocated = %d\n", (atoi(nr_hugepages)));
 		goto close_fd;
 	}
 
-	ksft_test_result_pass("Memory compaction succeeded. No of huge pages allocated = %d\n",
-			      (atoi(nr_hugepages)));
 	ret = 0;
 
  close_fd:
 	close(fd);
+ out:
+	ksft_test_result(ret == 0, "check_compaction\n");
 	return ret;
 }
 
@@ -174,7 +177,7 @@ int main(int argc, char **argv)
 	ksft_print_header();
 
 	if (prereq() || geteuid())
-		return ksft_exit_pass();
+		ksft_exit_skip("Prerequisites unsatisfied\n");
 
 	ksft_set_plan(1);
 
@@ -222,7 +225,7 @@ int main(int argc, char **argv)
 	}
 
 	if (check_compaction(mem_free, hugepage_size) == 0)
-		return ksft_exit_pass();
+		ksft_exit_pass();
 
-	return ksft_exit_fail();
+	ksft_exit_fail();
 }
diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
index 363bf5f801be..fe078d6e1806 100644
--- a/tools/testing/selftests/mm/cow.c
+++ b/tools/testing/selftests/mm/cow.c
@@ -1779,5 +1779,5 @@ int main(int argc, char **argv)
 	if (err)
 		ksft_exit_fail_msg("%d out of %d tests failed\n",
 				   err, ksft_test_num());
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c
index ad168d35b23b..d7eaca5bbe9b 100644
--- a/tools/testing/selftests/mm/gup_longterm.c
+++ b/tools/testing/selftests/mm/gup_longterm.c
@@ -456,5 +456,5 @@ int main(int argc, char **argv)
 	if (err)
 		ksft_exit_fail_msg("%d out of %d tests failed\n",
 				   err, ksft_test_num());
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c
index cbe99594d319..bd335cf9bc0e 100644
--- a/tools/testing/selftests/mm/gup_test.c
+++ b/tools/testing/selftests/mm/gup_test.c
@@ -203,7 +203,7 @@ int main(int argc, char **argv)
 	ksft_print_header();
 	ksft_set_plan(nthreads);
 
-	filed = open(file, O_RDWR|O_CREAT);
+	filed = open(file, O_RDWR|O_CREAT, 0664);
 	if (filed < 0)
 		ksft_exit_fail_msg("Unable to open %s: %s\n", file, strerror(errno));
 
@@ -228,7 +228,7 @@ int main(int argc, char **argv)
 			break;
 		}
 		ksft_test_result_skip("Please run this test as root\n");
-		return ksft_exit_pass();
+		ksft_exit_pass();
 	}
 
 	p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0);
@@ -267,5 +267,5 @@ int main(int argc, char **argv)
 
 	free(tid);
 
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c
index f32d99565c5e..e74107185324 100644
--- a/tools/testing/selftests/mm/hugetlb-madvise.c
+++ b/tools/testing/selftests/mm/hugetlb-madvise.c
@@ -19,6 +19,7 @@
 #include <sys/mman.h>
 #include <fcntl.h>
 #include "vm_util.h"
+#include "../kselftest.h"
 
 #define MIN_FREE_PAGES	20
 #define NR_HUGE_PAGES	10	/* common number of pages to map/allocate */
@@ -78,7 +79,7 @@ int main(int argc, char **argv)
 	free_hugepages = get_free_hugepages();
 	if (free_hugepages < MIN_FREE_PAGES) {
 		printf("Not enough free huge pages to test, exiting!\n");
-		exit(1);
+		exit(KSFT_SKIP);
 	}
 
 	fd = memfd_create(argv[0], MFD_HUGETLB);
diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c
new file mode 100644
index 000000000000..d01e8d4901d0
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test case that must run on a system with one and only one huge page available.
+ *	# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+ *
+ * During setup, the test allocates the only available page, and starts three threads:
+ *  - thread1:
+ *	* madvise(MADV_DONTNEED) on the allocated huge page
+ *  - thread 2:
+ *	* Write to the allocated huge page
+ *  - thread 3:
+ *	* Try to allocated an extra huge page (which must not available)
+ *
+ *  The test fails if thread3 is able to allocate a page.
+ *
+ *  Touching the first page after thread3's allocation will raise a SIGBUS
+ *
+ *  Author: Breno Leitao <leitao@debian.org>
+ */
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "vm_util.h"
+#include "../kselftest.h"
+
+#define MMAP_SIZE (1 << 21)
+#define INLOOP_ITER 100
+
+char *huge_ptr;
+
+/* Touch the memory while it is being madvised() */
+void *touch(void *unused)
+{
+	for (int i = 0; i < INLOOP_ITER; i++)
+		huge_ptr[0] = '.';
+
+	return NULL;
+}
+
+void *madv(void *unused)
+{
+	for (int i = 0; i < INLOOP_ITER; i++)
+		madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED);
+
+	return NULL;
+}
+
+/*
+ * We got here, and there must be no huge page available for mapping
+ * The other hugepage should be flipping from used <-> reserved, because
+ * of madvise(DONTNEED).
+ */
+void *map_extra(void *unused)
+{
+	void *ptr;
+
+	for (int i = 0; i < INLOOP_ITER; i++) {
+		ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+			   -1, 0);
+
+		if ((long)ptr != -1) {
+			/* Touching the other page now will cause a SIGBUG
+			 * huge_ptr[0] = '1';
+			 */
+			return ptr;
+		}
+	}
+
+	return NULL;
+}
+
+int main(void)
+{
+	pthread_t thread1, thread2, thread3;
+	unsigned long free_hugepages;
+	void *ret;
+
+	/*
+	 * On kernel 6.7, we are able to reproduce the problem with ~10
+	 * interactions
+	 */
+	int max = 10;
+
+	free_hugepages = get_free_hugepages();
+
+	if (free_hugepages != 1) {
+		ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n",
+			       free_hugepages);
+	}
+
+	while (max--) {
+		huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE,
+				MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+				-1, 0);
+
+		if ((unsigned long)huge_ptr == -1) {
+			ksft_exit_skip("Failed to allocated huge page\n");
+			return KSFT_SKIP;
+		}
+
+		pthread_create(&thread1, NULL, madv, NULL);
+		pthread_create(&thread2, NULL, touch, NULL);
+		pthread_create(&thread3, NULL, map_extra, NULL);
+
+		pthread_join(thread1, NULL);
+		pthread_join(thread2, NULL);
+		pthread_join(thread3, &ret);
+
+		if (ret) {
+			ksft_test_result_fail("Unexpected huge page allocation\n");
+			return KSFT_FAIL;
+		}
+
+		/* Unmap and restart */
+		munmap(huge_ptr, MMAP_SIZE);
+	}
+
+	return KSFT_PASS;
+}
diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
index 14d26075c863..11f9bbe7dc22 100755
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -11,6 +11,7 @@ if [[ $(id -u) -ne 0 ]]; then
   exit $ksft_skip
 fi
 
+nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
 usage_file=usage_in_bytes
 
 if [[ "$1" == "-cgroup-v2" ]]; then
@@ -248,5 +249,9 @@ cleanup
 
 echo ALL PASS
 
-umount $CGROUP_ROOT
-rm -rf $CGROUP_ROOT
+if [[ $do_umount ]]; then
+  umount $CGROUP_ROOT
+  rm -rf $CGROUP_ROOT
+fi
+
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index fbff0dd09191..508287560c45 100644
--- a/tools/testing/selftests/mm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -155,12 +155,12 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot,
 	/* Stabilize accounting by disabling KSM completely. */
 	if (ksm_unmerge()) {
 		ksft_test_result_fail("Disabling (unmerging) KSM failed\n");
-		goto unmap;
+		return MAP_FAILED;
 	}
 
 	if (get_my_merging_pages() > 0) {
 		ksft_test_result_fail("Still pages merged\n");
-		goto unmap;
+		return MAP_FAILED;
 	}
 
 	map = mmap(NULL, size, PROT_READ|PROT_WRITE,
@@ -646,5 +646,5 @@ int main(int argc, char **argv)
 	if (err)
 		ksft_exit_fail_msg("%d out of %d tests failed\n",
 				   err, ksft_test_num());
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c
index 17bcb07f19f3..ef7d911da13e 100644
--- a/tools/testing/selftests/mm/madv_populate.c
+++ b/tools/testing/selftests/mm/madv_populate.c
@@ -307,5 +307,5 @@ int main(int argc, char **argv)
 	if (err)
 		ksft_exit_fail_msg("%d out of %d tests failed\n",
 				   err, ksft_test_num());
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c
index 598159f3df1f..b74813fdc951 100644
--- a/tools/testing/selftests/mm/map_fixed_noreplace.c
+++ b/tools/testing/selftests/mm/map_fixed_noreplace.c
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include "../kselftest.h"
 
 static void dump_maps(void)
 {
@@ -28,15 +29,12 @@ static unsigned long find_base_addr(unsigned long size)
 
 	flags = MAP_PRIVATE | MAP_ANONYMOUS;
 	addr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
-	if (addr == MAP_FAILED) {
-		printf("Error: couldn't map the space we need for the test\n");
-		return 0;
-	}
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n");
+
+	if (munmap(addr, size) != 0)
+		ksft_exit_fail_msg("Error: munmap failed\n");
 
-	if (munmap(addr, size) != 0) {
-		printf("Error: couldn't map the space we need for the test\n");
-		return 0;
-	}
 	return (unsigned long)addr;
 }
 
@@ -46,51 +44,39 @@ int main(void)
 	unsigned long flags, addr, size, page_size;
 	char *p;
 
+	ksft_print_header();
+	ksft_set_plan(9);
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
-	//let's find a base addr that is free before we start the tests
+	/* let's find a base addr that is free before we start the tests */
 	size = 5 * page_size;
 	base_addr = find_base_addr(size);
-	if (!base_addr) {
-		printf("Error: couldn't map the space we need for the test\n");
-		return 1;
-	}
 
 	flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
 
-	// Check we can map all the areas we need below
-	errno = 0;
+	/* Check we can map all the areas we need below */
 	addr = base_addr;
 	size = 5 * page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p == MAP_FAILED) {
 		dump_maps();
-		printf("Error: couldn't map the space we need for the test\n");
-		return 1;
+		ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n");
 	}
-
-	errno = 0;
 	if (munmap((void *)addr, 5 * page_size) != 0) {
 		dump_maps();
-		printf("Error: munmap failed!?\n");
-		return 1;
+		ksft_exit_fail_msg("Error: munmap failed!?\n");
 	}
-	printf("unmap() successful\n");
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
-	errno = 0;
 	addr = base_addr + page_size;
 	size = 3 * page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p == MAP_FAILED) {
 		dump_maps();
-		printf("Error: first mmap() failed unexpectedly\n");
-		return 1;
+		ksft_exit_fail_msg("Error: first mmap() failed unexpectedly\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Exact same mapping again:
@@ -100,17 +86,14 @@ int main(void)
 	 *     +3 | mapped | new
 	 *     +4 |  free  | new
 	 */
-	errno = 0;
 	addr = base_addr;
 	size = 5 * page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p != MAP_FAILED) {
 		dump_maps();
-		printf("Error:1: mmap() succeeded when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:1: mmap() succeeded when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Second mapping contained within first:
@@ -121,17 +104,14 @@ int main(void)
 	 *     +3 | mapped |
 	 *     +4 |  free  |
 	 */
-	errno = 0;
 	addr = base_addr + (2 * page_size);
 	size = page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p != MAP_FAILED) {
 		dump_maps();
-		printf("Error:2: mmap() succeeded when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:2: mmap() succeeded when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Overlap end of existing mapping:
@@ -141,17 +121,14 @@ int main(void)
 	 *     +3 | mapped | new
 	 *     +4 |  free  | new
 	 */
-	errno = 0;
 	addr = base_addr + (3 * page_size);
 	size = 2 * page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p != MAP_FAILED) {
 		dump_maps();
-		printf("Error:3: mmap() succeeded when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:3: mmap() succeeded when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Overlap start of existing mapping:
@@ -161,17 +138,14 @@ int main(void)
 	 *     +3 | mapped |
 	 *     +4 |  free  |
 	 */
-	errno = 0;
 	addr = base_addr;
 	size = 2 * page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p != MAP_FAILED) {
 		dump_maps();
-		printf("Error:4: mmap() succeeded when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:4: mmap() succeeded when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Adjacent to start of existing mapping:
@@ -181,17 +155,14 @@ int main(void)
 	 *     +3 | mapped |
 	 *     +4 |  free  |
 	 */
-	errno = 0;
 	addr = base_addr;
 	size = page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p == MAP_FAILED) {
 		dump_maps();
-		printf("Error:5: mmap() failed when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:5: mmap() failed when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	/*
 	 * Adjacent to end of existing mapping:
@@ -201,27 +172,22 @@ int main(void)
 	 *     +3 | mapped |
 	 *     +4 |  free  |  new
 	 */
-	errno = 0;
 	addr = base_addr + (4 * page_size);
 	size = page_size;
 	p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-	printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
 	if (p == MAP_FAILED) {
 		dump_maps();
-		printf("Error:6: mmap() failed when it shouldn't have\n");
-		return 1;
+		ksft_exit_fail_msg("Error:6: mmap() failed when it shouldn't have\n");
 	}
+	ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
 
 	addr = base_addr;
 	size = 5 * page_size;
 	if (munmap((void *)addr, size) != 0) {
 		dump_maps();
-		printf("Error: munmap failed!?\n");
-		return 1;
+		ksft_exit_fail_msg("Error: munmap failed!?\n");
 	}
-	printf("unmap() successful\n");
+	ksft_test_result_pass("Base Address unmap() successful\n");
 
-	printf("OK\n");
-	return 0;
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c
index 86e8f2048a40..a1f005a90a4f 100644
--- a/tools/testing/selftests/mm/map_hugetlb.c
+++ b/tools/testing/selftests/mm/map_hugetlb.c
@@ -16,6 +16,7 @@
 #include <sys/mman.h>
 #include <fcntl.h>
 #include "vm_util.h"
+#include "../kselftest.h"
 
 #define LENGTH (256UL*1024*1024)
 #define PROTECTION (PROT_READ | PROT_WRITE)
@@ -31,7 +32,7 @@
 
 static void check_bytes(char *addr)
 {
-	printf("First hex is %x\n", *((unsigned int *)addr));
+	ksft_print_msg("First hex is %x\n", *((unsigned int *)addr));
 }
 
 static void write_bytes(char *addr, size_t length)
@@ -42,23 +43,21 @@ static void write_bytes(char *addr, size_t length)
 		*(addr + i) = (char)i;
 }
 
-static int read_bytes(char *addr, size_t length)
+static void read_bytes(char *addr, size_t length)
 {
 	unsigned long i;
 
 	check_bytes(addr);
 	for (i = 0; i < length; i++)
-		if (*(addr + i) != (char)i) {
-			printf("Mismatch at %lu\n", i);
-			return 1;
-		}
-	return 0;
+		if (*(addr + i) != (char)i)
+			ksft_exit_fail_msg("Mismatch at %lu\n", i);
+
+	ksft_test_result_pass("Read correct data\n");
 }
 
 int main(int argc, char **argv)
 {
 	void *addr;
-	int ret;
 	size_t hugepage_size;
 	size_t length = LENGTH;
 	int flags = FLAGS;
@@ -69,6 +68,9 @@ int main(int argc, char **argv)
 	if (hugepage_size > length)
 		length = hugepage_size;
 
+	ksft_print_header();
+	ksft_set_plan(1);
+
 	if (argc > 1)
 		length = atol(argv[1]) << 20;
 	if (argc > 2) {
@@ -78,27 +80,23 @@ int main(int argc, char **argv)
 	}
 
 	if (shift)
-		printf("%u kB hugepages\n", 1 << (shift - 10));
+		ksft_print_msg("%u kB hugepages\n", 1 << (shift - 10));
 	else
-		printf("Default size hugepages\n");
-	printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
+		ksft_print_msg("Default size hugepages\n");
+	ksft_print_msg("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
 
 	addr = mmap(ADDR, length, PROTECTION, flags, -1, 0);
-	if (addr == MAP_FAILED) {
-		perror("mmap");
-		exit(1);
-	}
+	if (addr == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
 
-	printf("Returned address is %p\n", addr);
+	ksft_print_msg("Returned address is %p\n", addr);
 	check_bytes(addr);
 	write_bytes(addr, length);
-	ret = read_bytes(addr, length);
+	read_bytes(addr, length);
 
 	/* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
-	if (munmap(addr, length)) {
-		perror("munmap");
-		exit(1);
-	}
+	if (munmap(addr, length))
+		ksft_exit_fail_msg("munmap: %s\n", strerror(errno));
 
-	return ret;
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c
index 7945d0754875..5c8a53869b1b 100644
--- a/tools/testing/selftests/mm/map_populate.c
+++ b/tools/testing/selftests/mm/map_populate.c
@@ -16,19 +16,21 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include "../kselftest.h"
 
 #define MMAP_SZ		4096
 
-#define BUG_ON(condition, description)					\
-	do {								\
-		if (condition) {					\
-			fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
-				__LINE__, (description), strerror(errno)); \
-			exit(1);					\
-		}							\
+#define BUG_ON(condition, description)						\
+	do {									\
+		if (condition)							\
+			ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n",		\
+					   __func__, __LINE__, (description),	\
+					   strerror(errno));			\
 	} while (0)
 
-static int parent_f(int sock, unsigned long *smap, int child)
+#define TESTS_IN_CHILD 2
+
+static void parent_f(int sock, unsigned long *smap, int child)
 {
 	int status, ret;
 
@@ -43,9 +45,10 @@ static int parent_f(int sock, unsigned long *smap, int child)
 	BUG_ON(ret <= 0, "write(sock)");
 
 	waitpid(child, &status, 0);
-	BUG_ON(!WIFEXITED(status), "child in unexpected state");
 
-	return WEXITSTATUS(status);
+	/* The ksft macros don't keep counters between processes */
+	ksft_cnt.ksft_pass = WEXITSTATUS(status);
+	ksft_cnt.ksft_fail = TESTS_IN_CHILD - WEXITSTATUS(status);
 }
 
 static int child_f(int sock, unsigned long *smap, int fd)
@@ -64,10 +67,11 @@ static int child_f(int sock, unsigned long *smap, int fd)
 	ret = read(sock, &buf, sizeof(int));
 	BUG_ON(ret <= 0, "read(sock)");
 
-	BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
-	BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
+	ksft_test_result(*smap != 0x22222BAD, "MAP_POPULATE COW private page\n");
+	ksft_test_result(*smap == 0xdeadbabe, "The mapping state\n");
 
-	return 0;
+	/* The ksft macros don't keep counters between processes */
+	return ksft_cnt.ksft_pass;
 }
 
 int main(int argc, char **argv)
@@ -76,6 +80,9 @@ int main(int argc, char **argv)
 	FILE *ftmp;
 	unsigned long *smap;
 
+	ksft_print_header();
+	ksft_set_plan(TESTS_IN_CHILD);
+
 	ftmp = tmpfile();
 	BUG_ON(!ftmp, "tmpfile()");
 
@@ -101,7 +108,9 @@ int main(int argc, char **argv)
 		ret = close(sock[0]);
 		BUG_ON(ret, "close()");
 
-		return parent_f(sock[1], smap, child);
+		parent_f(sock[1], smap, child);
+
+		ksft_finished();
 	}
 
 	ret = close(sock[1]);
diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c
index 200bedcdc32e..1e01d3ddc11c 100644
--- a/tools/testing/selftests/mm/mdwe_test.c
+++ b/tools/testing/selftests/mm/mdwe_test.c
@@ -7,6 +7,7 @@
 #include <linux/mman.h>
 #include <linux/prctl.h>
 
+#define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/auxv.h>
diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c
index 301abb99e027..b8a7efe9204e 100644
--- a/tools/testing/selftests/mm/mkdirty.c
+++ b/tools/testing/selftests/mm/mkdirty.c
@@ -375,5 +375,5 @@ int main(void)
 	if (err)
 		ksft_exit_fail_msg("%d out of %d tests failed\n",
 				   err, ksft_test_num());
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c
index 1fba77df7f62..1cd80b0f76c3 100644
--- a/tools/testing/selftests/mm/mlock-random-test.c
+++ b/tools/testing/selftests/mm/mlock-random-test.c
@@ -13,6 +13,7 @@
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <time.h>
+#include "../kselftest.h"
 #include "mlock2.h"
 
 #define CHUNK_UNIT (128 * 1024)
@@ -31,14 +32,14 @@ int set_cap_limits(rlim_t max)
 	new.rlim_cur = max;
 	new.rlim_max = max;
 	if (setrlimit(RLIMIT_MEMLOCK, &new)) {
-		perror("setrlimit() returns error\n");
+		ksft_perror("setrlimit() returns error\n");
 		return -1;
 	}
 
 	/* drop capabilities including CAP_IPC_LOCK */
 	if (cap_set_proc(cap)) {
-		perror("cap_set_proc() returns error\n");
-		return -2;
+		ksft_perror("cap_set_proc() returns error\n");
+		return -1;
 	}
 
 	return 0;
@@ -52,27 +53,24 @@ int get_proc_locked_vm_size(void)
 	unsigned long lock_size = 0;
 
 	f = fopen("/proc/self/status", "r");
-	if (!f) {
-		perror("fopen");
-		return -1;
-	}
+	if (!f)
+		ksft_exit_fail_msg("fopen: %s\n", strerror(errno));
 
 	while (fgets(line, 1024, f)) {
 		if (strstr(line, "VmLck")) {
 			ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
 			if (ret <= 0) {
-				printf("sscanf() on VmLck error: %s: %d\n",
-						line, ret);
 				fclose(f);
-				return -1;
+				ksft_exit_fail_msg("sscanf() on VmLck error: %s: %d\n",
+						   line, ret);
 			}
 			fclose(f);
 			return (int)(lock_size << 10);
 		}
 	}
 
-	perror("cannot parse VmLck in /proc/self/status\n");
 	fclose(f);
+	ksft_exit_fail_msg("cannot parse VmLck in /proc/self/status: %s\n", strerror(errno));
 	return -1;
 }
 
@@ -91,10 +89,8 @@ int get_proc_page_size(unsigned long addr)
 	size_t size;
 
 	smaps = seek_to_smaps_entry(addr);
-	if (!smaps) {
-		printf("Unable to parse /proc/self/smaps\n");
-		return 0;
-	}
+	if (!smaps)
+		ksft_exit_fail_msg("Unable to parse /proc/self/smaps\n");
 
 	while (getline(&line, &size, smaps) > 0) {
 		if (!strstr(line, "MMUPageSize")) {
@@ -105,12 +101,9 @@ int get_proc_page_size(unsigned long addr)
 		}
 
 		/* found the MMUPageSize of this section */
-		if (sscanf(line, "MMUPageSize:    %8lu kB",
-					&mmupage_size) < 1) {
-			printf("Unable to parse smaps entry for Size:%s\n",
-					line);
-			break;
-		}
+		if (sscanf(line, "MMUPageSize:    %8lu kB", &mmupage_size) < 1)
+			ksft_exit_fail_msg("Unable to parse smaps entry for Size:%s\n",
+					   line);
 
 	}
 	free(line);
@@ -136,7 +129,7 @@ int get_proc_page_size(unsigned long addr)
  *    return value: 0 - success
  *    else: failure
  */
-int test_mlock_within_limit(char *p, int alloc_size)
+static void test_mlock_within_limit(char *p, int alloc_size)
 {
 	int i;
 	int ret = 0;
@@ -145,11 +138,9 @@ int test_mlock_within_limit(char *p, int alloc_size)
 	int page_size = 0;
 
 	getrlimit(RLIMIT_MEMLOCK, &cur);
-	if (cur.rlim_cur < alloc_size) {
-		printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
-				alloc_size, (unsigned int)cur.rlim_cur);
-		return -1;
-	}
+	if (cur.rlim_cur < alloc_size)
+		ksft_exit_fail_msg("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
+				   alloc_size, (unsigned int)cur.rlim_cur);
 
 	srand(time(NULL));
 	for (i = 0; i < TEST_LOOP; i++) {
@@ -169,13 +160,11 @@ int test_mlock_within_limit(char *p, int alloc_size)
 			ret = mlock2_(p + start_offset, lock_size,
 				       MLOCK_ONFAULT);
 
-		if (ret) {
-			printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
-					is_mlock ? "mlock" : "mlock2",
-					p, alloc_size,
-					p + start_offset, lock_size);
-			return ret;
-		}
+		if (ret)
+			ksft_exit_fail_msg("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
+					   is_mlock ? "mlock" : "mlock2",
+					   p, alloc_size,
+					   p + start_offset, lock_size);
 	}
 
 	/*
@@ -183,18 +172,12 @@ int test_mlock_within_limit(char *p, int alloc_size)
 	 */
 	locked_vm_size = get_proc_locked_vm_size();
 	page_size = get_proc_page_size((unsigned long)p);
-	if (page_size == 0) {
-		printf("cannot get proc MMUPageSize\n");
-		return -1;
-	}
 
-	if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
-		printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
-				locked_vm_size, alloc_size);
-		return -1;
-	}
+	if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size)
+		ksft_exit_fail_msg("%s left VmLck:%d on %d chunk\n",
+				   __func__, locked_vm_size, alloc_size);
 
-	return 0;
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 
@@ -213,7 +196,7 @@ int test_mlock_within_limit(char *p, int alloc_size)
  *    return value: 0 - success
  *    else: failure
  */
-int test_mlock_outof_limit(char *p, int alloc_size)
+static void test_mlock_outof_limit(char *p, int alloc_size)
 {
 	int i;
 	int ret = 0;
@@ -221,11 +204,9 @@ int test_mlock_outof_limit(char *p, int alloc_size)
 	struct rlimit cur;
 
 	getrlimit(RLIMIT_MEMLOCK, &cur);
-	if (cur.rlim_cur >= alloc_size) {
-		printf("alloc_size[%d] >%u rlimit, violates test condition\n",
-				alloc_size, (unsigned int)cur.rlim_cur);
-		return -1;
-	}
+	if (cur.rlim_cur >= alloc_size)
+		ksft_exit_fail_msg("alloc_size[%d] >%u rlimit, violates test condition\n",
+				   alloc_size, (unsigned int)cur.rlim_cur);
 
 	old_locked_vm_size = get_proc_locked_vm_size();
 	srand(time(NULL));
@@ -240,56 +221,47 @@ int test_mlock_outof_limit(char *p, int alloc_size)
 		else
 			ret = mlock2_(p + start_offset, lock_size,
 					MLOCK_ONFAULT);
-		if (ret == 0) {
-			printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
-					is_mlock ? "mlock" : "mlock2",
-					p, alloc_size,
-					p + start_offset, lock_size);
-			return -1;
-		}
+		if (ret == 0)
+			ksft_exit_fail_msg("%s() succeeds? on %p(%d) mlock%p(%d)\n",
+					   is_mlock ? "mlock" : "mlock2",
+					   p, alloc_size, p + start_offset, lock_size);
 	}
 
 	locked_vm_size = get_proc_locked_vm_size();
-	if (locked_vm_size != old_locked_vm_size) {
-		printf("tests leads to new mlocked page: old[%d], new[%d]\n",
-				old_locked_vm_size,
-				locked_vm_size);
-		return -1;
-	}
+	if (locked_vm_size != old_locked_vm_size)
+		ksft_exit_fail_msg("tests leads to new mlocked page: old[%d], new[%d]\n",
+				   old_locked_vm_size,
+				   locked_vm_size);
 
-	return 0;
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 int main(int argc, char **argv)
 {
 	char *p = NULL;
-	int ret = 0;
+
+	ksft_print_header();
 
 	if (set_cap_limits(MLOCK_RLIMIT_SIZE))
-		return -1;
+		ksft_finished();
+
+	ksft_set_plan(2);
 
 	p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
-	if (p == NULL) {
-		perror("malloc() failure\n");
-		return -1;
-	}
-	ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
-	if (ret)
-		return ret;
+	if (p == NULL)
+		ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno));
+
+	test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
 	munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
 	free(p);
 
-
 	p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
-	if (p == NULL) {
-		perror("malloc() failure\n");
-		return -1;
-	}
-	ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
-	if (ret)
-		return ret;
+	if (p == NULL)
+		ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno));
+
+	test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
 	munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
 	free(p);
 
-	return 0;
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c
index 80cddc0de206..26f744188ad0 100644
--- a/tools/testing/selftests/mm/mlock2-tests.c
+++ b/tools/testing/selftests/mm/mlock2-tests.c
@@ -7,9 +7,8 @@
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <stdbool.h>
-#include "mlock2.h"
-
 #include "../kselftest.h"
+#include "mlock2.h"
 
 struct vm_boundaries {
 	unsigned long start;
@@ -40,14 +39,14 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
 	while(fgets(line, 1024, file)) {
 		end_addr = strchr(line, '-');
 		if (!end_addr) {
-			printf("cannot parse /proc/self/maps\n");
+			ksft_print_msg("cannot parse /proc/self/maps\n");
 			goto out;
 		}
 		*end_addr = '\0';
 		end_addr++;
 		stop = strchr(end_addr, ' ');
 		if (!stop) {
-			printf("cannot parse /proc/self/maps\n");
+			ksft_print_msg("cannot parse /proc/self/maps\n");
 			goto out;
 		}
 
@@ -78,7 +77,7 @@ static bool is_vmflag_set(unsigned long addr, const char *vmflag)
 
 	smaps = seek_to_smaps_entry(addr);
 	if (!smaps) {
-		printf("Unable to parse /proc/self/smaps\n");
+		ksft_print_msg("Unable to parse /proc/self/smaps\n");
 		goto out;
 	}
 
@@ -115,7 +114,7 @@ static unsigned long get_value_for_name(unsigned long addr, const char *name)
 
 	smaps = seek_to_smaps_entry(addr);
 	if (!smaps) {
-		printf("Unable to parse /proc/self/smaps\n");
+		ksft_print_msg("Unable to parse /proc/self/smaps\n");
 		goto out;
 	}
 
@@ -129,7 +128,7 @@ static unsigned long get_value_for_name(unsigned long addr, const char *name)
 
 		value_ptr = line + strlen(name);
 		if (sscanf(value_ptr, "%lu kB", &value) < 1) {
-			printf("Unable to parse smaps entry for Size\n");
+			ksft_print_msg("Unable to parse smaps entry for Size\n");
 			goto out;
 		}
 		break;
@@ -180,57 +179,45 @@ static int lock_check(unsigned long addr)
 static int unlock_lock_check(char *map)
 {
 	if (is_vmflag_set((unsigned long)map, LOCKED)) {
-		printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
+		ksft_print_msg("VMA flag %s is present on page 1 after unlock\n", LOCKED);
 		return 1;
 	}
 
 	return 0;
 }
 
-static int test_mlock_lock()
+static void test_mlock_lock(void)
 {
 	char *map;
-	int ret = 1;
 	unsigned long page_size = getpagesize();
 
 	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("test_mlock_locked mmap");
-		goto out;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
 
 	if (mlock2_(map, 2 * page_size, 0)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock2(0)");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("mlock2(0): %s\n", strerror(errno));
 	}
 
-	if (!lock_check((unsigned long)map))
-		goto unmap;
+	ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__);
 
 	/* Now unlock and recheck attributes */
 	if (munlock(map, 2 * page_size)) {
-		perror("munlock()");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
 	}
 
-	ret = unlock_lock_check(map);
-
-unmap:
+	ksft_test_result(!unlock_lock_check(map), "%s: Locked\n", __func__);
 	munmap(map, 2 * page_size);
-out:
-	return ret;
 }
 
 static int onfault_check(char *map)
 {
 	*map = 'a';
 	if (!is_vma_lock_on_fault((unsigned long)map)) {
-		printf("VMA is not marked for lock on fault\n");
+		ksft_print_msg("VMA is not marked for lock on fault\n");
 		return 1;
 	}
 
@@ -243,172 +230,131 @@ static int unlock_onfault_check(char *map)
 
 	if (is_vma_lock_on_fault((unsigned long)map) ||
 	    is_vma_lock_on_fault((unsigned long)map + page_size)) {
-		printf("VMA is still lock on fault after unlock\n");
+		ksft_print_msg("VMA is still lock on fault after unlock\n");
 		return 1;
 	}
 
 	return 0;
 }
 
-static int test_mlock_onfault()
+static void test_mlock_onfault(void)
 {
 	char *map;
-	int ret = 1;
 	unsigned long page_size = getpagesize();
 
 	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("test_mlock_locked mmap");
-		goto out;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
 
 	if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock2(MLOCK_ONFAULT)");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("mlock2(MLOCK_ONFAULT): %s\n", strerror(errno));
 	}
 
-	if (onfault_check(map))
-		goto unmap;
+	ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__);
 
 	/* Now unlock and recheck attributes */
 	if (munlock(map, 2 * page_size)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("munlock()");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
 	}
 
-	ret = unlock_onfault_check(map);
-unmap:
+	ksft_test_result(!unlock_onfault_check(map), "VMA open lock after fault\n");
 	munmap(map, 2 * page_size);
-out:
-	return ret;
 }
 
-static int test_lock_onfault_of_present()
+static void test_lock_onfault_of_present(void)
 {
 	char *map;
-	int ret = 1;
 	unsigned long page_size = getpagesize();
 
 	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("test_mlock_locked mmap");
-		goto out;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
 
 	*map = 'a';
 
 	if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock2(MLOCK_ONFAULT)");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_test_result_fail("mlock2(MLOCK_ONFAULT) error: %s", strerror(errno));
 	}
 
-	if (!is_vma_lock_on_fault((unsigned long)map) ||
-	    !is_vma_lock_on_fault((unsigned long)map + page_size)) {
-		printf("VMA with present pages is not marked lock on fault\n");
-		goto unmap;
-	}
-	ret = 0;
-unmap:
+	ksft_test_result(is_vma_lock_on_fault((unsigned long)map) ||
+			 is_vma_lock_on_fault((unsigned long)map + page_size),
+			 "VMA with present pages is not marked lock on fault\n");
 	munmap(map, 2 * page_size);
-out:
-	return ret;
 }
 
-static int test_munlockall()
+static void test_munlockall0(void)
 {
 	char *map;
-	int ret = 1;
 	unsigned long page_size = getpagesize();
 
 	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
-	if (map == MAP_FAILED) {
-		perror("test_munlockall mmap");
-		goto out;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s\n", strerror(errno));
 
 	if (mlockall(MCL_CURRENT)) {
-		perror("mlockall(MCL_CURRENT)");
-		goto out;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("mlockall(MCL_CURRENT): %s\n", strerror(errno));
 	}
 
-	if (!lock_check((unsigned long)map))
-		goto unmap;
+	ksft_test_result(lock_check((unsigned long)map), "%s: Locked memory area\n", __func__);
 
 	if (munlockall()) {
-		perror("munlockall()");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno));
 	}
 
-	if (unlock_lock_check(map))
-		goto unmap;
-
+	ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__);
 	munmap(map, 2 * page_size);
+}
+
+static void test_munlockall1(void)
+{
+	char *map;
+	unsigned long page_size = getpagesize();
 
 	map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
-	if (map == MAP_FAILED) {
-		perror("test_munlockall second mmap");
-		goto out;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
 
 	if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
-		perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_ONFAULT): %s\n", strerror(errno));
 	}
 
-	if (onfault_check(map))
-		goto unmap;
+	ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__);
 
 	if (munlockall()) {
-		perror("munlockall()");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno));
 	}
 
-	if (unlock_onfault_check(map))
-		goto unmap;
+	ksft_test_result(!unlock_onfault_check(map), "%s: Unlocked\n", __func__);
 
 	if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
-		perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
-		goto out;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_FUTURE): %s\n", strerror(errno));
 	}
 
-	if (!lock_check((unsigned long)map))
-		goto unmap;
+	ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__);
 
 	if (munlockall()) {
-		perror("munlockall()");
-		goto unmap;
+		munmap(map, 2 * page_size);
+		ksft_exit_fail_msg("munlockall() %s\n", strerror(errno));
 	}
 
-	ret = unlock_lock_check(map);
-
-unmap:
+	ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__);
 	munmap(map, 2 * page_size);
-out:
-	munlockall();
-	return ret;
 }
 
-static int test_vma_management(bool call_mlock)
+static void test_vma_management(bool call_mlock)
 {
-	int ret = 1;
 	void *map;
 	unsigned long page_size = getpagesize();
 	struct vm_boundaries page1;
@@ -417,25 +363,19 @@ static int test_vma_management(bool call_mlock)
 
 	map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
 		   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-	if (map == MAP_FAILED) {
-		perror("mmap()");
-		return ret;
-	}
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
 
 	if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
-		if (errno == ENOSYS) {
-			printf("Cannot call new mlock family, skipping test\n");
-			_exit(KSFT_SKIP);
-		}
-		perror("mlock(ONFAULT)\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("mlock error: %s", strerror(errno));
 	}
 
 	if (get_vm_area((unsigned long)map, &page1) ||
 	    get_vm_area((unsigned long)map + page_size, &page2) ||
 	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-		printf("couldn't find mapping in /proc/self/maps\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
 	}
 
 	/*
@@ -444,76 +384,86 @@ static int test_vma_management(bool call_mlock)
 	 * not a failure)
 	 */
 	if (page1.start != page2.start || page2.start != page3.start) {
-		printf("VMAs are not merged to start, aborting test\n");
-		ret = 0;
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("VMAs are not merged to start, aborting test");
 	}
 
 	if (munlock(map + page_size, page_size)) {
-		perror("munlock()");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("munlock(): %s", strerror(errno));
 	}
 
 	if (get_vm_area((unsigned long)map, &page1) ||
 	    get_vm_area((unsigned long)map + page_size, &page2) ||
 	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-		printf("couldn't find mapping in /proc/self/maps\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
 	}
 
 	/* All three VMAs should be different */
 	if (page1.start == page2.start || page2.start == page3.start) {
-		printf("failed to split VMA for munlock\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("failed to split VMA for munlock");
 	}
 
 	/* Now unlock the first and third page and check the VMAs again */
 	if (munlock(map, page_size * 3)) {
-		perror("munlock()");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("munlock(): %s", strerror(errno));
 	}
 
 	if (get_vm_area((unsigned long)map, &page1) ||
 	    get_vm_area((unsigned long)map + page_size, &page2) ||
 	    get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-		printf("couldn't find mapping in /proc/self/maps\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
 	}
 
 	/* Now all three VMAs should be the same */
 	if (page1.start != page2.start || page2.start != page3.start) {
-		printf("failed to merge VMAs after munlock\n");
-		goto out;
+		munmap(map, 3 * page_size);
+		ksft_test_result_fail("failed to merge VMAs after munlock");
 	}
 
-	ret = 0;
-out:
+	ksft_test_result_pass("%s call_mlock %d\n", __func__, call_mlock);
 	munmap(map, 3 * page_size);
-	return ret;
 }
 
-static int test_mlockall(int (test_function)(bool call_mlock))
+static void test_mlockall(void)
 {
-	int ret = 1;
+	if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE))
+		ksft_exit_fail_msg("mlockall failed: %s\n", strerror(errno));
 
-	if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
-		perror("mlockall");
-		return ret;
-	}
-
-	ret = test_function(false);
+	test_vma_management(false);
 	munlockall();
-	return ret;
 }
 
 int main(int argc, char **argv)
 {
-	int ret = 0;
-	ret += test_mlock_lock();
-	ret += test_mlock_onfault();
-	ret += test_munlockall();
-	ret += test_lock_onfault_of_present();
-	ret += test_vma_management(true);
-	ret += test_mlockall(test_vma_management);
-	return ret;
+	int ret, size = 3 * getpagesize();
+	void *map;
+
+	ksft_print_header();
+
+	map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap error: %s", strerror(errno));
+
+	ret = mlock2_(map, size, MLOCK_ONFAULT);
+	if (ret && errno == ENOSYS)
+		ksft_finished();
+
+	munmap(map, size);
+
+	ksft_set_plan(13);
+
+	test_mlock_lock();
+	test_mlock_onfault();
+	test_munlockall0();
+	test_munlockall1();
+	test_lock_onfault_of_present();
+	test_vma_management(true);
+	test_mlockall();
+
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h
index 8e02991b313c..4417eaa5cfb7 100644
--- a/tools/testing/selftests/mm/mlock2.h
+++ b/tools/testing/selftests/mm/mlock2.h
@@ -6,12 +6,7 @@
 
 static int mlock2_(void *start, size_t len, int flags)
 {
-#ifdef __NR_mlock2
 	return syscall(__NR_mlock2, start, len, flags);
-#else
-	errno = ENOSYS;
-	return -1;
-#endif
 }
 
 static FILE *seek_to_smaps_entry(unsigned long addr)
@@ -27,10 +22,8 @@ static FILE *seek_to_smaps_entry(unsigned long addr)
 	char path[BUFSIZ];
 
 	file = fopen("/proc/self/smaps", "r");
-	if (!file) {
-		perror("fopen smaps");
-		_exit(1);
-	}
+	if (!file)
+		ksft_exit_fail_msg("fopen smaps: %s\n", strerror(errno));
 
 	while (getline(&line, &size, file) > 0) {
 		if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c
index d822004a374e..100370a7111d 100644
--- a/tools/testing/selftests/mm/mrelease_test.c
+++ b/tools/testing/selftests/mm/mrelease_test.c
@@ -26,19 +26,15 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd)
 
 	buf = (char *)mmap(NULL, nr_pages * psize(), PROT_READ | PROT_WRITE,
 			   MAP_PRIVATE | MAP_ANON, 0, 0);
-	if (buf == MAP_FAILED) {
-		perror("mmap failed, halting the test");
-		return KSFT_FAIL;
-	}
+	if (buf == MAP_FAILED)
+		ksft_exit_fail_msg("mmap failed, halting the test: %s\n", strerror(errno));
 
 	for (i = 0; i < nr_pages; i++)
 		*((unsigned long *)(buf + (i * psize()))) = i;
 
 	/* Signal the parent that the child is ready */
-	if (write(pipefd, "", 1) < 0) {
-		perror("write");
-		return KSFT_FAIL;
-	}
+	if (write(pipefd, "", 1) < 0)
+		ksft_exit_fail_msg("write: %s\n", strerror(errno));
 
 	/* Wait to be killed (when reparenting happens) */
 	while (getppid() == ppid && timeout > 0) {
@@ -54,23 +50,17 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd)
 /* The process_mrelease calls in this test are expected to fail */
 static void run_negative_tests(int pidfd)
 {
-	int res;
 	/* Test invalid flags. Expect to fail with EINVAL error code. */
 	if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) ||
 			errno != EINVAL) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("process_mrelease with wrong flags");
-		exit(res);
+		ksft_exit_fail_msg("process_mrelease with wrong flags: %s\n", strerror(errno));
 	}
 	/*
 	 * Test reaping while process is alive with no pending SIGKILL.
 	 * Expect to fail with EINVAL error code.
 	 */
-	if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("process_mrelease on a live process");
-		exit(res);
-	}
+	if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL)
+		ksft_exit_fail_msg("process_mrelease on a live process: %s\n", strerror(errno));
 }
 
 static int child_main(int pipefd[], size_t size)
@@ -93,11 +83,18 @@ int main(void)
 	char byte;
 	int res;
 
+	ksft_print_header();
+	ksft_set_plan(1);
+
 	/* Test a wrong pidfd */
 	if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("process_mrelease with wrong pidfd");
-		exit(res);
+		if (errno == ENOSYS) {
+			ksft_test_result_skip("process_mrelease not implemented\n");
+			ksft_finished();
+		} else {
+			ksft_exit_fail_msg("process_mrelease with wrong pidfd: %s",
+					   strerror(errno));
+		}
 	}
 
 	/* Start the test with 1MB child memory allocation */
@@ -107,16 +104,14 @@ retry:
 	 * Pipe for the child to signal when it's done allocating
 	 * memory
 	 */
-	if (pipe(pipefd)) {
-		perror("pipe");
-		exit(KSFT_FAIL);
-	}
+	if (pipe(pipefd))
+		ksft_exit_fail_msg("pipe: %s\n", strerror(errno));
+
 	pid = fork();
 	if (pid < 0) {
-		perror("fork");
 		close(pipefd[0]);
 		close(pipefd[1]);
-		exit(KSFT_FAIL);
+		ksft_exit_fail_msg("fork: %s\n", strerror(errno));
 	}
 
 	if (pid == 0) {
@@ -134,28 +129,23 @@ retry:
 	res = read(pipefd[0], &byte, 1);
 	close(pipefd[0]);
 	if (res < 0) {
-		perror("read");
 		if (!kill(pid, SIGKILL))
 			waitpid(pid, NULL, 0);
-		exit(KSFT_FAIL);
+		ksft_exit_fail_msg("read: %s\n", strerror(errno));
 	}
 
 	pidfd = syscall(__NR_pidfd_open, pid, 0);
 	if (pidfd < 0) {
-		perror("pidfd_open");
 		if (!kill(pid, SIGKILL))
 			waitpid(pid, NULL, 0);
-		exit(KSFT_FAIL);
+		ksft_exit_fail_msg("pidfd_open: %s\n", strerror(errno));
 	}
 
 	/* Run negative tests which require a live child */
 	run_negative_tests(pidfd);
 
-	if (kill(pid, SIGKILL)) {
-		res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-		perror("kill");
-		exit(res);
-	}
+	if (kill(pid, SIGKILL))
+		ksft_exit_fail_msg("kill: %s\n", strerror(errno));
 
 	success = (syscall(__NR_process_mrelease, pidfd, 0) == 0);
 	if (!success) {
@@ -169,18 +159,15 @@ retry:
 		if (errno == ESRCH) {
 			retry = (size <= MAX_SIZE_MB);
 		} else {
-			res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-			perror("process_mrelease");
 			waitpid(pid, NULL, 0);
-			exit(res);
+			ksft_exit_fail_msg("process_mrelease: %s\n", strerror(errno));
 		}
 	}
 
 	/* Cleanup to prevent zombies */
-	if (waitpid(pid, NULL, 0) < 0) {
-		perror("waitpid");
-		exit(KSFT_FAIL);
-	}
+	if (waitpid(pid, NULL, 0) < 0)
+		ksft_exit_fail_msg("waitpid: %s\n", strerror(errno));
+
 	close(pidfd);
 
 	if (!success) {
@@ -188,11 +175,10 @@ retry:
 			size *= 2;
 			goto retry;
 		}
-		printf("All process_mrelease attempts failed!\n");
-		exit(KSFT_FAIL);
+		ksft_exit_fail_msg("All process_mrelease attempts failed!\n");
 	}
 
-	printf("Success reaping a child with %zuMB of memory allocations\n",
-	       size);
-	return KSFT_PASS;
+	ksft_test_result_pass("Success reaping a child with %zuMB of memory allocations\n",
+			      size);
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c
index a06e73ec8568..1d75084b9ca5 100644
--- a/tools/testing/selftests/mm/mremap_dontunmap.c
+++ b/tools/testing/selftests/mm/mremap_dontunmap.c
@@ -27,14 +27,14 @@ static void dump_maps(void)
 	system(cmd);
 }
 
-#define BUG_ON(condition, description)					      \
-	do {								      \
-		if (condition) {					      \
-			fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \
-				__LINE__, (description), strerror(errno));    \
-			dump_maps();					  \
-			exit(1);					      \
-		} 							      \
+#define BUG_ON(condition, description)						\
+	do {									\
+		if (condition) {						\
+			dump_maps();						\
+			ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n",		\
+					   __func__, __LINE__, (description),	\
+					   strerror(errno));			\
+		}								\
 	} while (0)
 
 // Try a simple operation for to "test" for kernel support this prevents
@@ -122,6 +122,7 @@ static void mremap_dontunmap_simple()
 	       "unable to unmap destination mapping");
 	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
 	       "unable to unmap source mapping");
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 // This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected.
@@ -173,6 +174,7 @@ static void mremap_dontunmap_simple_shmem()
 	       "unable to unmap destination mapping");
 	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
 	       "unable to unmap source mapping");
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 // This test validates MREMAP_DONTUNMAP will move page tables to a specific
@@ -219,6 +221,7 @@ static void mremap_dontunmap_simple_fixed()
 	       "unable to unmap destination mapping");
 	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
 	       "unable to unmap source mapping");
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 // This test validates that we can MREMAP_DONTUNMAP for a portion of an
@@ -269,6 +272,7 @@ static void mremap_dontunmap_partial_mapping()
 	       "unable to unmap destination mapping");
 	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
 	       "unable to unmap source mapping");
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 // This test validates that we can remap over only a portion of a mapping.
@@ -328,19 +332,24 @@ static void mremap_dontunmap_partial_mapping_overwrite(void)
 	       "unable to unmap destination mapping");
 	BUG_ON(munmap(source_mapping, 5 * page_size) == -1,
 	       "unable to unmap source mapping");
+	ksft_test_result_pass("%s\n", __func__);
 }
 
 int main(void)
 {
+	ksft_print_header();
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	// test for kernel support for MREMAP_DONTUNMAP skipping the test if
 	// not.
 	if (kernel_support_for_mremap_dontunmap() != 0) {
-		printf("No kernel support for MREMAP_DONTUNMAP\n");
-		return KSFT_SKIP;
+		ksft_print_msg("No kernel support for MREMAP_DONTUNMAP\n");
+		ksft_finished();
 	}
 
+	ksft_set_plan(5);
+
 	// Keep a page sized buffer around for when we need it.
 	page_buffer =
 	    mmap(NULL, page_size, PROT_READ | PROT_WRITE,
@@ -356,6 +365,5 @@ int main(void)
 	BUG_ON(munmap(page_buffer, page_size) == -1,
 	       "unable to unmap page buffer");
 
-	printf("OK\n");
-	return 0;
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c
index b5888d613f34..431c1277d83a 100644
--- a/tools/testing/selftests/mm/on-fault-limit.c
+++ b/tools/testing/selftests/mm/on-fault-limit.c
@@ -5,40 +5,38 @@
 #include <string.h>
 #include <sys/time.h>
 #include <sys/resource.h>
+#include "../kselftest.h"
 
-static int test_limit(void)
+static void test_limit(void)
 {
-	int ret = 1;
 	struct rlimit lims;
 	void *map;
 
-	if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
-		perror("getrlimit");
-		return ret;
-	}
+	if (getrlimit(RLIMIT_MEMLOCK, &lims))
+		ksft_exit_fail_msg("getrlimit: %s\n", strerror(errno));
 
-	if (mlockall(MCL_ONFAULT | MCL_FUTURE)) {
-		perror("mlockall");
-		return ret;
-	}
+	if (mlockall(MCL_ONFAULT | MCL_FUTURE))
+		ksft_exit_fail_msg("mlockall: %s\n", strerror(errno));
 
 	map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
 		   MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+
+	ksft_test_result(map == MAP_FAILED, "The map failed respecting mlock limits\n");
+
 	if (map != MAP_FAILED)
-		printf("mmap should have failed, but didn't\n");
-	else {
-		ret = 0;
 		munmap(map, 2 * lims.rlim_max);
-	}
-
 	munlockall();
-	return ret;
 }
 
 int main(int argc, char **argv)
 {
-	int ret = 0;
+	ksft_print_header();
+	ksft_set_plan(1);
+
+	if (!getuid())
+		ksft_test_result_skip("The test must be run from a normal user\n");
+	else
+		test_limit();
 
-	ret += test_limit();
-	return ret;
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index d59517ed3d48..2d785aca72a5 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -1484,7 +1484,7 @@ int main(int argc, char *argv[])
 	ksft_print_header();
 
 	if (init_uffd())
-		return ksft_exit_pass();
+		ksft_exit_pass();
 
 	ksft_set_plan(115);
 
@@ -1660,5 +1660,5 @@ int main(int argc, char *argv[])
 	userfaultfd_tests();
 
 	close(pagemap_fd);
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 246d53a5d7f2..4bdb3a0c7a60 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -15,10 +15,11 @@ usage() {
 	cat <<EOF
 usage: ${BASH_SOURCE[0]:-$0} [ options ]
 
-  -a: run all tests, including extra ones
+  -a: run all tests, including extra ones (other than destructive ones)
   -t: specify specific categories to tests to run
   -h: display this message
   -n: disable TAP output
+  -d: run destructive tests
 
 The default behavior is to run required tests only.  If -a is specified,
 will run all tests.
@@ -64,6 +65,8 @@ separated by spaces:
 	test copy-on-write semantics
 - thp
 	test transparent huge pages
+- hugetlb
+	test hugetlbfs huge pages
 - migration
 	invoke move_pages(2) to exercise the migration entry code
 	paths in the kernel
@@ -79,6 +82,7 @@ EOF
 }
 
 RUN_ALL=false
+RUN_DESTRUCTIVE=false
 TAP_PREFIX="# "
 
 while getopts "aht:n" OPT; do
@@ -87,6 +91,7 @@ while getopts "aht:n" OPT; do
 		"h") usage ;;
 		"t") VM_SELFTEST_ITEMS=${OPTARG} ;;
 		"n") TAP_PREFIX= ;;
+		"d") RUN_DESTRUCTIVE=true ;;
 	esac
 done
 shift $((OPTIND -1))
@@ -173,7 +178,6 @@ if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
 	if [ "$freepgs" -lt "$needpgs" ]; then
 		printf "Not enough huge pages available (%d < %d)\n" \
 		       "$freepgs" "$needpgs"
-		exit 1
 	fi
 else
 	echo "no hugetlbfs support in kernel?"
@@ -206,6 +210,15 @@ pretty_name() {
 # Usage: run_test [test binary] [arbitrary test arguments...]
 run_test() {
 	if test_selected ${CATEGORY}; then
+		# On memory constrainted systems some tests can fail to allocate hugepages.
+		# perform some cleanup before the test for a higher success rate.
+		if [ ${CATEGORY} == "thp" ] | [ ${CATEGORY} == "hugetlb" ]; then
+			echo 3 > /proc/sys/vm/drop_caches
+			sleep 2
+			echo 1 > /proc/sys/vm/compact_memory
+			sleep 2
+		fi
+
 		local test=$(pretty_name "$*")
 		local title="running $*"
 		local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
@@ -253,6 +266,7 @@ nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages)
 # For this test, we need one and just one huge page
 echo 1 > /proc/sys/vm/nr_hugepages
 CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv
+CATEGORY="hugetlb" run_test ./hugetlb_madv_vs_map
 # Restore the previous number of huge pages, since further tests rely on it
 echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages
 
@@ -291,7 +305,12 @@ echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
 
 CATEGORY="compaction" run_test ./compaction_test
 
-CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
+if command -v sudo &> /dev/null;
+then
+	CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
+else
+	echo "# SKIP ./on-fault-limit"
+fi
 
 CATEGORY="mmap" run_test ./map_populate
 
@@ -304,6 +323,11 @@ CATEGORY="process_mrelease" run_test ./mrelease_test
 CATEGORY="mremap" run_test ./mremap_test
 
 CATEGORY="hugetlb" run_test ./thuge-gen
+CATEGORY="hugetlb" run_test ./charge_reserved_hugetlb.sh -cgroup-v2
+CATEGORY="hugetlb" run_test ./hugetlb_reparenting_test.sh -cgroup-v2
+if $RUN_DESTRUCTIVE; then
+CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison
+fi
 
 if [ $VADDR64 -ne 0 ]; then
 
@@ -361,6 +385,7 @@ CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0
 CATEGORY="ksm" run_test ./ksm_functional_tests
 
 # protection_keys tests
+nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
 if [ -x ./protection_keys_32 ]
 then
 	CATEGORY="pkey" run_test ./protection_keys_32
@@ -370,6 +395,7 @@ if [ -x ./protection_keys_64 ]
 then
 	CATEGORY="pkey" run_test ./protection_keys_64
 fi
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
 
 if [ -x ./soft-dirty ]
 then
@@ -387,7 +413,27 @@ CATEGORY="thp" run_test ./khugepaged -s 2
 
 CATEGORY="thp" run_test ./transhuge-stress -d 20
 
-CATEGORY="thp" run_test ./split_huge_page_test
+# Try to create XFS if not provided
+if [ -z "${SPLIT_HUGE_PAGE_TEST_XFS_PATH}" ]; then
+    if test_selected "thp"; then
+        if grep xfs /proc/filesystems &>/dev/null; then
+            XFS_IMG=$(mktemp /tmp/xfs_img_XXXXXX)
+            SPLIT_HUGE_PAGE_TEST_XFS_PATH=$(mktemp -d /tmp/xfs_dir_XXXXXX)
+            truncate -s 314572800 ${XFS_IMG}
+            mkfs.xfs -q ${XFS_IMG}
+            mount -o loop ${XFS_IMG} ${SPLIT_HUGE_PAGE_TEST_XFS_PATH}
+            MOUNTED_XFS=1
+        fi
+    fi
+fi
+
+CATEGORY="thp" run_test ./split_huge_page_test ${SPLIT_HUGE_PAGE_TEST_XFS_PATH}
+
+if [ -n "${MOUNTED_XFS}" ]; then
+    umount ${SPLIT_HUGE_PAGE_TEST_XFS_PATH}
+    rmdir ${SPLIT_HUGE_PAGE_TEST_XFS_PATH}
+    rm -f ${XFS_IMG}
+fi
 
 CATEGORY="migration" run_test ./migration
 
diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c
index cc5f144430d4..d9dbf879748b 100644
--- a/tools/testing/selftests/mm/soft-dirty.c
+++ b/tools/testing/selftests/mm/soft-dirty.c
@@ -137,7 +137,7 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon)
 		if (!map)
 			ksft_exit_fail_msg("anon mmap failed\n");
 	} else {
-		test_fd = open(fname, O_RDWR | O_CREAT);
+		test_fd = open(fname, O_RDWR | O_CREAT, 0664);
 		if (test_fd < 0) {
 			ksft_test_result_skip("Test %s open() file failed\n", __func__);
 			return;
@@ -209,5 +209,5 @@ int main(int argc, char **argv)
 
 	close(pagemap_fd);
 
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 0e74635c8c3d..d3c7f5fb3e7b 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -16,17 +16,20 @@
 #include <sys/mount.h>
 #include <malloc.h>
 #include <stdbool.h>
+#include <time.h>
 #include "vm_util.h"
+#include "../kselftest.h"
 
 uint64_t pagesize;
 unsigned int pageshift;
 uint64_t pmd_pagesize;
 
 #define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
+#define SMAP_PATH "/proc/self/smaps"
 #define INPUT_MAX 80
 
-#define PID_FMT "%d,0x%lx,0x%lx"
-#define PATH_FMT "%s,0x%lx,0x%lx"
+#define PID_FMT "%d,0x%lx,0x%lx,%d"
+#define PATH_FMT "%s,0x%lx,0x%lx,%d"
 
 #define PFN_MASK     ((1UL<<55)-1)
 #define KPF_THP      (1UL<<22)
@@ -50,21 +53,19 @@ int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
 	return 0;
 }
 
-static int write_file(const char *path, const char *buf, size_t buflen)
+static void write_file(const char *path, const char *buf, size_t buflen)
 {
 	int fd;
 	ssize_t numwritten;
 
 	fd = open(path, O_WRONLY);
 	if (fd == -1)
-		return 0;
+		ksft_exit_fail_msg("%s open failed: %s\n", path, strerror(errno));
 
 	numwritten = write(fd, buf, buflen - 1);
 	close(fd);
 	if (numwritten < 1)
-		return 0;
-
-	return (unsigned int) numwritten;
+		ksft_exit_fail_msg("Write failed\n");
 }
 
 static void write_debugfs(const char *fmt, ...)
@@ -77,15 +78,10 @@ static void write_debugfs(const char *fmt, ...)
 	ret = vsnprintf(input, INPUT_MAX, fmt, argp);
 	va_end(argp);
 
-	if (ret >= INPUT_MAX) {
-		printf("%s: Debugfs input is too long\n", __func__);
-		exit(EXIT_FAILURE);
-	}
+	if (ret >= INPUT_MAX)
+		ksft_exit_fail_msg("%s: Debugfs input is too long\n", __func__);
 
-	if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
-		perror(SPLIT_DEBUGFS);
-		exit(EXIT_FAILURE);
-	}
+	write_file(SPLIT_DEBUGFS, input, ret + 1);
 }
 
 void split_pmd_thp(void)
@@ -95,39 +91,30 @@ void split_pmd_thp(void)
 	size_t i;
 
 	one_page = memalign(pmd_pagesize, len);
-
-	if (!one_page) {
-		printf("Fail to allocate memory\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!one_page)
+		ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno));
 
 	madvise(one_page, len, MADV_HUGEPAGE);
 
 	for (i = 0; i < len; i++)
 		one_page[i] = (char)i;
 
-	if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
-		printf("No THP is allocated\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!check_huge_anon(one_page, 4, pmd_pagesize))
+		ksft_exit_fail_msg("No THP is allocated\n");
 
 	/* split all THPs */
 	write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
-		(uint64_t)one_page + len);
+		(uint64_t)one_page + len, 0);
 
 	for (i = 0; i < len; i++)
-		if (one_page[i] != (char)i) {
-			printf("%ld byte corrupted\n", i);
-			exit(EXIT_FAILURE);
-		}
+		if (one_page[i] != (char)i)
+			ksft_exit_fail_msg("%ld byte corrupted\n", i);
 
 
-	if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
-		printf("Still AnonHugePages not split\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!check_huge_anon(one_page, 0, pmd_pagesize))
+		ksft_exit_fail_msg("Still AnonHugePages not split\n");
 
-	printf("Split huge pages successful\n");
+	ksft_test_result_pass("Split huge pages successful\n");
 	free(one_page);
 }
 
@@ -143,36 +130,29 @@ void split_pte_mapped_thp(void)
 	int pagemap_fd;
 	int kpageflags_fd;
 
-	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
-		perror("get pagemap proc error");
-		exit(EXIT_FAILURE);
-	}
-	pagemap_fd = open(pagemap_proc, O_RDONLY);
+	if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0)
+		ksft_exit_fail_msg("get pagemap proc error: %s\n", strerror(errno));
 
-	if (pagemap_fd == -1) {
-		perror("read pagemap:");
-		exit(EXIT_FAILURE);
-	}
+	pagemap_fd = open(pagemap_proc, O_RDONLY);
+	if (pagemap_fd == -1)
+		ksft_exit_fail_msg("read pagemap: %s\n", strerror(errno));
 
 	kpageflags_fd = open(kpageflags_proc, O_RDONLY);
-
-	if (kpageflags_fd == -1) {
-		perror("read kpageflags:");
-		exit(EXIT_FAILURE);
-	}
+	if (kpageflags_fd == -1)
+		ksft_exit_fail_msg("read kpageflags: %s\n", strerror(errno));
 
 	one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
 			MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (one_page == MAP_FAILED)
+		ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno));
 
 	madvise(one_page, len, MADV_HUGEPAGE);
 
 	for (i = 0; i < len; i++)
 		one_page[i] = (char)i;
 
-	if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
-		printf("No THP is allocated\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!check_huge_anon(one_page, 4, pmd_pagesize))
+		ksft_exit_fail_msg("No THP is allocated\n");
 
 	/* remap the first pagesize of first THP */
 	pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
@@ -183,10 +163,8 @@ void split_pte_mapped_thp(void)
 				     pagesize, pagesize,
 				     MREMAP_MAYMOVE|MREMAP_FIXED,
 				     pte_mapped + pagesize * i);
-		if (pte_mapped2 == (char *)-1) {
-			perror("mremap failed");
-			exit(EXIT_FAILURE);
-		}
+		if (pte_mapped2 == MAP_FAILED)
+			ksft_exit_fail_msg("mremap failed: %s\n", strerror(errno));
 	}
 
 	/* smap does not show THPs after mremap, use kpageflags instead */
@@ -196,33 +174,28 @@ void split_pte_mapped_thp(void)
 		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
 			thp_size++;
 
-	if (thp_size != 4) {
-		printf("Some THPs are missing during mremap\n");
-		exit(EXIT_FAILURE);
-	}
+	if (thp_size != 4)
+		ksft_exit_fail_msg("Some THPs are missing during mremap\n");
 
 	/* split all remapped THPs */
 	write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
-		      (uint64_t)pte_mapped + pagesize * 4);
+		      (uint64_t)pte_mapped + pagesize * 4, 0);
 
 	/* smap does not show THPs after mremap, use kpageflags instead */
 	thp_size = 0;
 	for (i = 0; i < pagesize * 4; i++) {
-		if (pte_mapped[i] != (char)i) {
-			printf("%ld byte corrupted\n", i);
-			exit(EXIT_FAILURE);
-		}
+		if (pte_mapped[i] != (char)i)
+			ksft_exit_fail_msg("%ld byte corrupted\n", i);
+
 		if (i % pagesize == 0 &&
 		    is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
 			thp_size++;
 	}
 
-	if (thp_size) {
-		printf("Still %ld THPs not split\n", thp_size);
-		exit(EXIT_FAILURE);
-	}
+	if (thp_size)
+		ksft_exit_fail_msg("Still %ld THPs not split\n", thp_size);
 
-	printf("Split PTE-mapped huge pages successful\n");
+	ksft_test_result_pass("Split PTE-mapped huge pages successful\n");
 	munmap(one_page, len);
 	close(pagemap_fd);
 	close(kpageflags_fd);
@@ -238,24 +211,21 @@ void split_file_backed_thp(void)
 	char testfile[INPUT_MAX];
 	uint64_t pgoff_start = 0, pgoff_end = 1024;
 
-	printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n");
+	ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() for more info.\n");
 
 	status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
 
-	if (status) {
-		printf("Unable to create a tmpfs for testing\n");
-		exit(EXIT_FAILURE);
-	}
+	if (status)
+		ksft_exit_fail_msg("Unable to create a tmpfs for testing\n");
 
 	status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
 	if (status >= INPUT_MAX) {
-		printf("Fail to create file-backed THP split testing file\n");
-		goto cleanup;
+		ksft_exit_fail_msg("Fail to create file-backed THP split testing file\n");
 	}
 
-	fd = open(testfile, O_CREAT|O_WRONLY);
+	fd = open(testfile, O_CREAT|O_WRONLY, 0664);
 	if (fd == -1) {
-		perror("Cannot open testing file\n");
+		ksft_perror("Cannot open testing file");
 		goto cleanup;
 	}
 
@@ -264,50 +234,213 @@ void split_file_backed_thp(void)
 	close(fd);
 
 	if (num_written < 1) {
-		printf("Fail to write data to testing file\n");
+		ksft_perror("Fail to write data to testing file");
 		goto cleanup;
 	}
 
 	/* split the file-backed THP */
-	write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end);
+	write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end, 0);
 
 	status = unlink(testfile);
-	if (status)
-		perror("Cannot remove testing file\n");
+	if (status) {
+		ksft_perror("Cannot remove testing file");
+		goto cleanup;
+	}
 
-cleanup:
 	status = umount(tmpfs_loc);
 	if (status) {
-		printf("Unable to umount %s\n", tmpfs_loc);
-		exit(EXIT_FAILURE);
+		rmdir(tmpfs_loc);
+		ksft_exit_fail_msg("Unable to umount %s\n", tmpfs_loc);
 	}
+
 	status = rmdir(tmpfs_loc);
-	if (status) {
-		perror("cannot remove tmp dir");
-		exit(EXIT_FAILURE);
+	if (status)
+		ksft_exit_fail_msg("cannot remove tmp dir: %s\n", strerror(errno));
+
+	ksft_print_msg("Please check dmesg for more information\n");
+	ksft_test_result_pass("File-backed THP split test done\n");
+	return;
+
+cleanup:
+	umount(tmpfs_loc);
+	rmdir(tmpfs_loc);
+	ksft_exit_fail_msg("Error occurred\n");
+}
+
+bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template,
+		const char **thp_fs_loc)
+{
+	if (xfs_path) {
+		*thp_fs_loc = xfs_path;
+		return false;
+	}
+
+	*thp_fs_loc = mkdtemp(thp_fs_template);
+
+	if (!*thp_fs_loc)
+		ksft_exit_fail_msg("cannot create temp folder\n");
+
+	return true;
+}
+
+void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp)
+{
+	int status;
+
+	if (!created_tmp)
+		return;
+
+	status = rmdir(thp_fs_loc);
+	if (status)
+		ksft_exit_fail_msg("cannot remove tmp dir: %s\n",
+				   strerror(errno));
+}
+
+int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd,
+		char **addr)
+{
+	size_t i;
+	int __attribute__((unused)) dummy = 0;
+
+	srand(time(NULL));
+
+	*fd = open(testfile, O_CREAT | O_RDWR, 0664);
+	if (*fd == -1)
+		ksft_exit_fail_msg("Failed to create a file at %s\n", testfile);
+
+	for (i = 0; i < fd_size; i++) {
+		unsigned char byte = (unsigned char)i;
+
+		write(*fd, &byte, sizeof(byte));
+	}
+	close(*fd);
+	sync();
+	*fd = open("/proc/sys/vm/drop_caches", O_WRONLY);
+	if (*fd == -1) {
+		ksft_perror("open drop_caches");
+		goto err_out_unlink;
+	}
+	if (write(*fd, "3", 1) != 1) {
+		ksft_perror("write to drop_caches");
+		goto err_out_unlink;
+	}
+	close(*fd);
+
+	*fd = open(testfile, O_RDWR);
+	if (*fd == -1) {
+		ksft_perror("Failed to open testfile\n");
+		goto err_out_unlink;
+	}
+
+	*addr = mmap(NULL, fd_size, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0);
+	if (*addr == (char *)-1) {
+		ksft_perror("cannot mmap");
+		goto err_out_close;
+	}
+	madvise(*addr, fd_size, MADV_HUGEPAGE);
+
+	for (size_t i = 0; i < fd_size; i++)
+		dummy += *(*addr + i);
+
+	if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) {
+		ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n");
+		munmap(*addr, fd_size);
+		close(*fd);
+		unlink(testfile);
+		ksft_test_result_skip("Pagecache folio split skipped\n");
+		return -2;
+	}
+	return 0;
+err_out_close:
+	close(*fd);
+err_out_unlink:
+	unlink(testfile);
+	ksft_exit_fail_msg("Failed to create large pagecache folios\n");
+	return -1;
+}
+
+void split_thp_in_pagecache_to_order(size_t fd_size, int order, const char *fs_loc)
+{
+	int fd;
+	char *addr;
+	size_t i;
+	char testfile[INPUT_MAX];
+	int err = 0;
+
+	err = snprintf(testfile, INPUT_MAX, "%s/test", fs_loc);
+
+	if (err < 0)
+		ksft_exit_fail_msg("cannot generate right test file name\n");
+
+	err = create_pagecache_thp_and_fd(testfile, fd_size, &fd, &addr);
+	if (err)
+		return;
+	err = 0;
+
+	write_debugfs(PID_FMT, getpid(), (uint64_t)addr, (uint64_t)addr + fd_size, order);
+
+	for (i = 0; i < fd_size; i++)
+		if (*(addr + i) != (char)i) {
+			ksft_print_msg("%lu byte corrupted in the file\n", i);
+			err = EXIT_FAILURE;
+			goto out;
+		}
+
+	if (!check_huge_file(addr, 0, pmd_pagesize)) {
+		ksft_print_msg("Still FilePmdMapped not split\n");
+		err = EXIT_FAILURE;
+		goto out;
 	}
 
-	printf("file-backed THP split test done, please check dmesg for more information\n");
+out:
+	munmap(addr, fd_size);
+	close(fd);
+	unlink(testfile);
+	if (err)
+		ksft_exit_fail_msg("Split PMD-mapped pagecache folio to order %d failed\n", order);
+	ksft_test_result_pass("Split PMD-mapped pagecache folio to order %d passed\n", order);
 }
 
 int main(int argc, char **argv)
 {
+	int i;
+	size_t fd_size;
+	char *optional_xfs_path = NULL;
+	char fs_loc_template[] = "/tmp/thp_fs_XXXXXX";
+	const char *fs_loc;
+	bool created_tmp;
+
+	ksft_print_header();
+
 	if (geteuid() != 0) {
-		printf("Please run the benchmark as root\n");
-		exit(EXIT_FAILURE);
+		ksft_print_msg("Please run the benchmark as root\n");
+		ksft_finished();
 	}
 
+	if (argc > 1)
+		optional_xfs_path = argv[1];
+
+	ksft_set_plan(3+9);
+
 	pagesize = getpagesize();
 	pageshift = ffs(pagesize) - 1;
 	pmd_pagesize = read_pmd_pagesize();
-	if (!pmd_pagesize) {
-		printf("Reading PMD pagesize failed\n");
-		exit(EXIT_FAILURE);
-	}
+	if (!pmd_pagesize)
+		ksft_exit_fail_msg("Reading PMD pagesize failed\n");
+
+	fd_size = 2 * pmd_pagesize;
 
 	split_pmd_thp();
 	split_pte_mapped_thp();
 	split_file_backed_thp();
 
+	created_tmp = prepare_thp_fs(optional_xfs_path, fs_loc_template,
+			&fs_loc);
+	for (i = 8; i >= 0; i--)
+		split_thp_in_pagecache_to_order(fd_size, i, fs_loc);
+	cleanup_thp_fs(fs_loc, created_tmp);
+
+	ksft_finished();
+
 	return 0;
 }
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index 622987f12c89..ea7fd8fe2876 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -4,7 +4,7 @@
    Before running this huge pages for each huge page size must have been
    reserved.
    For large pages beyond MAX_PAGE_ORDER (like 1GB on x86) boot options must
-   be used.
+   be used. 1GB wouldn't be tested if it isn't available.
    Also shmmax must be increased.
    And you need to run as root to work around some weird permissions in shm.
    And nothing using huge pages should run in parallel.
@@ -26,8 +26,7 @@
 #include <stdarg.h>
 #include <string.h>
 #include "vm_util.h"
-
-#define err(x) perror(x), exit(1)
+#include "../kselftest.h"
 
 #define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
 #define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
@@ -44,11 +43,8 @@
 #define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
 
 #define NUM_PAGESIZES   5
-
 #define NUM_PAGES 4
 
-#define Dprintf(fmt...) // printf(fmt)
-
 unsigned long page_sizes[NUM_PAGESIZES];
 int num_page_sizes;
 
@@ -60,28 +56,15 @@ int ilog2(unsigned long v)
 	return l;
 }
 
-void find_pagesizes(void)
-{
-	glob_t g;
-	int i;
-	glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
-	assert(g.gl_pathc <= NUM_PAGESIZES);
-	for (i = 0; i < g.gl_pathc; i++) {
-		sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
-				&page_sizes[i]);
-		page_sizes[i] <<= 10;
-		printf("Found %luMB\n", page_sizes[i] >> 20);
-	}
-	num_page_sizes = g.gl_pathc;
-	globfree(&g);
-}
-
 void show(unsigned long ps)
 {
 	char buf[100];
+
 	if (ps == getpagesize())
 		return;
-	printf("%luMB: ", ps >> 20);
+
+	ksft_print_msg("%luMB: ", ps >> 20);
+
 	fflush(stdout);
 	snprintf(buf, sizeof buf,
 		"cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
@@ -105,7 +88,7 @@ unsigned long read_sysfs(int warn, char *fmt, ...)
 	f = fopen(buf, "r");
 	if (!f) {
 		if (warn)
-			printf("missing %s\n", buf);
+			ksft_print_msg("missing %s\n", buf);
 		return 0;
 	}
 	if (getline(&line, &linelen, f) > 0) {
@@ -119,123 +102,143 @@ unsigned long read_sysfs(int warn, char *fmt, ...)
 unsigned long read_free(unsigned long ps)
 {
 	return read_sysfs(ps != getpagesize(),
-			"/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
-			ps >> 10);
+			  "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+			  ps >> 10);
 }
 
 void test_mmap(unsigned long size, unsigned flags)
 {
 	char *map;
 	unsigned long before, after;
-	int err;
 
 	before = read_free(size);
 	map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
 			MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0);
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
 
-	if (map == (char *)-1) err("mmap");
 	memset(map, 0xff, size*NUM_PAGES);
 	after = read_free(size);
-	Dprintf("before %lu after %lu diff %ld size %lu\n",
-		before, after, before - after, size);
-	assert(size == getpagesize() || (before - after) == NUM_PAGES);
+
 	show(size);
-	err = munmap(map, size * NUM_PAGES);
-	assert(!err);
+	ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
+			 "%s mmap\n", __func__);
+
+	if (munmap(map, size * NUM_PAGES))
+		ksft_exit_fail_msg("%s: unmap %s\n", __func__, strerror(errno));
 }
 
 void test_shmget(unsigned long size, unsigned flags)
 {
 	int id;
 	unsigned long before, after;
-	int err;
+	struct shm_info i;
+	char *map;
 
 	before = read_free(size);
 	id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
-	if (id < 0) err("shmget");
-
-	struct shm_info i;
-	if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl");
-	Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss);
+	if (id < 0) {
+		if (errno == EPERM) {
+			ksft_test_result_skip("shmget requires root privileges: %s\n",
+					      strerror(errno));
+			return;
+		}
+		ksft_exit_fail_msg("shmget: %s\n", strerror(errno));
+	}
 
+	if (shmctl(id, SHM_INFO, (void *)&i) < 0)
+		ksft_exit_fail_msg("shmctl: %s\n", strerror(errno));
 
-	Dprintf("id %d\n", id);
-	char *map = shmat(id, NULL, 0600);
-	if (map == (char*)-1) err("shmat");
+	map = shmat(id, NULL, 0600);
+	if (map == MAP_FAILED)
+		ksft_exit_fail_msg("shmat: %s\n", strerror(errno));
 
 	shmctl(id, IPC_RMID, NULL);
 
 	memset(map, 0xff, size*NUM_PAGES);
 	after = read_free(size);
 
-	Dprintf("before %lu after %lu diff %ld size %lu\n",
-		before, after, before - after, size);
-	assert(size == getpagesize() || (before - after) == NUM_PAGES);
 	show(size);
-	err = shmdt(map);
-	assert(!err);
+	ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
+			 "%s: mmap\n", __func__);
+	if (shmdt(map))
+		ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno));
 }
 
-void sanity_checks(void)
+void find_pagesizes(void)
 {
-	int i;
 	unsigned long largest = getpagesize();
+	int i;
+	glob_t g;
 
-	for (i = 0; i < num_page_sizes; i++) {
-		if (page_sizes[i] > largest)
+	glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
+	assert(g.gl_pathc <= NUM_PAGESIZES);
+	for (i = 0; (i < g.gl_pathc) && (num_page_sizes < NUM_PAGESIZES); i++) {
+		sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
+				&page_sizes[num_page_sizes]);
+		page_sizes[num_page_sizes] <<= 10;
+		ksft_print_msg("Found %luMB\n", page_sizes[i] >> 20);
+
+		if (page_sizes[num_page_sizes] > largest)
 			largest = page_sizes[i];
 
-		if (read_free(page_sizes[i]) < NUM_PAGES) {
-			printf("Not enough huge pages for page size %lu MB, need %u\n",
-				page_sizes[i] >> 20,
-				NUM_PAGES);
-			exit(0);
-		}
+		if (read_free(page_sizes[num_page_sizes]) >= NUM_PAGES)
+			num_page_sizes++;
+		else
+			ksft_print_msg("SKIP for size %lu MB as not enough huge pages, need %u\n",
+				       page_sizes[num_page_sizes] >> 20, NUM_PAGES);
 	}
+	globfree(&g);
 
-	if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) {
-		printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES);
-		exit(0);
-	}
+	if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
+		ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax",
+				   largest * NUM_PAGES);
 
 #if defined(__x86_64__)
 	if (largest != 1U<<30) {
-		printf("No GB pages available on x86-64\n"
-		       "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
-		exit(0);
+		ksft_exit_fail_msg("No GB pages available on x86-64\n"
+				   "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
 	}
 #endif
 }
 
 int main(void)
 {
-	int i;
 	unsigned default_hps = default_huge_page_size();
+	int i;
+
+	ksft_print_header();
 
 	find_pagesizes();
 
-	sanity_checks();
+	if (!num_page_sizes)
+		ksft_finished();
+
+	ksft_set_plan(2 * num_page_sizes + 3);
 
 	for (i = 0; i < num_page_sizes; i++) {
 		unsigned long ps = page_sizes[i];
 		int arg = ilog2(ps) << MAP_HUGE_SHIFT;
-		printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
+
+		ksft_print_msg("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
 		test_mmap(ps, MAP_HUGETLB | arg);
 	}
-	printf("Testing default huge mmap\n");
+
+	ksft_print_msg("Testing default huge mmap\n");
 	test_mmap(default_hps, MAP_HUGETLB);
 
-	puts("Testing non-huge shmget");
+	ksft_print_msg("Testing non-huge shmget\n");
 	test_shmget(getpagesize(), 0);
 
 	for (i = 0; i < num_page_sizes; i++) {
 		unsigned long ps = page_sizes[i];
 		int arg = ilog2(ps) << SHM_HUGE_SHIFT;
-		printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
+		ksft_print_msg("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
 		test_shmget(ps, SHM_HUGETLB | arg);
 	}
-	puts("default huge shmget");
+
+	ksft_print_msg("default huge shmget\n");
 	test_shmget(default_hps, SHM_HUGETLB);
 
-	return 0;
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c
index c61fb9350b8c..68201192e37c 100644
--- a/tools/testing/selftests/mm/transhuge-stress.c
+++ b/tools/testing/selftests/mm/transhuge-stress.c
@@ -16,6 +16,7 @@
 #include <string.h>
 #include <sys/mman.h>
 #include "vm_util.h"
+#include "../kselftest.h"
 
 int backing_fd = -1;
 int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
@@ -34,6 +35,8 @@ int main(int argc, char **argv)
 	int pagemap_fd;
 	int duration = 0;
 
+	ksft_print_header();
+
 	ram = sysconf(_SC_PHYS_PAGES);
 	if (ram > SIZE_MAX / psize() / 4)
 		ram = SIZE_MAX / 4;
@@ -43,7 +46,8 @@ int main(int argc, char **argv)
 
 	while (++i < argc) {
 		if (!strcmp(argv[i], "-h"))
-			errx(1, "usage: %s [-f <filename>] [-d <duration>] [size in MiB]", argv[0]);
+			ksft_exit_fail_msg("usage: %s [-f <filename>] [-d <duration>] [size in MiB]\n",
+					   argv[0]);
 		else if (!strcmp(argv[i], "-f"))
 			name = argv[++i];
 		else if (!strcmp(argv[i], "-d"))
@@ -52,10 +56,12 @@ int main(int argc, char **argv)
 			len = atoll(argv[i]) << 20;
 	}
 
+	ksft_set_plan(1);
+
 	if (name) {
 		backing_fd = open(name, O_RDWR);
 		if (backing_fd == -1)
-			errx(2, "open %s", name);
+			ksft_exit_fail_msg("open %s\n", name);
 		mmap_flags = MAP_SHARED;
 	}
 
@@ -65,21 +71,21 @@ int main(int argc, char **argv)
 
 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
 	if (pagemap_fd < 0)
-		err(2, "open pagemap");
+		ksft_exit_fail_msg("open pagemap\n");
 
 	len -= len % HPAGE_SIZE;
 	ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0);
 	if (ptr == MAP_FAILED)
-		err(2, "initial mmap");
+		ksft_exit_fail_msg("initial mmap");
 	ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
 
 	if (madvise(ptr, len, MADV_HUGEPAGE))
-		err(2, "MADV_HUGEPAGE");
+		ksft_exit_fail_msg("MADV_HUGEPAGE");
 
 	map_len = ram >> (HPAGE_SHIFT - 1);
 	map = malloc(map_len);
 	if (!map)
-		errx(2, "map malloc");
+		ksft_exit_fail_msg("map malloc\n");
 
 	clock_gettime(CLOCK_MONOTONIC, &start);
 
@@ -103,7 +109,7 @@ int main(int argc, char **argv)
 				if (idx >= map_len) {
 					map = realloc(map, idx + 1);
 					if (!map)
-						errx(2, "map realloc");
+						ksft_exit_fail_msg("map realloc\n");
 					memset(map + map_len, 0, idx + 1 - map_len);
 					map_len = idx + 1;
 				}
@@ -114,17 +120,19 @@ int main(int argc, char **argv)
 
 			/* split transhuge page, keep last page */
 			if (madvise(p, HPAGE_SIZE - psize(), MADV_DONTNEED))
-				err(2, "MADV_DONTNEED");
+				ksft_exit_fail_msg("MADV_DONTNEED");
 		}
 		clock_gettime(CLOCK_MONOTONIC, &b);
 		s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
 
-		warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
-		      "%4d succeed, %4d failed, %4d different pages",
-		      s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
-		      nr_succeed, nr_failed, nr_pages);
+		ksft_print_msg("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
+			       "%4d succeed, %4d failed, %4d different pages\n",
+			       s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
+			       nr_succeed, nr_failed, nr_pages);
 
-		if (duration > 0 && b.tv_sec - start.tv_sec >= duration)
-			return 0;
+		if (duration > 0 && b.tv_sec - start.tv_sec >= duration) {
+			ksft_test_result_pass("Completed\n");
+			ksft_finished();
+		}
 	}
 }
diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c
index b0ac0ec2356d..7ad6ba660c7d 100644
--- a/tools/testing/selftests/mm/uffd-common.c
+++ b/tools/testing/selftests/mm/uffd-common.c
@@ -18,6 +18,7 @@ bool test_uffdio_wp = true;
 unsigned long long *count_verify;
 uffd_test_ops_t *uffd_test_ops;
 uffd_test_case_ops_t *uffd_test_case_ops;
+atomic_bool ready_for_fork;
 
 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
 {
@@ -518,6 +519,8 @@ void *uffd_poll_thread(void *arg)
 	pollfd[1].fd = pipefd[cpu*2];
 	pollfd[1].events = POLLIN;
 
+	ready_for_fork = true;
+
 	for (;;) {
 		ret = poll(pollfd, 2, -1);
 		if (ret <= 0) {
diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h
index cb055282c89c..cc5629c3d2aa 100644
--- a/tools/testing/selftests/mm/uffd-common.h
+++ b/tools/testing/selftests/mm/uffd-common.h
@@ -32,6 +32,7 @@
 #include <inttypes.h>
 #include <stdint.h>
 #include <sys/random.h>
+#include <stdatomic.h>
 
 #include "../kselftest.h"
 #include "vm_util.h"
@@ -103,6 +104,7 @@ extern bool map_shared;
 extern bool test_uffdio_wp;
 extern unsigned long long *count_verify;
 extern volatile bool test_uffdio_copy_eexist;
+extern atomic_bool ready_for_fork;
 
 extern uffd_test_ops_t anon_uffd_test_ops;
 extern uffd_test_ops_t shmem_uffd_test_ops;
diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c
index 7e83829bbb33..f78bab0f3d45 100644
--- a/tools/testing/selftests/mm/uffd-stress.c
+++ b/tools/testing/selftests/mm/uffd-stress.c
@@ -441,6 +441,12 @@ int main(int argc, char **argv)
 	parse_test_type_arg(argv[1]);
 	bytes = atol(argv[2]) * 1024 * 1024;
 
+	if (test_type == TEST_HUGETLB &&
+	   get_free_hugepages() < bytes / page_size) {
+		printf("skip: Skipping userfaultfd... not enough hugepages\n");
+		return KSFT_SKIP;
+	}
+
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 
 	nr_pages_per_cpu = bytes / page_size / nr_cpus;
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index 2b9f8cc52639..21ec23206ab4 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -775,6 +775,8 @@ static void uffd_sigbus_test_common(bool wp)
 	char c;
 	struct uffd_args args = { 0 };
 
+	ready_for_fork = false;
+
 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
 
 	if (uffd_register(uffd, area_dst, nr_pages * page_size,
@@ -790,6 +792,9 @@ static void uffd_sigbus_test_common(bool wp)
 	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
 		err("uffd_poll_thread create");
 
+	while (!ready_for_fork)
+		; /* Wait for the poll_thread to start executing before forking */
+
 	pid = fork();
 	if (pid < 0)
 		err("fork");
@@ -829,6 +834,8 @@ static void uffd_events_test_common(bool wp)
 	char c;
 	struct uffd_args args = { 0 };
 
+	ready_for_fork = false;
+
 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
 	if (uffd_register(uffd, area_dst, nr_pages * page_size,
 			  true, wp, false))
@@ -838,6 +845,9 @@ static void uffd_events_test_common(bool wp)
 	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
 		err("uffd_poll_thread create");
 
+	while (!ready_for_fork)
+		; /* Wait for the poll_thread to start executing before forking */
+
 	pid = fork();
 	if (pid < 0)
 		err("fork");
@@ -1427,7 +1437,8 @@ uffd_test_case_t uffd_tests[] = {
 		.uffd_fn = uffd_sigbus_wp_test,
 		.mem_targets = MEM_ALL,
 		.uffd_feature_required = UFFD_FEATURE_SIGBUS |
-		UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_PAGEFAULT_FLAG_WP,
+		UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+		UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
 	},
 	{
 		.name = "events",
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index bae0ceaf95b1..7bcf8d48256a 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -12,6 +12,7 @@
 #include <errno.h>
 #include <sys/mman.h>
 #include <sys/time.h>
+#include "../kselftest.h"
 
 /*
  * Maximum address range mapped with a single mmap()
@@ -68,23 +69,15 @@ static char *hind_addr(void)
 	return (char *) (1UL << bits);
 }
 
-static int validate_addr(char *ptr, int high_addr)
+static void validate_addr(char *ptr, int high_addr)
 {
 	unsigned long addr = (unsigned long) ptr;
 
-	if (high_addr) {
-		if (addr < HIGH_ADDR_MARK) {
-			printf("Bad address %lx\n", addr);
-			return 1;
-		}
-		return 0;
-	}
+	if (high_addr && addr < HIGH_ADDR_MARK)
+		ksft_exit_fail_msg("Bad address %lx\n", addr);
 
-	if (addr > HIGH_ADDR_MARK) {
-		printf("Bad address %lx\n", addr);
-		return 1;
-	}
-	return 0;
+	if (addr > HIGH_ADDR_MARK)
+		ksft_exit_fail_msg("Bad address %lx\n", addr);
 }
 
 static int validate_lower_address_hint(void)
@@ -107,23 +100,29 @@ int main(int argc, char *argv[])
 	char *hint;
 	unsigned long i, lchunks, hchunks;
 
+	ksft_print_header();
+	ksft_set_plan(1);
+
 	for (i = 0; i < NR_CHUNKS_LOW; i++) {
 		ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
 					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 
 		if (ptr[i] == MAP_FAILED) {
-			if (validate_lower_address_hint())
-				return 1;
+			if (validate_lower_address_hint()) {
+				ksft_test_result_skip("Memory constraint not fulfilled\n");
+				ksft_finished();
+			}
 			break;
 		}
 
-		if (validate_addr(ptr[i], 0))
-			return 1;
+		validate_addr(ptr[i], 0);
 	}
 	lchunks = i;
 	hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *));
-	if (hptr == NULL)
-		return 1;
+	if (hptr == NULL) {
+		ksft_test_result_skip("Memory constraint not fulfilled\n");
+		ksft_finished();
+	}
 
 	for (i = 0; i < NR_CHUNKS_HIGH; i++) {
 		hint = hind_addr();
@@ -133,8 +132,7 @@ int main(int argc, char *argv[])
 		if (hptr[i] == MAP_FAILED)
 			break;
 
-		if (validate_addr(hptr[i], 1))
-			return 1;
+		validate_addr(hptr[i], 1);
 	}
 	hchunks = i;
 
@@ -145,5 +143,7 @@ int main(int argc, char *argv[])
 		munmap(hptr[i], MAP_CHUNK_SIZE);
 
 	free(hptr);
-	return 0;
+
+	ksft_test_result_pass("Test\n");
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 05736c615734..5a62530da3b5 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -232,17 +232,17 @@ int64_t allocate_transhuge(void *ptr, int pagemap_fd)
 	if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
 		 MAP_FIXED | MAP_ANONYMOUS |
 		 MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
-		errx(2, "mmap transhuge");
+		ksft_exit_fail_msg("mmap transhuge\n");
 
 	if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
-		err(2, "MADV_HUGEPAGE");
+		ksft_exit_fail_msg("MADV_HUGEPAGE\n");
 
 	/* allocate transparent huge page */
 	*(volatile void **)ptr = ptr;
 
 	if (pread(pagemap_fd, ent, sizeof(ent),
 		  (uintptr_t)ptr >> (pshift() - 3)) != sizeof(ent))
-		err(2, "read pagemap");
+		ksft_exit_fail_msg("read pagemap\n");
 
 	if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
 	    PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index c02990bbd56f..9007c420d52c 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -3,7 +3,7 @@
 #include <stdbool.h>
 #include <sys/mman.h>
 #include <err.h>
-#include <string.h> /* ffsl() */
+#include <strings.h> /* ffsl() */
 #include <unistd.h> /* _SC_PAGESIZE */
 
 #define BIT_ULL(nr)                   (1ULL << (nr))
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 2f9d378edec3..49a56eb5d036 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -2,9 +2,9 @@
 bind_bhash
 bind_timewait
 bind_wildcard
-csum
 cmsg_sender
 diag_uid
+epoll_busy_poll
 fin_ack_lat
 gro
 hwtstamp_config
@@ -31,6 +31,7 @@ reuseport_dualstack
 rxtimestamp
 sctp_hello
 scm_pidfd
+scm_rights
 sk_bind_sendto_listen
 sk_connect_zero_addr
 socket
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 7b6918d5f4af..bd01e4a0be2c 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -20,7 +20,6 @@ TEST_PROGS += reuseaddr_ports_exhausted.sh
 TEST_PROGS += txtimestamp.sh
 TEST_PROGS += vrf-xfrm-tests.sh
 TEST_PROGS += rxtimestamp.sh
-TEST_PROGS += devlink_port_split.py
 TEST_PROGS += drop_monitor_tests.sh
 TEST_PROGS += vrf_route_leaking.sh
 TEST_PROGS += bareudp.sh
@@ -35,6 +34,7 @@ TEST_PROGS += gre_gso.sh
 TEST_PROGS += cmsg_so_mark.sh
 TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh
 TEST_PROGS += netns-name.sh
+TEST_PROGS += nl_netdev.py
 TEST_PROGS += srv6_end_dt46_l3vpn_test.sh
 TEST_PROGS += srv6_end_dt4_l3vpn_test.sh
 TEST_PROGS += srv6_end_dt6_l3vpn_test.sh
@@ -67,7 +67,7 @@ TEST_GEN_FILES += ipsec
 TEST_GEN_FILES += ioam6_parser
 TEST_GEN_FILES += gro
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
-TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap
+TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap epoll_busy_poll
 TEST_GEN_FILES += toeplitz
 TEST_GEN_FILES += cmsg_sender
 TEST_GEN_FILES += stress_reuseport_listen
@@ -81,9 +81,6 @@ TEST_PROGS += test_ingress_egress_chaining.sh
 TEST_GEN_PROGS += so_incoming_cpu
 TEST_PROGS += sctp_vrf.sh
 TEST_GEN_FILES += sctp_hello
-TEST_GEN_FILES += csum
-TEST_GEN_FILES += nat6to4.o
-TEST_GEN_FILES += xdp_dummy.o
 TEST_GEN_FILES += ip_local_port_range
 TEST_GEN_FILES += bind_wildcard
 TEST_PROGS += test_vxlan_mdb.sh
@@ -93,63 +90,22 @@ TEST_PROGS += test_bridge_backup_port.sh
 TEST_PROGS += fdb_flush.sh
 TEST_PROGS += fq_band_pktlimit.sh
 TEST_PROGS += vlan_hw_filter.sh
+TEST_PROGS += bpf_offload.py
 
 TEST_FILES := settings
 TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
 
+TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c))
+
 TEST_INCLUDES := forwarding/lib.sh
 
 include ../lib.mk
 
+$(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap
 $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
 $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto
 $(OUTPUT)/tcp_inq: LDLIBS += -lpthread
 $(OUTPUT)/bind_bhash: LDLIBS += -lpthread
 $(OUTPUT)/io_uring_zerocopy_tx: CFLAGS += -I../../../include/
 
-# Rules to generate bpf objs
-CLANG ?= clang
-SCRATCH_DIR := $(OUTPUT)/tools
-BUILD_DIR := $(SCRATCH_DIR)/build
-BPFDIR := $(abspath ../../../lib/bpf)
-APIDIR := $(abspath ../../../include/uapi)
-
-CCINCLUDE += -I../bpf
-CCINCLUDE += -I../../../../usr/include/
-CCINCLUDE += -I$(SCRATCH_DIR)/include
-
-BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
-
-MAKE_DIRS := $(BUILD_DIR)/libbpf
-$(MAKE_DIRS):
-	mkdir -p $@
-
-# Get Clang's default includes on this system, as opposed to those seen by
-# '--target=bpf'. This fixes "missing" files on some architectures/distros,
-# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
-#
-# Use '-idirafter': Don't interfere with include mechanics except where the
-# build would have failed anyways.
-define get_sys_includes
-$(shell $(1) $(2) -v -E - </dev/null 2>&1 \
-	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
-$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
-endef
-
-ifneq ($(CROSS_COMPILE),)
-CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%))
-endif
-
-CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
-
-$(OUTPUT)/nat6to4.o $(OUTPUT)/xdp_dummy.o: $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS)
-	$(CLANG) -O2 --target=bpf -c $< $(CCINCLUDE) $(CLANG_SYS_INCLUDES) -o $@
-
-$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
-	   $(APIDIR)/linux/bpf.h					       \
-	   | $(BUILD_DIR)/libbpf
-	$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/     \
-		    EXTRA_CFLAGS='-g -O0'				       \
-		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
-
-EXTRA_CLEAN := $(SCRATCH_DIR)
+include bpf.mk
diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile
index 221c387a7d7f..3b83c797650d 100644
--- a/tools/testing/selftests/net/af_unix/Makefile
+++ b/tools/testing/selftests/net/af_unix/Makefile
@@ -1,4 +1,4 @@
 CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd
+TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd scm_rights
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/net/af_unix/scm_rights.c b/tools/testing/selftests/net/af_unix/scm_rights.c
new file mode 100644
index 000000000000..bab606c9f1eb
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/scm_rights.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates. */
+#define _GNU_SOURCE
+#include <sched.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "../../kselftest_harness.h"
+
+FIXTURE(scm_rights)
+{
+	int fd[16];
+};
+
+FIXTURE_VARIANT(scm_rights)
+{
+	char name[16];
+	int type;
+	int flags;
+	bool test_listener;
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, dgram)
+{
+	.name = "UNIX ",
+	.type = SOCK_DGRAM,
+	.flags = 0,
+	.test_listener = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = 0,
+	.test_listener = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_oob)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = MSG_OOB,
+	.test_listener = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_listener)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = 0,
+	.test_listener = true,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_listener_oob)
+{
+	.name = "UNIX-STREAM ",
+	.type = SOCK_STREAM,
+	.flags = MSG_OOB,
+	.test_listener = true,
+};
+
+static int count_sockets(struct __test_metadata *_metadata,
+			 const FIXTURE_VARIANT(scm_rights) *variant)
+{
+	int sockets = -1, len, ret;
+	char *line = NULL;
+	size_t unused;
+	FILE *f;
+
+	f = fopen("/proc/net/protocols", "r");
+	ASSERT_NE(NULL, f);
+
+	len = strlen(variant->name);
+
+	while (getline(&line, &unused, f) != -1) {
+		int unused2;
+
+		if (strncmp(line, variant->name, len))
+			continue;
+
+		ret = sscanf(line + len, "%d %d", &unused2, &sockets);
+		ASSERT_EQ(2, ret);
+
+		break;
+	}
+
+	free(line);
+
+	ret = fclose(f);
+	ASSERT_EQ(0, ret);
+
+	return sockets;
+}
+
+FIXTURE_SETUP(scm_rights)
+{
+	int ret;
+
+	ret = unshare(CLONE_NEWNET);
+	ASSERT_EQ(0, ret);
+
+	ret = count_sockets(_metadata, variant);
+	ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(scm_rights)
+{
+	int ret;
+
+	sleep(1);
+
+	ret = count_sockets(_metadata, variant);
+	ASSERT_EQ(0, ret);
+}
+
+static void create_listeners(struct __test_metadata *_metadata,
+			     FIXTURE_DATA(scm_rights) *self,
+			     int n)
+{
+	struct sockaddr_un addr = {
+		.sun_family = AF_UNIX,
+	};
+	socklen_t addrlen;
+	int i, ret;
+
+	for (i = 0; i < n * 2; i += 2) {
+		self->fd[i] = socket(AF_UNIX, SOCK_STREAM, 0);
+		ASSERT_LE(0, self->fd[i]);
+
+		addrlen = sizeof(addr.sun_family);
+		ret = bind(self->fd[i], (struct sockaddr *)&addr, addrlen);
+		ASSERT_EQ(0, ret);
+
+		ret = listen(self->fd[i], -1);
+		ASSERT_EQ(0, ret);
+
+		addrlen = sizeof(addr);
+		ret = getsockname(self->fd[i], (struct sockaddr *)&addr, &addrlen);
+		ASSERT_EQ(0, ret);
+
+		self->fd[i + 1] = socket(AF_UNIX, SOCK_STREAM, 0);
+		ASSERT_LE(0, self->fd[i + 1]);
+
+		ret = connect(self->fd[i + 1], (struct sockaddr *)&addr, addrlen);
+		ASSERT_EQ(0, ret);
+	}
+}
+
+static void create_socketpairs(struct __test_metadata *_metadata,
+			       FIXTURE_DATA(scm_rights) *self,
+			       const FIXTURE_VARIANT(scm_rights) *variant,
+			       int n)
+{
+	int i, ret;
+
+	ASSERT_GE(sizeof(self->fd) / sizeof(int), n);
+
+	for (i = 0; i < n * 2; i += 2) {
+		ret = socketpair(AF_UNIX, variant->type, 0, self->fd + i);
+		ASSERT_EQ(0, ret);
+	}
+}
+
+static void __create_sockets(struct __test_metadata *_metadata,
+			     FIXTURE_DATA(scm_rights) *self,
+			     const FIXTURE_VARIANT(scm_rights) *variant,
+			     int n)
+{
+	if (variant->test_listener)
+		create_listeners(_metadata, self, n);
+	else
+		create_socketpairs(_metadata, self, variant, n);
+}
+
+static void __close_sockets(struct __test_metadata *_metadata,
+			    FIXTURE_DATA(scm_rights) *self,
+			    int n)
+{
+	int i, ret;
+
+	ASSERT_GE(sizeof(self->fd) / sizeof(int), n);
+
+	for (i = 0; i < n * 2; i++) {
+		ret = close(self->fd[i]);
+		ASSERT_EQ(0, ret);
+	}
+}
+
+void __send_fd(struct __test_metadata *_metadata,
+	       const FIXTURE_DATA(scm_rights) *self,
+	       const FIXTURE_VARIANT(scm_rights) *variant,
+	       int inflight, int receiver)
+{
+#define MSG "nop"
+#define MSGLEN 3
+	struct {
+		struct cmsghdr cmsghdr;
+		int fd[2];
+	} cmsg = {
+		.cmsghdr = {
+			.cmsg_len = CMSG_LEN(sizeof(cmsg.fd)),
+			.cmsg_level = SOL_SOCKET,
+			.cmsg_type = SCM_RIGHTS,
+		},
+		.fd = {
+			self->fd[inflight * 2],
+			self->fd[inflight * 2],
+		},
+	};
+	struct iovec iov = {
+		.iov_base = MSG,
+		.iov_len = MSGLEN,
+	};
+	struct msghdr msg = {
+		.msg_name = NULL,
+		.msg_namelen = 0,
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = &cmsg,
+		.msg_controllen = CMSG_SPACE(sizeof(cmsg.fd)),
+	};
+	int ret;
+
+	ret = sendmsg(self->fd[receiver * 2 + 1], &msg, variant->flags);
+	ASSERT_EQ(MSGLEN, ret);
+}
+
+#define create_sockets(n)					\
+	__create_sockets(_metadata, self, variant, n)
+#define close_sockets(n)					\
+	__close_sockets(_metadata, self, n)
+#define send_fd(inflight, receiver)				\
+	__send_fd(_metadata, self, variant, inflight, receiver)
+
+TEST_F(scm_rights, self_ref)
+{
+	create_sockets(2);
+
+	send_fd(0, 0);
+
+	send_fd(1, 1);
+
+	close_sockets(2);
+}
+
+TEST_F(scm_rights, triangle)
+{
+	create_sockets(6);
+
+	send_fd(0, 1);
+	send_fd(1, 2);
+	send_fd(2, 0);
+
+	send_fd(3, 4);
+	send_fd(4, 5);
+	send_fd(5, 3);
+
+	close_sockets(6);
+}
+
+TEST_F(scm_rights, cross_edge)
+{
+	create_sockets(8);
+
+	send_fd(0, 1);
+	send_fd(1, 2);
+	send_fd(2, 0);
+	send_fd(1, 3);
+	send_fd(3, 2);
+
+	send_fd(4, 5);
+	send_fd(5, 6);
+	send_fd(6, 4);
+	send_fd(5, 7);
+	send_fd(7, 6);
+
+	close_sockets(8);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/amt.sh b/tools/testing/selftests/net/amt.sh
index 75528788cb95..5175a42cbe8a 100755
--- a/tools/testing/selftests/net/amt.sh
+++ b/tools/testing/selftests/net/amt.sh
@@ -210,8 +210,8 @@ check_features()
 
 test_ipv4_forward()
 {
-	RESULT4=$(ip netns exec "${LISTENER}" nc -w 1 -l -u 239.0.0.1 4000)
-	if [ "$RESULT4" == "172.17.0.2" ]; then
+	RESULT4=$(ip netns exec "${LISTENER}" timeout 15 socat - UDP4-LISTEN:4000,readbytes=128 || true)
+	if echo "$RESULT4" | grep -q "172.17.0.2"; then
 		printf "TEST: %-60s  [ OK ]\n" "IPv4 amt multicast forwarding"
 		exit 0
 	else
@@ -222,8 +222,8 @@ test_ipv4_forward()
 
 test_ipv6_forward()
 {
-	RESULT6=$(ip netns exec "${LISTENER}" nc -w 1 -l -u ff0e::5:6 6000)
-	if [ "$RESULT6" == "2001:db8:3::2" ]; then
+	RESULT6=$(ip netns exec "${LISTENER}" timeout 15 socat - UDP6-LISTEN:6000,readbytes=128 || true)
+	if echo "$RESULT6" | grep -q "2001:db8:3::2"; then
 		printf "TEST: %-60s  [ OK ]\n" "IPv6 amt multicast forwarding"
 		exit 0
 	else
@@ -236,14 +236,14 @@ send_mcast4()
 {
 	sleep 2
 	ip netns exec "${SOURCE}" bash -c \
-		'echo 172.17.0.2 | nc -w 1 -u 239.0.0.1 4000' &
+		'printf "%s %128s" 172.17.0.2 | nc -w 1 -u 239.0.0.1 4000' &
 }
 
 send_mcast6()
 {
 	sleep 2
 	ip netns exec "${SOURCE}" bash -c \
-		'echo 2001:db8:3::2 | nc -w 1 -u ff0e::5:6 6000' &
+		'printf "%s %128s" 2001:db8:3::2 | nc -w 1 -u ff0e::5:6 6000' &
 }
 
 check_features
diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c
index a2662348cdb1..b7b54d646b93 100644
--- a/tools/testing/selftests/net/bind_wildcard.c
+++ b/tools/testing/selftests/net/bind_wildcard.c
@@ -6,7 +6,9 @@
 
 #include "../kselftest_harness.h"
 
-struct in6_addr in6addr_v4mapped_any = {
+static const __u32 in4addr_any = INADDR_ANY;
+static const __u32 in4addr_loopback = INADDR_LOOPBACK;
+static const struct in6_addr in6addr_v4mapped_any = {
 	.s6_addr = {
 		0, 0, 0, 0,
 		0, 0, 0, 0,
@@ -14,8 +16,7 @@ struct in6_addr in6addr_v4mapped_any = {
 		0, 0, 0, 0
 	}
 };
-
-struct in6_addr in6addr_v4mapped_loopback = {
+static const struct in6_addr in6addr_v4mapped_loopback = {
 	.s6_addr = {
 		0, 0, 0, 0,
 		0, 0, 0, 0,
@@ -24,137 +25,785 @@ struct in6_addr in6addr_v4mapped_loopback = {
 	}
 };
 
+#define NR_SOCKETS 8
+
 FIXTURE(bind_wildcard)
 {
-	struct sockaddr_in addr4;
-	struct sockaddr_in6 addr6;
+	int fd[NR_SOCKETS];
+	socklen_t addrlen[NR_SOCKETS];
+	union {
+		struct sockaddr addr;
+		struct sockaddr_in addr4;
+		struct sockaddr_in6 addr6;
+	} addr[NR_SOCKETS];
 };
 
 FIXTURE_VARIANT(bind_wildcard)
 {
-	const __u32 addr4_const;
-	const struct in6_addr *addr6_const;
-	int expected_errno;
+	sa_family_t family[2];
+	const void *addr[2];
+	bool ipv6_only[2];
+
+	/* 6 bind() calls below follow two bind() for the defined 2 addresses:
+	 *
+	 *   0.0.0.0
+	 *   127.0.0.1
+	 *   ::
+	 *   ::1
+	 *   ::ffff:0.0.0.0
+	 *   ::ffff:127.0.0.1
+	 */
+	int expected_errno[NR_SOCKETS];
+	int expected_reuse_errno[NR_SOCKETS];
+};
+
+/* (IPv4, IPv4) */
+FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v4_local)
+{
+	.family = {AF_INET, AF_INET},
+	.addr = {&in4addr_any, &in4addr_loopback},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any)
+{
+	.family = {AF_INET, AF_INET},
+	.addr = {&in4addr_loopback, &in4addr_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
 };
 
+/* (IPv4, IPv6) */
 FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any)
 {
-	.addr4_const = INADDR_ANY,
-	.addr6_const = &in6addr_any,
-	.expected_errno = EADDRINUSE,
+	.family = {AF_INET, AF_INET6},
+	.addr = {&in4addr_any, &in6addr_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any_only)
+{
+	.family = {AF_INET, AF_INET6},
+	.addr = {&in4addr_any, &in6addr_any},
+	.ipv6_only = {false, true},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
 };
 
 FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local)
 {
-	.addr4_const = INADDR_ANY,
-	.addr6_const = &in6addr_loopback,
-	.expected_errno = 0,
+	.family = {AF_INET, AF_INET6},
+	.addr = {&in4addr_any, &in6addr_loopback},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
 };
 
 FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any)
 {
-	.addr4_const = INADDR_ANY,
-	.addr6_const = &in6addr_v4mapped_any,
-	.expected_errno = EADDRINUSE,
+	.family = {AF_INET, AF_INET6},
+	.addr = {&in4addr_any, &in6addr_v4mapped_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
 };
 
 FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local)
 {
-	.addr4_const = INADDR_ANY,
-	.addr6_const = &in6addr_v4mapped_loopback,
-	.expected_errno = EADDRINUSE,
+	.family = {AF_INET, AF_INET6},
+	.addr = {&in4addr_any, &in6addr_v4mapped_loopback},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
 };
 
 FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any)
 {
-	.addr4_const = INADDR_LOOPBACK,
-	.addr6_const = &in6addr_any,
-	.expected_errno = EADDRINUSE,
+	.family = {AF_INET, AF_INET6},
+	.addr = {&in4addr_loopback, &in6addr_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any_only)
+{
+	.family = {AF_INET, AF_INET6},
+	.addr = {&in4addr_loopback, &in6addr_any},
+	.ipv6_only = {false, true},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
 };
 
 FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local)
 {
-	.addr4_const = INADDR_LOOPBACK,
-	.addr6_const = &in6addr_loopback,
-	.expected_errno = 0,
+	.family = {AF_INET, AF_INET6},
+	.addr = {&in4addr_loopback, &in6addr_loopback},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
 };
 
 FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any)
 {
-	.addr4_const = INADDR_LOOPBACK,
-	.addr6_const = &in6addr_v4mapped_any,
-	.expected_errno = EADDRINUSE,
+	.family = {AF_INET, AF_INET6},
+	.addr = {&in4addr_loopback, &in6addr_v4mapped_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
 };
 
 FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local)
 {
-	.addr4_const = INADDR_LOOPBACK,
-	.addr6_const = &in6addr_v4mapped_loopback,
-	.expected_errno = EADDRINUSE,
+	.family = {AF_INET, AF_INET6},
+	.addr = {&in4addr_loopback, &in6addr_v4mapped_loopback},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+/* (IPv6, IPv4) */
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any)
+{
+	.family = {AF_INET6, AF_INET},
+	.addr = {&in6addr_any, &in4addr_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
 };
 
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_any)
+{
+	.family = {AF_INET6, AF_INET},
+	.addr = {&in6addr_any, &in4addr_any},
+	.ipv6_only = {true, false},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local)
+{
+	.family = {AF_INET6, AF_INET},
+	.addr = {&in6addr_any, &in4addr_loopback},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_local)
+{
+	.family = {AF_INET6, AF_INET},
+	.addr = {&in6addr_any, &in4addr_loopback},
+	.ipv6_only = {true, false},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any)
+{
+	.family = {AF_INET6, AF_INET},
+	.addr = {&in6addr_loopback, &in4addr_any},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local)
+{
+	.family = {AF_INET6, AF_INET},
+	.addr = {&in6addr_loopback, &in4addr_loopback},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any)
+{
+	.family = {AF_INET6, AF_INET},
+	.addr = {&in6addr_v4mapped_any, &in4addr_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local)
+{
+	.family = {AF_INET6, AF_INET},
+	.addr = {&in6addr_v4mapped_any, &in4addr_loopback},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any)
+{
+	.family = {AF_INET6, AF_INET},
+	.addr = {&in6addr_v4mapped_loopback, &in4addr_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local)
+{
+	.family = {AF_INET6, AF_INET},
+	.addr = {&in6addr_v4mapped_loopback, &in4addr_loopback},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+/* (IPv6, IPv6) */
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_any)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_any, &in6addr_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_any)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_any, &in6addr_any},
+	.ipv6_only = {true, false},
+	.expected_errno = {0, EADDRINUSE,
+			   0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_any_only)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_any, &in6addr_any},
+	.ipv6_only = {false, true},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_any_only)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_any, &in6addr_any},
+	.ipv6_only = {true, true},
+	.expected_errno = {0, EADDRINUSE,
+			   0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 0, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_any, &in6addr_loopback},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_local)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_any, &in6addr_loopback},
+	.ipv6_only = {true, false},
+	.expected_errno = {0, EADDRINUSE,
+			   0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 0, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_any, &in6addr_v4mapped_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_any)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_any, &in6addr_v4mapped_any},
+	.ipv6_only = {true, false},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_any, &in6addr_v4mapped_loopback},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_local)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_any, &in6addr_v4mapped_loopback},
+	.ipv6_only = {true, false},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_loopback, &in6addr_any},
+	.expected_errno = {0, EADDRINUSE,
+			   0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any_only)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_loopback, &in6addr_any},
+	.ipv6_only = {false, true},
+	.expected_errno = {0, EADDRINUSE,
+			   0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 0, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_loopback, &in6addr_v4mapped_any},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_loopback, &in6addr_v4mapped_loopback},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_v4mapped_any, &in6addr_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any_only)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_v4mapped_any, &in6addr_any},
+	.ipv6_only = {false, true},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_v4mapped_any, &in6addr_loopback},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_v4mapped_any, &in6addr_v4mapped_loopback},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_v4mapped_loopback, &in6addr_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any_only)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_v4mapped_loopback, &in6addr_any},
+	.ipv6_only = {false, true},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_v4mapped_loopback, &in6addr_loopback},
+	.expected_errno = {0, 0,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any)
+{
+	.family = {AF_INET6, AF_INET6},
+	.addr = {&in6addr_v4mapped_loopback, &in6addr_v4mapped_any},
+	.expected_errno = {0, EADDRINUSE,
+			   EADDRINUSE, EADDRINUSE,
+			   EADDRINUSE, 0,
+			   EADDRINUSE, EADDRINUSE},
+	.expected_reuse_errno = {0, 0,
+				 EADDRINUSE, EADDRINUSE,
+				 EADDRINUSE, 0,
+				 EADDRINUSE, EADDRINUSE},
+};
+
+static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i,
+		       int family, const void *addr_const)
+{
+	if (family == AF_INET) {
+		struct sockaddr_in *addr4 = &self->addr[i].addr4;
+		const __u32 *addr4_const = addr_const;
+
+		addr4->sin_family = AF_INET;
+		addr4->sin_port = htons(0);
+		addr4->sin_addr.s_addr = htonl(*addr4_const);
+
+		self->addrlen[i] = sizeof(struct sockaddr_in);
+	} else {
+		struct sockaddr_in6 *addr6 = &self->addr[i].addr6;
+		const struct in6_addr *addr6_const = addr_const;
+
+		addr6->sin6_family = AF_INET6;
+		addr6->sin6_port = htons(0);
+		addr6->sin6_addr = *addr6_const;
+
+		self->addrlen[i] = sizeof(struct sockaddr_in6);
+	}
+}
+
 FIXTURE_SETUP(bind_wildcard)
 {
-	self->addr4.sin_family = AF_INET;
-	self->addr4.sin_port = htons(0);
-	self->addr4.sin_addr.s_addr = htonl(variant->addr4_const);
+	setup_addr(self, 0, variant->family[0], variant->addr[0]);
+	setup_addr(self, 1, variant->family[1], variant->addr[1]);
+
+	setup_addr(self, 2, AF_INET, &in4addr_any);
+	setup_addr(self, 3, AF_INET, &in4addr_loopback);
 
-	self->addr6.sin6_family = AF_INET6;
-	self->addr6.sin6_port = htons(0);
-	self->addr6.sin6_addr = *variant->addr6_const;
+	setup_addr(self, 4, AF_INET6, &in6addr_any);
+	setup_addr(self, 5, AF_INET6, &in6addr_loopback);
+	setup_addr(self, 6, AF_INET6, &in6addr_v4mapped_any);
+	setup_addr(self, 7, AF_INET6, &in6addr_v4mapped_loopback);
 }
 
 FIXTURE_TEARDOWN(bind_wildcard)
 {
+	int i;
+
+	for (i = 0; i < NR_SOCKETS; i++)
+		close(self->fd[i]);
 }
 
-void bind_sockets(struct __test_metadata *_metadata,
-		  FIXTURE_DATA(bind_wildcard) *self,
-		  int expected_errno,
-		  struct sockaddr *addr1, socklen_t addrlen1,
-		  struct sockaddr *addr2, socklen_t addrlen2)
+void bind_socket(struct __test_metadata *_metadata,
+		 FIXTURE_DATA(bind_wildcard) *self,
+		 const FIXTURE_VARIANT(bind_wildcard) *variant,
+		 int i, int reuse)
 {
-	int fd[2];
 	int ret;
 
-	fd[0] = socket(addr1->sa_family, SOCK_STREAM, 0);
-	ASSERT_GT(fd[0], 0);
+	self->fd[i] = socket(self->addr[i].addr.sa_family, SOCK_STREAM, 0);
+	ASSERT_GT(self->fd[i], 0);
 
-	ret = bind(fd[0], addr1, addrlen1);
-	ASSERT_EQ(ret, 0);
+	if (i < 2 && variant->ipv6_only[i]) {
+		ret = setsockopt(self->fd[i], SOL_IPV6, IPV6_V6ONLY, &(int){1}, sizeof(int));
+		ASSERT_EQ(ret, 0);
+	}
 
-	ret = getsockname(fd[0], addr1, &addrlen1);
-	ASSERT_EQ(ret, 0);
+	if (i < 2 && reuse) {
+		ret = setsockopt(self->fd[i], SOL_SOCKET, reuse, &(int){1}, sizeof(int));
+		ASSERT_EQ(ret, 0);
+	}
 
-	((struct sockaddr_in *)addr2)->sin_port = ((struct sockaddr_in *)addr1)->sin_port;
+	self->addr[i].addr4.sin_port = self->addr[0].addr4.sin_port;
 
-	fd[1] = socket(addr2->sa_family, SOCK_STREAM, 0);
-	ASSERT_GT(fd[1], 0);
+	ret = bind(self->fd[i], &self->addr[i].addr, self->addrlen[i]);
 
-	ret = bind(fd[1], addr2, addrlen2);
-	if (expected_errno) {
-		ASSERT_EQ(ret, -1);
-		ASSERT_EQ(errno, expected_errno);
+	if (reuse) {
+		if (variant->expected_reuse_errno[i]) {
+			ASSERT_EQ(ret, -1);
+			ASSERT_EQ(errno, variant->expected_reuse_errno[i]);
+		} else {
+			ASSERT_EQ(ret, 0);
+		}
 	} else {
+		if (variant->expected_errno[i]) {
+			ASSERT_EQ(ret, -1);
+			ASSERT_EQ(errno, variant->expected_errno[i]);
+		} else {
+			ASSERT_EQ(ret, 0);
+		}
+	}
+
+	if (i == 0) {
+		ret = getsockname(self->fd[0], &self->addr[0].addr, &self->addrlen[0]);
 		ASSERT_EQ(ret, 0);
 	}
+}
 
-	close(fd[1]);
-	close(fd[0]);
+TEST_F(bind_wildcard, plain)
+{
+	int i;
+
+	for (i = 0; i < NR_SOCKETS; i++)
+		bind_socket(_metadata, self, variant, i, 0);
 }
 
-TEST_F(bind_wildcard, v4_v6)
+TEST_F(bind_wildcard, reuseaddr)
 {
-	bind_sockets(_metadata, self, variant->expected_errno,
-		     (struct sockaddr *)&self->addr4, sizeof(self->addr4),
-		     (struct sockaddr *)&self->addr6, sizeof(self->addr6));
+	int i;
+
+	for (i = 0; i < NR_SOCKETS; i++)
+		bind_socket(_metadata, self, variant, i, SO_REUSEADDR);
 }
 
-TEST_F(bind_wildcard, v6_v4)
+TEST_F(bind_wildcard, reuseport)
 {
-	bind_sockets(_metadata, self, variant->expected_errno,
-		     (struct sockaddr *)&self->addr6, sizeof(self->addr6),
-		     (struct sockaddr *)&self->addr4, sizeof(self->addr4));
+	int i;
+
+	for (i = 0; i < NR_SOCKETS; i++)
+		bind_socket(_metadata, self, variant, i, SO_REUSEPORT);
 }
 
 TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/bpf.mk b/tools/testing/selftests/net/bpf.mk
new file mode 100644
index 000000000000..a4f6755dd894
--- /dev/null
+++ b/tools/testing/selftests/net/bpf.mk
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: GPL-2.0
+# Rules to generate bpf objs
+CLANG ?= clang
+SCRATCH_DIR := $(OUTPUT)/tools
+BUILD_DIR := $(SCRATCH_DIR)/build
+BPFDIR := $(top_srcdir)/tools/lib/bpf
+APIDIR := $(top_srcdir)/tools/include/uapi
+
+CCINCLUDE += -I$(selfdir)/bpf
+CCINCLUDE += -I$(top_srcdir)/usr/include/
+CCINCLUDE += -I$(SCRATCH_DIR)/include
+
+BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
+
+MAKE_DIRS := $(BUILD_DIR)/libbpf
+$(MAKE_DIRS):
+	$(call msg,MKDIR,,$@)
+	$(Q)mkdir -p $@
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '--target=bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) $(2) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+ifneq ($(CROSS_COMPILE),)
+CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif
+
+CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
+
+BPF_PROG_OBJS := $(patsubst %.c,$(OUTPUT)/%.o,$(wildcard *.bpf.c))
+
+$(BPF_PROG_OBJS): $(OUTPUT)/%.o : %.c $(BPFOBJ) | $(MAKE_DIRS)
+	$(call msg,BPF_PROG,,$@)
+	$(Q)$(CLANG) -O2 -g --target=bpf $(CCINCLUDE) $(CLANG_SYS_INCLUDES) \
+	-c $< -o $@
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		       \
+	   $(APIDIR)/linux/bpf.h					       \
+	   | $(BUILD_DIR)/libbpf
+	$(call msg,MAKE,,$@)
+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \
+		    EXTRA_CFLAGS='-g -O0'				       \
+		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
+
+EXTRA_CLEAN += $(SCRATCH_DIR)
diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/net/bpf_offload.py
index 6157f884d091..3efe44f6e92a 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/net/bpf_offload.py
@@ -29,6 +29,9 @@ import subprocess
 import time
 import traceback
 
+from lib.py import NetdevSim, NetdevSimDev
+
+
 logfile = None
 log_level = 1
 skip_extack = False
@@ -145,8 +148,10 @@ def tool(name, args, flags, JSON=True, ns="", fail=True, include_stderr=False):
     if JSON:
         params += "%s " % (flags["json"])
 
-    if ns != "":
+    if ns:
         ns = "ip netns exec %s " % (ns)
+    elif ns is None:
+        ns = ""
 
     if include_stderr:
         ret, stdout, stderr = cmd(ns + name + " " + params + args,
@@ -201,11 +206,11 @@ def bpftool_prog_list_wait(expected=0, n_retry=20):
         time.sleep(0.05)
     raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs))
 
-def bpftool_map_list_wait(expected=0, n_retry=20):
+def bpftool_map_list_wait(expected=0, n_retry=20, ns=""):
     for i in range(n_retry):
-        nmaps = len(bpftool_map_list())
-        if nmaps == expected:
-            return
+        maps = bpftool_map_list(ns=ns)
+        if len(maps) == expected:
+            return maps
         time.sleep(0.05)
     raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps))
 
@@ -237,7 +242,7 @@ def tc(args, JSON=True, ns="", fail=True, include_stderr=False):
 def ethtool(dev, opt, args, fail=True):
     return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail)
 
-def bpf_obj(name, sec=".text", path=bpf_test_dir,):
+def bpf_obj(name, sec="xdp", path=bpf_test_dir,):
     return "obj %s sec %s" % (os.path.join(path, name), sec)
 
 def bpf_pinned(name):
@@ -334,72 +339,16 @@ class DebugfsDir:
 
         return dfs
 
-class NetdevSimDev:
+class BpfNetdevSimDev(NetdevSimDev):
     """
     Class for netdevsim bus device and its attributes.
     """
-    @staticmethod
-    def ctrl_write(path, val):
-        fullpath = os.path.join("/sys/bus/netdevsim/", path)
-        try:
-            with open(fullpath, "w") as f:
-                f.write(val)
-        except OSError as e:
-            log("WRITE %s: %r" % (fullpath, val), -e.errno)
-            raise e
-        log("WRITE %s: %r" % (fullpath, val), 0)
-
-    def __init__(self, port_count=1):
-        addr = 0
-        while True:
-            try:
-                self.ctrl_write("new_device", "%u %u" % (addr, port_count))
-            except OSError as e:
-                if e.errno == errno.ENOSPC:
-                    addr += 1
-                    continue
-                raise e
-            break
-        self.addr = addr
-
-        # As probe of netdevsim device might happen from a workqueue,
-        # so wait here until all netdevs appear.
-        self.wait_for_netdevs(port_count)
-
-        ret, out = cmd("udevadm settle", fail=False)
-        if ret:
-            raise Exception("udevadm settle failed")
-        ifnames = self.get_ifnames()
-
+    def __init__(self, port_count=1, ns=None):
+        super().__init__(port_count, ns=ns)
         devs.append(self)
-        self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr
-
-        self.nsims = []
-        for port_index in range(port_count):
-            self.nsims.append(NetdevSim(self, port_index, ifnames[port_index]))
-
-    def get_ifnames(self):
-        ifnames = []
-        listdir = os.listdir("/sys/bus/netdevsim/devices/netdevsim%u/net/" % self.addr)
-        for ifname in listdir:
-            ifnames.append(ifname)
-        ifnames.sort()
-        return ifnames
-
-    def wait_for_netdevs(self, port_count):
-        timeout = 5
-        timeout_start = time.time()
-
-        while True:
-            try:
-                ifnames = self.get_ifnames()
-            except FileNotFoundError as e:
-                ifnames = []
-            if len(ifnames) == port_count:
-                break
-            if time.time() < timeout_start + timeout:
-                continue
-            raise Exception("netdevices did not appear within timeout")
+
+    def _make_port(self, port_index, ifname):
+        return BpfNetdevSim(self, port_index, ifname, self.ns)
 
     def dfs_num_bound_progs(self):
         path = os.path.join(self.dfs_dir, "bpf_bound_progs")
@@ -415,33 +364,20 @@ class NetdevSimDev:
         return progs
 
     def remove(self):
-        self.ctrl_write("del_device", "%u" % (self.addr, ))
+        super().remove()
         devs.remove(self)
 
-    def remove_nsim(self, nsim):
-        self.nsims.remove(nsim)
-        self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ),
-                        "%u" % (nsim.port_index, ))
 
-class NetdevSim:
+class BpfNetdevSim(NetdevSim):
     """
     Class for netdevsim netdevice and its attributes.
     """
 
-    def __init__(self, nsimdev, port_index, ifname):
-        # In case udev renamed the netdev to according to new schema,
-        # check if the name matches the port_index.
-        nsimnamere = re.compile("eni\d+np(\d+)")
-        match = nsimnamere.match(ifname)
-        if match and int(match.groups()[0]) != port_index + 1:
-            raise Exception("netdevice name mismatches the expected one")
-
-        self.nsimdev = nsimdev
-        self.port_index = port_index
-        self.ns = ""
+    def __init__(self, nsimdev, port_index, ifname, ns=None):
+        super().__init__(nsimdev, port_index, ifname, ns=ns)
+
         self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index)
         self.dfs_refresh()
-        _, [self.dev] = ip("link show dev %s" % ifname)
 
     def __getitem__(self, key):
         return self.dev[key]
@@ -468,7 +404,7 @@ class NetdevSim:
         raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs))
 
     def set_ns(self, ns):
-        name = "1" if ns == "" else ns
+        name = ns if ns else "1"
         ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns)
         self.ns = ns
 
@@ -605,7 +541,7 @@ def pin_prog(file_name, idx=0):
     return file_name, bpf_pinned(file_name)
 
 def pin_map(file_name, idx=0, expected=1):
-    maps = bpftool_map_list(expected=expected)
+    maps = bpftool_map_list_wait(expected=expected)
     m = maps[idx]
     bpftool("map pin id %d %s" % (m["id"], file_name))
     files.append(file_name)
@@ -618,7 +554,7 @@ def check_dev_info_removed(prog_file=None, map_file=None):
     ret, err = bpftool("prog show pin %s" % (prog_file), fail=False)
     fail(ret != 0, "failed to show prog with removed device")
 
-    bpftool_map_list(expected=0)
+    bpftool_map_list_wait(expected=0)
     ret, err = bpftool("map show pin %s" % (map_file), fail=False)
     fail(ret == 0, "Showing map with removed device did not fail")
     fail(err["error"].find("No such device") == -1,
@@ -642,7 +578,7 @@ def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False):
     else:
         fail("ifname" in dev.keys(), "Ifname is reported for other ns")
 
-    maps = bpftool_map_list(expected=2, ns=ns)
+    maps = bpftool_map_list_wait(expected=2, ns=ns)
     for m in maps:
         fail("dev" not in m.keys(), "Device parameters not reported")
         fail(dev != m["dev"], "Map's device different than program's")
@@ -744,7 +680,7 @@ def test_multi_prog(simdev, sim, obj, modename, modeid):
     start_test("Test multi-attachment XDP - device remove...")
     simdev.remove()
 
-    simdev = NetdevSimDev()
+    simdev = BpfNetdevSimDev()
     sim, = simdev.nsims
     sim.set_ethtool_tc_offloads(True)
     return [simdev, sim]
@@ -809,13 +745,13 @@ try:
     bytecode = bpf_bytecode("1,6 0 0 4294967295,")
 
     start_test("Test destruction of generic XDP...")
-    simdev = NetdevSimDev()
+    simdev = BpfNetdevSimDev()
     sim, = simdev.nsims
     sim.set_xdp(obj, "generic")
     simdev.remove()
     bpftool_prog_list_wait(expected=0)
 
-    simdev = NetdevSimDev()
+    simdev = BpfNetdevSimDev()
     sim, = simdev.nsims
     sim.tc_add_ingress()
 
@@ -967,7 +903,7 @@ try:
     simdev.remove()
     bpftool_prog_list_wait(expected=0)
 
-    simdev = NetdevSimDev()
+    simdev = BpfNetdevSimDev()
     sim, = simdev.nsims
     sim.set_ethtool_tc_offloads(True)
 
@@ -976,7 +912,7 @@ try:
     simdev.remove()
     bpftool_prog_list_wait(expected=0)
 
-    simdev = NetdevSimDev()
+    simdev = BpfNetdevSimDev()
     sim, = simdev.nsims
     sim.set_ethtool_tc_offloads(True)
 
@@ -1080,7 +1016,7 @@ try:
     bpftool_prog_list_wait(expected=0)
 
     start_test("Test attempt to use a program for a wrong device...")
-    simdev2 = NetdevSimDev()
+    simdev2 = BpfNetdevSimDev()
     sim2, = simdev2.nsims
     sim2.set_xdp(obj, "offload")
     pin_file, pinned = pin_prog("/sys/fs/bpf/tmp")
@@ -1169,7 +1105,7 @@ try:
     clean_up()
     bpftool_prog_list_wait(expected=0)
 
-    simdev = NetdevSimDev()
+    simdev = BpfNetdevSimDev()
     sim, = simdev.nsims
     map_obj = bpf_obj("sample_map_ret0.bpf.o")
     start_test("Test loading program with maps...")
@@ -1201,12 +1137,12 @@ try:
     clean_up()
     bpftool_prog_list_wait(expected=0)
 
-    simdev = NetdevSimDev()
+    simdev = BpfNetdevSimDev()
     sim, = simdev.nsims
 
     start_test("Test map update (no flags)...")
     sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
-    maps = bpftool_map_list(expected=2)
+    maps = bpftool_map_list_wait(expected=2)
     array = maps[0] if maps[0]["type"] == "array" else maps[1]
     htab = maps[0] if maps[0]["type"] == "hash" else maps[1]
     for m in maps:
@@ -1285,14 +1221,14 @@ try:
     bpftool_map_list_wait(expected=0)
     simdev.remove()
 
-    simdev = NetdevSimDev()
+    simdev = BpfNetdevSimDev()
     sim, = simdev.nsims
     sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
     simdev.remove()
     bpftool_map_list_wait(expected=0)
 
     start_test("Test map creation fail path...")
-    simdev = NetdevSimDev()
+    simdev = BpfNetdevSimDev()
     sim, = simdev.nsims
     sim.dfs["bpf_map_accept"] = "N"
     ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False)
@@ -1302,9 +1238,9 @@ try:
     simdev.remove()
 
     start_test("Test multi-dev ASIC program reuse...")
-    simdevA = NetdevSimDev()
+    simdevA = BpfNetdevSimDev()
     simA, = simdevA.nsims
-    simdevB = NetdevSimDev(3)
+    simdevB = BpfNetdevSimDev(3)
     simB1, simB2, simB3 = simdevB.nsims
     sims = (simA, simB1, simB2, simB3)
     simB = (simB1, simB2, simB3)
diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c
index c79e65581dc3..876c2db02a63 100644
--- a/tools/testing/selftests/net/cmsg_sender.c
+++ b/tools/testing/selftests/net/cmsg_sender.c
@@ -260,15 +260,8 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
 			  SOL_IPV6, IPV6_HOPLIMIT, &opt.v6.hlimit);
 
 	if (opt.txtime.ena) {
-		struct sock_txtime so_txtime = {
-			.clockid = CLOCK_MONOTONIC,
-		};
 		__u64 txtime;
 
-		if (setsockopt(fd, SOL_SOCKET, SO_TXTIME,
-			       &so_txtime, sizeof(so_txtime)))
-			error(ERN_SOCKOPT, errno, "setsockopt TXTIME");
-
 		txtime = time_start_mono.tv_sec * (1000ULL * 1000 * 1000) +
 			 time_start_mono.tv_nsec +
 			 opt.txtime.delay * 1000;
@@ -284,13 +277,6 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
 		memcpy(CMSG_DATA(cmsg), &txtime, sizeof(txtime));
 	}
 	if (opt.ts.ena) {
-		__u32 val = SOF_TIMESTAMPING_SOFTWARE |
-			    SOF_TIMESTAMPING_OPT_TSONLY;
-
-		if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
-			       &val, sizeof(val)))
-			error(ERN_SOCKOPT, errno, "setsockopt TIMESTAMPING");
-
 		cmsg = (struct cmsghdr *)(cbuf + cmsg_len);
 		cmsg_len += CMSG_SPACE(sizeof(__u32));
 		if (cbuf_sz < cmsg_len)
@@ -333,16 +319,17 @@ static const char *cs_ts_info2str(unsigned int info)
 	return "unknown";
 }
 
-static void
+static unsigned long
 cs_read_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
 {
 	struct sock_extended_err *see;
 	struct scm_timestamping *ts;
+	unsigned long ts_seen = 0;
 	struct cmsghdr *cmsg;
 	int i, err;
 
 	if (!opt.ts.ena)
-		return;
+		return 0;
 	msg->msg_control = cbuf;
 	msg->msg_controllen = cbuf_sz;
 
@@ -396,8 +383,11 @@ cs_read_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz)
 			printf(" %5s ts%d %lluus\n",
 			       cs_ts_info2str(see->ee_info),
 			       i, rel_time);
+			ts_seen |= 1 << see->ee_info;
 		}
 	}
+
+	return ts_seen;
 }
 
 static void ca_set_sockopts(int fd)
@@ -422,6 +412,24 @@ static void ca_set_sockopts(int fd)
 	    setsockopt(fd, SOL_SOCKET, SO_PRIORITY,
 		       &opt.sockopt.priority, sizeof(opt.sockopt.priority)))
 		error(ERN_SOCKOPT, errno, "setsockopt SO_PRIORITY");
+
+	if (opt.txtime.ena) {
+		struct sock_txtime so_txtime = {
+			.clockid = CLOCK_MONOTONIC,
+		};
+
+		if (setsockopt(fd, SOL_SOCKET, SO_TXTIME,
+			       &so_txtime, sizeof(so_txtime)))
+			error(ERN_SOCKOPT, errno, "setsockopt TXTIME");
+	}
+	if (opt.ts.ena) {
+		__u32 val = SOF_TIMESTAMPING_SOFTWARE |
+			SOF_TIMESTAMPING_OPT_TSONLY;
+
+		if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
+			       &val, sizeof(val)))
+			error(ERN_SOCKOPT, errno, "setsockopt TIMESTAMPING");
+	}
 }
 
 int main(int argc, char *argv[])
@@ -509,10 +517,16 @@ int main(int argc, char *argv[])
 	err = ERN_SUCCESS;
 
 	if (opt.ts.ena) {
-		/* Make sure all timestamps have time to loop back */
-		usleep(opt.txtime.delay);
+		unsigned long seen;
+		int i;
 
-		cs_read_cmsg(fd, &msg, cbuf, sizeof(cbuf));
+		/* Make sure all timestamps have time to loop back */
+		for (i = 0; i < 40; i++) {
+			seen = cs_read_cmsg(fd, &msg, cbuf, sizeof(cbuf));
+			if (seen & (1 << SCM_TSTAMP_SND))
+				break;
+			usleep(opt.txtime.delay / 20);
+		}
 	}
 
 err_out:
diff --git a/tools/testing/selftests/net/cmsg_time.sh b/tools/testing/selftests/net/cmsg_time.sh
index af85267ad1e3..1d7e756644bc 100755
--- a/tools/testing/selftests/net/cmsg_time.sh
+++ b/tools/testing/selftests/net/cmsg_time.sh
@@ -66,10 +66,13 @@ for i in "-4 $TGT4" "-6 $TGT6"; do
 		 awk '/SND/ { if ($3 > 1000) print "OK"; }')
 	check_result $? "$ts" "OK" "$prot - TXTIME abs"
 
-	ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d 1000 |
+	[ "$KSFT_MACHINE_SLOW" = yes ] && delay=8000 || delay=1000
+
+	ts=$(ip netns exec $NS ./cmsg_sender -p $p $i 1234 -t -d $delay |
 		 awk '/SND/ {snd=$3}
 		      /SCHED/ {sch=$3}
-		      END { if (snd - sch > 500) print "OK"; }')
+		      END { if (snd - sch > '$((delay/2))') print "OK";
+			    else print snd, "-", sch, "<", '$((delay/2))'; }')
 	check_result $? "$ts" "OK" "$prot - TXTIME rel"
     done
 done
diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index 5e4390cac17e..04de7a6ba6f3 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -30,6 +30,7 @@ CONFIG_IP_GRE=m
 CONFIG_NETFILTER=y
 CONFIG_NETFILTER_ADVANCED=y
 CONFIG_NF_CONNTRACK=m
+CONFIG_IPV6_MROUTE=y
 CONFIG_IPV6_SIT=y
 CONFIG_IP_DCCP=m
 CONFIG_NF_NAT=m
diff --git a/tools/testing/selftests/net/epoll_busy_poll.c b/tools/testing/selftests/net/epoll_busy_poll.c
new file mode 100644
index 000000000000..16e457c2f877
--- /dev/null
+++ b/tools/testing/selftests/net/epoll_busy_poll.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* Basic per-epoll context busy poll test.
+ *
+ * Only tests the ioctls, but should be expanded to test two connected hosts in
+ * the future
+ */
+
+#define _GNU_SOURCE
+
+#include <error.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/capability.h>
+
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include "../kselftest_harness.h"
+
+/* if the headers haven't been updated, we need to define some things */
+#if !defined(EPOLL_IOC_TYPE)
+struct epoll_params {
+	uint32_t busy_poll_usecs;
+	uint16_t busy_poll_budget;
+	uint8_t prefer_busy_poll;
+
+	/* pad the struct to a multiple of 64bits */
+	uint8_t __pad;
+};
+
+#define EPOLL_IOC_TYPE 0x8A
+#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params)
+#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params)
+#endif
+
+FIXTURE(invalid_fd)
+{
+	int invalid_fd;
+	struct epoll_params params;
+};
+
+FIXTURE_SETUP(invalid_fd)
+{
+	int ret;
+
+	ret = socket(AF_UNIX, SOCK_DGRAM, 0);
+	EXPECT_NE(-1, ret)
+		TH_LOG("error creating unix socket");
+
+	self->invalid_fd = ret;
+}
+
+FIXTURE_TEARDOWN(invalid_fd)
+{
+	int ret;
+
+	ret = close(self->invalid_fd);
+	EXPECT_EQ(0, ret);
+}
+
+TEST_F(invalid_fd, test_invalid_fd)
+{
+	int ret;
+
+	ret = ioctl(self->invalid_fd, EPIOCGPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCGPARAMS on invalid epoll FD should error");
+
+	EXPECT_EQ(ENOTTY, errno)
+		TH_LOG("EPIOCGPARAMS on invalid epoll FD should set errno to ENOTTY");
+
+	memset(&self->params, 0, sizeof(struct epoll_params));
+
+	ret = ioctl(self->invalid_fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS on invalid epoll FD should error");
+
+	EXPECT_EQ(ENOTTY, errno)
+		TH_LOG("EPIOCSPARAMS on invalid epoll FD should set errno to ENOTTY");
+}
+
+FIXTURE(epoll_busy_poll)
+{
+	int fd;
+	struct epoll_params params;
+	struct epoll_params *invalid_params;
+	cap_t caps;
+};
+
+FIXTURE_SETUP(epoll_busy_poll)
+{
+	int ret;
+
+	ret = epoll_create1(0);
+	EXPECT_NE(-1, ret)
+		TH_LOG("epoll_create1 failed?");
+
+	self->fd = ret;
+
+	self->caps = cap_get_proc();
+	EXPECT_NE(NULL, self->caps);
+}
+
+FIXTURE_TEARDOWN(epoll_busy_poll)
+{
+	int ret;
+
+	ret = close(self->fd);
+	EXPECT_EQ(0, ret);
+
+	ret = cap_free(self->caps);
+	EXPECT_NE(-1, ret)
+		TH_LOG("unable to free capabilities");
+}
+
+TEST_F(epoll_busy_poll, test_get_params)
+{
+	/* begin by getting the epoll params from the kernel
+	 *
+	 * the default should be default and all fields should be zero'd by the
+	 * kernel, so set params fields to garbage to test this.
+	 */
+	int ret = 0;
+
+	self->params.busy_poll_usecs = 0xff;
+	self->params.busy_poll_budget = 0xff;
+	self->params.prefer_busy_poll = 1;
+	self->params.__pad = 0xf;
+
+	ret = ioctl(self->fd, EPIOCGPARAMS, &self->params);
+	EXPECT_EQ(0, ret)
+		TH_LOG("ioctl EPIOCGPARAMS should succeed");
+
+	EXPECT_EQ(0, self->params.busy_poll_usecs)
+		TH_LOG("EPIOCGPARAMS busy_poll_usecs should have been 0");
+
+	EXPECT_EQ(0, self->params.busy_poll_budget)
+		TH_LOG("EPIOCGPARAMS busy_poll_budget should have been 0");
+
+	EXPECT_EQ(0, self->params.prefer_busy_poll)
+		TH_LOG("EPIOCGPARAMS prefer_busy_poll should have been 0");
+
+	EXPECT_EQ(0, self->params.__pad)
+		TH_LOG("EPIOCGPARAMS __pad should have been 0");
+
+	self->invalid_params = (struct epoll_params *)0xdeadbeef;
+	ret = ioctl(self->fd, EPIOCGPARAMS, self->invalid_params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCGPARAMS should error with invalid params");
+
+	EXPECT_EQ(EFAULT, errno)
+		TH_LOG("EPIOCGPARAMS with invalid params should set errno to EFAULT");
+}
+
+TEST_F(epoll_busy_poll, test_set_invalid)
+{
+	int ret;
+
+	memset(&self->params, 0, sizeof(struct epoll_params));
+
+	self->params.__pad = 1;
+
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS non-zero __pad should error");
+
+	EXPECT_EQ(EINVAL, errno)
+		TH_LOG("EPIOCSPARAMS non-zero __pad errno should be EINVAL");
+
+	self->params.__pad = 0;
+	self->params.busy_poll_usecs = (uint32_t)INT_MAX + 1;
+
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS should error busy_poll_usecs > S32_MAX");
+
+	EXPECT_EQ(EINVAL, errno)
+		TH_LOG("EPIOCSPARAMS busy_poll_usecs > S32_MAX errno should be EINVAL");
+
+	self->params.__pad = 0;
+	self->params.busy_poll_usecs = 32;
+	self->params.prefer_busy_poll = 2;
+
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS should error prefer_busy_poll > 1");
+
+	EXPECT_EQ(EINVAL, errno)
+		TH_LOG("EPIOCSPARAMS prefer_busy_poll > 1 errno should be EINVAL");
+
+	self->params.__pad = 0;
+	self->params.busy_poll_usecs = 32;
+	self->params.prefer_busy_poll = 1;
+
+	/* set budget well above kernel's NAPI_POLL_WEIGHT of 64 */
+	self->params.busy_poll_budget = UINT16_MAX;
+
+	/* test harness should run with CAP_NET_ADMIN, but let's make sure */
+	cap_flag_value_t tmp;
+
+	ret = cap_get_flag(self->caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &tmp);
+	EXPECT_EQ(0, ret)
+		TH_LOG("unable to get CAP_NET_ADMIN cap flag");
+
+	EXPECT_EQ(CAP_SET, tmp)
+		TH_LOG("expecting CAP_NET_ADMIN to be set for the test harness");
+
+	/* at this point we know CAP_NET_ADMIN is available, so setting the
+	 * params with a busy_poll_budget > NAPI_POLL_WEIGHT should succeed
+	 */
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(0, ret)
+		TH_LOG("EPIOCSPARAMS should allow busy_poll_budget > NAPI_POLL_WEIGHT");
+
+	/* remove CAP_NET_ADMIN from our effective set */
+	cap_value_t net_admin[] = { CAP_NET_ADMIN };
+
+	ret = cap_set_flag(self->caps, CAP_EFFECTIVE, 1, net_admin, CAP_CLEAR);
+	EXPECT_EQ(0, ret)
+		TH_LOG("couldn't clear CAP_NET_ADMIN");
+
+	ret = cap_set_proc(self->caps);
+	EXPECT_EQ(0, ret)
+		TH_LOG("cap_set_proc should drop CAP_NET_ADMIN");
+
+	/* this is now expected to fail */
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS should error busy_poll_budget > NAPI_POLL_WEIGHT");
+
+	EXPECT_EQ(EPERM, errno)
+		TH_LOG("EPIOCSPARAMS errno should be EPERM busy_poll_budget > NAPI_POLL_WEIGHT");
+
+	/* restore CAP_NET_ADMIN to our effective set */
+	ret = cap_set_flag(self->caps, CAP_EFFECTIVE, 1, net_admin, CAP_SET);
+	EXPECT_EQ(0, ret)
+		TH_LOG("couldn't restore CAP_NET_ADMIN");
+
+	ret = cap_set_proc(self->caps);
+	EXPECT_EQ(0, ret)
+		TH_LOG("cap_set_proc should set  CAP_NET_ADMIN");
+
+	self->invalid_params = (struct epoll_params *)0xdeadbeef;
+	ret = ioctl(self->fd, EPIOCSPARAMS, self->invalid_params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("EPIOCSPARAMS should error when epoll_params is invalid");
+
+	EXPECT_EQ(EFAULT, errno)
+		TH_LOG("EPIOCSPARAMS should set errno to EFAULT when epoll_params is invalid");
+}
+
+TEST_F(epoll_busy_poll, test_set_and_get_valid)
+{
+	int ret;
+
+	memset(&self->params, 0, sizeof(struct epoll_params));
+
+	self->params.busy_poll_usecs = 25;
+	self->params.busy_poll_budget = 16;
+	self->params.prefer_busy_poll = 1;
+
+	ret = ioctl(self->fd, EPIOCSPARAMS, &self->params);
+
+	EXPECT_EQ(0, ret)
+		TH_LOG("EPIOCSPARAMS with valid params should not error");
+
+	/* check that the kernel returns the same values back */
+
+	memset(&self->params, 0, sizeof(struct epoll_params));
+
+	ret = ioctl(self->fd, EPIOCGPARAMS, &self->params);
+
+	EXPECT_EQ(0, ret)
+		TH_LOG("EPIOCGPARAMS should not error");
+
+	EXPECT_EQ(25, self->params.busy_poll_usecs)
+		TH_LOG("params.busy_poll_usecs incorrect");
+
+	EXPECT_EQ(16, self->params.busy_poll_budget)
+		TH_LOG("params.busy_poll_budget incorrect");
+
+	EXPECT_EQ(1, self->params.prefer_busy_poll)
+		TH_LOG("params.prefer_busy_poll incorrect");
+
+	EXPECT_EQ(0, self->params.__pad)
+		TH_LOG("params.__pad was not 0");
+}
+
+TEST_F(epoll_busy_poll, test_invalid_ioctl)
+{
+	int invalid_ioctl = EPIOCGPARAMS + 10;
+	int ret;
+
+	ret = ioctl(self->fd, invalid_ioctl, &self->params);
+
+	EXPECT_EQ(-1, ret)
+		TH_LOG("invalid ioctl should return error");
+
+	EXPECT_EQ(EINVAL, errno)
+		TH_LOG("invalid ioctl should set errno to EINVAL");
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index 51157a5559b7..7c01f58a20de 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -9,6 +9,7 @@ PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
 
 RTABLE=100
 RTABLE_PEER=101
+RTABLE_VRF=102
 GW_IP4=192.51.100.2
 SRC_IP=192.51.100.3
 GW_IP6=2001:db8:1::2
@@ -17,7 +18,14 @@ SRC_IP6=2001:db8:1::3
 DEV_ADDR=192.51.100.1
 DEV_ADDR6=2001:db8:1::1
 DEV=dummy0
-TESTS="fib_rule6 fib_rule4 fib_rule6_connect fib_rule4_connect"
+TESTS="
+	fib_rule6
+	fib_rule4
+	fib_rule6_connect
+	fib_rule4_connect
+	fib_rule6_vrf
+	fib_rule4_vrf
+"
 
 SELFTEST_PATH=""
 
@@ -27,13 +35,18 @@ log_test()
 	local expected=$2
 	local msg="$3"
 
+	$IP rule show | grep -q l3mdev
+	if [ $? -eq 0 ]; then
+		msg="$msg (VRF)"
+	fi
+
 	if [ ${rc} -eq ${expected} ]; then
 		nsuccess=$((nsuccess+1))
-		printf "\n    TEST: %-50s  [ OK ]\n" "${msg}"
+		printf "\n    TEST: %-60s  [ OK ]\n" "${msg}"
 	else
 		ret=1
 		nfail=$((nfail+1))
-		printf "\n    TEST: %-50s  [FAIL]\n" "${msg}"
+		printf "\n    TEST: %-60s  [FAIL]\n" "${msg}"
 		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
 			echo
 			echo "hit enter to continue, 'q' to quit"
@@ -130,6 +143,17 @@ cleanup_peer()
 	ip netns del $peerns
 }
 
+setup_vrf()
+{
+	$IP link add name vrf0 up type vrf table $RTABLE_VRF
+	$IP link set dev $DEV master vrf0
+}
+
+cleanup_vrf()
+{
+	$IP link del dev vrf0
+}
+
 fib_check_iproute_support()
 {
 	ip rule help 2>&1 | grep -q $1
@@ -248,6 +272,13 @@ fib_rule6_test()
 	fi
 }
 
+fib_rule6_vrf_test()
+{
+	setup_vrf
+	fib_rule6_test
+	cleanup_vrf
+}
+
 # Verify that the IPV6_TCLASS option of UDPv6 and TCPv6 sockets is properly
 # taken into account when connecting the socket and when sending packets.
 fib_rule6_connect_test()
@@ -385,6 +416,13 @@ fib_rule4_test()
 	fi
 }
 
+fib_rule4_vrf_test()
+{
+	setup_vrf
+	fib_rule4_test
+	cleanup_vrf
+}
+
 # Verify that the IP_TOS option of UDPv4 and TCPv4 sockets is properly taken
 # into account when connecting the socket and when sending packets.
 fib_rule4_connect_test()
@@ -467,6 +505,8 @@ do
 	fib_rule4_test|fib_rule4)		fib_rule4_test;;
 	fib_rule6_connect_test|fib_rule6_connect)	fib_rule6_connect_test;;
 	fib_rule4_connect_test|fib_rule4_connect)	fib_rule4_connect_test;;
+	fib_rule6_vrf_test|fib_rule6_vrf)	fib_rule6_vrf_test;;
+	fib_rule4_vrf_test|fib_rule4_vrf)	fib_rule4_vrf_test;;
 
 	help) echo "Test names: $TESTS"; exit 0;;
 
diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile
index 535865b3d1d6..fa7b59ff4029 100644
--- a/tools/testing/selftests/net/forwarding/Makefile
+++ b/tools/testing/selftests/net/forwarding/Makefile
@@ -15,18 +15,12 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \
 	bridge_vlan_unaware.sh \
 	custom_multipath_hash.sh \
 	dual_vxlan_bridge.sh \
-	ethtool_extended_state.sh \
-	ethtool_mm.sh \
-	ethtool_rmon.sh \
-	ethtool.sh \
 	gre_custom_multipath_hash.sh \
 	gre_inner_v4_multipath.sh \
 	gre_inner_v6_multipath.sh \
 	gre_multipath_nh_res.sh \
 	gre_multipath_nh.sh \
 	gre_multipath.sh \
-	hw_stats_l3.sh \
-	hw_stats_l3_gre.sh \
 	ip6_forward_instats_vrf.sh \
 	ip6gre_custom_multipath_hash.sh \
 	ip6gre_flat_key.sh \
@@ -43,8 +37,8 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \
 	ipip_hier_gre_key.sh \
 	ipip_hier_gre_keys.sh \
 	ipip_hier_gre.sh \
+	lib_sh_test.sh \
 	local_termination.sh \
-	loopback.sh \
 	mirror_gre_bound.sh \
 	mirror_gre_bridge_1d.sh \
 	mirror_gre_bridge_1d_vlan.sh \
@@ -113,7 +107,6 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \
 	vxlan_symmetric.sh
 
 TEST_FILES := devlink_lib.sh \
-	ethtool_lib.sh \
 	fib_offload_lib.sh \
 	forwarding.config.sample \
 	ip6gre_lib.sh \
diff --git a/tools/testing/selftests/net/forwarding/README b/tools/testing/selftests/net/forwarding/README
index b8a2af8fcfb7..7fdb6a9ca543 100644
--- a/tools/testing/selftests/net/forwarding/README
+++ b/tools/testing/selftests/net/forwarding/README
@@ -56,3 +56,36 @@ o Checks shall be added to lib.sh for any external dependencies.
 o Code shall be checked using ShellCheck [1] prior to submission.
 
 1. https://www.shellcheck.net/
+
+Customization
+=============
+
+The forwarding selftests framework uses a number of variables that
+influence its behavior and tools it invokes, and how it invokes them, in
+various ways. A number of these variables can be overridden. The way these
+overridable variables are specified is typically one of the following two
+syntaxes:
+
+	: "${VARIABLE:=default_value}"
+	VARIABLE=${VARIABLE:=default_value}
+
+Any of these variables can be overridden. Notably net/forwarding/lib.sh and
+net/lib.sh contain a number of overridable variables.
+
+One way of overriding these variables is through the environment:
+
+	PAUSE_ON_FAIL=yes ./some_test.sh
+
+The variable NETIFS is special. Since it is an array variable, there is no
+way to pass it through the environment. Its value can instead be given as
+consecutive arguments to the selftest:
+
+	./some_test.sh swp{1..8}
+
+A way to customize variables in a persistent fashion is to create a file
+named forwarding.config in this directory. lib.sh sources the file if
+present, so it can contain any shell code. Typically it will contain
+assignments of variables whose value should be overridden.
+
+forwarding.config.sample is available in the directory as an example of
+how forwarding.config might look.
diff --git a/tools/testing/selftests/net/forwarding/forwarding.config.sample b/tools/testing/selftests/net/forwarding/forwarding.config.sample
index 1fc4f0242fc5..f1ca95e79a65 100644
--- a/tools/testing/selftests/net/forwarding/forwarding.config.sample
+++ b/tools/testing/selftests/net/forwarding/forwarding.config.sample
@@ -3,51 +3,28 @@
 
 ##############################################################################
 # Topology description. p1 looped back to p2, p3 to p4 and so on.
-declare -A NETIFS
 
-NETIFS[p1]=veth0
-NETIFS[p2]=veth1
-NETIFS[p3]=veth2
-NETIFS[p4]=veth3
-NETIFS[p5]=veth4
-NETIFS[p6]=veth5
-NETIFS[p7]=veth6
-NETIFS[p8]=veth7
-NETIFS[p9]=veth8
-NETIFS[p10]=veth9
+NETIFS=(
+	[p1]=veth0
+	[p2]=veth1
+	[p3]=veth2
+	[p4]=veth3
+	[p5]=veth4
+	[p6]=veth5
+	[p7]=veth6
+	[p8]=veth7
+	[p9]=veth8
+	[p10]=veth9
+)
 
 # Port that does not have a cable connected.
 NETIF_NO_CABLE=eth8
 
 ##############################################################################
-# Defines
+# In addition to the topology-related variables, it is also possible to override
+# in this file other variables that net/lib.sh, net/forwarding/lib.sh or other
+# libraries or selftests use. E.g.:
 
-# IPv4 ping utility name
-PING=ping
-# IPv6 ping utility name. Some distributions use 'ping' for IPv6.
 PING6=ping6
-# Packet generator. Some distributions use 'mz'.
 MZ=mausezahn
-# mausezahn delay between transmissions in microseconds.
-MZ_DELAY=0
-# Time to wait after interfaces participating in the test are all UP
 WAIT_TIME=5
-# Whether to pause on failure or not.
-PAUSE_ON_FAIL=no
-# Whether to pause on cleanup or not.
-PAUSE_ON_CLEANUP=no
-# Type of network interface to create
-NETIF_TYPE=veth
-# Whether to create virtual interfaces (veth) or not
-NETIF_CREATE=yes
-# Timeout (in seconds) before ping exits regardless of how many packets have
-# been sent or received
-PING_TIMEOUT=5
-# Minimum ageing_time (in centiseconds) supported by hardware
-LOW_AGEING_TIME=1000
-# Flag for tc match, supposed to be skip_sw/skip_hw which means do not process
-# filter by software/hardware
-TC_FLAG=skip_hw
-# IPv6 traceroute utility name.
-TROUTE6=traceroute6
-
diff --git a/tools/testing/selftests/net/forwarding/ipip_lib.sh b/tools/testing/selftests/net/forwarding/ipip_lib.sh
index 30f36a57bae6..01e62c4ac94d 100644
--- a/tools/testing/selftests/net/forwarding/ipip_lib.sh
+++ b/tools/testing/selftests/net/forwarding/ipip_lib.sh
@@ -141,7 +141,6 @@
 # |               $h2 +       |
 # |     192.0.2.18/28         |
 # +---------------------------+
-source lib.sh
 
 h1_create()
 {
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index e579c2e0c462..112c85c35092 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -2,33 +2,124 @@
 # SPDX-License-Identifier: GPL-2.0
 
 ##############################################################################
+# Topology description. p1 looped back to p2, p3 to p4 and so on.
+
+declare -A NETIFS=(
+    [p1]=veth0
+    [p2]=veth1
+    [p3]=veth2
+    [p4]=veth3
+    [p5]=veth4
+    [p6]=veth5
+    [p7]=veth6
+    [p8]=veth7
+    [p9]=veth8
+    [p10]=veth9
+)
+
+# Port that does not have a cable connected.
+: "${NETIF_NO_CABLE:=eth8}"
+
+##############################################################################
 # Defines
 
-# Can be overridden by the configuration file.
-PING=${PING:=ping}
-PING6=${PING6:=ping6}
-MZ=${MZ:=mausezahn}
-MZ_DELAY=${MZ_DELAY:=0}
-ARPING=${ARPING:=arping}
-TEAMD=${TEAMD:=teamd}
-WAIT_TIME=${WAIT_TIME:=5}
-PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
-PAUSE_ON_CLEANUP=${PAUSE_ON_CLEANUP:=no}
-NETIF_TYPE=${NETIF_TYPE:=veth}
-NETIF_CREATE=${NETIF_CREATE:=yes}
-MCD=${MCD:=smcrouted}
-MC_CLI=${MC_CLI:=smcroutectl}
-PING_COUNT=${PING_COUNT:=10}
-PING_TIMEOUT=${PING_TIMEOUT:=5}
-WAIT_TIMEOUT=${WAIT_TIMEOUT:=20}
-INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600}
-LOW_AGEING_TIME=${LOW_AGEING_TIME:=1000}
-REQUIRE_JQ=${REQUIRE_JQ:=yes}
-REQUIRE_MZ=${REQUIRE_MZ:=yes}
-REQUIRE_MTOOLS=${REQUIRE_MTOOLS:=no}
-STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no}
-TCPDUMP_EXTRA_FLAGS=${TCPDUMP_EXTRA_FLAGS:=}
-TROUTE6=${TROUTE6:=traceroute6}
+# Networking utilities.
+: "${PING:=ping}"
+: "${PING6:=ping6}"	# Some distros just use ping.
+: "${ARPING:=arping}"
+: "${TROUTE6:=traceroute6}"
+
+# Packet generator.
+: "${MZ:=mausezahn}"	# Some distributions use 'mz'.
+: "${MZ_DELAY:=0}"
+
+# Host configuration tools.
+: "${TEAMD:=teamd}"
+: "${MCD:=smcrouted}"
+: "${MC_CLI:=smcroutectl}"
+
+# Constants for netdevice bring-up:
+# Default time in seconds to wait for an interface to come up before giving up
+# and bailing out. Used during initial setup.
+: "${INTERFACE_TIMEOUT:=600}"
+# Like INTERFACE_TIMEOUT, but default for ad-hoc waiting in testing scripts.
+: "${WAIT_TIMEOUT:=20}"
+# Time to wait after interfaces participating in the test are all UP.
+: "${WAIT_TIME:=5}"
+
+# Whether to pause on, respectively, after a failure and before cleanup.
+: "${PAUSE_ON_FAIL:=no}"
+: "${PAUSE_ON_CLEANUP:=no}"
+
+# Whether to create virtual interfaces, and what netdevice type they should be.
+: "${NETIF_CREATE:=yes}"
+: "${NETIF_TYPE:=veth}"
+
+# Constants for ping tests:
+# How many packets should be sent.
+: "${PING_COUNT:=10}"
+# Timeout (in seconds) before ping exits regardless of how many packets have
+# been sent or received
+: "${PING_TIMEOUT:=5}"
+
+# Minimum ageing_time (in centiseconds) supported by hardware
+: "${LOW_AGEING_TIME:=1000}"
+
+# Whether to check for availability of certain tools.
+: "${REQUIRE_JQ:=yes}"
+: "${REQUIRE_MZ:=yes}"
+: "${REQUIRE_MTOOLS:=no}"
+
+# Whether to override MAC addresses on interfaces participating in the test.
+: "${STABLE_MAC_ADDRS:=no}"
+
+# Flags for tcpdump
+: "${TCPDUMP_EXTRA_FLAGS:=}"
+
+# Flags for TC filters.
+: "${TC_FLAG:=skip_hw}"
+
+# Whether the machine is "slow" -- i.e. might be incapable of running tests
+# involving heavy traffic. This might be the case on a debug kernel, a VM, or
+# e.g. a low-power board.
+: "${KSFT_MACHINE_SLOW:=no}"
+
+##############################################################################
+# Find netifs by test-specified driver name
+
+driver_name_get()
+{
+	local dev=$1; shift
+	local driver_path="/sys/class/net/$dev/device/driver"
+
+	if [[ -L $driver_path ]]; then
+		basename `realpath $driver_path`
+	fi
+}
+
+netif_find_driver()
+{
+	local ifnames=`ip -j link show | jq -r ".[].ifname"`
+	local count=0
+
+	for ifname in $ifnames
+	do
+		local driver_name=`driver_name_get $ifname`
+		if [[ ! -z $driver_name && $driver_name == $NETIF_FIND_DRIVER ]]; then
+			count=$((count + 1))
+			NETIFS[p$count]="$ifname"
+		fi
+	done
+}
+
+# Whether to find netdevice according to the driver speficied by the importer
+: "${NETIF_FIND_DRIVER:=}"
+
+if [[ $NETIF_FIND_DRIVER ]]; then
+	unset NETIFS
+	declare -A NETIFS
+	netif_find_driver
+fi
 
 net_forwarding_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
 
@@ -41,27 +132,9 @@ source "$net_forwarding_dir/../lib.sh"
 # timeout in seconds
 slowwait()
 {
-	local timeout=$1; shift
-
-	local start_time="$(date -u +%s)"
-	while true
-	do
-		local out
-		out=$("$@")
-		local ret=$?
-		if ((!ret)); then
-			echo -n "$out"
-			return 0
-		fi
-
-		local current_time="$(date -u +%s)"
-		if ((current_time - start_time > timeout)); then
-			echo -n "$out"
-			return 1
-		fi
+	local timeout_sec=$1; shift
 
-		sleep 0.1
-	done
+	loopy_wait "sleep 0.1" "$((timeout_sec * 1000))" "$@"
 }
 
 ##############################################################################
@@ -205,22 +278,23 @@ check_port_mab_support()
 	fi
 }
 
-skip_on_veth()
+if [[ "$(id -u)" -ne 0 ]]; then
+	echo "SKIP: need root privileges"
+	exit $ksft_skip
+fi
+
+check_driver()
 {
-	local kind=$(ip -j -d link show dev ${NETIFS[p1]} |
-		jq -r '.[].linkinfo.info_kind')
+	local dev=$1; shift
+	local expected=$1; shift
+	local driver_name=`driver_name_get $dev`
 
-	if [[ $kind == veth ]]; then
-		echo "SKIP: Test cannot be run with veth pairs"
+	if [[ $driver_name != $expected ]]; then
+		echo "SKIP: expected driver $expected for $dev, got $driver_name instead"
 		exit $ksft_skip
 	fi
 }
 
-if [[ "$(id -u)" -ne 0 ]]; then
-	echo "SKIP: need root privileges"
-	exit $ksft_skip
-fi
-
 if [[ "$CHECK_TC" = "yes" ]]; then
 	check_tc_version
 fi
@@ -235,6 +309,21 @@ require_command()
 	fi
 }
 
+# IPv6 support was added in v3.0
+check_mtools_version()
+{
+	local version="$(msend -v)"
+	local major
+
+	version=${version##msend version }
+	major=$(echo $version | cut -d. -f1)
+
+	if [ $major -lt 3 ]; then
+		echo "SKIP: expected mtools version 3.0, got $version"
+		exit $ksft_skip
+	fi
+}
+
 if [[ "$REQUIRE_JQ" = "yes" ]]; then
 	require_command jq
 fi
@@ -242,15 +331,10 @@ if [[ "$REQUIRE_MZ" = "yes" ]]; then
 	require_command $MZ
 fi
 if [[ "$REQUIRE_MTOOLS" = "yes" ]]; then
-	# https://github.com/vladimiroltean/mtools/
-	# patched for IPv6 support
+	# https://github.com/troglobit/mtools
 	require_command msend
 	require_command mreceive
-fi
-
-if [[ ! -v NUM_NETIFS ]]; then
-	echo "SKIP: importer does not define \"NUM_NETIFS\""
-	exit $ksft_skip
+	check_mtools_version
 fi
 
 ##############################################################################
@@ -271,6 +355,23 @@ done
 ##############################################################################
 # Network interfaces configuration
 
+if [[ ! -v NUM_NETIFS ]]; then
+	echo "SKIP: importer does not define \"NUM_NETIFS\""
+	exit $ksft_skip
+fi
+
+if (( NUM_NETIFS > ${#NETIFS[@]} )); then
+	echo "SKIP: Importer requires $NUM_NETIFS NETIFS, but only ${#NETIFS[@]} are defined (${NETIFS[@]})"
+	exit $ksft_skip
+fi
+
+for i in $(seq ${#NETIFS[@]}); do
+	if [[ ! ${NETIFS[p$i]} ]]; then
+		echo "SKIP: NETIFS[p$i] not given"
+		exit $ksft_skip
+	fi
+done
+
 create_netif_veth()
 {
 	local i
@@ -358,14 +459,31 @@ EXIT_STATUS=0
 # Per-test return value. Clear at the beginning of each test.
 RET=0
 
+ret_set_ksft_status()
+{
+	local ksft_status=$1; shift
+	local msg=$1; shift
+
+	RET=$(ksft_status_merge $RET $ksft_status)
+	if (( $? )); then
+		retmsg=$msg
+	fi
+}
+
+# Whether FAILs should be interpreted as XFAILs. Internal.
+FAIL_TO_XFAIL=
+
 check_err()
 {
 	local err=$1
 	local msg=$2
 
-	if [[ $RET -eq 0 && $err -ne 0 ]]; then
-		RET=$err
-		retmsg=$msg
+	if ((err)); then
+		if [[ $FAIL_TO_XFAIL = yes ]]; then
+			ret_set_ksft_status $ksft_xfail "$msg"
+		else
+			ret_set_ksft_status $ksft_fail "$msg"
+		fi
 	fi
 }
 
@@ -374,10 +492,7 @@ check_fail()
 	local err=$1
 	local msg=$2
 
-	if [[ $RET -eq 0 && $err -eq 0 ]]; then
-		RET=1
-		retmsg=$msg
-	fi
+	check_err $((!err)) "$msg"
 }
 
 check_err_fail()
@@ -393,6 +508,85 @@ check_err_fail()
 	fi
 }
 
+xfail_on_slow()
+{
+	if [[ $KSFT_MACHINE_SLOW = yes ]]; then
+		FAIL_TO_XFAIL=yes "$@"
+	else
+		"$@"
+	fi
+}
+
+xfail_on_veth()
+{
+	local dev=$1; shift
+	local kind
+
+	kind=$(ip -j -d link show dev $dev |
+			jq -r '.[].linkinfo.info_kind')
+	if [[ $kind = veth ]]; then
+		FAIL_TO_XFAIL=yes "$@"
+	else
+		"$@"
+	fi
+}
+
+log_test_result()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+	local result=$1; shift
+	local retmsg=$1; shift
+
+	printf "TEST: %-60s  [%s]\n" "$test_name $opt_str" "$result"
+	if [[ $retmsg ]]; then
+		printf "\t%s\n" "$retmsg"
+	fi
+}
+
+pause_on_fail()
+{
+	if [[ $PAUSE_ON_FAIL == yes ]]; then
+		echo "Hit enter to continue, 'q' to quit"
+		read a
+		[[ $a == q ]] && exit 1
+	fi
+}
+
+handle_test_result_pass()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" " OK "
+}
+
+handle_test_result_fail()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" FAIL "$retmsg"
+	pause_on_fail
+}
+
+handle_test_result_xfail()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" XFAIL "$retmsg"
+	pause_on_fail
+}
+
+handle_test_result_skip()
+{
+	local test_name=$1; shift
+	local opt_str=$1; shift
+
+	log_test_result "$test_name" "$opt_str" SKIP "$retmsg"
+}
+
 log_test()
 {
 	local test_name=$1
@@ -402,31 +596,28 @@ log_test()
 		opt_str="($opt_str)"
 	fi
 
-	if [[ $RET -ne 0 ]]; then
-		EXIT_STATUS=1
-		printf "TEST: %-60s  [FAIL]\n" "$test_name $opt_str"
-		if [[ ! -z "$retmsg" ]]; then
-			printf "\t%s\n" "$retmsg"
-		fi
-		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
-			echo "Hit enter to continue, 'q' to quit"
-			read a
-			[ "$a" = "q" ] && exit 1
-		fi
-		return 1
+	if ((RET == ksft_pass)); then
+		handle_test_result_pass "$test_name" "$opt_str"
+	elif ((RET == ksft_xfail)); then
+		handle_test_result_xfail "$test_name" "$opt_str"
+	elif ((RET == ksft_skip)); then
+		handle_test_result_skip "$test_name" "$opt_str"
+	else
+		handle_test_result_fail "$test_name" "$opt_str"
 	fi
 
-	printf "TEST: %-60s  [ OK ]\n" "$test_name $opt_str"
-	return 0
+	EXIT_STATUS=$(ksft_exit_status_merge $EXIT_STATUS $RET)
+	return $RET
 }
 
 log_test_skip()
 {
-	local test_name=$1
-	local opt_str=$2
+	RET=$ksft_skip retmsg= log_test "$@"
+}
 
-	printf "TEST: %-60s  [SKIP]\n" "$test_name $opt_str"
-	return 0
+log_test_xfail()
+{
+	RET=$ksft_xfail retmsg= log_test "$@"
 }
 
 log_info()
@@ -562,6 +753,19 @@ setup_wait()
 	sleep $WAIT_TIME
 }
 
+wait_for_dev()
+{
+        local dev=$1; shift
+        local timeout=${1:-$WAIT_TIMEOUT}; shift
+
+        slowwait $timeout ip link show dev $dev &> /dev/null
+        if (( $? )); then
+                check_err 1
+                log_test wait_for_dev "Interface $dev did not appear."
+                exit $EXIT_STATUS
+        fi
+}
+
 cmd_jq()
 {
 	local cmd=$1
@@ -2011,6 +2215,8 @@ bail_on_lldpad()
 {
 	local reason1="$1"; shift
 	local reason2="$1"; shift
+	local caller=${FUNCNAME[1]}
+	local src=${BASH_SOURCE[1]}
 
 	if systemctl is-active --quiet lldpad; then
 
@@ -2031,7 +2237,8 @@ bail_on_lldpad()
 				an environment variable ALLOW_LLDPAD to a
 				non-empty string.
 			EOF
-			exit 1
+			log_test_skip $src:$caller
+			exit $EXIT_STATUS
 		else
 			return
 		fi
diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh
new file mode 100755
index 000000000000..ff2accccaf4d
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This tests the operation of lib.sh itself.
+
+ALL_TESTS="
+	test_ret
+	test_exit_status
+"
+NUM_NETIFS=0
+source lib.sh
+
+# Simulated checks.
+
+do_test()
+{
+	local msg=$1; shift
+
+	"$@"
+	check_err $? "$msg"
+}
+
+tpass()
+{
+	do_test "tpass" true
+}
+
+tfail()
+{
+	do_test "tfail" false
+}
+
+txfail()
+{
+	FAIL_TO_XFAIL=yes do_test "txfail" false
+}
+
+# Simulated tests.
+
+pass()
+{
+	RET=0
+	do_test "true" true
+	log_test "true"
+}
+
+fail()
+{
+	RET=0
+	do_test "false" false
+	log_test "false"
+}
+
+xfail()
+{
+	RET=0
+	FAIL_TO_XFAIL=yes do_test "xfalse" false
+	log_test "xfalse"
+}
+
+skip()
+{
+	RET=0
+	log_test_skip "skip"
+}
+
+slow_xfail()
+{
+	RET=0
+	xfail_on_slow do_test "slow_false" false
+	log_test "slow_false"
+}
+
+# lib.sh tests.
+
+ret_tests_run()
+{
+	local t
+
+	RET=0
+	retmsg=
+	for t in "$@"; do
+		$t
+	done
+	echo "$retmsg"
+	return $RET
+}
+
+ret_subtest()
+{
+	local expect_ret=$1; shift
+	local expect_retmsg=$1; shift
+	local -a tests=( "$@" )
+
+	local status_names=(pass fail xfail xpass skip)
+	local ret
+	local out
+
+	RET=0
+
+	# Run this in a subshell, so that our environment is intact.
+	out=$(ret_tests_run "${tests[@]}")
+	ret=$?
+
+	(( ret == expect_ret ))
+	check_err $? "RET=$ret expected $expect_ret"
+
+	[[ $out == $expect_retmsg ]]
+	check_err $? "retmsg=$out expected $expect_retmsg"
+
+	log_test "RET $(echo ${tests[@]}) -> ${status_names[$ret]}"
+}
+
+test_ret()
+{
+	ret_subtest $ksft_pass ""
+
+	ret_subtest $ksft_pass "" tpass
+	ret_subtest $ksft_fail "tfail" tfail
+	ret_subtest $ksft_xfail "txfail" txfail
+
+	ret_subtest $ksft_pass "" tpass tpass
+	ret_subtest $ksft_fail "tfail" tpass tfail
+	ret_subtest $ksft_xfail "txfail" tpass txfail
+
+	ret_subtest $ksft_fail "tfail" tfail tpass
+	ret_subtest $ksft_xfail "txfail" txfail tpass
+
+	ret_subtest $ksft_fail "tfail" tfail tfail
+	ret_subtest $ksft_fail "tfail" tfail txfail
+
+	ret_subtest $ksft_fail "tfail" txfail tfail
+
+	ret_subtest $ksft_xfail "txfail" txfail txfail
+}
+
+exit_status_tests_run()
+{
+	EXIT_STATUS=0
+	tests_run > /dev/null
+	return $EXIT_STATUS
+}
+
+exit_status_subtest()
+{
+	local expect_exit_status=$1; shift
+	local tests=$1; shift
+	local what=$1; shift
+
+	local status_names=(pass fail xfail xpass skip)
+	local exit_status
+	local out
+
+	RET=0
+
+	# Run this in a subshell, so that our environment is intact.
+	out=$(TESTS="$tests" exit_status_tests_run)
+	exit_status=$?
+
+	(( exit_status == expect_exit_status ))
+	check_err $? "EXIT_STATUS=$exit_status, expected $expect_exit_status"
+
+	log_test "EXIT_STATUS $tests$what -> ${status_names[$exit_status]}"
+}
+
+test_exit_status()
+{
+	exit_status_subtest $ksft_pass ":"
+
+	exit_status_subtest $ksft_pass "pass"
+	exit_status_subtest $ksft_fail "fail"
+	exit_status_subtest $ksft_pass "xfail"
+	exit_status_subtest $ksft_skip "skip"
+
+	exit_status_subtest $ksft_pass "pass pass"
+	exit_status_subtest $ksft_fail "pass fail"
+	exit_status_subtest $ksft_pass "pass xfail"
+	exit_status_subtest $ksft_skip "pass skip"
+
+	exit_status_subtest $ksft_fail "fail pass"
+	exit_status_subtest $ksft_pass "xfail pass"
+	exit_status_subtest $ksft_skip "skip pass"
+
+	exit_status_subtest $ksft_fail "fail fail"
+	exit_status_subtest $ksft_fail "fail xfail"
+	exit_status_subtest $ksft_fail "fail skip"
+
+	exit_status_subtest $ksft_fail "xfail fail"
+	exit_status_subtest $ksft_fail "skip fail"
+
+	exit_status_subtest $ksft_pass "xfail xfail"
+	exit_status_subtest $ksft_skip "xfail skip"
+	exit_status_subtest $ksft_skip "skip xfail"
+
+	exit_status_subtest $ksft_skip "skip skip"
+
+	KSFT_MACHINE_SLOW=yes \
+		exit_status_subtest $ksft_pass "slow_xfail" ": slow"
+
+	KSFT_MACHINE_SLOW=no \
+		exit_status_subtest $ksft_fail "slow_xfail" ": fast"
+}
+
+trap pre_cleanup EXIT
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
index 3f0f5dc95542..2ba44247c60a 100755
--- a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
@@ -1,6 +1,41 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+# +-------------------------+
+# |  H1                     |
+# |               $h1 +     |
+# |      192.0.2.2/24 |     |
+# |  2001:db8:1::2/64 |     |
+# +-------------------|-----+
+#                     |
+# +-------------------|----------------------+
+# |                   |                   R1 |
+# |             $rp11 +                      |
+# |      192.0.2.1/24                        |
+# |  2001:db8:1::1/64                        |
+# |                                          |
+# |  + $rp12              + $rp13            |
+# |  | 169.254.2.12/24    | 169.254.3.13/24  |
+# |  | fe80:2::12/64      | fe80:3::13/64    |
+# +--|--------------------|------------------+
+#    |                    |
+# +--|--------------------|------------------+
+# |  + $rp22              + $rp23            |
+# |    169.254.2.22/24      169.254.3.23/24  |
+# |    fe80:2::22/64        fe80:3::23/64    |
+# |                                          |
+# |             $rp21 +                      |
+# |   198.51.100.1/24 |                      |
+# |  2001:db8:2::1/64 |                   R2 |
+# +-------------------|----------------------+
+#                     |
+# +-------------------|-----+
+# |                   |     |
+# |               $h2 +     |
+# |   198.51.100.2/24       |
+# |  2001:db8:2::2/64    H2 |
+# +-------------------------+
+
 ALL_TESTS="
 	ping_ipv4
 	ping_ipv6
diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh
index 7e7d62161c34..2903294d8bca 100644
--- a/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_lib.sh
@@ -56,21 +56,12 @@ nh_stats_test_dispatch_swhw()
 	local group_id=$1; shift
 	local mz="$@"
 
-	local used
-
 	nh_stats_do_test "$what" "$nh1_id" "$nh2_id" "$group_id" \
 			 nh_stats_get "${mz[@]}"
 
-	used=$(ip -s -j -d nexthop show id $group_id |
-		   jq '.[].hw_stats.used')
-	kind=$(ip -j -d link show dev $rp11 |
-		   jq -r '.[].linkinfo.info_kind')
-	if [[ $used == true ]]; then
+	xfail_on_veth $rp11 \
 		nh_stats_do_test "HW $what" "$nh1_id" "$nh2_id" "$group_id" \
 				 nh_stats_get_hw "${mz[@]}"
-	elif [[ $kind == veth ]]; then
-		log_test_skip "HW stats not offloaded on veth topology"
-	fi
 }
 
 nh_stats_test_dispatch()
@@ -83,7 +74,6 @@ nh_stats_test_dispatch()
 	local mz="$@"
 
 	local enabled
-	local kind
 
 	if ! ip nexthop help 2>&1 | grep -q hw_stats; then
 		log_test_skip "NH stats test: ip doesn't support HW stats"
diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
index 4b483d24ad00..cd9e346436fc 100755
--- a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh
@@ -1,6 +1,41 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+# +-------------------------+
+# |  H1                     |
+# |               $h1 +     |
+# |      192.0.2.2/24 |     |
+# |  2001:db8:1::2/64 |     |
+# +-------------------|-----+
+#                     |
+# +-------------------|----------------------+
+# |                   |                   R1 |
+# |             $rp11 +                      |
+# |      192.0.2.1/24                        |
+# |  2001:db8:1::1/64                        |
+# |                                          |
+# |  + $rp12              + $rp13            |
+# |  | 169.254.2.12/24    | 169.254.3.13/24  |
+# |  | fe80:2::12/64      | fe80:3::13/64    |
+# +--|--------------------|------------------+
+#    |                    |
+# +--|--------------------|------------------+
+# |  + $rp22              + $rp23            |
+# |    169.254.2.22/24      169.254.3.23/24  |
+# |    fe80:2::22/64        fe80:3::23/64    |
+# |                                          |
+# |             $rp21 +                      |
+# |   198.51.100.1/24 |                      |
+# |  2001:db8:2::1/64 |                   R2 |
+# +-------------------|----------------------+
+#                     |
+# +-------------------|-----+
+# |                   |     |
+# |               $h2 +     |
+# |   198.51.100.2/24       |
+# |  2001:db8:2::2/64    H2 |
+# +-------------------------+
+
 ALL_TESTS="
 	ping_ipv4
 	ping_ipv6
diff --git a/tools/testing/selftests/net/forwarding/router_nh.sh b/tools/testing/selftests/net/forwarding/router_nh.sh
index f3a53738bdcc..92904b01eae9 100755
--- a/tools/testing/selftests/net/forwarding/router_nh.sh
+++ b/tools/testing/selftests/net/forwarding/router_nh.sh
@@ -1,6 +1,20 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+# +-------------------------+  +-------------------------+
+# | H1                      |  |                      H2 |
+# |               $h1 +     |  |               $h2 +     |
+# |      192.0.2.2/24 |     |  |   198.51.100.2/24 |     |
+# |  2001:db8:1::2/64 |     |  |  2001:db8:2::2/64 |     |
+# +-------------------|-----+  +-------------------|-----+
+#                     |                            |
+# +-------------------|----------------------------|-----+
+# | R1                |                            |     |
+# |              $rp1 +                       $rp2 +     |
+# |      192.0.2.1/24              198.51.100.1/24       |
+# |  2001:db8:1::1/64             2001:db8:2::1/64       |
+# +------------------------------------------------------+
+
 ALL_TESTS="
 	ping_ipv4
 	ping_ipv6
diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
index cdf689e99458..f9d26a7911bb 100644
--- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
+++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
@@ -199,25 +199,28 @@ ets_set_dwrr_two_bands()
 ets_test_strict()
 {
 	ets_set_strict
-	ets_dwrr_test_01
-	ets_dwrr_test_12
+	xfail_on_slow ets_dwrr_test_01
+	xfail_on_slow ets_dwrr_test_12
 }
 
 ets_test_mixed()
 {
 	ets_set_mixed
-	ets_dwrr_test_01
-	ets_dwrr_test_12
+	xfail_on_slow ets_dwrr_test_01
+	xfail_on_slow ets_dwrr_test_12
 }
 
 ets_test_dwrr()
 {
 	ets_set_dwrr_uniform
-	ets_dwrr_test_012
+	xfail_on_slow ets_dwrr_test_012
+
 	ets_set_dwrr_varying
-	ets_dwrr_test_012
+	xfail_on_slow ets_dwrr_test_012
+
 	ets_change_quantum
-	ets_dwrr_test_012
+	xfail_on_slow ets_dwrr_test_012
+
 	ets_set_dwrr_two_bands
-	ets_dwrr_test_01
+	xfail_on_slow ets_dwrr_test_01
 }
diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh
index 81f31179ac88..17f28644568e 100755
--- a/tools/testing/selftests/net/forwarding/sch_red.sh
+++ b/tools/testing/selftests/net/forwarding/sch_red.sh
@@ -451,35 +451,35 @@ uninstall_qdisc()
 ecn_test()
 {
 	install_qdisc ecn
-	do_ecn_test $BACKLOG
+	xfail_on_slow do_ecn_test $BACKLOG
 	uninstall_qdisc
 }
 
 ecn_nodrop_test()
 {
 	install_qdisc ecn nodrop
-	do_ecn_nodrop_test $BACKLOG
+	xfail_on_slow do_ecn_nodrop_test $BACKLOG
 	uninstall_qdisc
 }
 
 red_test()
 {
 	install_qdisc
-	do_red_test $BACKLOG
+	xfail_on_slow do_red_test $BACKLOG
 	uninstall_qdisc
 }
 
 red_qevent_test()
 {
 	install_qdisc qevent early_drop block 10
-	do_red_qevent_test $BACKLOG
+	xfail_on_slow do_red_qevent_test $BACKLOG
 	uninstall_qdisc
 }
 
 ecn_qevent_test()
 {
 	install_qdisc ecn qevent mark block 10
-	do_ecn_qevent_test $BACKLOG
+	xfail_on_slow do_ecn_qevent_test $BACKLOG
 	uninstall_qdisc
 }
 
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
index d1f26cb7cd73..9cd884d4a5de 100644
--- a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
@@ -227,7 +227,7 @@ do_tbf_test()
 	local nr=$(rate $t2 $t3 10)
 	local nr_pct=$((100 * (nr - er) / er))
 	((-5 <= nr_pct && nr_pct <= 5))
-	check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%."
+	xfail_on_slow check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%."
 
 	log_test "TC $((vlan - 10)): TBF rate ${mbit}Mbit"
 }
diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh
index bce8bb8d2b6f..2e3326edfa9a 100644
--- a/tools/testing/selftests/net/forwarding/tc_common.sh
+++ b/tools/testing/selftests/net/forwarding/tc_common.sh
@@ -4,7 +4,7 @@
 CHECK_TC="yes"
 
 # Can be overridden by the configuration file. See lib.sh
-TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms
+: "${TC_HIT_TIMEOUT:=1000}" # ms
 
 tc_check_packets()
 {
diff --git a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh
index 5a5dd9034819..79775b10b99f 100755
--- a/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh
+++ b/tools/testing/selftests/net/forwarding/tc_tunnel_key.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
 
 ALL_TESTS="tunnel_key_nofrag_test"
 
diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh
index a0bb4524e1e9..a603f7b0a08f 100755
--- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh
@@ -354,7 +354,7 @@ __ping_ipv4()
 
 	# Send 100 packets and verify that at least 100 packets hit the rule,
 	# to overcome ARP noise.
-	PING_COUNT=100 PING_TIMEOUT=11 ping_do $dev $dst_ip
+	PING_COUNT=100 PING_TIMEOUT=20 ping_do $dev $dst_ip
 	check_err $? "Ping failed"
 
 	tc_check_at_least_x_packets "dev $rp1 egress" 101 10 100
@@ -410,7 +410,7 @@ __ping_ipv6()
 
 	# Send 100 packets and verify that at least 100 packets hit the rule,
 	# to overcome neighbor discovery noise.
-	PING_COUNT=100 PING_TIMEOUT=11 ping6_do $dev $dst_ip
+	PING_COUNT=100 PING_TIMEOUT=20 ping6_do $dev $dst_ip
 	check_err $? "Ping failed"
 
 	tc_check_at_least_x_packets "dev $rp1 egress" 101 100
diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_ipv6.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_ipv6.sh
index d880df89bc8b..e83fde79f40d 100755
--- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_ipv6.sh
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_ipv6.sh
@@ -457,7 +457,7 @@ __ping_ipv4()
 
 	# Send 100 packets and verify that at least 100 packets hit the rule,
 	# to overcome ARP noise.
-	PING_COUNT=100 PING_TIMEOUT=11 ping_do $dev $dst_ip
+	PING_COUNT=100 PING_TIMEOUT=20 ping_do $dev $dst_ip
 	check_err $? "Ping failed"
 
 	tc_check_at_least_x_packets "dev $rp1 egress" 101 10 100
@@ -522,7 +522,7 @@ __ping_ipv6()
 
 	# Send 100 packets and verify that at least 100 packets hit the rule,
 	# to overcome neighbor discovery noise.
-	PING_COUNT=100 PING_TIMEOUT=11 ping6_do $dev $dst_ip
+	PING_COUNT=100 PING_TIMEOUT=20 ping6_do $dev $dst_ip
 	check_err $? "Ping failed"
 
 	tc_check_at_least_x_packets "dev $rp1 egress" 101 100
diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
index 353e1e867fbb..b2184847e388 100644
--- a/tools/testing/selftests/net/gro.c
+++ b/tools/testing/selftests/net/gro.c
@@ -93,6 +93,7 @@ static bool tx_socket = true;
 static int tcp_offset = -1;
 static int total_hdr_len = -1;
 static int ethhdr_proto = -1;
+static const int num_flush_id_cases = 6;
 
 static void vlog(const char *fmt, ...)
 {
@@ -119,6 +120,9 @@ static void setup_sock_filter(int fd)
 		next_off = offsetof(struct ipv6hdr, nexthdr);
 	ipproto_off = ETH_HLEN + next_off;
 
+	/* Overridden later if exthdrs are used: */
+	opt_ipproto_off = ipproto_off;
+
 	if (strcmp(testname, "ip") == 0) {
 		if (proto == PF_INET)
 			optlen = sizeof(struct ip_timestamp);
@@ -617,6 +621,113 @@ static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type, char *ext
 	iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE);
 }
 
+static void fix_ip4_checksum(struct iphdr *iph)
+{
+	iph->check = 0;
+	iph->check = checksum_fold(iph, sizeof(struct iphdr), 0);
+}
+
+static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
+{
+	static char buf1[MAX_HDR_LEN + PAYLOAD_LEN];
+	static char buf2[MAX_HDR_LEN + PAYLOAD_LEN];
+	static char buf3[MAX_HDR_LEN + PAYLOAD_LEN];
+	bool send_three = false;
+	struct iphdr *iph1;
+	struct iphdr *iph2;
+	struct iphdr *iph3;
+
+	iph1 = (struct iphdr *)(buf1 + ETH_HLEN);
+	iph2 = (struct iphdr *)(buf2 + ETH_HLEN);
+	iph3 = (struct iphdr *)(buf3 + ETH_HLEN);
+
+	create_packet(buf1, 0, 0, PAYLOAD_LEN, 0);
+	create_packet(buf2, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0);
+	create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0);
+
+	switch (tcase) {
+	case 0: /* DF=1, Incrementing - should coalesce */
+		iph1->frag_off |= htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off |= htons(IP_DF);
+		iph2->id = htons(9);
+		break;
+
+	case 1: /* DF=1, Fixed - should coalesce */
+		iph1->frag_off |= htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off |= htons(IP_DF);
+		iph2->id = htons(8);
+		break;
+
+	case 2: /* DF=0, Incrementing - should coalesce */
+		iph1->frag_off &= ~htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off &= ~htons(IP_DF);
+		iph2->id = htons(9);
+		break;
+
+	case 3: /* DF=0, Fixed - should not coalesce */
+		iph1->frag_off &= ~htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off &= ~htons(IP_DF);
+		iph2->id = htons(8);
+		break;
+
+	case 4: /* DF=1, two packets incrementing, and one fixed - should
+		 * coalesce only the first two packets
+		 */
+		iph1->frag_off |= htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off |= htons(IP_DF);
+		iph2->id = htons(9);
+
+		iph3->frag_off |= htons(IP_DF);
+		iph3->id = htons(9);
+		send_three = true;
+		break;
+
+	case 5: /* DF=1, two packets fixed, and one incrementing - should
+		 * coalesce only the first two packets
+		 */
+		iph1->frag_off |= htons(IP_DF);
+		iph1->id = htons(8);
+
+		iph2->frag_off |= htons(IP_DF);
+		iph2->id = htons(8);
+
+		iph3->frag_off |= htons(IP_DF);
+		iph3->id = htons(9);
+		send_three = true;
+		break;
+	}
+
+	fix_ip4_checksum(iph1);
+	fix_ip4_checksum(iph2);
+	write_packet(fd, buf1, total_hdr_len + PAYLOAD_LEN, daddr);
+	write_packet(fd, buf2, total_hdr_len + PAYLOAD_LEN, daddr);
+
+	if (send_three) {
+		fix_ip4_checksum(iph3);
+		write_packet(fd, buf3, total_hdr_len + PAYLOAD_LEN, daddr);
+	}
+}
+
+static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt)
+{
+	for (int i = 0; i < num_flush_id_cases; i++) {
+		sleep(1);
+		send_flush_id_case(fd, daddr, i);
+		sleep(1);
+		write_packet(fd, fin_pkt, total_hdr_len, daddr);
+	}
+}
+
 static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2)
 {
 	static char buf[MAX_HDR_LEN + PAYLOAD_LEN];
@@ -935,6 +1046,8 @@ static void gro_sender(void)
 			send_fragment4(txfd, &daddr);
 			sleep(1);
 			write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+
+			test_flush_id(txfd, &daddr, fin_pkt);
 		} else if (proto == PF_INET6) {
 			sleep(1);
 			send_fragment6(txfd, &daddr);
@@ -1061,6 +1174,34 @@ static void gro_receiver(void)
 
 			printf("fragmented ip4 doesn't coalesce: ");
 			check_recv_pkts(rxfd, correct_payload, 2);
+
+			/* is_atomic checks */
+			printf("DF=1, Incrementing - should coalesce: ");
+			correct_payload[0] = PAYLOAD_LEN * 2;
+			check_recv_pkts(rxfd, correct_payload, 1);
+
+			printf("DF=1, Fixed - should coalesce: ");
+			correct_payload[0] = PAYLOAD_LEN * 2;
+			check_recv_pkts(rxfd, correct_payload, 1);
+
+			printf("DF=0, Incrementing - should coalesce: ");
+			correct_payload[0] = PAYLOAD_LEN * 2;
+			check_recv_pkts(rxfd, correct_payload, 1);
+
+			printf("DF=0, Fixed - should not coalesce: ");
+			correct_payload[0] = PAYLOAD_LEN;
+			correct_payload[1] = PAYLOAD_LEN;
+			check_recv_pkts(rxfd, correct_payload, 2);
+
+			printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: ");
+			correct_payload[0] = PAYLOAD_LEN * 2;
+			correct_payload[1] = PAYLOAD_LEN;
+			check_recv_pkts(rxfd, correct_payload, 2);
+
+			printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: ");
+			correct_payload[0] = PAYLOAD_LEN * 2;
+			correct_payload[1] = PAYLOAD_LEN;
+			check_recv_pkts(rxfd, correct_payload, 2);
 		} else if (proto == PF_INET6) {
 			/* GRO doesn't check for ipv6 hop limit when flushing.
 			 * Hence no corresponding test to the ipv4 case.
diff --git a/tools/testing/selftests/net/hsr/Makefile b/tools/testing/selftests/net/hsr/Makefile
index 92c1d9d080cd..884cd2cc0681 100644
--- a/tools/testing/selftests/net/hsr/Makefile
+++ b/tools/testing/selftests/net/hsr/Makefile
@@ -2,6 +2,7 @@
 
 top_srcdir = ../../../../..
 
-TEST_PROGS := hsr_ping.sh
+TEST_PROGS := hsr_ping.sh hsr_redbox.sh
+TEST_FILES += hsr_common.sh
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/net/hsr/hsr_common.sh b/tools/testing/selftests/net/hsr/hsr_common.sh
new file mode 100644
index 000000000000..8e97b1f2e7e5
--- /dev/null
+++ b/tools/testing/selftests/net/hsr/hsr_common.sh
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: GPL-2.0
+# Common code for HSR testing scripts
+
+source ../lib.sh
+ret=0
+ksft_skip=4
+
+# $1: IP address
+is_v6()
+{
+	[ -z "${1##*:*}" ]
+}
+
+do_ping()
+{
+	local netns="$1"
+	local connect_addr="$2"
+	local ping_args="-q -c 2"
+
+	if is_v6 "${connect_addr}"; then
+		$ipv6 || return 0
+		ping_args="${ping_args} -6"
+	fi
+
+	ip netns exec ${netns} ping ${ping_args} $connect_addr >/dev/null
+	if [ $? -ne 0 ] ; then
+		echo "$netns -> $connect_addr connectivity [ FAIL ]" 1>&2
+		ret=1
+		return 1
+	fi
+
+	return 0
+}
+
+do_ping_long()
+{
+	local netns="$1"
+	local connect_addr="$2"
+	local ping_args="-q -c 10"
+
+	if is_v6 "${connect_addr}"; then
+		$ipv6 || return 0
+		ping_args="${ping_args} -6"
+	fi
+
+	OUT="$(LANG=C ip netns exec ${netns} ping ${ping_args} $connect_addr | grep received)"
+	if [ $? -ne 0 ] ; then
+		echo "$netns -> $connect_addr ping [ FAIL ]" 1>&2
+		ret=1
+		return 1
+	fi
+
+	VAL="$(echo $OUT | cut -d' ' -f1-8)"
+	SED_VAL="$(echo ${VAL} | sed -r -e 's/([0-9]{2}).*([0-9]{2}).*[[:space:]]([0-9]+%).*/\1 transmitted \2 received \3 loss/')"
+	if [ "${SED_VAL}" != "10 transmitted 10 received 0% loss" ]
+	then
+		echo "$netns -> $connect_addr ping TEST [ FAIL ]"
+		echo "Expect to send and receive 10 packets and no duplicates."
+		echo "Full message: ${OUT}."
+		ret=1
+		return 1
+	fi
+
+	return 0
+}
+
+stop_if_error()
+{
+	local msg="$1"
+
+	if [ ${ret} -ne 0 ]; then
+		echo "FAIL: ${msg}" 1>&2
+		exit ${ret}
+	fi
+}
+
+check_prerequisites()
+{
+	ip -Version > /dev/null 2>&1
+	if [ $? -ne 0 ];then
+		echo "SKIP: Could not run test without ip tool"
+		exit $ksft_skip
+	fi
+}
diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh
index 1c6457e54625..790294c8af83 100755
--- a/tools/testing/selftests/net/hsr/hsr_ping.sh
+++ b/tools/testing/selftests/net/hsr/hsr_ping.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ret=0
-ksft_skip=4
 ipv6=true
 
+source ./hsr_common.sh
+
 optstring="h4"
 usage() {
 	echo "Usage: $0 [OPTION]"
@@ -27,88 +27,6 @@ while getopts "$optstring" option;do
 esac
 done
 
-sec=$(date +%s)
-rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
-ns1="ns1-$rndh"
-ns2="ns2-$rndh"
-ns3="ns3-$rndh"
-
-cleanup()
-{
-	local netns
-	for netns in "$ns1" "$ns2" "$ns3" ;do
-		ip netns del $netns
-	done
-}
-
-# $1: IP address
-is_v6()
-{
-	[ -z "${1##*:*}" ]
-}
-
-do_ping()
-{
-	local netns="$1"
-	local connect_addr="$2"
-	local ping_args="-q -c 2"
-
-	if is_v6 "${connect_addr}"; then
-		$ipv6 || return 0
-		ping_args="${ping_args} -6"
-	fi
-
-	ip netns exec ${netns} ping ${ping_args} $connect_addr >/dev/null
-	if [ $? -ne 0 ] ; then
-		echo "$netns -> $connect_addr connectivity [ FAIL ]" 1>&2
-		ret=1
-		return 1
-	fi
-
-	return 0
-}
-
-do_ping_long()
-{
-	local netns="$1"
-	local connect_addr="$2"
-	local ping_args="-q -c 10"
-
-	if is_v6 "${connect_addr}"; then
-		$ipv6 || return 0
-		ping_args="${ping_args} -6"
-	fi
-
-	OUT="$(LANG=C ip netns exec ${netns} ping ${ping_args} $connect_addr | grep received)"
-	if [ $? -ne 0 ] ; then
-		echo "$netns -> $connect_addr ping [ FAIL ]" 1>&2
-		ret=1
-		return 1
-	fi
-
-	VAL="$(echo $OUT | cut -d' ' -f1-8)"
-	if [ "$VAL" != "10 packets transmitted, 10 received, 0% packet loss," ]
-	then
-		echo "$netns -> $connect_addr ping TEST [ FAIL ]"
-		echo "Expect to send and receive 10 packets and no duplicates."
-		echo "Full message: ${OUT}."
-		ret=1
-		return 1
-	fi
-
-	return 0
-}
-
-stop_if_error()
-{
-	local msg="$1"
-
-	if [ ${ret} -ne 0 ]; then
-		echo "FAIL: ${msg}" 1>&2
-		exit ${ret}
-	fi
-}
-
 do_complete_ping_test()
 {
 	echo "INFO: Initial validation ping."
@@ -248,27 +166,13 @@ setup_hsr_interfaces()
 	ip -net "$ns3" link set hsr3 up
 }
 
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
-trap cleanup EXIT
+check_prerequisites
+setup_ns ns1 ns2 ns3
 
-for i in "$ns1" "$ns2" "$ns3" ;do
-	ip netns add $i || exit $ksft_skip
-	ip -net $i link set lo up
-done
+trap cleanup_all_ns EXIT
 
 setup_hsr_interfaces 0
 do_complete_ping_test
-cleanup
-
-for i in "$ns1" "$ns2" "$ns3" ;do
-	ip netns add $i || exit $ksft_skip
-	ip -net $i link set lo up
-done
 
 setup_hsr_interfaces 1
 do_complete_ping_test
diff --git a/tools/testing/selftests/net/hsr/hsr_redbox.sh b/tools/testing/selftests/net/hsr/hsr_redbox.sh
new file mode 100755
index 000000000000..1f36785347c0
--- /dev/null
+++ b/tools/testing/selftests/net/hsr/hsr_redbox.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ipv6=false
+
+source ./hsr_common.sh
+
+do_complete_ping_test()
+{
+	echo "INFO: Initial validation ping (HSR-SAN/RedBox)."
+	# Each node has to be able to reach each one.
+	do_ping "${ns1}" 100.64.0.2
+	do_ping "${ns2}" 100.64.0.1
+	# Ping between SANs (test bridge)
+	do_ping "${ns4}" 100.64.0.51
+	do_ping "${ns5}" 100.64.0.41
+	# Ping from SANs to hsr1 (via hsr2) (and opposite)
+	do_ping "${ns3}" 100.64.0.1
+	do_ping "${ns1}" 100.64.0.3
+	do_ping "${ns1}" 100.64.0.41
+	do_ping "${ns4}" 100.64.0.1
+	do_ping "${ns1}" 100.64.0.51
+	do_ping "${ns5}" 100.64.0.1
+	stop_if_error "Initial validation failed."
+
+	# Wait for MGNT HSR frames being received and nodes being
+	# merged.
+	sleep 5
+
+	echo "INFO: Longer ping test (HSR-SAN/RedBox)."
+	# Ping from SAN to hsr1 (via hsr2)
+	do_ping_long "${ns3}" 100.64.0.1
+	# Ping from hsr1 (via hsr2) to SANs (and opposite)
+	do_ping_long "${ns1}" 100.64.0.3
+	do_ping_long "${ns1}" 100.64.0.41
+	do_ping_long "${ns4}" 100.64.0.1
+	do_ping_long "${ns1}" 100.64.0.51
+	do_ping_long "${ns5}" 100.64.0.1
+	stop_if_error "Longer ping test failed."
+
+	echo "INFO: All good."
+}
+
+setup_hsr_interfaces()
+{
+	local HSRv="$1"
+
+	echo "INFO: preparing interfaces for HSRv${HSRv} (HSR-SAN/RedBox)."
+#
+# IPv4 addresses (100.64.X.Y/24), and [X.Y] is presented on below diagram:
+#
+#
+# |NS1                     |               |NS4                |
+# |       [0.1]            |               |                   |
+# |    /-- hsr1 --\        |               |    [0.41]         |
+# | ns1eth1     ns1eth2    |               |    ns4eth1 (SAN)  |
+# |------------------------|               |-------------------|
+#      |            |                                |
+#      |            |                                |
+#      |            |                                |
+# |------------------------|   |-------------------------------|
+# | ns2eth1     ns2eth2    |   |                  ns3eth2      |
+# |    \-- hsr2 --/        |   |                 /             |
+# |      [0.2] \           |   |                /              |  |------------|
+# |             ns2eth3    |---| ns3eth1 -- ns3br1 -- ns3eth3--|--| ns5eth1    |
+# |             (interlink)|   | [0.3]      [0.11]             |  | [0.51]     |
+# |NS2 (RedBOX)            |   |NS3 (BR)                       |  | NS5 (SAN)  |
+#
+#
+	# Check if iproute2 supports adding interlink port to hsrX device
+	ip link help hsr | grep -q INTERLINK
+	[ $? -ne 0 ] && { echo "iproute2: HSR interlink interface not supported!"; exit 0; }
+
+	# Create interfaces for name spaces
+	ip link add ns1eth1 netns "${ns1}" type veth peer name ns2eth1 netns "${ns2}"
+	ip link add ns1eth2 netns "${ns1}" type veth peer name ns2eth2 netns "${ns2}"
+	ip link add ns2eth3 netns "${ns2}" type veth peer name ns3eth1 netns "${ns3}"
+	ip link add ns3eth2 netns "${ns3}" type veth peer name ns4eth1 netns "${ns4}"
+	ip link add ns3eth3 netns "${ns3}" type veth peer name ns5eth1 netns "${ns5}"
+
+	sleep 1
+
+	ip -n "${ns1}" link set ns1eth1 up
+	ip -n "${ns1}" link set ns1eth2 up
+
+	ip -n "${ns2}" link set ns2eth1 up
+	ip -n "${ns2}" link set ns2eth2 up
+	ip -n "${ns2}" link set ns2eth3 up
+
+	ip -n "${ns3}" link add name ns3br1 type bridge
+	ip -n "${ns3}" link set ns3br1 up
+	ip -n "${ns3}" link set ns3eth1 master ns3br1 up
+	ip -n "${ns3}" link set ns3eth2 master ns3br1 up
+	ip -n "${ns3}" link set ns3eth3 master ns3br1 up
+
+	ip -n "${ns4}" link set ns4eth1 up
+	ip -n "${ns5}" link set ns5eth1 up
+
+	ip -net "${ns1}" link add name hsr1 type hsr slave1 ns1eth1 slave2 ns1eth2 supervision 45 version ${HSRv} proto 0
+	ip -net "${ns2}" link add name hsr2 type hsr slave1 ns2eth1 slave2 ns2eth2 interlink ns2eth3 supervision 45 version ${HSRv} proto 0
+
+	ip -n "${ns1}" addr add 100.64.0.1/24 dev hsr1
+	ip -n "${ns2}" addr add 100.64.0.2/24 dev hsr2
+	ip -n "${ns3}" addr add 100.64.0.11/24 dev ns3br1
+	ip -n "${ns3}" addr add 100.64.0.3/24 dev ns3eth1
+	ip -n "${ns4}" addr add 100.64.0.41/24 dev ns4eth1
+	ip -n "${ns5}" addr add 100.64.0.51/24 dev ns5eth1
+
+	ip -n "${ns1}" link set hsr1 up
+	ip -n "${ns2}" link set hsr2 up
+}
+
+check_prerequisites
+setup_ns ns1 ns2 ns3 ns4 ns5
+
+trap cleanup_all_ns EXIT
+
+setup_hsr_interfaces 1
+do_complete_ping_test
+
+exit $ret
diff --git a/tools/testing/selftests/net/ip_local_port_range.c b/tools/testing/selftests/net/ip_local_port_range.c
index 193b82745fd8..29451d2244b7 100644
--- a/tools/testing/selftests/net/ip_local_port_range.c
+++ b/tools/testing/selftests/net/ip_local_port_range.c
@@ -359,7 +359,7 @@ TEST_F(ip_local_port_range, late_bind)
 		struct sockaddr_in v4;
 		struct sockaddr_in6 v6;
 	} addr;
-	socklen_t addr_len;
+	socklen_t addr_len = 0;
 	const int one = 1;
 	int fd, err;
 	__u32 range;
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index f9fe182dfbd4..c868c0aec121 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -4,19 +4,64 @@
 ##############################################################################
 # Defines
 
-WAIT_TIMEOUT=${WAIT_TIMEOUT:=20}
+: "${WAIT_TIMEOUT:=20}"
+
 BUSYWAIT_TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms
 
-# Kselftest framework requirement - SKIP code is 4.
+# Kselftest framework constants.
+ksft_pass=0
+ksft_fail=1
+ksft_xfail=2
 ksft_skip=4
+
 # namespace list created by setup_ns
 NS_LIST=""
 
 ##############################################################################
 # Helpers
-busywait()
+
+__ksft_status_merge()
 {
-	local timeout=$1; shift
+	local a=$1; shift
+	local b=$1; shift
+	local -A weights
+	local weight=0
+
+	for i in "$@"; do
+		weights[$i]=$((weight++))
+	done
+
+	if [[ ${weights[$a]} > ${weights[$b]} ]]; then
+		echo "$a"
+		return 0
+	else
+		echo "$b"
+		return 1
+	fi
+}
+
+ksft_status_merge()
+{
+	local a=$1; shift
+	local b=$1; shift
+
+	__ksft_status_merge "$a" "$b" \
+		$ksft_pass $ksft_xfail $ksft_skip $ksft_fail
+}
+
+ksft_exit_status_merge()
+{
+	local a=$1; shift
+	local b=$1; shift
+
+	__ksft_status_merge "$a" "$b" \
+		$ksft_xfail $ksft_pass $ksft_skip $ksft_fail
+}
+
+loopy_wait()
+{
+	local sleep_cmd=$1; shift
+	local timeout_ms=$1; shift
 
 	local start_time="$(date -u +%s%3N)"
 	while true
@@ -30,13 +75,22 @@ busywait()
 		fi
 
 		local current_time="$(date -u +%s%3N)"
-		if ((current_time - start_time > timeout)); then
+		if ((current_time - start_time > timeout_ms)); then
 			echo -n "$out"
 			return 1
 		fi
+
+		$sleep_cmd
 	done
 }
 
+busywait()
+{
+	local timeout_ms=$1; shift
+
+	loopy_wait : "$timeout_ms" "$@"
+}
+
 cleanup_ns()
 {
 	local ns=""
diff --git a/tools/testing/selftests/net/lib/.gitignore b/tools/testing/selftests/net/lib/.gitignore
new file mode 100644
index 000000000000..1ebc6187f421
--- /dev/null
+++ b/tools/testing/selftests/net/lib/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+csum
diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile
new file mode 100644
index 000000000000..82c3264b115e
--- /dev/null
+++ b/tools/testing/selftests/net/lib/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g
+CFLAGS += -I../../../../../usr/include/ $(KHDR_INCLUDES)
+# Additional include paths needed by kselftest.h
+CFLAGS += -I../../
+
+TEST_FILES := ../../../../../Documentation/netlink/specs
+TEST_FILES += ../../../../net/ynl
+
+TEST_GEN_FILES += csum
+
+TEST_INCLUDES := $(wildcard py/*.py)
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/net/csum.c b/tools/testing/selftests/net/lib/csum.c
index 90eb06fefa59..b9f3fc3c3426 100644
--- a/tools/testing/selftests/net/csum.c
+++ b/tools/testing/selftests/net/lib/csum.c
@@ -682,7 +682,7 @@ static int recv_verify_packet_ipv6(void *nh, int len)
 }
 
 /* return whether auxdata includes TP_STATUS_CSUM_VALID */
-static bool recv_verify_packet_csum(struct msghdr *msg)
+static uint32_t recv_get_packet_csum_status(struct msghdr *msg)
 {
 	struct tpacket_auxdata *aux = NULL;
 	struct cmsghdr *cm;
@@ -706,7 +706,7 @@ static bool recv_verify_packet_csum(struct msghdr *msg)
 	if (!aux)
 		error(1, 0, "cmsg: no auxdata");
 
-	return aux->tp_status & TP_STATUS_CSUM_VALID;
+	return aux->tp_status;
 }
 
 static int recv_packet(int fd)
@@ -716,6 +716,7 @@ static int recv_packet(int fd)
 	char ctrl[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
 	struct pkt *buf = (void *)_buf;
 	struct msghdr msg = {0};
+	uint32_t tp_status;
 	struct iovec iov;
 	int len, ret;
 
@@ -737,6 +738,17 @@ static int recv_packet(int fd)
 		if (len == -1)
 			error(1, errno, "recv p");
 
+		tp_status = recv_get_packet_csum_status(&msg);
+
+		/* GRO might coalesce randomized packets. Such GSO packets are
+		 * then reinitialized for csum offload (CHECKSUM_PARTIAL), with
+		 * a pseudo csum. Do not try to validate these checksums.
+		 */
+		if (tp_status & TP_STATUS_CSUMNOTREADY) {
+			fprintf(stderr, "cmsg: GSO packet has partial csum: skip\n");
+			continue;
+		}
+
 		if (cfg_family == PF_INET6)
 			ret = recv_verify_packet_ipv6(buf, len);
 		else
@@ -753,7 +765,7 @@ static int recv_packet(int fd)
 		 * Do not fail if kernel does not validate a good csum:
 		 * Absence of validation does not imply invalid.
 		 */
-		if (recv_verify_packet_csum(&msg) && cfg_bad_csum) {
+		if (tp_status & TP_STATUS_CSUM_VALID && cfg_bad_csum) {
 			fprintf(stderr, "cmsg: expected bad csum, pf_packet returns valid\n");
 			bad_validations++;
 		}
diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py
new file mode 100644
index 000000000000..b6d498d125fe
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+
+from .consts import KSRC
+from .ksft import *
+from .netns import NetNS
+from .nsim import *
+from .utils import *
+from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily
diff --git a/tools/testing/selftests/net/lib/py/consts.py b/tools/testing/selftests/net/lib/py/consts.py
new file mode 100644
index 000000000000..f518ce79d82c
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/consts.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import sys
+from pathlib import Path
+
+KSFT_DIR = (Path(__file__).parent / "../../..").resolve()
+KSRC = (Path(__file__).parent / "../../../../../..").resolve()
+
+KSFT_MAIN_NAME = Path(sys.argv[0]).with_suffix("").name
diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
new file mode 100644
index 000000000000..4769b4eb1ea1
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import builtins
+import inspect
+import sys
+import time
+import traceback
+from .consts import KSFT_MAIN_NAME
+
+KSFT_RESULT = None
+KSFT_RESULT_ALL = True
+
+
+class KsftFailEx(Exception):
+    pass
+
+
+class KsftSkipEx(Exception):
+    pass
+
+
+class KsftXfailEx(Exception):
+    pass
+
+
+def ksft_pr(*objs, **kwargs):
+    print("#", *objs, **kwargs)
+
+
+def _fail(*args):
+    global KSFT_RESULT
+    KSFT_RESULT = False
+
+    frame = inspect.stack()[2]
+    ksft_pr("At " + frame.filename + " line " + str(frame.lineno) + ":")
+    ksft_pr(*args)
+
+
+def ksft_eq(a, b, comment=""):
+    global KSFT_RESULT
+    if a != b:
+        _fail("Check failed", a, "!=", b, comment)
+
+
+def ksft_true(a, comment=""):
+    if not a:
+        _fail("Check failed", a, "does not eval to True", comment)
+
+
+def ksft_in(a, b, comment=""):
+    if a not in b:
+        _fail("Check failed", a, "not in", b, comment)
+
+
+def ksft_ge(a, b, comment=""):
+    if a < b:
+        _fail("Check failed", a, "<", b, comment)
+
+
+class ksft_raises:
+    def __init__(self, expected_type):
+        self.exception = None
+        self.expected_type = expected_type
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is None:
+            _fail(f"Expected exception {str(self.expected_type.__name__)}, none raised")
+        elif self.expected_type != exc_type:
+            _fail(f"Expected exception {str(self.expected_type.__name__)}, raised {str(exc_type.__name__)}")
+        self.exception = exc_val
+        # Suppress the exception if its the expected one
+        return self.expected_type == exc_type
+
+
+def ksft_busy_wait(cond, sleep=0.005, deadline=1, comment=""):
+    end = time.monotonic() + deadline
+    while True:
+        if cond():
+            return
+        if time.monotonic() > end:
+            _fail("Waiting for condition timed out", comment)
+            return
+        time.sleep(sleep)
+
+
+def ktap_result(ok, cnt=1, case="", comment=""):
+    global KSFT_RESULT_ALL
+    KSFT_RESULT_ALL = KSFT_RESULT_ALL and ok
+
+    res = ""
+    if not ok:
+        res += "not "
+    res += "ok "
+    res += str(cnt) + " "
+    res += KSFT_MAIN_NAME
+    if case:
+        res += "." + str(case.__name__)
+    if comment:
+        res += " # " + comment
+    print(res)
+
+
+def ksft_run(cases=None, globs=None, case_pfx=None, args=()):
+    cases = cases or []
+
+    if globs and case_pfx:
+        for key, value in globs.items():
+            if not callable(value):
+                continue
+            for prefix in case_pfx:
+                if key.startswith(prefix):
+                    cases.append(value)
+                    break
+
+    totals = {"pass": 0, "fail": 0, "skip": 0, "xfail": 0}
+
+    print("KTAP version 1")
+    print("1.." + str(len(cases)))
+
+    global KSFT_RESULT
+    cnt = 0
+    for case in cases:
+        KSFT_RESULT = True
+        cnt += 1
+        try:
+            case(*args)
+        except KsftSkipEx as e:
+            ktap_result(True, cnt, case, comment="SKIP " + str(e))
+            totals['skip'] += 1
+            continue
+        except KsftXfailEx as e:
+            ktap_result(True, cnt, case, comment="XFAIL " + str(e))
+            totals['xfail'] += 1
+            continue
+        except Exception as e:
+            tb = traceback.format_exc()
+            for line in tb.strip().split('\n'):
+                ksft_pr("Exception|", line)
+            ktap_result(False, cnt, case)
+            totals['fail'] += 1
+            continue
+
+        ktap_result(KSFT_RESULT, cnt, case)
+        if KSFT_RESULT:
+            totals['pass'] += 1
+        else:
+            totals['fail'] += 1
+
+    print(
+        f"# Totals: pass:{totals['pass']} fail:{totals['fail']} xfail:{totals['xfail']} xpass:0 skip:{totals['skip']} error:0"
+    )
+
+
+def ksft_exit():
+    global KSFT_RESULT_ALL
+    sys.exit(0 if KSFT_RESULT_ALL else 1)
diff --git a/tools/testing/selftests/net/lib/py/netns.py b/tools/testing/selftests/net/lib/py/netns.py
new file mode 100644
index 000000000000..ecff85f9074f
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/netns.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0
+
+from .utils import ip
+import random
+import string
+
+
+class NetNS:
+    def __init__(self, name=None):
+        if name:
+            self.name = name
+        else:
+            self.name = ''.join(random.choice(string.ascii_lowercase) for _ in range(8))
+        ip('netns add ' + self.name)
+
+    def __del__(self):
+        if self.name:
+            ip('netns del ' + self.name)
+            self.name = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, ex_type, ex_value, ex_tb):
+        self.__del__()
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return f"NetNS({self.name})"
diff --git a/tools/testing/selftests/net/lib/py/nsim.py b/tools/testing/selftests/net/lib/py/nsim.py
new file mode 100644
index 000000000000..f571a8b3139b
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/nsim.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import json
+import os
+import random
+import re
+import time
+from .utils import cmd, ip
+
+
+class NetdevSim:
+    """
+    Class for netdevsim netdevice and its attributes.
+    """
+
+    def __init__(self, nsimdev, port_index, ifname, ns=None):
+        # In case udev renamed the netdev to according to new schema,
+        # check if the name matches the port_index.
+        nsimnamere = re.compile(r"eni\d+np(\d+)")
+        match = nsimnamere.match(ifname)
+        if match and int(match.groups()[0]) != port_index + 1:
+            raise Exception("netdevice name mismatches the expected one")
+
+        self.ifname = ifname
+        self.nsimdev = nsimdev
+        self.port_index = port_index
+        self.ns = ns
+        self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index)
+        ret = ip("-j link show dev %s" % ifname, ns=ns)
+        self.dev = json.loads(ret.stdout)[0]
+        self.ifindex = self.dev["ifindex"]
+
+    def dfs_write(self, path, val):
+        self.nsimdev.dfs_write(f'ports/{self.port_index}/' + path, val)
+
+
+class NetdevSimDev:
+    """
+    Class for netdevsim bus device and its attributes.
+    """
+    @staticmethod
+    def ctrl_write(path, val):
+        fullpath = os.path.join("/sys/bus/netdevsim/", path)
+        with open(fullpath, "w") as f:
+            f.write(val)
+
+    def dfs_write(self, path, val):
+        fullpath = os.path.join(f"/sys/kernel/debug/netdevsim/netdevsim{self.addr}/", path)
+        with open(fullpath, "w") as f:
+            f.write(val)
+
+    def __init__(self, port_count=1, queue_count=1, ns=None):
+        # nsim will spawn in init_net, we'll set to actual ns once we switch it there
+        self.ns = None
+
+        if not os.path.exists("/sys/bus/netdevsim"):
+            cmd("modprobe netdevsim")
+
+        addr = random.randrange(1 << 15)
+        while True:
+            try:
+                self.ctrl_write("new_device", "%u %u %u" % (addr, port_count, queue_count))
+            except OSError as e:
+                if e.errno == errno.ENOSPC:
+                    addr = random.randrange(1 << 15)
+                    continue
+                raise e
+            break
+        self.addr = addr
+
+        # As probe of netdevsim device might happen from a workqueue,
+        # so wait here until all netdevs appear.
+        self.wait_for_netdevs(port_count)
+
+        if ns:
+            cmd(f"devlink dev reload netdevsim/netdevsim{addr} netns {ns.name}")
+            self.ns = ns
+
+        cmd("udevadm settle", ns=self.ns)
+        ifnames = self.get_ifnames()
+
+        self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr
+
+        self.nsims = []
+        for port_index in range(port_count):
+            self.nsims.append(self._make_port(port_index, ifnames[port_index]))
+
+        self.removed = False
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, ex_type, ex_value, ex_tb):
+        """
+        __exit__ gets called at the end of a "with" block.
+        """
+        self.remove()
+
+    def _make_port(self, port_index, ifname):
+        return NetdevSim(self, port_index, ifname, self.ns)
+
+    def get_ifnames(self):
+        ifnames = []
+        listdir = cmd(f"ls /sys/bus/netdevsim/devices/netdevsim{self.addr}/net/",
+                      ns=self.ns).stdout.split()
+        for ifname in listdir:
+            ifnames.append(ifname)
+        ifnames.sort()
+        return ifnames
+
+    def wait_for_netdevs(self, port_count):
+        timeout = 5
+        timeout_start = time.time()
+
+        while True:
+            try:
+                ifnames = self.get_ifnames()
+            except FileNotFoundError as e:
+                ifnames = []
+            if len(ifnames) == port_count:
+                break
+            if time.time() < timeout_start + timeout:
+                continue
+            raise Exception("netdevices did not appear within timeout")
+
+    def remove(self):
+        if not self.removed:
+            self.ctrl_write("del_device", "%u" % (self.addr, ))
+            self.removed = True
+
+    def remove_nsim(self, nsim):
+        self.nsims.remove(nsim)
+        self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ),
+                        "%u" % (nsim.port_index, ))
diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py
new file mode 100644
index 000000000000..0540ea24921d
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/utils.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import json as _json
+import random
+import re
+import subprocess
+import time
+
+
+class cmd:
+    def __init__(self, comm, shell=True, fail=True, ns=None, background=False, host=None, timeout=5):
+        if ns:
+            comm = f'ip netns exec {ns} ' + comm
+
+        self.stdout = None
+        self.stderr = None
+        self.ret = None
+
+        self.comm = comm
+        if host:
+            self.proc = host.cmd(comm)
+        else:
+            self.proc = subprocess.Popen(comm, shell=shell, stdout=subprocess.PIPE,
+                                         stderr=subprocess.PIPE)
+        if not background:
+            self.process(terminate=False, fail=fail, timeout=timeout)
+
+    def process(self, terminate=True, fail=None, timeout=5):
+        if fail is None:
+            fail = not terminate
+
+        if terminate:
+            self.proc.terminate()
+        stdout, stderr = self.proc.communicate(timeout)
+        self.stdout = stdout.decode("utf-8")
+        self.stderr = stderr.decode("utf-8")
+        self.proc.stdout.close()
+        self.proc.stderr.close()
+        self.ret = self.proc.returncode
+
+        if self.proc.returncode != 0 and fail:
+            if len(stderr) > 0 and stderr[-1] == "\n":
+                stderr = stderr[:-1]
+            raise Exception("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" %
+                            (self.proc.args, stdout, stderr))
+
+
+class bkg(cmd):
+    def __init__(self, comm, shell=True, fail=None, ns=None, host=None,
+                 exit_wait=False):
+        super().__init__(comm, background=True,
+                         shell=shell, fail=fail, ns=ns, host=host)
+        self.terminate = not exit_wait
+        self.check_fail = fail
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, ex_type, ex_value, ex_tb):
+        return self.process(terminate=self.terminate, fail=self.check_fail)
+
+
+def tool(name, args, json=None, ns=None, host=None):
+    cmd_str = name + ' '
+    if json:
+        cmd_str += '--json '
+    cmd_str += args
+    cmd_obj = cmd(cmd_str, ns=ns, host=host)
+    if json:
+        return _json.loads(cmd_obj.stdout)
+    return cmd_obj
+
+
+def ip(args, json=None, ns=None, host=None):
+    if ns:
+        args = f'-netns {ns} ' + args
+    return tool('ip', args, json=json, host=host)
+
+
+def rand_port():
+    """
+    Get unprivileged port, for now just random, one day we may decide to check if used.
+    """
+    return random.randint(10000, 65535)
+
+
+def wait_port_listen(port, proto="tcp", ns=None, host=None, sleep=0.005, deadline=5):
+    end = time.monotonic() + deadline
+
+    pattern = f":{port:04X} .* "
+    if proto == "tcp": # for tcp protocol additionally check the socket state
+        pattern += "0A"
+    pattern = re.compile(pattern)
+
+    while True:
+        data = cmd(f'cat /proc/net/{proto}*', ns=ns, host=host, shell=True).stdout
+        for row in data.split("\n"):
+            if pattern.search(row):
+                return
+        if time.monotonic() > end:
+            raise Exception("Waiting for port listen timed out")
+        time.sleep(sleep)
diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py
new file mode 100644
index 000000000000..1ace58370c06
--- /dev/null
+++ b/tools/testing/selftests/net/lib/py/ynl.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import sys
+from pathlib import Path
+from .consts import KSRC, KSFT_DIR
+from .ksft import ksft_pr, ktap_result
+
+# Resolve paths
+try:
+    if (KSFT_DIR / "kselftest-list.txt").exists():
+        # Running in "installed" selftests
+        tools_full_path = KSFT_DIR
+        SPEC_PATH = KSFT_DIR / "net/lib/specs"
+
+        sys.path.append(tools_full_path.as_posix())
+        from net.lib.ynl.lib import YnlFamily, NlError
+    else:
+        # Running in tree
+        tools_full_path = KSRC / "tools"
+        SPEC_PATH = KSRC / "Documentation/netlink/specs"
+
+        sys.path.append(tools_full_path.as_posix())
+        from net.ynl.lib import YnlFamily, NlError
+except ModuleNotFoundError as e:
+    ksft_pr("Failed importing `ynl` library from kernel sources")
+    ksft_pr(str(e))
+    ktap_result(True, comment="SKIP")
+    sys.exit(4)
+
+#
+# Wrapper classes, loading the right specs
+# Set schema='' to avoid jsonschema validation, it's slow
+#
+class EthtoolFamily(YnlFamily):
+    def __init__(self):
+        super().__init__((SPEC_PATH / Path('ethtool.yaml')).as_posix(),
+                         schema='')
+
+
+class RtnlFamily(YnlFamily):
+    def __init__(self):
+        super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(),
+                         schema='')
+
+
+class NetdevFamily(YnlFamily):
+    def __init__(self):
+        super().__init__((SPEC_PATH / Path('netdev.yaml')).as_posix(),
+                         schema='')
diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh
index bc97ab33a00e..776d43a6922d 100755
--- a/tools/testing/selftests/net/mptcp/diag.sh
+++ b/tools/testing/selftests/net/mptcp/diag.sh
@@ -200,6 +200,58 @@ chk_msk_cestab()
 		 "${expected}" "${msg}" ""
 }
 
+msk_info_get_value()
+{
+	local port="${1}"
+	local info="${2}"
+
+	ss -N "${ns}" -inHM dport "${port}" | \
+		mptcp_lib_get_info_value "${info}" "${info}"
+}
+
+chk_msk_info()
+{
+	local port="${1}"
+	local info="${2}"
+	local cnt="${3}"
+	local msg="....chk ${info}"
+	local delta_ms=250  # half what we waited before, just to be sure
+	local now
+
+	now=$(msk_info_get_value "${port}" "${info}")
+
+	mptcp_lib_print_title "${msg}"
+	if { [ -z "${cnt}" ] || [ -z "${now}" ]; } &&
+	   ! mptcp_lib_expect_all_features; then
+		mptcp_lib_pr_skip "Feature probably not supported"
+		mptcp_lib_result_skip "${msg}"
+	elif [ "$((cnt + delta_ms))" -lt "${now}" ]; then
+		mptcp_lib_pr_ok
+		mptcp_lib_result_pass "${msg}"
+	else
+		mptcp_lib_pr_fail "value of ${info} changed by $((now - cnt))ms," \
+				  "expected at least ${delta_ms}ms"
+		mptcp_lib_result_fail "${msg}"
+		ret=${KSFT_FAIL}
+	fi
+}
+
+chk_last_time_info()
+{
+	local port="${1}"
+	local data_sent data_recv ack_recv
+
+	data_sent=$(msk_info_get_value "${port}" "last_data_sent")
+	data_recv=$(msk_info_get_value "${port}" "last_data_recv")
+	ack_recv=$(msk_info_get_value "${port}" "last_ack_recv")
+
+	sleep 0.5  # wait to check after if the timestamps difference
+
+	chk_msk_info "${port}" "last_data_sent" "${data_sent}"
+	chk_msk_info "${port}" "last_data_recv" "${data_recv}"
+	chk_msk_info "${port}" "last_ack_recv" "${ack_recv}"
+}
+
 wait_connected()
 {
 	local listener_ns="${1}"
@@ -233,6 +285,7 @@ echo "b" | \
 				127.0.0.1 >/dev/null &
 wait_connected $ns 10000
 chk_msk_nr 2 "after MPC handshake "
+chk_last_time_info 10000
 chk_msk_remote_key_nr 2 "....chk remote_key"
 chk_msk_fallback_nr 0 "....chk no fallback"
 chk_msk_inuse 2
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index 4c4248554826..b77fb7065bfb 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -147,7 +147,7 @@ cleanup()
 
 mptcp_lib_check_mptcp
 mptcp_lib_check_kallsyms
-mptcp_lib_check_tools ip
+mptcp_lib_check_tools ip tc
 
 sin=$(mktemp)
 sout=$(mktemp)
@@ -383,12 +383,14 @@ do_transfer()
 	local stat_cookierx_last
 	local stat_csum_err_s
 	local stat_csum_err_c
+	local stat_tcpfb_last_l
 	stat_synrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX")
 	stat_ackrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX")
 	stat_cookietx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent")
 	stat_cookierx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv")
 	stat_csum_err_s=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtDataCsumErr")
 	stat_csum_err_c=$(mptcp_lib_get_counter "${connector_ns}" "MPTcpExtDataCsumErr")
+	stat_tcpfb_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK")
 
 	timeout ${timeout_test} \
 		ip netns exec ${listener_ns} \
@@ -457,11 +459,13 @@ do_transfer()
 	local stat_cookietx_now
 	local stat_cookierx_now
 	local stat_ooo_now
+	local stat_tcpfb_now_l
 	stat_synrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX")
 	stat_ackrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX")
 	stat_cookietx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent")
 	stat_cookierx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv")
 	stat_ooo_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtTCPOFOQueue")
+	stat_tcpfb_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK")
 
 	expect_synrx=$((stat_synrx_last_l))
 	expect_ackrx=$((stat_ackrx_last_l))
@@ -508,6 +512,11 @@ do_transfer()
 		fi
 	fi
 
+	if [ ${stat_ooo_now} -eq 0 ] && [ ${stat_tcpfb_last_l} -ne ${stat_tcpfb_now_l} ]; then
+		mptcp_lib_pr_fail "unexpected fallback to TCP"
+		rets=1
+	fi
+
 	if [ $cookies -eq 2 ];then
 		if [ $stat_cookietx_last -ge $stat_cookietx_now ] ;then
 			extra+=" WARN: CookieSent: did not advance"
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 5e9211e89825..fefa9173bdaa 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -31,7 +31,6 @@ timeout_poll=30
 timeout_test=$((timeout_poll * 2 + 1))
 capture=false
 checksum=false
-ip_mptcp=0
 check_invert=0
 validate_checksum=false
 init=0
@@ -125,8 +124,8 @@ init_shapers()
 {
 	local i
 	for i in $(seq 1 4); do
-		tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1
-		tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1
+		tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1ms
+		tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1ms
 	done
 }
 
@@ -142,7 +141,7 @@ init() {
 
 	mptcp_lib_check_mptcp
 	mptcp_lib_check_kallsyms
-	mptcp_lib_check_tools ip ss "${iptables}" "${ip6tables}"
+	mptcp_lib_check_tools ip tc ss "${iptables}" "${ip6tables}"
 
 	sin=$(mktemp)
 	sout=$(mktemp)
@@ -606,173 +605,65 @@ kill_events_pids()
 
 pm_nl_set_limits()
 {
-	local ns=$1
-	local addrs=$2
-	local subflows=$3
-
-	if [ $ip_mptcp -eq 1 ]; then
-		ip -n $ns mptcp limits set add_addr_accepted $addrs subflows $subflows
-	else
-		ip netns exec $ns ./pm_nl_ctl limits $addrs $subflows
-	fi
+	mptcp_lib_pm_nl_set_limits "${@}"
 }
 
 pm_nl_add_endpoint()
 {
-	local ns=$1
-	local addr=$2
-	local flags _flags
-	local port _port
-	local dev _dev
-	local id _id
-	local nr=2
-
-	local p
-	for p in "${@}"
-	do
-		if [ $p = "flags" ]; then
-			eval _flags=\$"$nr"
-			[ -n "$_flags" ]; flags="flags $_flags"
-		fi
-		if [ $p = "dev" ]; then
-			eval _dev=\$"$nr"
-			[ -n "$_dev" ]; dev="dev $_dev"
-		fi
-		if [ $p = "id" ]; then
-			eval _id=\$"$nr"
-			[ -n "$_id" ]; id="id $_id"
-		fi
-		if [ $p = "port" ]; then
-			eval _port=\$"$nr"
-			[ -n "$_port" ]; port="port $_port"
-		fi
-
-		nr=$((nr + 1))
-	done
-
-	if [ $ip_mptcp -eq 1 ]; then
-		ip -n $ns mptcp endpoint add $addr ${_flags//","/" "} $dev $id $port
-	else
-		ip netns exec $ns ./pm_nl_ctl add $addr $flags $dev $id $port
-	fi
+	mptcp_lib_pm_nl_add_endpoint "${@}"
 }
 
 pm_nl_del_endpoint()
 {
-	local ns=$1
-	local id=$2
-	local addr=$3
-
-	if [ $ip_mptcp -eq 1 ]; then
-		[ $id -ne 0 ] && addr=''
-		ip -n $ns mptcp endpoint delete id $id $addr
-	else
-		ip netns exec $ns ./pm_nl_ctl del $id $addr
-	fi
+	mptcp_lib_pm_nl_del_endpoint "${@}"
 }
 
 pm_nl_flush_endpoint()
 {
-	local ns=$1
-
-	if [ $ip_mptcp -eq 1 ]; then
-		ip -n $ns mptcp endpoint flush
-	else
-		ip netns exec $ns ./pm_nl_ctl flush
-	fi
+	mptcp_lib_pm_nl_flush_endpoint "${@}"
 }
 
 pm_nl_show_endpoints()
 {
-	local ns=$1
-
-	if [ $ip_mptcp -eq 1 ]; then
-		ip -n $ns mptcp endpoint show
-	else
-		ip netns exec $ns ./pm_nl_ctl dump
-	fi
+	mptcp_lib_pm_nl_show_endpoints "${@}"
 }
 
 pm_nl_change_endpoint()
 {
-	local ns=$1
-	local id=$2
-	local flags=$3
-
-	if [ $ip_mptcp -eq 1 ]; then
-		ip -n $ns mptcp endpoint change id $id ${flags//","/" "}
-	else
-		ip netns exec $ns ./pm_nl_ctl set id $id flags $flags
-	fi
+	mptcp_lib_pm_nl_change_endpoint "${@}"
 }
 
 pm_nl_check_endpoint()
 {
-	local line expected_line
 	local msg="$1"
 	local ns=$2
 	local addr=$3
-	local _flags=""
-	local flags
-	local _port
-	local port
-	local dev
-	local _id
-	local id
+	local flags dev id port
 
 	print_check "${msg}"
 
 	shift 3
 	while [ -n "$1" ]; do
-		if [ $1 = "flags" ]; then
-			_flags=$2
-			[ -n "$_flags" ]; flags="flags $_flags"
-			shift
-		elif [ $1 = "dev" ]; then
-			[ -n "$2" ]; dev="dev $1"
-			shift
-		elif [ $1 = "id" ]; then
-			_id=$2
-			[ -n "$_id" ]; id="id $_id"
+		case "${1}" in
+		"flags" | "dev" | "id" | "port")
+			eval "${1}"="${2}"
 			shift
-		elif [ $1 = "port" ]; then
-			_port=$2
-			[ -n "$_port" ]; port=" port $_port"
-			shift
-		fi
+			;;
+		*)
+			;;
+		esac
 
 		shift
 	done
 
-	if [ -z "$id" ]; then
+	if [ -z "${id}" ]; then
 		test_fail "bad test - missing endpoint id"
 		return
 	fi
 
-	if [ $ip_mptcp -eq 1 ]; then
-		# get line and trim trailing whitespace
-		line=$(ip -n $ns mptcp endpoint show $id)
-		line="${line% }"
-		# the dump order is: address id flags port dev
-		[ -n "$addr" ] && expected_line="$addr"
-		expected_line+=" $id"
-		[ -n "$_flags" ] && expected_line+=" ${_flags//","/" "}"
-		[ -n "$dev" ] && expected_line+=" $dev"
-		[ -n "$port" ] && expected_line+=" $port"
-	else
-		line=$(ip netns exec $ns ./pm_nl_ctl get $_id)
-		# the dump order is: id flags dev address port
-		expected_line="$id"
-		[ -n "$flags" ] && expected_line+=" $flags"
-		[ -n "$dev" ] && expected_line+=" $dev"
-		[ -n "$addr" ] && expected_line+=" $addr"
-		[ -n "$_port" ] && expected_line+=" $_port"
-	fi
-	if [ "$line" = "$expected_line" ]; then
-		print_ok
-	else
-		fail_test "expected '$expected_line' found '$line'"
-	fi
+	check_output "mptcp_lib_pm_nl_get_endpoint ${ns} ${id}" \
+		"$(mptcp_lib_pm_nl_format_endpoints \
+			"${id},${addr},${flags//","/" "},${dev},${port}")"
 }
 
 pm_nl_set_endpoint()
@@ -3212,7 +3103,7 @@ fail_tests()
 
 	# multiple subflows
 	if reset_with_fail "MP_FAIL MP_RST" 2; then
-		tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5
+		tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5ms
 		pm_nl_set_limits $ns1 0 1
 		pm_nl_set_limits $ns2 0 1
 		pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow
@@ -3610,6 +3501,8 @@ endpoint_tests()
 		local tests_pid=$!
 
 		wait_mpj $ns2
+		pm_nl_check_endpoint "creation" \
+			$ns2 10.0.2.2 id 2 flags subflow dev ns2eth2
 		chk_subflow_nr "before delete" 2
 		chk_mptcp_info subflows 1 subflows 1
 
@@ -3700,7 +3593,7 @@ while getopts "${all_tests_args}cCih" opt; do
 			checksum=true
 			;;
 		i)
-			ip_mptcp=1
+			mptcp_lib_set_ip_mptcp
 			;;
 		h)
 			usage
diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
index d529b4b37af8..ad2ebda5cb64 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
@@ -23,6 +23,7 @@ MPTCP_LIB_SUBTESTS=()
 MPTCP_LIB_SUBTESTS_DUPLICATED=0
 MPTCP_LIB_TEST_COUNTER=0
 MPTCP_LIB_TEST_FORMAT="%02u %-50s"
+MPTCP_LIB_IP_MPTCP=0
 
 # only if supported (or forced) and not disabled, see no-color.org
 if { [ -t 1 ] || [ "${SELFTESTS_MPTCP_LIB_COLOR_FORCE:-}" = "1" ]; } &&
@@ -384,6 +385,12 @@ mptcp_lib_check_tools() {
 				exit ${KSFT_SKIP}
 			fi
 			;;
+		"tc")
+			if ! tc -help &> /dev/null; then
+				mptcp_lib_pr_skip "Could not run test without tc tool"
+				exit ${KSFT_SKIP}
+			fi
+			;;
 		"ss")
 			if ! ss -h | grep -q MPTCP; then
 				mptcp_lib_pr_skip "ss tool does not support MPTCP"
@@ -505,3 +512,131 @@ mptcp_lib_verify_listener_events() {
 	mptcp_lib_check_expected "type" "family" "saddr" "sport" || rc="${?}"
 	return "${rc}"
 }
+
+mptcp_lib_set_ip_mptcp() {
+	MPTCP_LIB_IP_MPTCP=1
+}
+
+mptcp_lib_is_ip_mptcp() {
+	[ "${MPTCP_LIB_IP_MPTCP}" = "1" ]
+}
+
+# format: <id>,<ip>,<flags>,<dev>
+mptcp_lib_pm_nl_format_endpoints() {
+	local entry id ip flags dev port
+
+	for entry in "${@}"; do
+		IFS=, read -r id ip flags dev port <<< "${entry}"
+		if mptcp_lib_is_ip_mptcp; then
+			echo -n "${ip}"
+			[ -n "${port}" ] && echo -n " port ${port}"
+			echo -n " id ${id}"
+			[ -n "${flags}" ] && echo -n " ${flags}"
+			[ -n "${dev}" ] && echo -n " dev ${dev}"
+			echo " " # always a space at the end
+		else
+			echo -n "id ${id}"
+			echo -n " flags ${flags//" "/","}"
+			[ -n "${dev}" ] && echo -n " dev ${dev}"
+			echo -n " ${ip}"
+			[ -n "${port}" ] && echo -n " ${port}"
+			echo ""
+		fi
+	done
+}
+
+mptcp_lib_pm_nl_get_endpoint() {
+	local ns=${1}
+	local id=${2}
+
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns}" mptcp endpoint show id "${id}"
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl get "${id}"
+	fi
+}
+
+mptcp_lib_pm_nl_set_limits() {
+	local ns=${1}
+	local addrs=${2}
+	local subflows=${3}
+
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns}" mptcp limits set add_addr_accepted "${addrs}" subflows "${subflows}"
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl limits "${addrs}" "${subflows}"
+	fi
+}
+
+mptcp_lib_pm_nl_add_endpoint() {
+	local ns=${1}
+	local addr=${2}
+	local flags dev id port
+	local nr=2
+
+	local p
+	for p in "${@}"; do
+		case "${p}" in
+		"flags" | "dev" | "id" | "port")
+			eval "${p}"=\$"${nr}"
+			;;
+		esac
+
+		nr=$((nr + 1))
+	done
+
+	if mptcp_lib_is_ip_mptcp; then
+		# shellcheck disable=SC2086 # blanks in flags, no double quote
+		ip -n "${ns}" mptcp endpoint add "${addr}" ${flags//","/" "} \
+			${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"}
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl add "${addr}" ${flags:+flags "${flags}"} \
+			${dev:+dev "${dev}"} ${id:+id "${id}"} ${port:+port "${port}"}
+	fi
+}
+
+mptcp_lib_pm_nl_del_endpoint() {
+	local ns=${1}
+	local id=${2}
+	local addr=${3}
+
+	if mptcp_lib_is_ip_mptcp; then
+		[ "${id}" -ne 0 ] && addr=''
+		ip -n "${ns}" mptcp endpoint delete id "${id}" ${addr:+"${addr}"}
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl del "${id}" "${addr}"
+	fi
+}
+
+mptcp_lib_pm_nl_flush_endpoint() {
+	local ns=${1}
+
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns}" mptcp endpoint flush
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl flush
+	fi
+}
+
+mptcp_lib_pm_nl_show_endpoints() {
+	local ns=${1}
+
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns}" mptcp endpoint show
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl dump
+	fi
+}
+
+mptcp_lib_pm_nl_change_endpoint() {
+	local ns=${1}
+	local id=${2}
+	local flags=${3}
+
+	if mptcp_lib_is_ip_mptcp; then
+		# shellcheck disable=SC2086 # blanks in flags, no double quote
+		ip -n "${ns}" mptcp endpoint change id "${id}" ${flags//","/" "}
+	else
+		ip netns exec "${ns}" ./pm_nl_ctl set id "${id}" flags "${flags}"
+	fi
+}
diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
index e2d70c18786e..68899a303a1a 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
@@ -22,6 +22,28 @@ ns1=""
 ns2=""
 ns_sbox=""
 
+usage() {
+	echo "Usage: $0 [ -i ] [ -h ]"
+	echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'"
+	echo -e "\t-h: help"
+}
+
+while getopts "hi" option;do
+	case "$option" in
+	"h")
+		usage "$0"
+		exit ${KSFT_PASS}
+		;;
+	"i")
+		mptcp_lib_set_ip_mptcp
+		;;
+	"?")
+		usage "$0"
+		exit ${KSFT_FAIL}
+		;;
+	esac
+done
+
 add_mark_rules()
 {
 	local ns=$1
@@ -58,15 +80,15 @@ init()
 		# let $ns2 reach any $ns1 address from any interface
 		ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i
 
-		ip netns exec $ns1 ./pm_nl_ctl add 10.0.$i.1 flags signal
-		ip netns exec $ns1 ./pm_nl_ctl add dead:beef:$i::1 flags signal
+		mptcp_lib_pm_nl_add_endpoint "${ns1}" "10.0.${i}.1" flags signal
+		mptcp_lib_pm_nl_add_endpoint "${ns1}" "dead:beef:${i}::1" flags signal
 
-		ip netns exec $ns2 ./pm_nl_ctl add 10.0.$i.2 flags signal
-		ip netns exec $ns2 ./pm_nl_ctl add dead:beef:$i::2 flags signal
+		mptcp_lib_pm_nl_add_endpoint "${ns2}" "10.0.${i}.2" flags signal
+		mptcp_lib_pm_nl_add_endpoint "${ns2}" "dead:beef:${i}::2" flags signal
 	done
 
-	ip netns exec $ns1 ./pm_nl_ctl limits 8 8
-	ip netns exec $ns2 ./pm_nl_ctl limits 8 8
+	mptcp_lib_pm_nl_set_limits "${ns1}" 8 8
+	mptcp_lib_pm_nl_set_limits "${ns2}" 8 8
 
 	add_mark_rules $ns1 1
 	add_mark_rules $ns2 2
diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh
index 6ab8c5d36340..2757378b1b13 100755
--- a/tools/testing/selftests/net/mptcp/pm_netlink.sh
+++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh
@@ -1,28 +1,28 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-# Double quotes to prevent globbing and word splitting is recommended in new
-# code but we accept it, especially because there were too many before having
-# address all other issues detected by shellcheck.
-#shellcheck disable=SC2086
-
 . "$(dirname "${0}")/mptcp_lib.sh"
 
 ret=0
 
 usage() {
-	echo "Usage: $0 [ -h ]"
+	echo "Usage: $0 [ -i ] [ -h ]"
+	echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'"
+	echo -e "\t-h: help"
 }
 
-optstring=h
+optstring=hi
 while getopts "$optstring" option;do
 	case "$option" in
 	"h")
-		usage $0
+		usage "$0"
 		exit ${KSFT_PASS}
 		;;
+	"i")
+		mptcp_lib_set_ip_mptcp
+		;;
 	"?")
-		usage $0
+		usage "$0"
 		exit ${KSFT_FAIL}
 		;;
 	esac
@@ -35,7 +35,7 @@ err=$(mktemp)
 #shellcheck disable=SC2317
 cleanup()
 {
-	rm -f $err
+	rm -f "${err}"
 	mptcp_lib_ns_exit "${ns1}"
 }
 
@@ -46,6 +46,76 @@ trap cleanup EXIT
 
 mptcp_lib_ns_init ns1
 
+format_limits() {
+	local accept="${1}"
+	local subflows="${2}"
+
+	if mptcp_lib_is_ip_mptcp; then
+		# with a space at the end
+		printf "add_addr_accepted %d subflows %d \n" "${accept}" "${subflows}"
+	else
+		printf "accept %d\nsubflows %d\n" "${accept}" "${subflows}"
+	fi
+}
+
+get_limits() {
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns1}" mptcp limits
+	else
+		ip netns exec "${ns1}" ./pm_nl_ctl limits
+	fi
+}
+
+format_endpoints() {
+	mptcp_lib_pm_nl_format_endpoints "${@}"
+}
+
+get_endpoint() {
+	# shellcheck disable=SC2317 # invoked indirectly
+	mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}"
+}
+
+change_address() {
+	local addr=${1}
+	local flags=${2}
+
+	if mptcp_lib_is_ip_mptcp; then
+		ip -n "${ns1}" mptcp endpoint change "${addr}" "${flags}"
+	else
+		ip netns exec "${ns1}" ./pm_nl_ctl set "${addr}" flags "${flags}"
+	fi
+}
+
+set_limits()
+{
+	mptcp_lib_pm_nl_set_limits "${ns1}" "${@}"
+}
+
+add_endpoint()
+{
+	mptcp_lib_pm_nl_add_endpoint "${ns1}" "${@}"
+}
+
+del_endpoint()
+{
+	mptcp_lib_pm_nl_del_endpoint "${ns1}" "${@}"
+}
+
+flush_endpoint()
+{
+	mptcp_lib_pm_nl_flush_endpoint "${ns1}"
+}
+
+show_endpoints()
+{
+	mptcp_lib_pm_nl_show_endpoints "${ns1}"
+}
+
+change_endpoint()
+{
+	mptcp_lib_pm_nl_change_endpoint "${ns1}" "${@}"
+}
+
 check()
 {
 	local cmd="$1"
@@ -67,125 +137,126 @@ check()
 	fi
 }
 
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list"
+check "show_endpoints" "" "defaults addr list"
 
-default_limits="$(ip netns exec $ns1 ./pm_nl_ctl limits)"
+default_limits="$(get_limits)"
 if mptcp_lib_expect_all_features; then
-	check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0
-subflows 2" "defaults limits"
+	check "get_limits" "$(format_limits 0 2)" "defaults limits"
 fi
 
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup
-check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags  10.0.1.1" "simple add/get addr"
+add_endpoint 10.0.1.1
+add_endpoint 10.0.1.2 flags subflow dev lo
+add_endpoint 10.0.1.3 flags signal,backup
+check "get_endpoint 1" "$(format_endpoints "1,10.0.1.1")" "simple add/get addr"
 
-check "ip netns exec $ns1 ./pm_nl_ctl dump" \
-"id 1 flags  10.0.1.1
-id 2 flags subflow dev lo 10.0.1.2
-id 3 flags signal,backup 10.0.1.3" "dump addrs"
+check "show_endpoints" \
+	"$(format_endpoints "1,10.0.1.1" \
+			    "2,10.0.1.2,subflow,lo" \
+			    "3,10.0.1.3,signal backup")" "dump addrs"
 
-ip netns exec $ns1 ./pm_nl_ctl del 2
-check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr"
-check "ip netns exec $ns1 ./pm_nl_ctl dump" \
-"id 1 flags  10.0.1.1
-id 3 flags signal,backup 10.0.1.3" "dump addrs after del"
+del_endpoint 2
+check "get_endpoint 2" "" "simple del addr"
+check "show_endpoints" \
+	"$(format_endpoints "1,10.0.1.1" \
+			    "3,10.0.1.3,signal backup")" "dump addrs after del"
 
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 2>/dev/null
-check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr"
+add_endpoint 10.0.1.3 2>/dev/null
+check "get_endpoint 4" "" "duplicate addr"
 
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 flags signal
-check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4" "id addr increment"
+add_endpoint 10.0.1.4 flags signal
+check "get_endpoint 4" "$(format_endpoints "4,10.0.1.4,signal")" "id addr increment"
 
 for i in $(seq 5 9); do
-	ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1
+	add_endpoint "10.0.1.${i}" flags signal >/dev/null 2>&1
 done
-check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit"
-check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit"
+check "get_endpoint 9" "$(format_endpoints "9,10.0.1.9,signal")" "hard addr limit"
+check "get_endpoint 10" "" "above hard addr limit"
 
-ip netns exec $ns1 ./pm_nl_ctl del 9
+del_endpoint 9
 for i in $(seq 10 255); do
-	ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $i
-	ip netns exec $ns1 ./pm_nl_ctl del $i
+	add_endpoint 10.0.0.9 id "${i}"
+	del_endpoint "${i}"
 done
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags  10.0.1.1
-id 3 flags signal,backup 10.0.1.3
-id 4 flags signal 10.0.1.4
-id 5 flags signal 10.0.1.5
-id 6 flags signal 10.0.1.6
-id 7 flags signal 10.0.1.7
-id 8 flags signal 10.0.1.8" "id limit"
-
-ip netns exec $ns1 ./pm_nl_ctl flush
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs"
-
-ip netns exec $ns1 ./pm_nl_ctl limits 9 1 2>/dev/null
-check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "rcv addrs above hard limit"
-
-ip netns exec $ns1 ./pm_nl_ctl limits 1 9 2>/dev/null
-check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "subflows above hard limit"
-
-ip netns exec $ns1 ./pm_nl_ctl limits 8 8
-check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8
-subflows 8" "set limits"
-
-ip netns exec $ns1 ./pm_nl_ctl flush
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 id 100
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.5 id 254
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.6
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.7
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.8
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags  10.0.1.1
-id 2 flags  10.0.1.2
-id 3 flags  10.0.1.7
-id 4 flags  10.0.1.8
-id 100 flags  10.0.1.3
-id 101 flags  10.0.1.4
-id 254 flags  10.0.1.5
-id 255 flags  10.0.1.6" "set ids"
-
-ip netns exec $ns1 ./pm_nl_ctl flush
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.1
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.2 id 254
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.3
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.4
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.5 id 253
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.6
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.7
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.8
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags  10.0.0.1
-id 2 flags  10.0.0.4
-id 3 flags  10.0.0.6
-id 4 flags  10.0.0.7
-id 5 flags  10.0.0.8
-id 253 flags  10.0.0.5
-id 254 flags  10.0.0.2
-id 255 flags  10.0.0.3" "wrap-around ids"
-
-ip netns exec $ns1 ./pm_nl_ctl flush
-ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 flags subflow
-ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags backup
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
-subflow,backup 10.0.1.1" "set flags (backup)"
-ip netns exec $ns1 ./pm_nl_ctl set 10.0.1.1 flags nobackup
-check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
-subflow 10.0.1.1" "          (nobackup)"
+check "show_endpoints" \
+	"$(format_endpoints "1,10.0.1.1" \
+			    "3,10.0.1.3,signal backup" \
+			    "4,10.0.1.4,signal" \
+			    "5,10.0.1.5,signal" \
+			    "6,10.0.1.6,signal" \
+			    "7,10.0.1.7,signal" \
+			    "8,10.0.1.8,signal")" "id limit"
+
+flush_endpoint
+check "show_endpoints" "" "flush addrs"
+
+set_limits 9 1 2>/dev/null
+check "get_limits" "${default_limits}" "rcv addrs above hard limit"
+
+set_limits 1 9 2>/dev/null
+check "get_limits" "${default_limits}" "subflows above hard limit"
+
+set_limits 8 8
+check "get_limits" "$(format_limits 8 8)" "set limits"
+
+flush_endpoint
+add_endpoint 10.0.1.1
+add_endpoint 10.0.1.2
+add_endpoint 10.0.1.3 id 100
+add_endpoint 10.0.1.4
+add_endpoint 10.0.1.5 id 254
+add_endpoint 10.0.1.6
+add_endpoint 10.0.1.7
+add_endpoint 10.0.1.8
+check "show_endpoints" \
+	"$(format_endpoints "1,10.0.1.1" \
+			    "2,10.0.1.2" \
+			    "3,10.0.1.7" \
+			    "4,10.0.1.8" \
+			    "100,10.0.1.3" \
+			    "101,10.0.1.4" \
+			    "254,10.0.1.5" \
+			    "255,10.0.1.6")" "set ids"
+
+flush_endpoint
+add_endpoint 10.0.0.1
+add_endpoint 10.0.0.2 id 254
+add_endpoint 10.0.0.3
+add_endpoint 10.0.0.4
+add_endpoint 10.0.0.5 id 253
+add_endpoint 10.0.0.6
+add_endpoint 10.0.0.7
+add_endpoint 10.0.0.8
+check "show_endpoints" \
+	"$(format_endpoints "1,10.0.0.1" \
+			    "2,10.0.0.4" \
+			    "3,10.0.0.6" \
+			    "4,10.0.0.7" \
+			    "5,10.0.0.8" \
+			    "253,10.0.0.5" \
+			    "254,10.0.0.2" \
+			    "255,10.0.0.3")" "wrap-around ids"
+
+flush_endpoint
+add_endpoint 10.0.1.1 flags subflow
+change_address 10.0.1.1 backup
+check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup")" \
+	"set flags (backup)"
+change_address 10.0.1.1 nobackup
+check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \
+	"          (nobackup)"
 
 # fullmesh support has been added later
-ip netns exec $ns1 ./pm_nl_ctl set id 1 flags fullmesh 2>/dev/null
-if ip netns exec $ns1 ./pm_nl_ctl dump | grep -q "fullmesh" ||
+change_endpoint 1 fullmesh 2>/dev/null
+if show_endpoints | grep -q "fullmesh" ||
    mptcp_lib_expect_all_features; then
-	check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
-subflow,fullmesh 10.0.1.1" "          (fullmesh)"
-	ip netns exec $ns1 ./pm_nl_ctl set id 1 flags nofullmesh
-	check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
-subflow 10.0.1.1" "          (nofullmesh)"
-	ip netns exec $ns1 ./pm_nl_ctl set id 1 flags backup,fullmesh
-	check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags \
-subflow,backup,fullmesh 10.0.1.1" "          (backup,fullmesh)"
+	check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow fullmesh")" \
+		"          (fullmesh)"
+	change_endpoint 1 nofullmesh
+	check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow")" \
+		"          (nofullmesh)"
+	change_endpoint 1 backup,fullmesh
+	check "show_endpoints" "$(format_endpoints "1,10.0.1.1,subflow backup fullmesh")" \
+		"          (backup,fullmesh)"
 else
 	for st in fullmesh nofullmesh backup,fullmesh; do
 		st="          (${st})"
diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
index 7426a2cbd4a0..7ad5a59adff2 100644
--- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
+++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
@@ -1276,7 +1276,7 @@ int add_listener(int argc, char *argv[])
 	struct sockaddr_storage addr;
 	struct sockaddr_in6 *a6;
 	struct sockaddr_in *a4;
-	u_int16_t family;
+	u_int16_t family = AF_UNSPEC;
 	int enable = 1;
 	int sock;
 	int err;
diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
index 1b2366220388..4b14b4412166 100755
--- a/tools/testing/selftests/net/mptcp/simult_flows.sh
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -27,10 +27,11 @@ capout=""
 size=0
 
 usage() {
-	echo "Usage: $0 [ -b ] [ -c ] [ -d ]"
+	echo "Usage: $0 [ -b ] [ -c ] [ -d ] [ -i]"
 	echo -e "\t-b: bail out after first error, otherwise runs al testcases"
 	echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)"
 	echo -e "\t-d: debug this script"
+	echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'"
 }
 
 # This function is used in the cleanup trap
@@ -45,7 +46,7 @@ cleanup()
 }
 
 mptcp_lib_check_mptcp
-mptcp_lib_check_tools ip
+mptcp_lib_check_tools ip tc
 
 #  "$ns1"              ns2                    ns3
 #     ns1eth1    ns2eth1   ns2eth3      ns3eth1
@@ -85,8 +86,8 @@ setup()
 	ip -net "$ns1" route add default via 10.0.2.2 metric 101
 	ip -net "$ns1" route add default via dead:beef:2::2 metric 101
 
-	ip netns exec "$ns1" ./pm_nl_ctl limits 1 1
-	ip netns exec "$ns1" ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags subflow
+	mptcp_lib_pm_nl_set_limits "${ns1}" 1 1
+	mptcp_lib_pm_nl_add_endpoint "${ns1}" 10.0.2.1 dev ns1eth2 flags subflow
 
 	ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1
 	ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad
@@ -108,7 +109,7 @@ setup()
 	ip -net "$ns3" route add default via 10.0.3.2
 	ip -net "$ns3" route add default via dead:beef:3::2
 
-	ip netns exec "$ns3" ./pm_nl_ctl limits 1 1
+	mptcp_lib_pm_nl_set_limits "${ns3}" 1 1
 
 	# debug build can slow down measurably the test program
 	# we use quite tight time limit on the run-time, to ensure
@@ -216,8 +217,8 @@ run_test()
 	shift 4
 	local msg=$*
 
-	[ $delay1 -gt 0 ] && delay1="delay $delay1" || delay1=""
-	[ $delay2 -gt 0 ] && delay2="delay $delay2" || delay2=""
+	[ $delay1 -gt 0 ] && delay1="delay ${delay1}ms" || delay1=""
+	[ $delay2 -gt 0 ] && delay2="delay ${delay2}ms" || delay2=""
 
 	for dev in ns1eth1 ns1eth2; do
 		tc -n $ns1 qdisc del dev $dev root >/dev/null 2>&1
@@ -259,7 +260,7 @@ run_test()
 	fi
 }
 
-while getopts "bcdh" option;do
+while getopts "bcdhi" option;do
 	case "$option" in
 	"h")
 		usage $0
@@ -274,6 +275,9 @@ while getopts "bcdh" option;do
 	"d")
 		set -x
 		;;
+	"i")
+		mptcp_lib_set_ip_mptcp
+		;;
 	"?")
 		usage $0
 		exit ${KSFT_FAIL}
diff --git a/tools/testing/selftests/net/nat6to4.c b/tools/testing/selftests/net/nat6to4.bpf.c
index ac54c36b25fc..ac54c36b25fc 100644
--- a/tools/testing/selftests/net/nat6to4.c
+++ b/tools/testing/selftests/net/nat6to4.bpf.c
diff --git a/tools/testing/selftests/netfilter/.gitignore b/tools/testing/selftests/net/netfilter/.gitignore
index c2229b3e40d4..0a64d6d0e29a 100644
--- a/tools/testing/selftests/netfilter/.gitignore
+++ b/tools/testing/selftests/net/netfilter/.gitignore
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
-nf-queue
-connect_close
 audit_logread
+connect_close
 conntrack_dump_flush
 sctp_collision
+nf_queue
diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
new file mode 100644
index 000000000000..47945b2b3f92
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0
+
+top_srcdir = ../../../../..
+
+HOSTPKG_CONFIG := pkg-config
+MNL_CFLAGS := $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null)
+MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl)
+
+TEST_PROGS := br_netfilter.sh bridge_brouter.sh
+TEST_PROGS += conntrack_icmp_related.sh
+TEST_PROGS += conntrack_ipip_mtu.sh
+TEST_PROGS += conntrack_tcp_unreplied.sh
+TEST_PROGS += conntrack_sctp_collision.sh
+TEST_PROGS += conntrack_vrf.sh
+TEST_PROGS += ipvs.sh
+TEST_PROGS += nf_conntrack_packetdrill.sh
+TEST_PROGS += nf_nat_edemux.sh
+TEST_PROGS += nft_audit.sh
+TEST_PROGS += nft_concat_range.sh
+TEST_PROGS += nft_conntrack_helper.sh
+TEST_PROGS += nft_fib.sh
+TEST_PROGS += nft_flowtable.sh
+TEST_PROGS += nft_meta.sh
+TEST_PROGS += nft_nat.sh
+TEST_PROGS += nft_nat_zones.sh
+TEST_PROGS += nft_queue.sh
+TEST_PROGS += nft_synproxy.sh
+TEST_PROGS += nft_zones_many.sh
+TEST_PROGS += rpath.sh
+TEST_PROGS += xt_string.sh
+
+TEST_PROGS_EXTENDED = nft_concat_range_perf.sh
+
+TEST_GEN_PROGS = conntrack_dump_flush
+
+TEST_GEN_FILES = audit_logread
+TEST_GEN_FILES += connect_close nf_queue
+TEST_GEN_FILES += sctp_collision
+
+include ../../lib.mk
+
+$(OUTPUT)/nf_queue: CFLAGS += $(MNL_CFLAGS)
+$(OUTPUT)/nf_queue: LDLIBS += $(MNL_LDLIBS)
+
+$(OUTPUT)/conntrack_dump_flush: CFLAGS += $(MNL_CFLAGS)
+$(OUTPUT)/conntrack_dump_flush: LDLIBS += $(MNL_LDLIBS)
+
+TEST_FILES := lib.sh
+TEST_FILES += packetdrill
+
+TEST_INCLUDES := \
+	../lib.sh
diff --git a/tools/testing/selftests/netfilter/audit_logread.c b/tools/testing/selftests/net/netfilter/audit_logread.c
index a0a880fc2d9d..a0a880fc2d9d 100644
--- a/tools/testing/selftests/netfilter/audit_logread.c
+++ b/tools/testing/selftests/net/netfilter/audit_logread.c
diff --git a/tools/testing/selftests/net/netfilter/br_netfilter.sh b/tools/testing/selftests/net/netfilter/br_netfilter.sh
new file mode 100755
index 000000000000..c28379a965d8
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/br_netfilter.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test for legacy br_netfilter module combined with connection tracking,
+# a combination that doesn't really work.
+# Multicast/broadcast packets race for hash table insertion.
+
+#           eth0    br0     eth0
+# setup is: ns1 <->,ns0 <-> ns3
+#           ns2 <-'    `'-> ns4
+
+source lib.sh
+
+checktool "nft --version" "run test without nft tool"
+
+cleanup() {
+	cleanup_all_ns
+}
+
+trap cleanup EXIT
+
+setup_ns ns0 ns1 ns2 ns3 ns4
+
+ret=0
+
+do_ping()
+{
+	fromns="$1"
+	dstip="$2"
+
+	if ! ip netns exec "$fromns" ping -c 1 -q "$dstip" > /dev/null; then
+		echo "ERROR: ping from $fromns to $dstip"
+		ip netns exec "$ns0" nft list ruleset
+		ret=1
+	fi
+}
+
+bcast_ping()
+{
+	fromns="$1"
+	dstip="$2"
+
+	local packets=500
+
+	[ "$KSFT_MACHINE_SLOW" = yes ] && packets=100
+
+	for i in $(seq 1 $packets); do
+		if ! ip netns exec "$fromns" ping -q -f -b -c 1 -q "$dstip" > /dev/null 2>&1; then
+			echo "ERROR: ping -b from $fromns to $dstip"
+			ip netns exec "$ns0" nft list ruleset
+			ret=1
+			break
+		fi
+	done
+}
+
+ip netns exec "$ns0" sysctl -q net.ipv4.conf.all.rp_filter=0
+ip netns exec "$ns0" sysctl -q net.ipv4.conf.default.rp_filter=0
+
+if ! ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns1"; then
+	echo "SKIP: Can't create veth device"
+	exit $ksft_skip
+fi
+
+ip link add veth2 netns "$ns0" type veth peer name eth0 netns "$ns2"
+ip link add veth3 netns "$ns0" type veth peer name eth0 netns "$ns3"
+ip link add veth4 netns "$ns0" type veth peer name eth0 netns "$ns4"
+
+for i in $(seq 1 4); do
+  ip -net "$ns0" link set "veth$i" up
+done
+
+if ! ip -net "$ns0" link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1; then
+	echo "SKIP: Can't create bridge br0"
+	exit $ksft_skip
+fi
+
+# make veth0,1,2 part of bridge.
+for i in $(seq 1 3); do
+  ip -net "$ns0" link set "veth$i" master br0
+done
+
+# add a macvlan on top of the bridge.
+MACVLAN_ADDR=ba:f3:13:37:42:23
+ip -net "$ns0" link add link br0 name macvlan0 type macvlan mode private
+ip -net "$ns0" link set macvlan0 address ${MACVLAN_ADDR}
+ip -net "$ns0" link set macvlan0 up
+ip -net "$ns0" addr add 10.23.0.1/24 dev macvlan0
+
+# add a macvlan on top of veth4.
+MACVLAN_ADDR=ba:f3:13:37:42:24
+ip -net "$ns0" link add link veth4 name macvlan4 type macvlan mode passthru
+ip -net "$ns0" link set macvlan4 address ${MACVLAN_ADDR}
+ip -net "$ns0" link set macvlan4 up
+
+# make the macvlan part of the bridge.
+# veth4 is not a bridge port, only the macvlan on top of it.
+ip -net "$ns0" link set macvlan4 master br0
+
+ip -net "$ns0" link set br0 up
+ip -net "$ns0" addr add 10.0.0.1/24 dev br0
+
+modprobe -q br_netfilter
+if ! ip netns exec "$ns0" sysctl -q net.bridge.bridge-nf-call-iptables=1; then
+	echo "SKIP: bridge netfilter not available"
+	ret=$ksft_skip
+fi
+
+# for testing, so namespaces will reply to ping -b probes.
+ip netns exec "$ns0" sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0
+
+# enable conntrack in ns0 and drop broadcast packets in forward to
+# avoid them from getting confirmed in the postrouting hook before
+# the cloned skb is passed up the stack.
+ip netns exec "$ns0" nft -f - <<EOF
+table ip filter {
+	chain input {
+		type filter hook input priority 1; policy accept
+		iifname br0 counter
+		ct state new accept
+	}
+}
+
+table bridge filter {
+	chain forward {
+		type filter hook forward priority 0; policy accept
+		meta pkttype broadcast ip protocol icmp counter drop
+	}
+}
+EOF
+if [ "$?" -ne 0 ];then
+	echo "SKIP: could not add nftables ruleset"
+	exit $ksft_skip
+fi
+
+# place 1, 2 & 3 in same subnet, connected via ns0:br0.
+# ns4 is placed in same subnet as well, but its not
+# part of the bridge: the corresponding veth4 is not
+# part of the bridge, only its macvlan interface.
+for i in $(seq 1 4); do
+  eval ip -net \$ns"$i" link set eth0 up
+done
+for i in $(seq 1 2); do
+  eval ip -net \$ns"$i" addr add "10.0.0.1$i/24" dev eth0
+done
+
+ip -net "$ns3" addr add 10.23.0.13/24 dev eth0
+ip -net "$ns4" addr add 10.23.0.14/24 dev eth0
+
+# test basic connectivity
+do_ping "$ns1" 10.0.0.12
+do_ping "$ns3" 10.23.0.1
+do_ping "$ns4" 10.23.0.1
+
+bcast_ping "$ns1" 10.0.0.255
+
+# This should deliver broadcast to macvlan0, which is on top of ns0:br0.
+bcast_ping "$ns3" 10.23.0.255
+
+# same, this time via veth4:macvlan4.
+bcast_ping "$ns4" 10.23.0.255
+
+read t < /proc/sys/kernel/tainted
+if [ "$t" -eq 0 ];then
+	echo PASS: kernel not tainted
+else
+	echo ERROR: kernel is tainted
+	ret=1
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/bridge_brouter.sh b/tools/testing/selftests/net/netfilter/bridge_brouter.sh
new file mode 100755
index 000000000000..2549b6590693
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/bridge_brouter.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#
+# This test is for bridge 'brouting', i.e. make some packets being routed
+# rather than getting bridged even though they arrive on interface that is
+# part of a bridge.
+
+#           eth0    br0     eth0
+# setup is: ns1 <-> nsbr <-> ns2
+
+source lib.sh
+
+if ! ebtables -V > /dev/null 2>&1;then
+	echo "SKIP: Could not run test without ebtables"
+	exit $ksft_skip
+fi
+
+cleanup() {
+	cleanup_all_ns
+}
+
+trap cleanup EXIT
+
+setup_ns nsbr ns1 ns2
+
+ip netns exec "$nsbr" sysctl -q net.ipv4.conf.default.rp_filter=0
+ip netns exec "$nsbr" sysctl -q net.ipv4.conf.all.rp_filter=0
+if ! ip link add veth0 netns "$nsbr" type veth peer name eth0 netns "$ns1"; then
+	echo "SKIP: Can't create veth device"
+	exit $ksft_skip
+fi
+ip link add veth1 netns "$nsbr" type veth peer name eth0 netns "$ns2"
+
+if ! ip -net "$nsbr" link add br0 type bridge; then
+	echo "SKIP: Can't create bridge br0"
+	exit $ksft_skip
+fi
+
+ip -net "$nsbr" link set veth0 up
+ip -net "$nsbr" link set veth1 up
+
+ip -net "$nsbr" link set veth0 master br0
+ip -net "$nsbr" link set veth1 master br0
+ip -net "$nsbr" link set br0 up
+ip -net "$nsbr" addr add 10.0.0.1/24 dev br0
+
+# place both in same subnet, ${ns1} and ${ns2} connected via ${nsbr}:br0
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+ip -net "$ns1" addr add 10.0.0.11/24 dev eth0
+ip -net "$ns2" addr add 10.0.0.12/24 dev eth0
+
+test_ebtables_broute()
+{
+	# redirect is needed so the dstmac is rewritten to the bridge itself,
+	# ip stack won't process OTHERHOST (foreign unicast mac) packets.
+	if ! ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP; then
+		echo "SKIP: Could not add ebtables broute redirect rule"
+		return $ksft_skip
+	fi
+
+	ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=0
+
+	# ping net${ns1}, expected to not work (ip forwarding is off)
+	if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then
+		echo "ERROR: ping works, should have failed" 1>&2
+		return 1
+	fi
+
+	# enable forwarding on both interfaces.
+	# neither needs an ip address, but at least the bridge needs
+	# an ip address in same network segment as ${ns1} and ${ns2} (${nsbr}
+	# needs to be able to determine route for to-be-forwarded packet).
+	ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth0.forwarding=1
+	ip netns exec "$nsbr" sysctl -q net.ipv4.conf.veth1.forwarding=1
+
+	if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then
+		echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2
+		return 1
+	fi
+
+	echo "PASS: ${ns1}/${ns2} connectivity with active broute rule"
+	ip netns exec "$nsbr" ebtables -t broute -F
+
+	# ping net${ns1}, expected to work (frames are bridged)
+	if ! ip netns exec "$ns1" ping -q -c 1 10.0.0.12 > /dev/null; then
+		echo "ERROR: ping did not work, but it should (bridged)" 1>&2
+		return 1
+	fi
+
+	ip netns exec "$nsbr" ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP
+
+	# ping net${ns1}, expected to not work (DROP in bridge forward)
+	if ip netns exec "$ns1" ping -q -c 1 10.0.0.12 -W 0.5 > /dev/null 2>&1; then
+		echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2
+		return 1
+	fi
+
+	# re-activate brouter
+	ip netns exec "$nsbr" ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP
+
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.0.11 > /dev/null; then
+		echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2
+		return 1
+	fi
+
+	echo "PASS: ${ns1}/${ns2} connectivity with active broute rule and bridge forward drop"
+	return 0
+}
+
+# test basic connectivity
+if ! ip netns exec "$ns1" ping -c 1 -q 10.0.0.12 > /dev/null; then
+    echo "ERROR: Could not reach ${ns2} from ${ns1}" 1>&2
+    exit 1
+fi
+
+if ! ip netns exec "$ns2" ping -c 1 -q 10.0.0.11 > /dev/null; then
+    echo "ERROR: Could not reach ${ns1} from ${ns2}" 1>&2
+    exit 1
+fi
+
+test_ebtables_broute
+exit $?
diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config
new file mode 100644
index 000000000000..63ef80ef47a4
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/config
@@ -0,0 +1,89 @@
+CONFIG_AUDIT=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BRIDGE=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_NETFILTER=m
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_CGROUP_BPF=y
+CONFIG_DUMMY=m
+CONFIG_INET_ESP=m
+CONFIG_IP_NF_MATCH_RPFILTER=m
+CONFIG_IP6_NF_MATCH_RPFILTER=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP_NF_RAW=m
+CONFIG_IP6_NF_RAW=m
+CONFIG_IP_SCTP=m
+CONFIG_IP_VS=m
+CONFIG_IP_VS_PROTO_TCP=y
+CONFIG_IP_VS_RR=m
+CONFIG_IPV6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_MACVLAN=m
+CONFIG_NAMESPACES=y
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_L3_MASTER_DEV=y
+CONFIG_NET_NS=y
+CONFIG_NET_SCH_NETEM=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_IPIP=m
+CONFIG_NET_VRF=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_ADVANCED=y
+CONFIG_NETFILTER_NETLINK=m
+CONFIG_NETFILTER_NETLINK_QUEUE=m
+CONFIG_NETFILTER_SYNPROXY=m
+CONFIG_NETFILTER_XTABLES=m
+CONFIG_NETFILTER_XT_NAT=m
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NETFILTER_XT_MATCH_STATE=m
+CONFIG_NETFILTER_XT_MATCH_STRING=m
+CONFIG_NETFILTER_XT_TARGET_REDIRECT=m
+CONFIG_NF_CONNTRACK=m
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=m
+CONFIG_NF_CONNTRACK_MARK=y
+CONFIG_NF_CONNTRACK_ZONES=y
+CONFIG_NF_CT_NETLINK=m
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_FLOW_TABLE=m
+CONFIG_NF_LOG_IPV4=m
+CONFIG_NF_LOG_IPV6=m
+CONFIG_NF_NAT=m
+CONFIG_NF_NAT_REDIRECT=y
+CONFIG_NF_NAT_MASQUERADE=y
+CONFIG_NF_TABLES=m
+CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_NF_TABLES_INET=y
+CONFIG_NF_TABLES_IPV4=y
+CONFIG_NF_TABLES_IPV6=y
+CONFIG_NF_TABLES_NETDEV=y
+CONFIG_NF_FLOW_TABLE_INET=m
+CONFIG_NFT_BRIDGE_META=m
+CONFIG_NFT_COMPAT=m
+CONFIG_NFT_CT=m
+CONFIG_NFT_FIB=m
+CONFIG_NFT_FIB_INET=m
+CONFIG_NFT_FIB_IPV4=m
+CONFIG_NFT_FIB_IPV6=m
+CONFIG_NFT_FLOW_OFFLOAD=m
+CONFIG_NFT_LIMIT=m
+CONFIG_NFT_LOG=m
+CONFIG_NFT_MASQ=m
+CONFIG_NFT_NAT=m
+CONFIG_NFT_NUMGEN=m
+CONFIG_NFT_QUEUE=m
+CONFIG_NFT_QUOTA=m
+CONFIG_NFT_REDIR=m
+CONFIG_NFT_SYNPROXY=m
+CONFIG_VETH=m
+CONFIG_VLAN_8021Q=m
+CONFIG_XFRM_USER=m
+CONFIG_XFRM_STATISTICS=y
+CONFIG_NET_PKTGEN=m
+CONFIG_TUN=m
diff --git a/tools/testing/selftests/netfilter/connect_close.c b/tools/testing/selftests/net/netfilter/connect_close.c
index 1c3b0add54c4..1c3b0add54c4 100644
--- a/tools/testing/selftests/netfilter/connect_close.c
+++ b/tools/testing/selftests/net/netfilter/connect_close.c
diff --git a/tools/testing/selftests/netfilter/conntrack_dump_flush.c b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c
index b11ea8ee6719..bd9317bf5ada 100644
--- a/tools/testing/selftests/netfilter/conntrack_dump_flush.c
+++ b/tools/testing/selftests/net/netfilter/conntrack_dump_flush.c
@@ -10,7 +10,7 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
 #include <linux/netfilter/nf_conntrack_tcp.h>
-#include "../kselftest_harness.h"
+#include "../../kselftest_harness.h"
 
 #define TEST_ZONE_ID 123
 #define NF_CT_DEFAULT_ZONE_ID 0
@@ -313,13 +313,11 @@ FIXTURE_SETUP(conntrack_dump_flush)
 	self->sock = mnl_socket_open(NETLINK_NETFILTER);
 	if (!self->sock) {
 		perror("mnl_socket_open");
-		exit(EXIT_FAILURE);
+		SKIP(return, "cannot open netlink_netfilter socket");
 	}
 
-	if (mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID) < 0) {
-		perror("mnl_socket_bind");
-		exit(EXIT_FAILURE);
-	}
+	ret = mnl_socket_bind(self->sock, 0, MNL_SOCKET_AUTOPID);
+	EXPECT_EQ(ret, 0);
 
 	ret = conntracK_count_zone(self->sock, TEST_ZONE_ID);
 	if (ret < 0 && errno == EPERM)
diff --git a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh
index 76645aaf2b58..c63d840ead61 100755
--- a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh
+++ b/tools/testing/selftests/net/netfilter/conntrack_icmp_related.sh
@@ -14,35 +14,32 @@
 # check the icmp errors are propagated to the correct host as per
 # nat of "established" icmp-echo "connection".
 
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-ret=0
+source lib.sh
 
-nft --version > /dev/null 2>&1
-if [ $? -ne 0 ];then
+if ! nft --version > /dev/null 2>&1;then
 	echo "SKIP: Could not run test without nft tool"
 	exit $ksft_skip
 fi
 
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
 cleanup() {
-	for i in 1 2;do ip netns del nsclient$i;done
-	for i in 1 2;do ip netns del nsrouter$i;done
+	cleanup_all_ns
 }
 
 trap cleanup EXIT
 
-ipv4() {
-    echo -n 192.168.$1.2
-}
+setup_ns nsclient1 nsclient2 nsrouter1 nsrouter2
+
+ret=0
+
+add_addr()
+{
+	ns=$1
+	dev=$2
+	i=$3
 
-ipv6 () {
-    echo -n dead:$1::2
+	ip -net "$ns" link set "$dev" up
+	ip -net "$ns" addr add "192.168.$i.2/24" dev "$dev"
+	ip -net "$ns" addr add "dead:$i::2/64" dev "$dev" nodad
 }
 
 check_counter()
@@ -52,10 +49,9 @@ check_counter()
 	expect=$3
 	local lret=0
 
-	cnt=$(ip netns exec $ns nft list counter inet filter "$name" | grep -q "$expect")
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns" nft list counter inet filter "$name" | grep -q "$expect"; then
 		echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2
-		ip netns exec $ns nft list counter inet filter "$name" 1>&2
+		ip netns exec "$ns" nft list counter inet filter "$name" 1>&2
 		lret=1
 	fi
 
@@ -65,9 +61,8 @@ check_counter()
 check_unknown()
 {
 	expect="packets 0 bytes 0"
-	for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do
-		check_counter $n "unknown" "$expect"
-		if [ $? -ne 0 ] ;then
+	for n in ${nsclient1} ${nsclient2} ${nsrouter1} ${nsrouter2}; do
+		if ! check_counter "$n" "unknown" "$expect"; then
 			return 1
 		fi
 	done
@@ -75,61 +70,48 @@ check_unknown()
 	return 0
 }
 
-for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do
-  ip netns add $n
-  ip -net $n link set lo up
-done
-
-DEV=veth0
-ip link add $DEV netns nsclient1 type veth peer name eth1 netns nsrouter1
 DEV=veth0
-ip link add $DEV netns nsclient2 type veth peer name eth1 netns nsrouter2
+ip link add "$DEV" netns "$nsclient1" type veth peer name eth1 netns "$nsrouter1"
+ip link add "$DEV" netns "$nsclient2" type veth peer name eth1 netns "$nsrouter2"
+ip link add "$DEV" netns "$nsrouter1" type veth peer name eth2 netns "$nsrouter2"
 
-DEV=veth0
-ip link add $DEV netns nsrouter1 type veth peer name eth2 netns nsrouter2
+add_addr "$nsclient1" $DEV 1
+add_addr "$nsclient2" $DEV 2
 
-DEV=veth0
-for i in 1 2; do
-    ip -net nsclient$i link set $DEV up
-    ip -net nsclient$i addr add $(ipv4 $i)/24 dev $DEV
-    ip -net nsclient$i addr add $(ipv6 $i)/64 dev $DEV
-done
-
-ip -net nsrouter1 link set eth1 up
-ip -net nsrouter1 link set veth0 up
+ip -net "$nsrouter1" link set eth1 up
+ip -net "$nsrouter1" link set $DEV up
 
-ip -net nsrouter2 link set eth1 up
-ip -net nsrouter2 link set eth2 up
+ip -net "$nsrouter2" link set eth1 mtu 1280 up
+ip -net "$nsrouter2" link set eth2 up
 
-ip -net nsclient1 route add default via 192.168.1.1
-ip -net nsclient1 -6 route add default via dead:1::1
+ip -net "$nsclient1" route add default via 192.168.1.1
+ip -net "$nsclient1" -6 route add default via dead:1::1
 
-ip -net nsclient2 route add default via 192.168.2.1
-ip -net nsclient2 route add default via dead:2::1
+ip -net "$nsclient2" route add default via 192.168.2.1
+ip -net "$nsclient2" route add default via dead:2::1
+ip -net "$nsclient2" link set veth0 mtu 1280
 
-i=3
-ip -net nsrouter1 addr add 192.168.1.1/24 dev eth1
-ip -net nsrouter1 addr add 192.168.3.1/24 dev veth0
-ip -net nsrouter1 addr add dead:1::1/64 dev eth1
-ip -net nsrouter1 addr add dead:3::1/64 dev veth0
-ip -net nsrouter1 route add default via 192.168.3.10
-ip -net nsrouter1 -6 route add default via dead:3::10
+ip -net "$nsrouter1" addr add 192.168.1.1/24 dev eth1
+ip -net "$nsrouter1" addr add 192.168.3.1/24 dev veth0
+ip -net "$nsrouter1" addr add dead:1::1/64 dev eth1 nodad
+ip -net "$nsrouter1" addr add dead:3::1/64 dev veth0 nodad
+ip -net "$nsrouter1" route add default via 192.168.3.10
+ip -net "$nsrouter1" -6 route add default via dead:3::10
 
-ip -net nsrouter2 addr add 192.168.2.1/24 dev eth1
-ip -net nsrouter2 addr add 192.168.3.10/24 dev eth2
-ip -net nsrouter2 addr add dead:2::1/64 dev eth1
-ip -net nsrouter2 addr add dead:3::10/64 dev eth2
-ip -net nsrouter2 route add default via 192.168.3.1
-ip -net nsrouter2 route add default via dead:3::1
+ip -net "$nsrouter2" addr add 192.168.2.1/24 dev eth1
+ip -net "$nsrouter2" addr add 192.168.3.10/24 dev eth2
+ip -net "$nsrouter2" addr add dead:2::1/64  dev eth1 nodad
+ip -net "$nsrouter2" addr add dead:3::10/64 dev eth2 nodad
+ip -net "$nsrouter2" route add default via 192.168.3.1
+ip -net "$nsrouter2" route add default via dead:3::1
 
-sleep 2
 for i in 4 6; do
-	ip netns exec nsrouter1 sysctl -q net.ipv$i.conf.all.forwarding=1
-	ip netns exec nsrouter2 sysctl -q net.ipv$i.conf.all.forwarding=1
+	ip netns exec "$nsrouter1" sysctl -q net.ipv$i.conf.all.forwarding=1
+	ip netns exec "$nsrouter2" sysctl -q net.ipv$i.conf.all.forwarding=1
 done
 
-for netns in nsrouter1 nsrouter2; do
-ip netns exec $netns nft -f - <<EOF
+for netns in "$nsrouter1" "$nsrouter2"; do
+ip netns exec "$netns" nft -f - <<EOF
 table inet filter {
 	counter unknown { }
 	counter related { }
@@ -144,7 +126,7 @@ table inet filter {
 EOF
 done
 
-ip netns exec nsclient1 nft -f - <<EOF
+ip netns exec "$nsclient1" nft -f - <<EOF
 table inet filter {
 	counter unknown { }
 	counter related { }
@@ -164,7 +146,7 @@ table inet filter {
 }
 EOF
 
-ip netns exec nsclient2 nft -f - <<EOF
+ip netns exec "$nsclient2" nft -f - <<EOF
 table inet filter {
 	counter unknown { }
 	counter new { }
@@ -189,11 +171,10 @@ table inet filter {
 }
 EOF
 
-
 # make sure NAT core rewrites adress of icmp error if nat is used according to
 # conntrack nat information (icmp error will be directed at nsrouter1 address,
 # but it needs to be routed to nsclient1 address).
-ip netns exec nsrouter1 nft -f - <<EOF
+ip netns exec "$nsrouter1" nft -f - <<EOF
 table ip nat {
 	chain postrouting {
 		type nat hook postrouting priority 0; policy accept;
@@ -208,44 +189,32 @@ table ip6 nat {
 }
 EOF
 
-ip netns exec nsrouter2 ip link set eth1  mtu 1280
-ip netns exec nsclient2 ip link set veth0 mtu 1280
-sleep 1
-
-ip netns exec nsclient1 ping -c 1 -s 1000 -q -M do 192.168.2.2 >/dev/null
-if [ $? -ne 0 ]; then
+if ! ip netns exec "$nsclient1" ping -c 1 -s 1000 -q -M "do" 192.168.2.2 >/dev/null; then
 	echo "ERROR: netns ip routing/connectivity broken" 1>&2
-	cleanup
 	exit 1
 fi
-ip netns exec nsclient1 ping6 -q -c 1 -s 1000 dead:2::2 >/dev/null
-if [ $? -ne 0 ]; then
+if ! ip netns exec "$nsclient1" ping -c 1 -s 1000 -q dead:2::2 >/dev/null; then
 	echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2
-	cleanup
 	exit 1
 fi
 
-check_unknown
-if [ $? -ne 0 ]; then
+if ! check_unknown; then
 	ret=1
 fi
 
 expect="packets 0 bytes 0"
-for netns in nsrouter1 nsrouter2 nsclient1;do
-	check_counter "$netns" "related" "$expect"
-	if [ $? -ne 0 ]; then
+for netns in "$nsrouter1" "$nsrouter2" "$nsclient1";do
+	if ! check_counter "$netns" "related" "$expect"; then
 		ret=1
 	fi
 done
 
 expect="packets 2 bytes 2076"
-check_counter nsclient2 "new" "$expect"
-if [ $? -ne 0 ]; then
+if ! check_counter "$nsclient2" "new" "$expect"; then
 	ret=1
 fi
 
-ip netns exec nsclient1 ping -q -c 1 -s 1300 -M do 192.168.2.2 > /dev/null
-if [ $? -eq 0 ]; then
+if ip netns exec "$nsclient1" ping -W 0.5 -q -c 1 -s 1300 -M "do" 192.168.2.2 > /dev/null; then
 	echo "ERROR: ping should have failed with PMTU too big error" 1>&2
 	ret=1
 fi
@@ -253,30 +222,26 @@ fi
 # nsrouter2 should have generated the icmp error, so
 # related counter should be 0 (its in forward).
 expect="packets 0 bytes 0"
-check_counter "nsrouter2" "related" "$expect"
-if [ $? -ne 0 ]; then
+if ! check_counter "$nsrouter2" "related" "$expect"; then
 	ret=1
 fi
 
 # but nsrouter1 should have seen it, same for nsclient1.
 expect="packets 1 bytes 576"
-for netns in nsrouter1 nsclient1;do
-	check_counter "$netns" "related" "$expect"
-	if [ $? -ne 0 ]; then
+for netns in ${nsrouter1} ${nsclient1};do
+	if ! check_counter "$netns" "related" "$expect"; then
 		ret=1
 	fi
 done
 
-ip netns exec nsclient1 ping6 -c 1 -s 1300 dead:2::2 > /dev/null
-if [ $? -eq 0 ]; then
+if ip netns exec "${nsclient1}" ping6 -W 0.5 -c 1 -s 1300 dead:2::2 > /dev/null; then
 	echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2
 	ret=1
 fi
 
 expect="packets 2 bytes 1856"
-for netns in nsrouter1 nsclient1;do
-	check_counter "$netns" "related" "$expect"
-	if [ $? -ne 0 ]; then
+for netns in "${nsrouter1}" "${nsclient1}";do
+	if ! check_counter "$netns" "related" "$expect"; then
 		ret=1
 	fi
 done
@@ -288,21 +253,19 @@ else
 fi
 
 # add 'bad' route,  expect icmp REDIRECT to be generated
-ip netns exec nsclient1 ip route add 192.168.1.42 via 192.168.1.1
-ip netns exec nsclient1 ip route add dead:1::42 via dead:1::1
+ip netns exec "${nsclient1}" ip route add 192.168.1.42 via 192.168.1.1
+ip netns exec "${nsclient1}" ip route add dead:1::42 via dead:1::1
 
-ip netns exec "nsclient1" ping -q -c 2 192.168.1.42 > /dev/null
+ip netns exec "$nsclient1" ping -W 1 -q -i 0.5 -c 2 192.168.1.42 > /dev/null
 
 expect="packets 1 bytes 112"
-check_counter nsclient1 "redir4" "$expect"
-if [ $? -ne 0 ];then
+if ! check_counter "$nsclient1" "redir4" "$expect"; then
 	ret=1
 fi
 
-ip netns exec "nsclient1" ping -c 1 dead:1::42 > /dev/null
+ip netns exec "$nsclient1" ping -W 1 -c 1 dead:1::42 > /dev/null
 expect="packets 1 bytes 192"
-check_counter nsclient1 "redir6" "$expect"
-if [ $? -ne 0 ];then
+if ! check_counter "$nsclient1" "redir6" "$expect"; then
 	ret=1
 fi
 
diff --git a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh
index eb9553e4986b..9832a5d0198a 100755
--- a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh
+++ b/tools/testing/selftests/net/netfilter/conntrack_ipip_mtu.sh
@@ -1,8 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
+source lib.sh
 
 # Conntrack needs to reassemble fragments in order to have complete
 # packets for rule matching.  Reassembly can lead to packet loss.
@@ -23,56 +22,44 @@ ksft_skip=4
 # between Client A and Client B over WAN. Wanrouter has MTU 1400 set
 # on its interfaces.
 
-rnd=$(mktemp -u XXXXXXXX)
 rx=$(mktemp)
 
-r_a="ns-ra-$rnd"
-r_b="ns-rb-$rnd"
-r_w="ns-rw-$rnd"
-c_a="ns-ca-$rnd"
-c_b="ns-cb-$rnd"
-
-checktool (){
-	if ! $1 > /dev/null 2>&1; then
-		echo "SKIP: Could not $2"
-		exit $ksft_skip
-	fi
-}
-
 checktool "iptables --version" "run test without iptables"
-checktool "ip -Version" "run test without ip tool"
-checktool "which socat" "run test without socat"
-checktool "ip netns add ${r_a}" "create net namespace"
+checktool "socat -h" "run test without socat"
 
-for n in ${r_b} ${r_w} ${c_a} ${c_b};do
-	ip netns add ${n}
-done
+setup_ns r_a r_b r_w c_a c_b
 
 cleanup() {
-	for n in ${r_a} ${r_b} ${r_w} ${c_a} ${c_b};do
-		ip netns del ${n}
-	done
-	rm -f ${rx}
+	cleanup_all_ns
+	rm -f "$rx"
 }
 
 trap cleanup EXIT
 
+listener_ready()
+{
+	ns="$1"
+	port="$2"
+	ss -N "$ns" -lnu -o "sport = :$port" | grep -q "$port"
+}
+
 test_path() {
 	msg="$1"
 
-	ip netns exec ${c_b} socat -t 3 - udp4-listen:5000,reuseaddr > ${rx} < /dev/null &
+	ip netns exec "$c_b" socat -t 3 - udp4-listen:5000,reuseaddr > "$rx" < /dev/null &
+
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$c_b" 5000
 
-	sleep 1
 	for i in 1 2 3; do
 		head -c1400 /dev/zero | tr "\000" "a" | \
-			ip netns exec ${c_a} socat -t 1 -u STDIN UDP:192.168.20.2:5000
+			ip netns exec "$c_a" socat -t 1 -u STDIN UDP:192.168.20.2:5000
 	done
 
 	wait
 
-	bytes=$(wc -c < ${rx})
+	bytes=$(wc -c < "$rx")
 
-	if [ $bytes -eq 1400 ];then
+	if [ "$bytes" -eq 1400 ];then
 		echo "OK: PMTU $msg connection tracking"
 	else
 		echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400"
@@ -91,24 +78,24 @@ test_path() {
 # 10.4.4.1 via 10.2.2.254      (Router B via Wanrouter)
 # No iptables rules at all.
 
-ip link add veth0 netns ${r_a} type veth peer name veth0 netns ${r_w}
-ip link add veth1 netns ${r_a} type veth peer name veth0 netns ${c_a}
+ip link add veth0 netns "$r_a" type veth peer name veth0 netns "$r_w"
+ip link add veth1 netns "$r_a" type veth peer name veth0 netns "$c_a"
 
 l_addr="10.2.2.1"
 r_addr="10.4.4.1"
-ip netns exec ${r_a} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip
+ip netns exec "$r_a" ip link add ipip0 type ipip local "$l_addr" remote "$r_addr" mode ipip || exit $ksft_skip
 
 for dev in lo veth0 veth1 ipip0; do
-    ip -net ${r_a} link set $dev up
+    ip -net "$r_a" link set "$dev" up
 done
 
-ip -net ${r_a} addr add 10.2.2.1/24 dev veth0
-ip -net ${r_a} addr add 192.168.10.1/24 dev veth1
+ip -net "$r_a" addr add 10.2.2.1/24 dev veth0
+ip -net "$r_a" addr add 192.168.10.1/24 dev veth1
 
-ip -net ${r_a} route add 192.168.20.0/24 dev ipip0
-ip -net ${r_a} route add 10.4.4.0/24 via 10.2.2.254
+ip -net "$r_a" route add 192.168.20.0/24 dev ipip0
+ip -net "$r_a" route add 10.4.4.0/24 via 10.2.2.254
 
-ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
+ip netns exec "$r_a" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
 
 # Detailed setup for Router B
 # ---------------------------
@@ -121,49 +108,46 @@ ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
 # 10.2.2.1 via 10.4.4.254      (Router A via Wanrouter)
 # No iptables rules at all.
 
-ip link add veth0 netns ${r_b} type veth peer name veth1 netns ${r_w}
-ip link add veth1 netns ${r_b} type veth peer name veth0 netns ${c_b}
+ip link add veth0 netns "$r_b" type veth peer name veth1 netns "$r_w"
+ip link add veth1 netns "$r_b" type veth peer name veth0 netns "$c_b"
 
 l_addr="10.4.4.1"
 r_addr="10.2.2.1"
 
-ip netns exec ${r_b} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip
+ip netns exec "$r_b" ip link add ipip0 type ipip local "${l_addr}" remote "${r_addr}" mode ipip || exit $ksft_skip
 
-for dev in lo veth0 veth1 ipip0; do
-	ip -net ${r_b} link set $dev up
+for dev in veth0 veth1 ipip0; do
+	ip -net "$r_b" link set $dev up
 done
 
-ip -net ${r_b} addr add 10.4.4.1/24 dev veth0
-ip -net ${r_b} addr add 192.168.20.1/24 dev veth1
+ip -net "$r_b" addr add 10.4.4.1/24 dev veth0
+ip -net "$r_b" addr add 192.168.20.1/24 dev veth1
 
-ip -net ${r_b} route add 192.168.10.0/24 dev ipip0
-ip -net ${r_b} route add 10.2.2.0/24 via 10.4.4.254
-ip netns exec ${r_b} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
+ip -net "$r_b" route add 192.168.10.0/24 dev ipip0
+ip -net "$r_b" route add 10.2.2.0/24 via 10.4.4.254
+ip netns exec "$r_b" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
 
 # Client A
-ip -net ${c_a} addr add 192.168.10.2/24 dev veth0
-ip -net ${c_a} link set dev lo up
-ip -net ${c_a} link set dev veth0 up
-ip -net ${c_a} route add default via 192.168.10.1
+ip -net "$c_a" addr add 192.168.10.2/24 dev veth0
+ip -net "$c_a" link set dev veth0 up
+ip -net "$c_a" route add default via 192.168.10.1
 
 # Client A
-ip -net ${c_b} addr add 192.168.20.2/24 dev veth0
-ip -net ${c_b} link set dev veth0 up
-ip -net ${c_b} link set dev lo up
-ip -net ${c_b} route add default via 192.168.20.1
+ip -net "$c_b" addr add 192.168.20.2/24 dev veth0
+ip -net "$c_b" link set dev veth0 up
+ip -net "$c_b" route add default via 192.168.20.1
 
 # Wan
-ip -net ${r_w} addr add 10.2.2.254/24 dev veth0
-ip -net ${r_w} addr add 10.4.4.254/24 dev veth1
+ip -net "$r_w" addr add 10.2.2.254/24 dev veth0
+ip -net "$r_w" addr add 10.4.4.254/24 dev veth1
 
-ip -net ${r_w} link set dev lo up
-ip -net ${r_w} link set dev veth0 up mtu 1400
-ip -net ${r_w} link set dev veth1 up mtu 1400
+ip -net "$r_w" link set dev veth0 up mtu 1400
+ip -net "$r_w" link set dev veth1 up mtu 1400
 
-ip -net ${r_a} link set dev veth0 mtu 1400
-ip -net ${r_b} link set dev veth0 mtu 1400
+ip -net "$r_a" link set dev veth0 mtu 1400
+ip -net "$r_b" link set dev veth0 mtu 1400
 
-ip netns exec ${r_w} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
+ip netns exec "$r_w" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
 
 # Path MTU discovery
 # ------------------
@@ -203,5 +187,5 @@ test_path "without"
 #packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is
 #dropped on Router A before sending.
 
-ip netns exec ${r_a} iptables -A FORWARD -m conntrack --ctstate NEW
+ip netns exec "$r_a" iptables -A FORWARD -m conntrack --ctstate NEW
 test_path "with"
diff --git a/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh
new file mode 100755
index 000000000000..d860f7d9744b
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Testing For SCTP COLLISION SCENARIO as Below:
+#
+#   14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359]
+#   14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187]
+#   14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359]
+#   14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO]
+#   14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK]
+#   14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970]
+#
+# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS
+
+source lib.sh
+
+CLIENT_IP="198.51.200.1"
+CLIENT_PORT=1234
+
+SERVER_IP="198.51.100.1"
+SERVER_PORT=1234
+
+CLIENT_GW="198.51.200.2"
+SERVER_GW="198.51.100.2"
+
+# setup the topo
+setup() {
+	setup_ns CLIENT_NS SERVER_NS ROUTER_NS
+	ip -n "$SERVER_NS" link add link0 type veth peer name link1 netns "$ROUTER_NS"
+	ip -n "$CLIENT_NS" link add link3 type veth peer name link2 netns "$ROUTER_NS"
+
+	ip -n "$SERVER_NS" link set link0 up
+	ip -n "$SERVER_NS" addr add $SERVER_IP/24 dev link0
+	ip -n "$SERVER_NS" route add $CLIENT_IP dev link0 via $SERVER_GW
+
+	ip -n "$ROUTER_NS" link set link1 up
+	ip -n "$ROUTER_NS" link set link2 up
+	ip -n "$ROUTER_NS" addr add $SERVER_GW/24 dev link1
+	ip -n "$ROUTER_NS" addr add $CLIENT_GW/24 dev link2
+	ip net exec "$ROUTER_NS" sysctl -wq net.ipv4.ip_forward=1
+
+	ip -n "$CLIENT_NS" link set link3 up
+	ip -n "$CLIENT_NS" addr add $CLIENT_IP/24 dev link3
+	ip -n "$CLIENT_NS" route add $SERVER_IP dev link3 via $CLIENT_GW
+
+	# simulate the delay on OVS upcall by setting up a delay for INIT_ACK with
+	# tc on $SERVER_NS side
+	tc -n "$SERVER_NS" qdisc add dev link0 root handle 1: htb r2q 64
+	tc -n "$SERVER_NS" class add dev link0 parent 1: classid 1:1 htb rate 100mbit
+	tc -n "$SERVER_NS" filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \
+		0xff match u8 2 0xff at 32 flowid 1:1
+	if ! tc -n "$SERVER_NS" qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms; then
+		echo "SKIP: Cannot add netem qdisc"
+		exit $ksft_skip
+	fi
+
+	# simulate the ctstate check on OVS nf_conntrack
+	ip net exec "$ROUTER_NS" iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP
+	ip net exec "$ROUTER_NS" iptables -A INPUT -p sctp -j DROP
+
+	# use a smaller number for assoc's max_retrans to reproduce the issue
+	modprobe -q sctp
+	ip net exec "$CLIENT_NS" sysctl -wq net.sctp.association_max_retrans=3
+}
+
+cleanup() {
+	ip net exec "$CLIENT_NS" pkill sctp_collision >/dev/null 2>&1
+	ip net exec "$SERVER_NS" pkill sctp_collision >/dev/null 2>&1
+	cleanup_all_ns
+}
+
+do_test() {
+	ip net exec "$SERVER_NS" ./sctp_collision server \
+		$SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT &
+	ip net exec "$CLIENT_NS" ./sctp_collision client \
+		$CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT
+}
+
+# NOTE: one way to work around the issue is set a smaller hb_interval
+# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500
+
+# run the test case
+trap cleanup EXIT
+setup && \
+echo "Test for SCTP Collision in nf_conntrack:" && \
+do_test && echo "PASS!"
+exit $?
diff --git a/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh
new file mode 100755
index 000000000000..121ea93c0178
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_tcp_unreplied.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Check that UNREPLIED tcp conntrack will eventually timeout.
+#
+
+source lib.sh
+
+if ! nft --version > /dev/null 2>&1;then
+	echo "SKIP: Could not run test without nft tool"
+	exit $ksft_skip
+fi
+
+if ! conntrack --version > /dev/null 2>&1;then
+	echo "SKIP: Could not run test without conntrack tool"
+	exit $ksft_skip
+fi
+
+ret=0
+
+cleanup() {
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+}
+
+ipv4() {
+    echo -n 192.168."$1".2
+}
+
+check_counter()
+{
+	ns=$1
+	name=$2
+	expect=$3
+	local lret=0
+
+	if ! ip netns exec "$ns2" nft list counter inet filter "$name" | grep -q "$expect"; then
+		echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2
+		ip netns exec "$ns2" nft list counter inet filter "$name" 1>&2
+		lret=1
+	fi
+
+	return $lret
+}
+
+trap cleanup EXIT
+
+# Create test namespaces
+setup_ns ns1 ns2
+
+# Connect the namespace to the host using a veth pair
+ip -net "$ns1" link add name veth1 type veth peer name veth2
+ip -net "$ns1" link set netns "$ns2" dev veth2
+
+ip -net "$ns1" link set up dev lo
+ip -net "$ns2" link set up dev lo
+ip -net "$ns1" link set up dev veth1
+ip -net "$ns2" link set up dev veth2
+
+ip -net "$ns2" addr add 10.11.11.2/24 dev veth2
+ip -net "$ns2" route add default via 10.11.11.1
+
+ip netns exec "$ns2" sysctl -q net.ipv4.conf.veth2.forwarding=1
+
+# add a rule inside NS so we enable conntrack
+ip netns exec "$ns1" nft -f - <<EOF
+table inet filter {
+	chain input {
+		type filter hook input priority 0; policy accept;
+		ct state established accept
+	}
+}
+EOF
+
+ip -net "$ns1" addr add 10.11.11.1/24 dev veth1
+ip -net "$ns1" route add 10.99.99.99 via 10.11.11.2
+
+# Check connectivity works
+ip netns exec "$ns1" ping -q -c 2 10.11.11.2 >/dev/null || exit 1
+
+ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT &
+
+ip netns exec "$ns2" nft -f - <<EOF
+table inet filter {
+	counter connreq { }
+	counter redir { }
+	chain input {
+		type filter hook input priority 0; policy accept;
+		ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept
+		ct state new ct status dnat tcp dport 8080 counter name "redir" accept
+	}
+}
+EOF
+if [ $? -ne 0 ]; then
+	echo "ERROR: Could not load nft rules"
+	exit 1
+fi
+
+ip netns exec "$ns2" sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10
+
+echo "INFO: connect $ns1 -> $ns2 to the virtual ip"
+ip netns exec "$ns1" bash -c 'for i in $(seq 1 $BUSYWAIT_TIMEOUT) ; do
+	socat -u STDIN TCP:10.99.99.99:80 < /dev/null
+	sleep 0.1
+	done' &
+
+wait_for_attempt()
+{
+	count=$(ip netns exec "$ns2" conntrack -L -p tcp --dport 80 2>/dev/null | wc -l)
+	if [ "$count" -gt 0 ]; then
+		return 0
+	fi
+
+	return 1
+}
+
+# wait for conntrack to pick the new connection request up before loading
+# the nat redirect rule.
+if ! busywait "$BUSYWAIT_TIMEOUT" wait_for_attempt; then
+	echo "ERROR: $ns2 did not pick up tcp connection from peer"
+	exit 1
+fi
+
+ip netns exec "$ns2" nft -f - <<EOF
+table inet nat {
+	chain prerouting {
+		type nat hook prerouting priority 0; policy accept;
+		ip daddr 10.99.99.99 tcp dport 80 redirect to :8080
+	}
+}
+EOF
+if [ $? -ne 0 ]; then
+	echo "ERROR: Could not load nat redirect"
+	exit 1
+fi
+
+wait_for_redirect()
+{
+	count=$(ip netns exec "$ns2" conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l)
+	if [ "$count" -gt 0 ]; then
+		return 0
+	fi
+
+	return 1
+}
+echo "INFO: NAT redirect added in ns $ns2, waiting for $BUSYWAIT_TIMEOUT ms for nat to take effect"
+
+busywait "$BUSYWAIT_TIMEOUT" wait_for_redirect
+ret=$?
+
+expect="packets 1 bytes 60"
+if ! check_counter "$ns2" "redir" "$expect"; then
+	ret=1
+fi
+
+if [ $ret -eq 0 ];then
+	echo "PASS: redirection counter has expected values"
+else
+	echo "ERROR: no tcp connection was redirected"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/conntrack_vrf.sh b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh
index 8b5ea9234588..073e8e62d350 100755
--- a/tools/testing/selftests/netfilter/conntrack_vrf.sh
+++ b/tools/testing/selftests/net/netfilter/conntrack_vrf.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 
 # This script demonstrates interaction of conntrack and vrf.
 # The vrf driver calls the netfilter hooks again, with oif/iif
@@ -28,84 +28,67 @@
 # that was supposed to be fixed by the commit mentioned above to make sure
 # that any fix to test case 1 won't break masquerade again.
 
-ksft_skip=4
+source lib.sh
 
 IP0=172.30.30.1
 IP1=172.30.30.2
 PFXL=30
 ret=0
 
-sfx=$(mktemp -u "XXXXXXXX")
-ns0="ns0-$sfx"
-ns1="ns1-$sfx"
-
 cleanup()
 {
 	ip netns pids $ns0 | xargs kill 2>/dev/null
 	ip netns pids $ns1 | xargs kill 2>/dev/null
 
-	ip netns del $ns0 $ns1
+	cleanup_all_ns
 }
 
-nft --version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without nft tool"
-	exit $ksft_skip
-fi
-
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
-ip netns add "$ns0"
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not create net namespace $ns0"
-	exit $ksft_skip
-fi
-ip netns add "$ns1"
+checktool "nft --version" "run test without nft"
+checktool "conntrack --version" "run test without conntrack"
+checktool "socat -h" "run test without socat"
 
 trap cleanup EXIT
 
-ip netns exec $ns0 sysctl -q -w net.ipv4.conf.default.rp_filter=0
-ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0
-ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0
+setup_ns ns0 ns1
+
+ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.default.rp_filter=0
+ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0
+ip netns exec "$ns0" sysctl -q -w net.ipv4.conf.all.rp_filter=0
 
-ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1
-if [ $? -ne 0 ];then
+if ! ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1; then
 	echo "SKIP: Could not add veth device"
 	exit $ksft_skip
 fi
 
-ip -net $ns0 li add tvrf type vrf table 9876
-if [ $? -ne 0 ];then
+if ! ip -net "$ns0" li add tvrf type vrf table 9876; then
 	echo "SKIP: Could not add vrf device"
 	exit $ksft_skip
 fi
 
-ip -net $ns0 li set lo up
+ip -net "$ns0" li set veth0 master tvrf
+ip -net "$ns0" li set tvrf up
+ip -net "$ns0" li set veth0 up
+ip -net "$ns1" li set veth0 up
 
-ip -net $ns0 li set veth0 master tvrf
-ip -net $ns0 li set tvrf up
-ip -net $ns0 li set veth0 up
-ip -net $ns1 li set veth0 up
+ip -net "$ns0" addr add $IP0/$PFXL dev veth0
+ip -net "$ns1" addr add $IP1/$PFXL dev veth0
 
-ip -net $ns0 addr add $IP0/$PFXL dev veth0
-ip -net $ns1 addr add $IP1/$PFXL dev veth0
+listener_ready()
+{
+        local ns="$1"
 
-ip netns exec $ns1 iperf3 -s > /dev/null 2>&1&
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not start iperf3"
-	exit $ksft_skip
-fi
+        ss -N "$ns" -l -n -t -o "sport = :55555" | grep -q "55555"
+}
+
+ip netns exec "$ns1" socat -u -4 TCP-LISTEN:55555,reuseaddr,fork STDOUT > /dev/null &
+busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1"
 
 # test vrf ingress handling.
 # The incoming connection should be placed in conntrack zone 1,
 # as decided by the first iteration of the ruleset.
 test_ct_zone_in()
 {
-ip netns exec $ns0 nft -f - <<EOF
+ip netns exec "$ns0" nft -f - <<EOF
 table testct {
 	chain rawpre {
 		type filter hook prerouting priority raw;
@@ -126,21 +109,21 @@ table testct {
 	}
 }
 EOF
-	ip netns exec $ns1 ping -W 1 -c 1 -I veth0 $IP0 > /dev/null
+	ip netns exec "$ns1" ping -W 1 -c 1 -I veth0 "$IP0" > /dev/null
 
 	# should be in zone 1, not zone 2
-	count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l)
-	if [ $count -eq 1 ]; then
+	count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l)
+	if [ "$count" -eq 1 ]; then
 		echo "PASS: entry found in conntrack zone 1"
 	else
 		echo "FAIL: entry not found in conntrack zone 1"
-		count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l)
-		if [ $count -eq 1 ]; then
+		count=$(ip netns exec "$ns0" conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l)
+		if [ "$count" -eq 1 ]; then
 			echo "FAIL: entry found in zone 2 instead"
 		else
 			echo "FAIL: entry not in zone 1 or 2, dumping table"
-			ip netns exec $ns0 conntrack -L
-			ip netns exec $ns0 nft list ruleset
+			ip netns exec "$ns0" conntrack -L
+			ip netns exec "$ns0" nft list ruleset
 		fi
 	fi
 }
@@ -153,12 +136,12 @@ test_masquerade_vrf()
 	local qdisc=$1
 
 	if [ "$qdisc" != "default" ]; then
-		tc -net $ns0 qdisc add dev tvrf root $qdisc
+		tc -net "$ns0" qdisc add dev tvrf root "$qdisc"
 	fi
 
-	ip netns exec $ns0 conntrack -F 2>/dev/null
+	ip netns exec "$ns0" conntrack -F 2>/dev/null
 
-ip netns exec $ns0 nft -f - <<EOF
+ip netns exec "$ns0" nft -f - <<EOF
 flush ruleset
 table ip nat {
 	chain rawout {
@@ -179,25 +162,23 @@ table ip nat {
 	}
 }
 EOF
-	ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 >/dev/null
-	if [ $? -ne 0 ]; then
-		echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on vrf device"
+	if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then
+		echo "FAIL: connect failure with masquerade + sport rewrite on vrf device"
 		ret=1
 		return
 	fi
 
 	# must also check that nat table was evaluated on second (lower device) iteration.
-	ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' &&
-	ip netns exec $ns0 nft list table ip nat |grep -q 'untracked counter packets [1-9]'
-	if [ $? -eq 0 ]; then
-		echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)"
+	if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1' &&
+	   ip netns exec "$ns0" nft list table ip nat |grep -q 'untracked counter packets [1-9]'; then
+		echo "PASS: connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)"
 	else
 		echo "FAIL: vrf rules have unexpected counter value"
 		ret=1
 	fi
 
 	if [ "$qdisc" != "default" ]; then
-		tc -net $ns0 qdisc del dev tvrf root
+		tc -net "$ns0" qdisc del dev tvrf root
 	fi
 }
 
@@ -206,8 +187,8 @@ EOF
 # oifname is the lower device (veth0 in this case).
 test_masquerade_veth()
 {
-	ip netns exec $ns0 conntrack -F 2>/dev/null
-ip netns exec $ns0 nft -f - <<EOF
+	ip netns exec "$ns0" conntrack -F 2>/dev/null
+ip netns exec "$ns0" nft -f - <<EOF
 flush ruleset
 table ip nat {
 	chain postrouting {
@@ -216,17 +197,15 @@ table ip nat {
 	}
 }
 EOF
-	ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 > /dev/null
-	if [ $? -ne 0 ]; then
-		echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on veth device"
+	if ! ip netns exec "$ns0" ip vrf exec tvrf socat -u -4 STDIN TCP:"$IP1":55555 < /dev/null > /dev/null;then
+		echo "FAIL: connect failure with masquerade + sport rewrite on veth device"
 		ret=1
 		return
 	fi
 
 	# must also check that nat table was evaluated on second (lower device) iteration.
-	ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2'
-	if [ $? -eq 0 ]; then
-		echo "PASS: iperf3 connect with masquerade + sport rewrite on veth device"
+	if ip netns exec "$ns0" nft list table ip nat |grep -q 'counter packets 1'; then
+		echo "PASS: connect with masquerade + sport rewrite on veth device"
 	else
 		echo "FAIL: vrf masq rule has unexpected counter value"
 		ret=1
diff --git a/tools/testing/selftests/net/netfilter/ipvs.sh b/tools/testing/selftests/net/netfilter/ipvs.sh
new file mode 100755
index 000000000000..4ceee9fb3949
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/ipvs.sh
@@ -0,0 +1,211 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# End-to-end ipvs test suite
+# Topology:
+#--------------------------------------------------------------+
+#                      |                                       |
+#         ns0          |         ns1                           |
+#      -----------     |     -----------    -----------        |
+#      | veth01  | --------- | veth10  |    | veth12  |        |
+#      -----------    peer   -----------    -----------        |
+#           |          |                        |              |
+#      -----------     |                        |              |
+#      |  br0    |     |-----------------  peer |--------------|
+#      -----------     |                        |              |
+#           |          |                        |              |
+#      ----------     peer   ----------      -----------       |
+#      |  veth02 | --------- |  veth20 |     | veth21  |       |
+#      ----------      |     ----------      -----------       |
+#                      |         ns2                           |
+#                      |                                       |
+#--------------------------------------------------------------+
+#
+# We assume that all network driver are loaded
+#
+
+source lib.sh
+
+ret=0
+GREEN='\033[0;92m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+readonly port=8080
+
+readonly vip_v4=207.175.44.110
+readonly cip_v4=10.0.0.2
+readonly gip_v4=10.0.0.1
+readonly dip_v4=172.16.0.1
+readonly rip_v4=172.16.0.2
+readonly sip_v4=10.0.0.3
+
+readonly infile="$(mktemp)"
+readonly outfile="$(mktemp)"
+readonly datalen=32
+
+sysipvsnet="/proc/sys/net/ipv4/vs/"
+if [ ! -d $sysipvsnet ]; then
+	if ! modprobe -q ip_vs; then
+		echo "skip: could not run test without ipvs module"
+		exit $ksft_skip
+	fi
+fi
+
+checktool "ipvsadm -v" "run test without ipvsadm"
+checktool "socat -h" "run test without socat"
+
+setup() {
+	setup_ns ns0 ns1 ns2
+
+	ip link add veth01 netns "${ns0}" type veth peer name veth10 netns "${ns1}"
+	ip link add veth02 netns "${ns0}" type veth peer name veth20 netns "${ns2}"
+	ip link add veth12 netns "${ns1}" type veth peer name veth21 netns "${ns2}"
+
+	ip netns exec "${ns0}" ip link set veth01 up
+	ip netns exec "${ns0}" ip link set veth02 up
+	ip netns exec "${ns0}" ip link add br0 type bridge
+	ip netns exec "${ns0}" ip link set veth01 master br0
+	ip netns exec "${ns0}" ip link set veth02 master br0
+	ip netns exec "${ns0}" ip link set br0 up
+	ip netns exec "${ns0}" ip addr add "${cip_v4}/24" dev br0
+
+	ip netns exec "${ns1}" ip link set veth10 up
+	ip netns exec "${ns1}" ip addr add "${gip_v4}/24" dev veth10
+	ip netns exec "${ns1}" ip link set veth12 up
+	ip netns exec "${ns1}" ip addr add "${dip_v4}/24" dev veth12
+
+	ip netns exec "${ns2}" ip link set veth21 up
+	ip netns exec "${ns2}" ip addr add "${rip_v4}/24" dev veth21
+	ip netns exec "${ns2}" ip link set veth20 up
+	ip netns exec "${ns2}" ip addr add "${sip_v4}/24" dev veth20
+
+	sleep 1
+
+	dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none
+}
+
+cleanup() {
+	cleanup_all_ns
+
+	if [ -f "${outfile}" ]; then
+		rm "${outfile}"
+	fi
+	if [ -f "${infile}" ]; then
+		rm "${infile}"
+	fi
+}
+
+server_listen() {
+	ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" &
+	server_pid=$!
+	sleep 0.2
+}
+
+client_connect() {
+	ip netns exec "${ns0}" timeout 2 socat -u -4 STDIN TCP:"${vip_v4}":"${port}" < "${infile}"
+}
+
+verify_data() {
+	wait "${server_pid}"
+	cmp "$infile" "$outfile" 2>/dev/null
+}
+
+test_service() {
+	server_listen
+	client_connect
+	verify_data
+}
+
+
+test_dr() {
+	ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0
+
+	ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1
+	ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr
+	ip netns exec "${ns1}" ipvsadm -a -t "${vip_v4}:${port}" -r "${rip_v4}:${port}"
+	ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1
+
+	# avoid incorrect arp response
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2
+	# avoid reverse route lookup
+	ip netns exec "${ns2}" sysctl -qw  net.ipv4.conf.all.rp_filter=0
+	ip netns exec "${ns2}" sysctl -qw  net.ipv4.conf.veth21.rp_filter=0
+	ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1
+
+	test_service
+}
+
+test_nat() {
+	ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0
+
+	ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=1
+	ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr
+	ip netns exec "${ns1}" ipvsadm -a -m -t "${vip_v4}:${port}" -r "${rip_v4}:${port}"
+	ip netns exec "${ns1}" ip addr add "${vip_v4}/32" dev lo:1
+
+	ip netns exec "${ns2}" ip link del veth20
+	ip netns exec "${ns2}" ip route add default via "${dip_v4}" dev veth21
+
+	test_service
+}
+
+test_tun() {
+	ip netns exec "${ns0}" ip route add "${vip_v4}" via "${gip_v4}" dev br0
+
+	ip netns exec "${ns1}" modprobe -q ipip
+	ip netns exec "${ns1}" ip link set tunl0 up
+	ip netns exec "${ns1}" sysctl -qw net.ipv4.ip_forward=0
+	ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.all.send_redirects=0
+	ip netns exec "${ns1}" sysctl -qw net.ipv4.conf.default.send_redirects=0
+	ip netns exec "${ns1}" ipvsadm -A -t "${vip_v4}:${port}" -s rr
+	ip netns exec "${ns1}" ipvsadm -a -i -t "${vip_v4}:${port}" -r ${rip_v4}:${port}
+	ip netns exec "${ns1}" ip addr add ${vip_v4}/32 dev lo:1
+
+	ip netns exec "${ns2}" modprobe -q ipip
+	ip netns exec "${ns2}" ip link set tunl0 up
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_ignore=1
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.arp_announce=2
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.tunl0.rp_filter=0
+	ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.veth21.rp_filter=0
+	ip netns exec "${ns2}" ip addr add "${vip_v4}/32" dev lo:1
+
+	test_service
+}
+
+run_tests() {
+	local errors=
+
+	echo "Testing DR mode..."
+	cleanup
+	setup
+	test_dr
+	errors=$(( $errors + $? ))
+
+	echo "Testing NAT mode..."
+	cleanup
+	setup
+	test_nat
+	errors=$(( $errors + $? ))
+
+	echo "Testing Tunnel mode..."
+	cleanup
+	setup
+	test_tun
+	errors=$(( $errors + $? ))
+
+	return $errors
+}
+
+trap cleanup EXIT
+
+run_tests
+
+if [ $? -ne 0 ]; then
+	echo -e "$(basename $0): ${RED}FAIL${NC}"
+	exit 1
+fi
+echo -e "$(basename $0): ${GREEN}PASS${NC}"
+exit 0
diff --git a/tools/testing/selftests/net/netfilter/lib.sh b/tools/testing/selftests/net/netfilter/lib.sh
new file mode 100644
index 000000000000..bedd35298e15
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/lib.sh
@@ -0,0 +1,10 @@
+net_netfilter_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "$net_netfilter_dir/../lib.sh"
+
+checktool (){
+	if ! $1 > /dev/null 2>&1; then
+		echo "SKIP: Could not $2"
+		exit $ksft_skip
+	fi
+}
diff --git a/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh
new file mode 100755
index 000000000000..c6fdd2079f4d
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+checktool "conntrack --version" "run test without conntrack"
+checktool "iptables --version" "run test without iptables"
+checktool "ip6tables --version" "run test without ip6tables"
+
+modprobe -q tun
+modprobe -q nf_conntrack
+# echo 1 > /proc/sys/net/netfilter/nf_log_all_netns
+
+PDRILL_TIMEOUT=10
+
+files="
+conntrack_ack_loss_stall.pkt
+conntrack_inexact_rst.pkt
+conntrack_syn_challenge_ack.pkt
+conntrack_synack_old.pkt
+conntrack_synack_reuse.pkt
+conntrack_rst_invalid.pkt
+"
+
+if ! packetdrill --dry_run --verbose "packetdrill/conntrack_ack_loss_stall.pkt";then
+	echo "SKIP: packetdrill not installed"
+	exit ${ksft_skip}
+fi
+
+ret=0
+
+run_packetdrill()
+{
+	filename="$1"
+	ipver="$2"
+	local mtu=1500
+
+	export NFCT_IP_VERSION="$ipver"
+
+	if [ "$ipver" = "ipv4" ];then
+		export xtables="iptables"
+	elif [ "$ipver" = "ipv6" ];then
+		export xtables="ip6tables"
+		mtu=1520
+	fi
+
+	timeout "$PDRILL_TIMEOUT" unshare -n packetdrill --ip_version="$ipver" --mtu=$mtu \
+		--tolerance_usecs=1000000 --non_fatal packet "$filename"
+}
+
+run_one_test_file()
+{
+	filename="$1"
+
+	for v in ipv4 ipv6;do
+		printf "%-50s(%s)%-20s" "$filename" "$v" ""
+		if run_packetdrill packetdrill/"$f" "$v";then
+			echo OK
+		else
+			echo FAIL
+			ret=1
+		fi
+	done
+}
+
+echo "Replaying packetdrill test cases:"
+for f in $files;do
+	run_one_test_file packetdrill/"$f"
+done
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh
new file mode 100755
index 000000000000..1014551dd769
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nf_nat_edemux.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test NAT source port clash resolution
+#
+
+source lib.sh
+ret=0
+socatpid=0
+
+cleanup()
+{
+	[ "$socatpid" -gt 0 ] && kill "$socatpid"
+
+	cleanup_all_ns
+}
+
+checktool "socat -h" "run test without socat"
+checktool "iptables --version" "run test without iptables"
+
+trap cleanup EXIT
+
+setup_ns ns1 ns2
+
+# Connect the namespaces using a veth pair
+ip link add name veth2 type veth peer name veth1
+ip link set netns "$ns1" dev veth1
+ip link set netns "$ns2" dev veth2
+
+ip netns exec "$ns1" ip link set up dev lo
+ip netns exec "$ns1" ip link set up dev veth1
+ip netns exec "$ns1" ip addr add 192.168.1.1/24 dev veth1
+
+ip netns exec "$ns2" ip link set up dev lo
+ip netns exec "$ns2" ip link set up dev veth2
+ip netns exec "$ns2" ip addr add 192.168.1.2/24 dev veth2
+
+# Create a server in one namespace
+ip netns exec "$ns1" socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 &
+socatpid=$!
+
+# Restrict source port to just one so we don't have to exhaust
+# all others.
+ip netns exec "$ns2" sysctl -q net.ipv4.ip_local_port_range="10000 10000"
+
+# add a virtual IP using DNAT
+ip netns exec "$ns2" iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201
+
+# ... and route it to the other namespace
+ip netns exec "$ns2" ip route add 10.96.0.1 via 192.168.1.1
+
+# add a persistent connection from the other namespace
+ip netns exec "$ns2" socat -t 10 - TCP:192.168.1.1:5201 > /dev/null &
+
+sleep 1
+
+# ip daddr:dport will be rewritten to 192.168.1.1 5201
+# NAT must reallocate source port 10000 because
+# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use
+echo test | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null
+ret=$?
+
+# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201).
+if [ $ret -eq 0 ]; then
+	echo "PASS: socat can connect via NAT'd address"
+else
+	echo "FAIL: socat cannot connect via NAT'd address"
+fi
+
+# check sport clashres.
+ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201
+ip netns exec "$ns1" iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201
+
+sleep 5 | ip netns exec "$ns2" socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null &
+
+# if connect succeeds, client closes instantly due to EOF on stdin.
+# if connect hangs, it will time out after 5s.
+echo | ip netns exec "$ns2" socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null &
+cpid2=$!
+
+time_then=$(date +%s)
+wait $cpid2
+rv=$?
+time_now=$(date +%s)
+
+# Check how much time has elapsed, expectation is for
+# 'cpid2' to connect and then exit (and no connect delay).
+delta=$((time_now - time_then))
+
+if [ $delta -lt 2 ] && [ $rv -eq 0 ]; then
+	echo "PASS: could connect to service via redirected ports"
+else
+	echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)"
+	ret=1
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/nf-queue.c b/tools/testing/selftests/net/netfilter/nf_queue.c
index 9e56b9d47037..9e56b9d47037 100644
--- a/tools/testing/selftests/netfilter/nf-queue.c
+++ b/tools/testing/selftests/net/netfilter/nf_queue.c
diff --git a/tools/testing/selftests/netfilter/nft_audit.sh b/tools/testing/selftests/net/netfilter/nft_audit.sh
index 99ed5bd6e840..902f8114bc80 100755
--- a/tools/testing/selftests/netfilter/nft_audit.sh
+++ b/tools/testing/selftests/net/netfilter/nft_audit.sh
@@ -6,11 +6,34 @@
 SKIP_RC=4
 RC=0
 
+if [ -r /var/run/auditd.pid ];then
+	read pid < /var/run/auditd.pid
+	p=$(pgrep ^auditd$)
+
+	if [ "$pid" -eq "$p" ]; then
+		echo "SKIP: auditd is running"
+		exit $SKIP_RC
+	fi
+fi
+
 nft --version >/dev/null 2>&1 || {
 	echo "SKIP: missing nft tool"
 	exit $SKIP_RC
 }
 
+# nft must be recent enough to support "reset" keyword.
+nft --check -f /dev/stdin >/dev/null 2>&1 <<EOF
+add table t
+add chain t c
+reset rules t c
+EOF
+
+if [ "$?" -ne 0 ];then
+	echo -n "SKIP: nft reset feature test failed: "
+	nft --version
+	exit $SKIP_RC
+fi
+
 # Run everything in a separate network namespace
 [ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; }
 
@@ -73,7 +96,7 @@ done
 
 for ((i = 0; i < 500; i++)); do
 	echo "add rule t2 c3 counter accept comment \"rule $i\""
-done >$rulefile
+done > "$rulefile"
 do_test "nft -f $rulefile" \
 'table=t2 family=2 entries=500 op=nft_register_rule'
 
@@ -101,7 +124,7 @@ do_test 'nft add counter t2 c1; add counter t2 c2' \
 
 for ((i = 3; i <= 500; i++)); do
 	echo "add counter t2 c$i"
-done >$rulefile
+done > "$rulefile"
 do_test "nft -f $rulefile" \
 'table=t2 family=2 entries=498 op=nft_register_obj'
 
@@ -115,7 +138,7 @@ do_test 'nft add quota t2 q1 { 10 bytes }; add quota t2 q2 { 10 bytes }' \
 
 for ((i = 3; i <= 500; i++)); do
 	echo "add quota t2 q$i { 10 bytes }"
-done >$rulefile
+done > "$rulefile"
 do_test "nft -f $rulefile" \
 'table=t2 family=2 entries=498 op=nft_register_obj'
 
@@ -157,7 +180,7 @@ table=t2 family=2 entries=135 op=nft_reset_rule'
 
 # resetting sets and elements
 
-elem=(22 ,80 ,443)
+elem=(22 ",80" ",443")
 relem=""
 for i in {1..3}; do
 	relem+="${elem[((i - 1))]}"
diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh
index e908009576c7..6d66240e149c 100755
--- a/tools/testing/selftests/netfilter/nft_concat_range.sh
+++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 #
 # nft_concat_range.sh - Tests for sets with concatenation of ranged fields
@@ -7,10 +7,10 @@
 #
 # Author: Stefano Brivio <sbrivio@redhat.com>
 #
-# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031
+# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031,SC2317
 # ^ Configuration and templates sourced with eval, counters reused in subshells
 
-KSELFTEST_SKIP=4
+source lib.sh
 
 # Available test groups:
 # - reported_issues: check for issues that were reported in the past
@@ -19,7 +19,7 @@ KSELFTEST_SKIP=4
 # - timeout: check that packets match entries until they expire
 # - performance: estimate matching rate, compare with rbtree and hash baselines
 TESTS="reported_issues correctness concurrency timeout"
-[ "${quicktest}" != "1" ] && TESTS="${TESTS} performance"
+[ -n "$NFT_CONCAT_RANGE_TESTS" ] && TESTS="${NFT_CONCAT_RANGE_TESTS}"
 
 # Set types, defined by TYPE_ variables below
 TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto
@@ -31,7 +31,7 @@ BUGS="flush_remove_add reload"
 
 # List of possible paths to pktgen script from kernel tree for performance tests
 PKTGEN_SCRIPT_PATHS="
-	../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
+	../../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
 	pktgen/pktgen_bench_xmit_mode_netif_receive.sh"
 
 # Definition of set types:
@@ -66,7 +66,7 @@ src
 start		1
 count		5
 src_delta	2000
-tools		sendip nc bash
+tools		sendip bash
 proto		udp
 
 race_repeat	3
@@ -91,7 +91,7 @@ src
 start		1
 count		5
 src_delta	2000
-tools		sendip socat nc bash
+tools		sendip socat bash
 proto		udp
 
 race_repeat	3
@@ -116,7 +116,7 @@ src
 start		10
 count		5
 src_delta	2000
-tools		sendip socat nc bash
+tools		sendip socat bash
 proto		udp6
 
 race_repeat	3
@@ -141,7 +141,7 @@ src
 start		1
 count		5
 src_delta	2000
-tools		sendip socat nc bash
+tools		sendip socat bash
 proto		udp
 
 race_repeat	0
@@ -163,7 +163,7 @@ src		mac
 start		10
 count		5
 src_delta	2000
-tools		sendip socat nc bash
+tools		sendip socat bash
 proto		udp6
 
 race_repeat	0
@@ -185,7 +185,7 @@ src		mac proto
 start		10
 count		5
 src_delta	2000
-tools		sendip socat nc bash
+tools		sendip socat bash
 proto		udp6
 
 race_repeat	0
@@ -207,7 +207,7 @@ src		addr4
 start		1
 count		5
 src_delta	2000
-tools		sendip socat nc bash
+tools		sendip socat bash
 proto		udp
 
 race_repeat	3
@@ -227,7 +227,7 @@ src		addr6 port
 start		10
 count		5
 src_delta	2000
-tools		sendip socat nc
+tools		sendip socat
 proto		udp6
 
 race_repeat	3
@@ -247,7 +247,7 @@ src		mac proto addr4
 start		1
 count		5
 src_delta	2000
-tools		sendip socat nc bash
+tools		sendip socat bash
 proto		udp
 
 race_repeat	0
@@ -264,7 +264,7 @@ src		mac
 start		1
 count		5
 src_delta	2000
-tools		sendip socat nc bash
+tools		sendip socat bash
 proto		udp
 
 race_repeat	0
@@ -286,7 +286,7 @@ src		mac addr4
 start		1
 count		5
 src_delta	2000
-tools		sendip socat nc bash
+tools		sendip socat bash
 proto		udp
 
 race_repeat	0
@@ -337,7 +337,7 @@ src		addr4
 start		1
 count		5
 src_delta	2000
-tools		sendip socat nc
+tools		sendip socat
 proto		udp
 
 race_repeat	3
@@ -363,7 +363,7 @@ src		mac
 start		1
 count		1
 src_delta	2000
-tools		sendip socat nc bash
+tools		sendip socat bash
 proto		udp
 
 race_repeat	0
@@ -473,8 +473,6 @@ setup_veth() {
 	B() {
 		ip netns exec B "$@" >/dev/null 2>&1
 	}
-
-	sleep 2
 }
 
 # Fill in set template and initialise set
@@ -488,12 +486,6 @@ check_tools() {
 
 	__tools=
 	for tool in ${tools}; do
-		if [ "${tool}" = "nc" ] && [ "${proto}" = "udp6" ] && \
-		   ! nc -u -w0 1.1.1.1 1 2>/dev/null; then
-			# Some GNU netcat builds might not support IPv6
-			__tools="${__tools} netcat-openbsd"
-			continue
-		fi
 		__tools="${__tools} ${tool}"
 
 		command -v "${tool}" >/dev/null && return 0
@@ -554,30 +546,7 @@ setup_send_udp() {
 			ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
 			[ -z "${dst_port}" ] && dst_port=12345
 
-			echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:${dst_addr4}:${dst_port}"${__socatbind}"
-
-			src_addr4=
-			src_port=
-		}
-	elif command -v nc >/dev/null; then
-		if nc -u -w0 1.1.1.1 1 2>/dev/null; then
-			# OpenBSD netcat
-			nc_opt="-w0"
-		else
-			# GNU netcat
-			nc_opt="-q0"
-		fi
-
-		send_udp() {
-			if [ -n "${src_addr4}" ]; then
-				B ip addr add "${src_addr4}" dev veth_b
-				__src_addr4="-s ${src_addr4}"
-			fi
-			ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
-			[ -n "${src_port}" ] && src_port="-p ${src_port}"
-
-			echo "" | B nc -u "${nc_opt}" "${__src_addr4}" \
-				  "${src_port}" "${dst_addr4}" "${dst_port}"
+			echo "test4" | B socat -t 0.01 STDIN UDP4-DATAGRAM:"$dst_addr4":"$dst_port""${__socatbind}"
 
 			src_addr4=
 			src_port=
@@ -632,11 +601,7 @@ setup_send_udp6() {
 			__socatbind6=
 
 			if [ -n "${src_addr6}" ]; then
-				if [ -n "${src_addr6} != "${src_addr6_added} ]; then
-					B ip addr add "${src_addr6}" dev veth_b nodad
-
-					src_addr6_added=${src_addr6}
-				fi
+				B ip addr add "${src_addr6}" dev veth_b nodad
 
 				__socatbind6=",bind=[${src_addr6}]"
 
@@ -645,26 +610,7 @@ setup_send_udp6() {
 				fi
 			fi
 
-			echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:[${dst_addr6}]:${dst_port}"${__socatbind6}"
-		}
-	elif command -v nc >/dev/null && nc -u -w0 1.1.1.1 1 2>/dev/null; then
-		# GNU netcat might not work with IPv6, try next tool
-		send_udp6() {
-			ip -6 addr add "${dst_addr6}" dev veth_a nodad \
-				2>/dev/null
-			if [ -n "${src_addr6}" ]; then
-				B ip addr add "${src_addr6}" dev veth_b nodad
-			else
-				src_addr6="2001:db8::2"
-			fi
-			[ -n "${src_port}" ] && src_port="-p ${src_port}"
-
-			# shellcheck disable=SC2086 # this needs split options
-			echo "" | B nc -u w0 "-s${src_addr6}" ${src_port} \
-					       ${dst_addr6} ${dst_port}
-
-			src_addr6=
-			src_port=
+			echo "test6" | B socat -t 0.01 STDIN UDP6-DATAGRAM:["$dst_addr6"]:"$dst_port""${__socatbind6}"
 		}
 	elif [ -z "$(bash -c 'type -p')" ]; then
 		send_udp6() {
@@ -679,10 +625,17 @@ setup_send_udp6() {
 	fi
 }
 
+listener_ready()
+{
+	port="$1"
+	ss -lnt -o "sport = :$port" | grep -q "$port"
+}
+
 # Set up function to send TCP traffic on IPv4
 setup_flood_tcp() {
 	if command -v iperf3 >/dev/null; then
 		flood_tcp() {
+			local n_port="${dst_port}"
 			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
 			if [ -n "${src_addr4}" ]; then
 				B ip addr add "${src_addr4}/16" dev veth_b
@@ -699,7 +652,7 @@ setup_flood_tcp() {
 
 			# shellcheck disable=SC2086 # this needs split options
 			iperf3 -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1
-			sleep 2
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
 
 			# shellcheck disable=SC2086 # this needs split options
 			B iperf3 -c "${dst_addr4}" ${dst_port} ${src_port} \
@@ -711,6 +664,7 @@ setup_flood_tcp() {
 		}
 	elif command -v iperf >/dev/null; then
 		flood_tcp() {
+			local n_port="${dst_port}"
 			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
 			if [ -n "${src_addr4}" ]; then
 				B ip addr add "${src_addr4}/16" dev veth_b
@@ -727,7 +681,7 @@ setup_flood_tcp() {
 
 			# shellcheck disable=SC2086 # this needs split options
 			iperf -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1
-			sleep 2
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
 
 			# shellcheck disable=SC2086 # this needs split options
 			B iperf -c "${dst_addr4}" ${dst_port} ${src_addr4} \
@@ -739,6 +693,7 @@ setup_flood_tcp() {
 		}
 	elif command -v netperf >/dev/null; then
 		flood_tcp() {
+			local n_port="${dst_port}"
 			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
 			if [ -n "${src_addr4}" ]; then
 				B ip addr add "${src_addr4}/16" dev veth_b
@@ -755,7 +710,7 @@ setup_flood_tcp() {
 			# shellcheck disable=SC2086 # this needs split options
 			netserver -4 ${dst_port} -L "${dst_addr4}" \
 				>/dev/null 2>&1
-			sleep 2
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}"
 
 			# shellcheck disable=SC2086 # this needs split options
 			B netperf -4 -H "${dst_addr4}" ${dst_port} \
@@ -774,6 +729,7 @@ setup_flood_tcp() {
 setup_flood_tcp6() {
 	if command -v iperf3 >/dev/null; then
 		flood_tcp6() {
+			local n_port="${dst_port}"
 			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
 			if [ -n "${src_addr6}" ]; then
 				B ip addr add "${src_addr6}" dev veth_b nodad
@@ -790,7 +746,7 @@ setup_flood_tcp6() {
 
 			# shellcheck disable=SC2086 # this needs split options
 			iperf3 -s -DB "${dst_addr6}" ${dst_port} >/dev/null 2>&1
-			sleep 2
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "${n_port}"
 
 			# shellcheck disable=SC2086 # this needs split options
 			B iperf3 -c "${dst_addr6}" ${dst_port} \
@@ -802,6 +758,7 @@ setup_flood_tcp6() {
 		}
 	elif command -v iperf >/dev/null; then
 		flood_tcp6() {
+			local n_port="${dst_port}"
 			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
 			if [ -n "${src_addr6}" ]; then
 				B ip addr add "${src_addr6}" dev veth_b nodad
@@ -818,7 +775,7 @@ setup_flood_tcp6() {
 
 			# shellcheck disable=SC2086 # this needs split options
 			iperf -s -VDB "${dst_addr6}" ${dst_port} >/dev/null 2>&1
-			sleep 2
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
 
 			# shellcheck disable=SC2086 # this needs split options
 			B iperf -c "${dst_addr6}" -V ${dst_port} \
@@ -830,6 +787,7 @@ setup_flood_tcp6() {
 		}
 	elif command -v netperf >/dev/null; then
 		flood_tcp6() {
+			local n_port="${dst_port}"
 			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
 			if [ -n "${src_addr6}" ]; then
 				B ip addr add "${src_addr6}" dev veth_b nodad
@@ -846,7 +804,7 @@ setup_flood_tcp6() {
 			# shellcheck disable=SC2086 # this needs split options
 			netserver -6 ${dst_port} -L "${dst_addr6}" \
 				>/dev/null 2>&1
-			sleep 2
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
 
 			# shellcheck disable=SC2086 # this needs split options
 			B netperf -6 -H "${dst_addr6}" ${dst_port} \
@@ -865,6 +823,7 @@ setup_flood_tcp6() {
 setup_flood_udp() {
 	if command -v iperf3 >/dev/null; then
 		flood_udp() {
+			local n_port="${dst_port}"
 			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
 			if [ -n "${src_addr4}" ]; then
 				B ip addr add "${src_addr4}/16" dev veth_b
@@ -881,7 +840,7 @@ setup_flood_udp() {
 
 			# shellcheck disable=SC2086 # this needs split options
 			iperf3 -s -DB "${dst_addr4}" ${dst_port}
-			sleep 2
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
 
 			# shellcheck disable=SC2086 # this needs split options
 			B iperf3 -u -c "${dst_addr4}" -Z -b 100M -l16 -t1000 \
@@ -893,6 +852,7 @@ setup_flood_udp() {
 		}
 	elif command -v iperf >/dev/null; then
 		flood_udp() {
+			local n_port="${dst_port}"
 			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
 			if [ -n "${src_addr4}" ]; then
 				B ip addr add "${src_addr4}/16" dev veth_b
@@ -909,7 +869,7 @@ setup_flood_udp() {
 
 			# shellcheck disable=SC2086 # this needs split options
 			iperf -u -sDB "${dst_addr4}" ${dst_port} >/dev/null 2>&1
-			sleep 2
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
 
 			# shellcheck disable=SC2086 # this needs split options
 			B iperf -u -c "${dst_addr4}" -b 100M -l1 -t1000 \
@@ -921,6 +881,7 @@ setup_flood_udp() {
 		}
 	elif command -v netperf >/dev/null; then
 		flood_udp() {
+			local n_port="${dst_port}"
 			[ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
 			if [ -n "${src_addr4}" ]; then
 				B ip addr add "${src_addr4}/16" dev veth_b
@@ -937,7 +898,7 @@ setup_flood_udp() {
 			# shellcheck disable=SC2086 # this needs split options
 			netserver -4 ${dst_port} -L "${dst_addr4}" \
 				>/dev/null 2>&1
-			sleep 2
+			busywait "$BUSYWAIT_TIMEOUT" listener_ready "$n_port"
 
 			# shellcheck disable=SC2086 # this needs split options
 			B netperf -4 -H "${dst_addr4}" ${dst_port} \
@@ -982,6 +943,7 @@ cleanup() {
 	ip link del dummy0			2>/dev/null
 	ip route del default			2>/dev/null
 	ip -6 route del default			2>/dev/null
+	ip netns pids B				2>/dev/null | xargs kill 2>/dev/null
 	ip netns del B				2>/dev/null
 	ip link del veth_a			2>/dev/null
 	timeout=
@@ -989,15 +951,18 @@ cleanup() {
 	killall iperf				2>/dev/null
 	killall netperf				2>/dev/null
 	killall netserver			2>/dev/null
-	rm -f ${tmp}
-	sleep 2
+}
+
+cleanup_exit() {
+	cleanup
+	rm -f "$tmp"
 }
 
 # Entry point for setup functions
 setup() {
 	if [ "$(id -u)" -ne 0 ]; then
 		echo "  need to run as root"
-		exit ${KSELFTEST_SKIP}
+		exit ${ksft_skip}
 	fi
 
 	cleanup
@@ -1258,7 +1223,7 @@ send_nomatch() {
 # - check that packets outside range don't match it
 # - remove some elements, check that packets don't match anymore
 test_correctness() {
-	setup veth send_"${proto}" set || return ${KSELFTEST_SKIP}
+	setup veth send_"${proto}" set || return ${ksft_skip}
 
 	range_size=1
 	for i in $(seq "${start}" $((start + count))); do
@@ -1273,7 +1238,7 @@ test_correctness() {
 		srcend=$((end + src_delta))
 
 		add "$(format)" || return 1
-		for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do
+		for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
 			send_match "${j}" $((j + src_delta)) || return 1
 		done
 		send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1
@@ -1281,7 +1246,7 @@ test_correctness() {
 		# Delete elements now and then
 		if [ $((i % 3)) -eq 0 ]; then
 			del "$(format)" || return 1
-			for j in $(seq ${start} \
+			for j in $(seq "$start" \
 				   $((range_size / 2 + 1)) ${end}); do
 				send_nomatch "${j}" $((j + src_delta)) \
 					|| return 1
@@ -1307,12 +1272,12 @@ test_concurrency() {
 	proto=${flood_proto}
 	tools=${flood_tools}
 	chain_spec=${flood_spec}
-	setup veth flood_"${proto}" set || return ${KSELFTEST_SKIP}
+	setup veth flood_"${proto}" set || return ${ksft_skip}
 
 	range_size=1
 	cstart=${start}
 	flood_pids=
-	for i in $(seq ${start} $((start + count))); do
+	for i in $(seq "$start" $((start + count))); do
 		end=$((start + range_size))
 		srcstart=$((start + src_delta))
 		srcend=$((end + src_delta))
@@ -1325,7 +1290,7 @@ test_concurrency() {
 		start=$((end + range_size))
 	done
 
-	sleep 10
+	sleep $((RANDOM%10))
 
 	pids=
 	for c in $(seq 1 "$(nproc)"); do (
@@ -1335,7 +1300,7 @@ test_concurrency() {
 			# $start needs to be local to this subshell
 			# shellcheck disable=SC2030
 			start=${cstart}
-			for i in $(seq ${start} $((start + count))); do
+			for i in $(seq "$start" $((start + count))); do
 				end=$((start + range_size))
 				srcstart=$((start + src_delta))
 				srcend=$((end + src_delta))
@@ -1350,7 +1315,7 @@ test_concurrency() {
 
 			range_size=1
 			start=${cstart}
-			for i in $(seq ${start} $((start + count))); do
+			for i in $(seq "$start" $((start + count))); do
 				end=$((start + range_size))
 				srcstart=$((start + src_delta))
 				srcend=$((end + src_delta))
@@ -1366,7 +1331,7 @@ test_concurrency() {
 
 			range_size=1
 			start=${cstart}
-			for i in $(seq ${start} $((start + count))); do
+			for i in $(seq "$start" $((start + count))); do
 				end=$((start + range_size))
 				srcstart=$((start + src_delta))
 				srcend=$((end + src_delta))
@@ -1379,7 +1344,7 @@ test_concurrency() {
 
 			range_size=1
 			start=${cstart}
-			for i in $(seq ${start} $((start + count))); do
+			for i in $(seq "$start" $((start + count))); do
 				end=$((start + range_size))
 				srcstart=$((start + src_delta))
 				srcend=$((end + src_delta))
@@ -1407,31 +1372,34 @@ test_concurrency() {
 # - add all the elements with 3s timeout while checking that packets match
 # - wait 3s after the last insertion, check that packets don't match any entry
 test_timeout() {
-	setup veth send_"${proto}" set || return ${KSELFTEST_SKIP}
+	setup veth send_"${proto}" set || return ${ksft_skip}
 
 	timeout=3
+
+	[ "$KSFT_MACHINE_SLOW" = "yes" ] && timeout=8
+
 	range_size=1
-	for i in $(seq "${start}" $((start + count))); do
+	for i in $(seq "$start" $((start + count))); do
 		end=$((start + range_size))
 		srcstart=$((start + src_delta))
 		srcend=$((end + src_delta))
 
 		add "$(format)" || return 1
 
-		for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do
+		for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
 			send_match "${j}" $((j + src_delta)) || return 1
 		done
 
 		range_size=$((range_size + 1))
 		start=$((end + range_size))
 	done
-	sleep 3
-	for i in $(seq ${start} $((start + count))); do
+	sleep $timeout
+	for i in $(seq "$start" $((start + count))); do
 		end=$((start + range_size))
 		srcstart=$((start + src_delta))
 		srcend=$((end + src_delta))
 
-		for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do
+		for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
 			send_nomatch "${j}" $((j + src_delta)) || return 1
 		done
 
@@ -1450,13 +1418,13 @@ test_performance() {
 	chain_spec=${perf_spec}
 	dst="${perf_dst}"
 	src="${perf_src}"
-	setup veth perf set || return ${KSELFTEST_SKIP}
+	setup veth perf set || return ${ksft_skip}
 
 	first=${start}
 	range_size=1
 	for set in test norange noconcat; do
 		start=${first}
-		for i in $(seq ${start} $((start + perf_entries))); do
+		for i in $(seq "$start" $((start + perf_entries))); do
 			end=$((start + range_size))
 			srcstart=$((start + src_delta))
 			srcend=$((end + src_delta))
@@ -1464,7 +1432,7 @@ test_performance() {
 			if [ $((end / 65534)) -gt $((start / 65534)) ]; then
 				start=${end}
 				end=$((end + 1))
-			elif [ ${start} -eq ${end} ]; then
+			elif [ "$start" -eq "$end" ]; then
 				end=$((start + 1))
 			fi
 
@@ -1475,7 +1443,7 @@ test_performance() {
 		nft -f "${tmp}"
 	done
 
-	perf $((end - 1)) ${srcstart}
+	perf $((end - 1)) "$srcstart"
 
 	sleep 2
 
@@ -1519,14 +1487,17 @@ test_performance() {
 }
 
 test_bug_flush_remove_add() {
+	rounds=100
+	[ "$KSFT_MACHINE_SLOW" = "yes" ] && rounds=10
+
 	set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }'
 	elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }'
 	elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }'
-	for i in `seq 1 100`; do
-		nft add table t ${set_cmd}	|| return ${KSELFTEST_SKIP}
-		nft add element t s ${elem1}	2>/dev/null || return 1
+	for i in $(seq 1 $rounds); do
+		nft add table t "$set_cmd"	|| return ${ksft_skip}
+		nft add element t s "$elem1"	2>/dev/null || return 1
 		nft flush set t s		2>/dev/null || return 1
-		nft add element t s ${elem2}	2>/dev/null || return 1
+		nft add element t s "$elem2"	2>/dev/null || return 1
 	done
 	nft flush ruleset
 }
@@ -1534,7 +1505,7 @@ test_bug_flush_remove_add() {
 # - add ranged element, check that packets match it
 # - reload the set, check packets still match
 test_bug_reload() {
-	setup veth send_"${proto}" set || return ${KSELFTEST_SKIP}
+	setup veth send_"${proto}" set || return ${ksft_skip}
 	rstart=${start}
 
 	range_size=1
@@ -1573,7 +1544,7 @@ test_bug_reload() {
 		srcstart=$((start + src_delta))
 		srcend=$((end + src_delta))
 
-		for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do
+		for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
 			send_match "${j}" $((j + src_delta)) || return 1
 		done
 
@@ -1591,12 +1562,12 @@ test_reported_issues() {
 # Run everything in a separate network namespace
 [ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; }
 tmp="$(mktemp)"
-trap cleanup EXIT
+trap cleanup_exit EXIT
 
 # Entry point for test runs
 passed=0
 for name in ${TESTS}; do
-	printf "TEST: %s\n" "$(echo ${name} | tr '_' ' ')"
+	printf "TEST: %s\n" "$(echo "$name" | tr '_' ' ')"
 	if [ "${name}" = "reported_issues" ]; then
 		SUBTESTS="${BUGS}"
 	else
@@ -1623,10 +1594,16 @@ for name in ${TESTS}; do
 			continue
 		fi
 
-		printf "  %-60s  " "${display}"
+		[ "$KSFT_MACHINE_SLOW" = "yes" ] && count=1
+
+		printf "  %-32s  " "${display}"
+		tthen=$(date +%s)
 		eval test_"${name}"
 		ret=$?
 
+		tnow=$(date +%s)
+		printf "%5ds%-30s" $((tnow-tthen))
+
 		if [ $ret -eq 0 ]; then
 			printf "[ OK ]\n"
 			info_flush
@@ -1635,11 +1612,11 @@ for name in ${TESTS}; do
 			printf "[FAIL]\n"
 			err_flush
 			exit 1
-		elif [ $ret -eq ${KSELFTEST_SKIP} ]; then
+		elif [ $ret -eq ${ksft_skip} ]; then
 			printf "[SKIP]\n"
 			err_flush
 		fi
 	done
 done
 
-[ ${passed} -eq 0 ] && exit ${KSELFTEST_SKIP} || exit 0
+[ ${passed} -eq 0 ] && exit ${ksft_skip} || exit 0
diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh
new file mode 100755
index 000000000000..5d276995a5c5
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_concat_range_perf.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+
+source lib.sh
+
+[ "$KSFT_MACHINE_SLOW" = yes ] && exit ${ksft_skip}
+
+NFT_CONCAT_RANGE_TESTS="performance" exec ./nft_concat_range.sh
diff --git a/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh
new file mode 100755
index 000000000000..abcaa7337197
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_conntrack_helper.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+#
+# This tests connection tracking helper assignment:
+# 1. can attach ftp helper to a connection from nft ruleset.
+# 2. auto-assign still works.
+#
+# Kselftest framework requirement - SKIP code is 4.
+
+source lib.sh
+
+ret=0
+
+testipv6=1
+
+checktool "socat -h" "run test without socat"
+checktool "conntrack --version" "run test without conntrack"
+checktool "nft --version" "run test without nft"
+
+cleanup()
+{
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+
+	ip netns del "$ns1"
+	ip netns del "$ns2"
+}
+
+trap cleanup EXIT
+
+setup_ns ns1 ns2
+
+if ! ip link add veth0 netns "$ns1" type veth peer name veth0 netns "$ns2" > /dev/null 2>&1;then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+
+ip -net "$ns1" link set veth0 up
+ip -net "$ns2" link set veth0 up
+
+ip -net "$ns1" addr add 10.0.1.1/24 dev veth0
+ip -net "$ns1" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$ns2" addr add 10.0.1.2/24 dev veth0
+ip -net "$ns2" addr add dead:1::2/64 dev veth0 nodad
+
+load_ruleset_family() {
+	local family=$1
+	local ns=$2
+
+ip netns exec "$ns" nft -f - <<EOF
+table $family raw {
+	ct helper ftp {
+             type "ftp" protocol tcp
+        }
+	chain pre {
+		type filter hook prerouting priority 0; policy accept;
+		tcp dport 2121 ct helper set "ftp"
+	}
+	chain output {
+		type filter hook output priority 0; policy accept;
+		tcp dport 2121 ct helper set "ftp"
+	}
+}
+EOF
+	return $?
+}
+
+check_for_helper()
+{
+	local netns=$1
+	local message=$2
+	local port=$3
+
+	if echo "$message" |grep -q 'ipv6';then
+		local family="ipv6"
+	else
+		local family="ipv4"
+	fi
+
+	if ! ip netns exec "$netns" conntrack -L -f $family -p tcp --dport "$port" 2> /dev/null |grep -q 'helper=ftp';then
+		if [ "$autoassign" -eq 0 ] ;then
+			echo "FAIL: ${netns} did not show attached helper $message" 1>&2
+			ret=1
+		else
+			echo "PASS: ${netns} did not show attached helper $message" 1>&2
+		fi
+	else
+		if [ "$autoassign" -eq 0 ] ;then
+			echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2
+		else
+			echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2
+			ret=1
+		fi
+	fi
+
+	return 0
+}
+
+listener_ready()
+{
+	ns="$1"
+	port="$2"
+	proto="$3"
+	ss -N "$ns" -lnt -o "sport = :$port" | grep -q "$port"
+}
+
+test_helper()
+{
+	local port=$1
+	local autoassign=$2
+
+	if [ "$autoassign" -eq 0 ] ;then
+		msg="set via ruleset"
+	else
+		msg="auto-assign"
+	fi
+
+	ip netns exec "$ns2" socat -t 3 -u -4 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null &
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" "$port" "-4"
+
+	ip netns exec "$ns1" socat -u -4 STDIN TCP:10.0.1.2:"$port" < /dev/null > /dev/null
+
+	check_for_helper "$ns1" "ip $msg" "$port" "$autoassign"
+	check_for_helper "$ns2" "ip $msg" "$port" "$autoassign"
+
+	if [ $testipv6 -eq 0 ] ;then
+		return 0
+	fi
+
+	ip netns exec "$ns1" conntrack -F 2> /dev/null
+	ip netns exec "$ns2" conntrack -F 2> /dev/null
+
+	ip netns exec "$ns2" socat -t 3 -u -6 TCP-LISTEN:"$port",reuseaddr STDOUT > /dev/null &
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" "$port" "-6"
+
+	ip netns exec "$ns1" socat -t 3 -u -6 STDIN TCP:"[dead:1::2]":"$port" < /dev/null > /dev/null
+
+	check_for_helper "$ns1" "ipv6 $msg" "$port"
+	check_for_helper "$ns2" "ipv6 $msg" "$port"
+}
+
+if ! load_ruleset_family ip "$ns1"; then
+	echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2
+	exit 1
+fi
+
+if ! load_ruleset_family ip6 "$ns1"; then
+	echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2
+	testipv6=0
+fi
+
+if ! load_ruleset_family inet "${ns2}"; then
+	echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2
+	if ! load_ruleset_family ip "${ns2}"; then
+		echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2
+		exit 1
+	fi
+
+	if [ "$testipv6" -eq 1 ] ;then
+		if ! load_ruleset_family ip6 "$ns2"; then
+			echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2
+			exit 1
+		fi
+	fi
+fi
+
+test_helper 2121 0
+ip netns exec "$ns1" sysctl -qe 'net.netfilter.nf_conntrack_helper=1'
+ip netns exec "$ns2" sysctl -qe 'net.netfilter.nf_conntrack_helper=1'
+test_helper 21 1
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_fib.sh b/tools/testing/selftests/net/netfilter/nft_fib.sh
new file mode 100755
index 000000000000..ce1451c275fd
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_fib.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+#
+# This tests the fib expression.
+#
+# Kselftest framework requirement - SKIP code is 4.
+
+source lib.sh
+
+ret=0
+
+timeout=4
+
+log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
+
+cleanup()
+{
+	cleanup_all_ns
+
+	[ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
+}
+
+checktool "nft --version" "run test without nft"
+
+setup_ns nsrouter ns1 ns2
+
+trap cleanup EXIT
+
+if dmesg | grep -q ' nft_rpfilter: ';then
+	dmesg -c | grep ' nft_rpfilter: '
+	echo "WARN: a previous test run has failed" 1>&2
+fi
+
+sysctl -q net.netfilter.nf_log_all_netns=1
+
+load_ruleset() {
+	local netns=$1
+
+ip netns exec "$netns" nft -f /dev/stdin <<EOF
+table inet filter {
+	chain prerouting {
+		type filter hook prerouting priority 0; policy accept;
+	        fib saddr . iif oif missing counter log prefix "$netns nft_rpfilter: " drop
+	}
+}
+EOF
+}
+
+load_pbr_ruleset() {
+	local netns=$1
+
+ip netns exec "$netns" nft -f /dev/stdin <<EOF
+table inet filter {
+	chain forward {
+		type filter hook forward priority raw;
+		fib saddr . iif oif gt 0 accept
+		log drop
+	}
+}
+EOF
+}
+
+load_ruleset_count() {
+	local netns=$1
+
+ip netns exec "$netns" nft -f /dev/stdin <<EOF
+table inet filter {
+	chain prerouting {
+		type filter hook prerouting priority 0; policy accept;
+		ip daddr 1.1.1.1 fib saddr . iif oif missing counter drop
+		ip6 daddr 1c3::c01d fib saddr . iif oif missing counter drop
+	}
+}
+EOF
+}
+
+check_drops() {
+	if dmesg | grep -q ' nft_rpfilter: ';then
+		dmesg | grep ' nft_rpfilter: '
+		echo "FAIL: rpfilter did drop packets"
+		return 1
+	fi
+
+	return 0
+}
+
+check_fib_counter() {
+	local want=$1
+	local ns=$2
+	local address=$3
+
+	if ! ip netns exec "$ns" nft list table inet filter | grep 'fib saddr . iif' | grep "$address" | grep -q "packets $want";then
+		echo "Netns $ns fib counter doesn't match expected packet count of $want for $address" 1>&2
+		ip netns exec "$ns" nft list table inet filter
+		return 1
+	fi
+
+	if [ "$want" -gt 0 ]; then
+		echo "PASS: fib expression did drop packets for $address"
+	fi
+
+	return 0
+}
+
+load_ruleset "$nsrouter"
+load_ruleset "$ns1"
+load_ruleset "$ns2"
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+test_ping() {
+  local daddr4=$1
+  local daddr6=$2
+
+  if ! ip netns exec "$ns1" ping -c 1 -q "$daddr4" > /dev/null; then
+	check_drops
+	echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q "$daddr6" > /dev/null; then
+	check_drops
+	echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2
+	return 1
+  fi
+
+  return 0
+}
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null
+
+test_ping 10.0.2.1 dead:2::1 || exit 1
+check_drops || exit 1
+
+test_ping 10.0.2.99 dead:2::99 || exit 1
+check_drops || exit 1
+
+echo "PASS: fib expression did not cause unwanted packet drops"
+
+ip netns exec "$nsrouter" nft flush table inet filter
+
+ip -net "$ns1" route del default
+ip -net "$ns1" -6 route del default
+
+ip -net "$ns1" addr del 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr del dead:1::99/64 dev eth0
+
+ip -net "$ns1" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns1" addr add dead:2::99/64 dev eth0 nodad
+
+ip -net "$ns1" route add default via 10.0.2.1
+ip -net "$ns1" -6 route add default via dead:2::1
+
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth0 nodad
+
+# switch to ruleset that doesn't log, this time
+# its expected that this does drop the packets.
+load_ruleset_count "$nsrouter"
+
+# ns1 has a default route, but nsrouter does not.
+# must not check return value, ping to 1.1.1.1 will
+# fail.
+check_fib_counter 0 "$nsrouter" 1.1.1.1 || exit 1
+check_fib_counter 0 "$nsrouter" 1c3::c01d || exit 1
+
+ip netns exec "$ns1" ping -W 0.5 -c 1 -q 1.1.1.1 > /dev/null
+check_fib_counter 1 "$nsrouter" 1.1.1.1 || exit 1
+
+ip netns exec "$ns1" ping -W 0.5 -i 0.1 -c 3 -q 1c3::c01d > /dev/null
+check_fib_counter 3 "$nsrouter" 1c3::c01d || exit 1
+
+# delete all rules
+ip netns exec "$ns1" nft flush ruleset
+ip netns exec "$ns2" nft flush ruleset
+ip netns exec "$nsrouter" nft flush ruleset
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+
+ip -net "$ns1" addr del 10.0.2.99/24 dev eth0
+ip -net "$ns1" addr del dead:2::99/64 dev eth0
+
+ip -net "$nsrouter" addr del dead:2::1/64 dev veth0
+
+# ... pbr ruleset for the router, check iif+oif.
+if ! load_pbr_ruleset "$nsrouter";then
+	echo "SKIP: Could not load fib forward ruleset"
+	exit $ksft_skip
+fi
+
+ip -net "$nsrouter" rule add from all table 128
+ip -net "$nsrouter" rule add from all iif veth0 table 129
+ip -net "$nsrouter" route add table 128 to 10.0.1.0/24 dev veth0
+ip -net "$nsrouter" route add table 129 to 10.0.2.0/24 dev veth1
+
+# drop main ipv4 table
+ip -net "$nsrouter" -4 rule delete table main
+
+if ! test_ping 10.0.2.99 dead:2::99;then
+	ip -net "$nsrouter" nft list ruleset
+	echo "FAIL: fib mismatch in pbr setup"
+	exit 1
+fi
+
+echo "PASS: fib expression forward check with policy based routing"
+exit 0
diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
index a32f490f7539..b3995550856a 100755
--- a/tools/testing/selftests/netfilter/nft_flowtable.sh
+++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
@@ -14,15 +14,10 @@
 # nft_flowtable.sh -o8000 -l1500 -r2000
 #
 
-sfx=$(mktemp -u "XXXXXXXX")
-ns1="ns1-$sfx"
-ns2="ns2-$sfx"
-nsr1="nsr1-$sfx"
-nsr2="nsr2-$sfx"
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
+source lib.sh
+
 ret=0
+SOCAT_TIMEOUT=60
 
 nsin=""
 ns1out=""
@@ -30,52 +25,41 @@ ns2out=""
 
 log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
 
-checktool (){
-	if ! $1 > /dev/null 2>&1; then
-		echo "SKIP: Could not $2"
-		exit $ksft_skip
-	fi
-}
-
 checktool "nft --version" "run test without nft tool"
-checktool "ip -Version" "run test without ip tool"
-checktool "which nc" "run test without nc (netcat)"
-checktool "ip netns add $nsr1" "create net namespace $nsr1"
+checktool "socat -h" "run test without socat"
 
-ip netns add $ns1
-ip netns add $ns2
-ip netns add $nsr2
+setup_ns ns1 ns2 nsr1 nsr2
 
 cleanup() {
-	ip netns del $ns1
-	ip netns del $ns2
-	ip netns del $nsr1
-	ip netns del $nsr2
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
 
 	rm -f "$nsin" "$ns1out" "$ns2out"
 
-	[ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
+	[ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns="$log_netns"
 }
 
 trap cleanup EXIT
 
 sysctl -q net.netfilter.nf_log_all_netns=1
 
-ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1
-ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2
+ip link add veth0 netns "$nsr1" type veth peer name eth0 netns "$ns1"
+ip link add veth1 netns "$nsr1" type veth peer name veth0 netns "$nsr2"
 
-ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2
+ip link add veth1 netns "$nsr2" type veth peer name eth0 netns "$ns2"
 
-for dev in lo veth0 veth1; do
-    ip -net $nsr1 link set $dev up
-    ip -net $nsr2 link set $dev up
+for dev in veth0 veth1; do
+    ip -net "$nsr1" link set "$dev" up
+    ip -net "$nsr2" link set "$dev" up
 done
 
-ip -net $nsr1 addr add 10.0.1.1/24 dev veth0
-ip -net $nsr1 addr add dead:1::1/64 dev veth0
+ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad
 
-ip -net $nsr2 addr add 10.0.2.1/24 dev veth1
-ip -net $nsr2 addr add dead:2::1/64 dev veth1
+ip -net "$nsr2" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsr2" addr add dead:2::1/64 dev veth1 nodad
 
 # set different MTUs so we need to push packets coming from ns1 (large MTU)
 # to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
@@ -107,56 +91,63 @@ do
 	esac
 done
 
-if ! ip -net $nsr1 link set veth0 mtu $omtu; then
+if ! ip -net "$nsr1" link set veth0 mtu "$omtu"; then
+	exit 1
+fi
+
+ip -net "$ns1" link set eth0 mtu "$omtu"
+
+if ! ip -net "$nsr2" link set veth1 mtu "$rmtu"; then
 	exit 1
 fi
 
-ip -net $ns1 link set eth0 mtu $omtu
+if ! ip -net "$nsr1" link set veth1 mtu "$lmtu"; then
+	exit 1
+fi
 
-if ! ip -net $nsr2 link set veth1 mtu $rmtu; then
+if ! ip -net "$nsr2" link set veth0 mtu "$lmtu"; then
 	exit 1
 fi
 
-ip -net $ns2 link set eth0 mtu $rmtu
+ip -net "$ns2" link set eth0 mtu "$rmtu"
 
 # transfer-net between nsr1 and nsr2.
 # these addresses are not used for connections.
-ip -net $nsr1 addr add 192.168.10.1/24 dev veth1
-ip -net $nsr1 addr add fee1:2::1/64 dev veth1
+ip -net "$nsr1" addr add 192.168.10.1/24 dev veth1
+ip -net "$nsr1" addr add fee1:2::1/64 dev veth1 nodad
 
-ip -net $nsr2 addr add 192.168.10.2/24 dev veth0
-ip -net $nsr2 addr add fee1:2::2/64 dev veth0
+ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0
+ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad
 
 for i in 0 1; do
-  ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
-  ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
+  ip netns exec "$nsr1" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
+  ip netns exec "$nsr2" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null
 done
 
-for ns in $ns1 $ns2;do
-  ip -net $ns link set lo up
-  ip -net $ns link set eth0 up
+for ns in "$ns1" "$ns2";do
+  ip -net "$ns" link set eth0 up
 
-  if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then
+  if ! ip netns exec "$ns" sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then
 	echo "ERROR: Check Originator/Responder values (problem during address addition)"
 	exit 1
   fi
   # don't set ip DF bit for first two tests
-  ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
+  ip netns exec "$ns" sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
 done
 
-ip -net $ns1 addr add 10.0.1.99/24 dev eth0
-ip -net $ns2 addr add 10.0.2.99/24 dev eth0
-ip -net $ns1 route add default via 10.0.1.1
-ip -net $ns2 route add default via 10.0.2.1
-ip -net $ns1 addr add dead:1::99/64 dev eth0
-ip -net $ns2 addr add dead:2::99/64 dev eth0
-ip -net $ns1 route add default via dead:1::1
-ip -net $ns2 route add default via dead:2::1
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via dead:1::1
+ip -net "$ns2" route add default via dead:2::1
 
-ip -net $nsr1 route add default via 192.168.10.2
-ip -net $nsr2 route add default via 192.168.10.1
+ip -net "$nsr1" route add default via 192.168.10.2
+ip -net "$nsr2" route add default via 192.168.10.1
 
-ip netns exec $nsr1 nft -f - <<EOF
+ip netns exec "$nsr1" nft -f - <<EOF
 table inet filter {
   flowtable f1 {
      hook ingress priority 0
@@ -188,7 +179,7 @@ if [ $? -ne 0 ]; then
 	exit $ksft_skip
 fi
 
-ip netns exec $ns2 nft -f - <<EOF
+ip netns exec "$ns2" nft -f - <<EOF
 table inet filter {
    counter ip4dscp0 { }
    counter ip4dscp3 { }
@@ -204,25 +195,22 @@ table inet filter {
 EOF
 
 if [ $? -ne 0 ]; then
-	echo "SKIP: Could not load nft ruleset"
+	echo -n "SKIP: Could not load ruleset: "
+	nft --version
 	exit $ksft_skip
 fi
 
 # test basic connectivity
-if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then
+if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
   echo "ERROR: $ns1 cannot reach ns2" 1>&2
   exit 1
 fi
 
-if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then
+if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then
   echo "ERROR: $ns2 cannot reach $ns1" 1>&2
   exit 1
 fi
 
-if [ $ret -eq 0 ];then
-	echo "PASS: netns routing/connectivity: $ns1 can reach $ns2"
-fi
-
 nsin=$(mktemp)
 ns1out=$(mktemp)
 ns2out=$(mktemp)
@@ -248,23 +236,27 @@ check_counters()
 	local what=$1
 	local ok=1
 
-	local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets)
-	local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets)
+	local orig repl
+	orig=$(ip netns exec "$nsr1" nft reset counter inet filter routed_orig | grep packets)
+	repl=$(ip netns exec "$nsr1" nft reset counter inet filter routed_repl | grep packets)
 
 	local orig_cnt=${orig#*bytes}
 	local repl_cnt=${repl#*bytes}
 
-	local fs=$(du -sb $nsin)
+	local fs
+	fs=$(du -sb "$nsin")
 	local max_orig=${fs%%/*}
 	local max_repl=$((max_orig/4))
 
-	if [ $orig_cnt -gt $max_orig ];then
+	# flowtable fastpath should bypass normal routing one, i.e. the counters in forward hook
+	# should always be lower than the size of the transmitted file (max_orig).
+	if [ "$orig_cnt" -gt "$max_orig" ];then
 		echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2
 		ret=1
 		ok=0
 	fi
 
-	if [ $repl_cnt -gt $max_repl ];then
+	if [ "$repl_cnt" -gt $max_repl ];then
 		echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2
 		ret=1
 		ok=0
@@ -280,39 +272,40 @@ check_dscp()
 	local what=$1
 	local ok=1
 
-	local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp3 | grep packets)
+	local counter
+	counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp3 | grep packets)
 
 	local pc4=${counter%*bytes*}
 	local pc4=${pc4#*packets}
 
-	local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp0 | grep packets)
+	counter=$(ip netns exec "$ns2" nft reset counter inet filter ip4dscp0 | grep packets)
 	local pc4z=${counter%*bytes*}
 	local pc4z=${pc4z#*packets}
 
 	case "$what" in
 	"dscp_none")
-		if [ $pc4 -gt 0 ] || [ $pc4z -eq 0 ]; then
+		if [ "$pc4" -gt 0 ] || [ "$pc4z" -eq 0 ]; then
 			echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2
 			ret=1
 			ok=0
 		fi
 		;;
 	"dscp_fwd")
-		if [ $pc4 -eq 0 ] || [ $pc4z -eq 0 ]; then
+		if [ "$pc4" -eq 0 ] || [ "$pc4z" -eq 0 ]; then
 			echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2
 			ret=1
 			ok=0
 		fi
 		;;
 	"dscp_ingress")
-		if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then
+		if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then
 			echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
 			ret=1
 			ok=0
 		fi
 		;;
 	"dscp_egress")
-		if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then
+		if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then
 			echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
 			ret=1
 			ok=0
@@ -324,7 +317,7 @@ check_dscp()
 		ok=0
 	esac
 
-	if [ $ok -eq 1 ] ;then
+	if [ "$ok" -eq 1 ] ;then
 		echo "PASS: $what: dscp packet counters match"
 	fi
 }
@@ -345,6 +338,11 @@ check_transfer()
 	return 0
 }
 
+listener_ready()
+{
+	ss -N "$nsb" -lnt -o "sport = :12345" | grep -q 12345
+}
+
 test_tcp_forwarding_ip()
 {
 	local nsa=$1
@@ -353,40 +351,23 @@ test_tcp_forwarding_ip()
 	local dstport=$4
 	local lret=0
 
-	ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" &
+	timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$nsin" > "$ns2out" &
 	lpid=$!
 
-	sleep 1
-	ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" &
-	cpid=$!
-
-	sleep 1
-
-	prev="$(ls -l $ns1out $ns2out)"
-	sleep 1
-
-	while [[ "$prev" != "$(ls -l $ns1out $ns2out)" ]]; do
-		sleep 1;
-		prev="$(ls -l $ns1out $ns2out)"
-	done
+	busywait 1000 listener_ready
 
-	if test -d /proc/"$lpid"/; then
-		kill $lpid
-	fi
-
-	if test -d /proc/"$cpid"/; then
-		kill $cpid
-	fi
+	timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$nsin" > "$ns1out"
 
 	wait $lpid
-	wait $cpid
 
 	if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then
 		lret=1
+		ret=1
 	fi
 
 	if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then
 		lret=1
+		ret=1
 	fi
 
 	return $lret
@@ -403,7 +384,7 @@ test_tcp_forwarding_set_dscp()
 {
 	check_dscp "dscp_none"
 
-ip netns exec $nsr1 nft -f - <<EOF
+ip netns exec "$nsr1" nft -f - <<EOF
 table netdev dscpmangle {
    chain setdscp0 {
       type filter hook ingress device "veth0" priority 0; policy accept
@@ -415,12 +396,12 @@ if [ $? -eq 0 ]; then
 	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
 	check_dscp "dscp_ingress"
 
-	ip netns exec $nsr1 nft delete table netdev dscpmangle
+	ip netns exec "$nsr1" nft delete table netdev dscpmangle
 else
 	echo "SKIP: Could not load netdev:ingress for veth0"
 fi
 
-ip netns exec $nsr1 nft -f - <<EOF
+ip netns exec "$nsr1" nft -f - <<EOF
 table netdev dscpmangle {
    chain setdscp0 {
       type filter hook egress device "veth1" priority 0; policy accept
@@ -432,14 +413,14 @@ if [ $? -eq 0 ]; then
 	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
 	check_dscp "dscp_egress"
 
-	ip netns exec $nsr1 nft flush table netdev dscpmangle
+	ip netns exec "$nsr1" nft flush table netdev dscpmangle
 else
 	echo "SKIP: Could not load netdev:egress for veth1"
 fi
 
 	# partial.  If flowtable really works, then both dscp-is-0 and dscp-is-cs3
 	# counters should have seen packets (before and after ft offload kicks in).
-	ip netns exec $nsr1 nft -a insert rule inet filter forward ip dscp set cs3
+	ip netns exec "$nsr1" nft -a insert rule inet filter forward ip dscp set cs3
 	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
 	check_dscp "dscp_fwd"
 }
@@ -455,8 +436,8 @@ test_tcp_forwarding_nat()
 	pmtu=$3
 	what=$4
 
-	if [ $lret -eq 0 ] ; then
-		if [ $pmtu -eq 1 ] ;then
+	if [ "$lret" -eq 0 ] ; then
+		if [ "$pmtu" -eq 1 ] ;then
 			check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what"
 		else
 			echo "PASS: flow offload for ns1/ns2 with masquerade $what"
@@ -464,9 +445,9 @@ test_tcp_forwarding_nat()
 
 		test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666
 		lret=$?
-		if [ $pmtu -eq 1 ] ;then
+		if [ "$pmtu" -eq 1 ] ;then
 			check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what"
-		elif [ $lret -eq 0 ] ; then
+		elif [ "$lret" -eq 0 ] ; then
 			echo "PASS: flow offload for ns1/ns2 with dnat $what"
 		fi
 	fi
@@ -481,25 +462,25 @@ make_file "$nsin"
 # Due to MTU mismatch in both directions, all packets (except small packets like pure
 # acks) have to be handled by normal forwarding path.  Therefore, packet counters
 # are not checked.
-if test_tcp_forwarding $ns1 $ns2; then
+if test_tcp_forwarding "$ns1" "$ns2"; then
 	echo "PASS: flow offloaded for ns1/ns2"
 else
 	echo "FAIL: flow offload for ns1/ns2:" 1>&2
-	ip netns exec $nsr1 nft list ruleset
+	ip netns exec "$nsr1" nft list ruleset
 	ret=1
 fi
 
 # delete default route, i.e. ns2 won't be able to reach ns1 and
 # will depend on ns1 being masqueraded in nsr1.
 # expect ns1 has nsr1 address.
-ip -net $ns2 route del default via 10.0.2.1
-ip -net $ns2 route del default via dead:2::1
-ip -net $ns2 route add 192.168.10.1 via 10.0.2.1
+ip -net "$ns2" route del default via 10.0.2.1
+ip -net "$ns2" route del default via dead:2::1
+ip -net "$ns2" route add 192.168.10.1 via 10.0.2.1
 
 # Second test:
 # Same, but with NAT enabled.  Same as in first test: we expect normal forward path
 # to handle most packets.
-ip netns exec $nsr1 nft -f - <<EOF
+ip netns exec "$nsr1" nft -f - <<EOF
 table ip nat {
    chain prerouting {
       type nat hook prerouting priority 0; policy accept;
@@ -513,14 +494,14 @@ table ip nat {
 }
 EOF
 
-if ! test_tcp_forwarding_set_dscp $ns1 $ns2 0 ""; then
+if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then
 	echo "FAIL: flow offload for ns1/ns2 with dscp update" 1>&2
 	exit 0
 fi
 
-if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then
+if ! test_tcp_forwarding_nat "$ns1" "$ns2" 0 ""; then
 	echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
-	ip netns exec $nsr1 nft list ruleset
+	ip netns exec "$nsr1" nft list ruleset
 	ret=1
 fi
 
@@ -528,35 +509,40 @@ fi
 # Same as second test, but with PMTU discovery enabled. This
 # means that we expect the fastpath to handle packets as soon
 # as the endpoints adjust the packet size.
-ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
-ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
+ip netns exec "$ns1" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
+ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
 
 # reset counters.
 # With pmtu in-place we'll also check that nft counters
 # are lower than file size and packets were forwarded via flowtable layer.
 # For earlier tests (large mtus), packets cannot be handled via flowtable
 # (except pure acks and other small packets).
-ip netns exec $nsr1 nft reset counters table inet filter >/dev/null
+ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null
 
-if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then
+if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then
 	echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
-	ip netns exec $nsr1 nft list ruleset
+	ip netns exec "$nsr1" nft list ruleset
 fi
 
 # Another test:
 # Add bridge interface br0 to Router1, with NAT enabled.
-ip -net $nsr1 link add name br0 type bridge
-ip -net $nsr1 addr flush dev veth0
-ip -net $nsr1 link set up dev veth0
-ip -net $nsr1 link set veth0 master br0
-ip -net $nsr1 addr add 10.0.1.1/24 dev br0
-ip -net $nsr1 addr add dead:1::1/64 dev br0
-ip -net $nsr1 link set up dev br0
+test_bridge() {
+if ! ip -net "$nsr1" link add name br0 type bridge 2>/dev/null;then
+	echo "SKIP: could not add bridge br0"
+	[ "$ret" -eq 0 ] && ret=$ksft_skip
+	return
+fi
+ip -net "$nsr1" addr flush dev veth0
+ip -net "$nsr1" link set up dev veth0
+ip -net "$nsr1" link set veth0 master br0
+ip -net "$nsr1" addr add 10.0.1.1/24 dev br0
+ip -net "$nsr1" addr add dead:1::1/64 dev br0 nodad
+ip -net "$nsr1" link set up dev br0
 
-ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null
+ip netns exec "$nsr1" sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null
 
 # br0 with NAT enabled.
-ip netns exec $nsr1 nft -f - <<EOF
+ip netns exec "$nsr1" nft -f - <<EOF
 flush table ip nat
 table ip nat {
    chain prerouting {
@@ -571,56 +557,59 @@ table ip nat {
 }
 EOF
 
-if ! test_tcp_forwarding_nat $ns1 $ns2 1 "on bridge"; then
+if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "on bridge"; then
 	echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2
-	ip netns exec $nsr1 nft list ruleset
+	ip netns exec "$nsr1" nft list ruleset
 	ret=1
 fi
 
 
 # Another test:
 # Add bridge interface br0 to Router1, with NAT and VLAN.
-ip -net $nsr1 link set veth0 nomaster
-ip -net $nsr1 link set down dev veth0
-ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10
-ip -net $nsr1 link set up dev veth0
-ip -net $nsr1 link set up dev veth0.10
-ip -net $nsr1 link set veth0.10 master br0
-
-ip -net $ns1 addr flush dev eth0
-ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10
-ip -net $ns1 link set eth0 up
-ip -net $ns1 link set eth0.10 up
-ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10
-ip -net $ns1 route add default via 10.0.1.1
-ip -net $ns1 addr add dead:1::99/64 dev eth0.10
-
-if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then
+ip -net "$nsr1" link set veth0 nomaster
+ip -net "$nsr1" link set down dev veth0
+ip -net "$nsr1" link add link veth0 name veth0.10 type vlan id 10
+ip -net "$nsr1" link set up dev veth0
+ip -net "$nsr1" link set up dev veth0.10
+ip -net "$nsr1" link set veth0.10 master br0
+
+ip -net "$ns1" addr flush dev eth0
+ip -net "$ns1" link add link eth0 name eth0.10 type vlan id 10
+ip -net "$ns1" link set eth0 up
+ip -net "$ns1" link set eth0.10 up
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0.10
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" addr add dead:1::99/64 dev eth0.10 nodad
+
+if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "bridge and VLAN"; then
 	echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2
-	ip netns exec $nsr1 nft list ruleset
+	ip netns exec "$nsr1" nft list ruleset
 	ret=1
 fi
 
 # restore test topology (remove bridge and VLAN)
-ip -net $nsr1 link set veth0 nomaster
-ip -net $nsr1 link set veth0 down
-ip -net $nsr1 link set veth0.10 down
-ip -net $nsr1 link delete veth0.10 type vlan
-ip -net $nsr1 link delete br0 type bridge
-ip -net $ns1 addr flush dev eth0.10
-ip -net $ns1 link set eth0.10 down
-ip -net $ns1 link set eth0 down
-ip -net $ns1 link delete eth0.10 type vlan
+ip -net "$nsr1" link set veth0 nomaster
+ip -net "$nsr1" link set veth0 down
+ip -net "$nsr1" link set veth0.10 down
+ip -net "$nsr1" link delete veth0.10 type vlan
+ip -net "$nsr1" link delete br0 type bridge
+ip -net "$ns1" addr flush dev eth0.10
+ip -net "$ns1" link set eth0.10 down
+ip -net "$ns1" link set eth0 down
+ip -net "$ns1" link delete eth0.10 type vlan
 
 # restore address in ns1 and nsr1
-ip -net $ns1 link set eth0 up
-ip -net $ns1 addr add 10.0.1.99/24 dev eth0
-ip -net $ns1 route add default via 10.0.1.1
-ip -net $ns1 addr add dead:1::99/64 dev eth0
-ip -net $ns1 route add default via dead:1::1
-ip -net $nsr1 addr add 10.0.1.1/24 dev veth0
-ip -net $nsr1 addr add dead:1::1/64 dev veth0
-ip -net $nsr1 link set up dev veth0
+ip -net "$ns1" link set eth0 up
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via dead:1::1
+ip -net "$nsr1" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad
+ip -net "$nsr1" link set up dev veth0
+}
+
+test_bridge
 
 KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1)
 KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1)
@@ -640,33 +629,43 @@ do_esp() {
     local spi_out=$6
     local spi_in=$7
 
-    ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in  enc aes $KEY_AES  auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
-    ip -net $ns xfrm state add src $me  dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet
+    ip -net "$ns" xfrm state add src "$remote" dst "$me" proto esp spi "$spi_in"  enc aes "$KEY_AES"  auth sha1 "$KEY_SHA" mode tunnel sel src "$rnet" dst "$lnet"
+    ip -net "$ns" xfrm state add src "$me"  dst "$remote" proto esp spi "$spi_out" enc aes "$KEY_AES" auth sha1 "$KEY_SHA" mode tunnel sel src "$lnet" dst "$rnet"
 
     # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
-    ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
+    ip -net "$ns" xfrm policy add src "$lnet" dst "$rnet" dir out tmpl src "$me" dst "$remote" proto esp mode tunnel priority 1 action allow
     # to fwd decrypted packets after esp processing:
-    ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow
-
+    ip -net "$ns" xfrm policy add src "$rnet" dst "$lnet" dir fwd tmpl src "$remote" dst "$me" proto esp mode tunnel priority 1 action allow
 }
 
-do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2
+do_esp "$nsr1" 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 "$SPI1" "$SPI2"
 
-do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1
+do_esp "$nsr2" 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 "$SPI2" "$SPI1"
 
-ip netns exec $nsr1 nft delete table ip nat
+ip netns exec "$nsr1" nft delete table ip nat
 
 # restore default routes
-ip -net $ns2 route del 192.168.10.1 via 10.0.2.1
-ip -net $ns2 route add default via 10.0.2.1
-ip -net $ns2 route add default via dead:2::1
+ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
 
-if test_tcp_forwarding $ns1 $ns2; then
+if test_tcp_forwarding "$ns1" "$ns2"; then
 	check_counters "ipsec tunnel mode for ns1/ns2"
 else
 	echo "FAIL: ipsec tunnel mode for ns1/ns2"
-	ip netns exec $nsr1 nft list ruleset 1>&2
-	ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2
+	ip netns exec "$nsr1" nft list ruleset 1>&2
+	ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2
+fi
+
+if [ "$1" = "" ]; then
+	low=1280
+	mtu=$((65536 - low))
+	o=$(((RANDOM%mtu) + low))
+	l=$(((RANDOM%mtu) + low))
+	r=$(((RANDOM%mtu) + low))
+
+	echo "re-run with random mtus: -o $o -l $l -r $r"
+	$0 -o "$o" -l "$l" -r "$r"
 fi
 
 exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/net/netfilter/nft_meta.sh
index f33154c04d34..71505b6cb252 100755
--- a/tools/testing/selftests/netfilter/nft_meta.sh
+++ b/tools/testing/selftests/net/netfilter/nft_meta.sh
@@ -91,10 +91,10 @@ check_one_counter()
 	local want="packets $2"
 	local verbose="$3"
 
-	if ! ip netns exec "$ns0" nft list counter inet filter $cname | grep -q "$want"; then
+	if ! ip netns exec "$ns0" nft list counter inet filter "$cname" | grep -q "$want"; then
 		echo "FAIL: $cname, want \"$want\", got"
 		ret=1
-		ip netns exec "$ns0" nft list counter inet filter $cname
+		ip netns exec "$ns0" nft list counter inet filter "$cname"
 	fi
 }
 
diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/net/netfilter/nft_nat.sh
index dd40d9f6f259..9e39de26455f 100755
--- a/tools/testing/selftests/netfilter/nft_nat.sh
+++ b/tools/testing/selftests/net/netfilter/nft_nat.sh
@@ -3,77 +3,60 @@
 # This test is for basic NAT functionality: snat, dnat, redirect, masquerade.
 #
 
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
+source lib.sh
+
 ret=0
 test_inet_nat=true
 
-sfx=$(mktemp -u "XXXXXXXX")
-ns0="ns0-$sfx"
-ns1="ns1-$sfx"
-ns2="ns2-$sfx"
+checktool "nft --version" "run test without nft tool"
+checktool "socat -h" "run test without socat"
 
 cleanup()
 {
-	for i in 0 1 2; do ip netns del ns$i-"$sfx";done
-}
+	ip netns pids "$ns0" | xargs kill 2>/dev/null
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
 
-nft --version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without nft tool"
-	exit $ksft_skip
-fi
-
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
+	rm -f "$INFILE" "$OUTFILE"
 
-ip netns add "$ns0"
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not create net namespace $ns0"
-	exit $ksft_skip
-fi
+	cleanup_all_ns
+}
 
 trap cleanup EXIT
 
-ip netns add "$ns1"
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not create net namespace $ns1"
-	exit $ksft_skip
-fi
+INFILE=$(mktemp)
+OUTFILE=$(mktemp)
 
-ip netns add "$ns2"
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not create net namespace $ns2"
-	exit $ksft_skip
-fi
+setup_ns ns0 ns1 ns2
 
-ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1
-if [ $? -ne 0 ];then
+if ! ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1;then
     echo "SKIP: No virtual ethernet pair device support in kernel"
     exit $ksft_skip
 fi
 ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns2"
 
-ip -net "$ns0" link set lo up
 ip -net "$ns0" link set veth0 up
 ip -net "$ns0" addr add 10.0.1.1/24 dev veth0
-ip -net "$ns0" addr add dead:1::1/64 dev veth0
+ip -net "$ns0" addr add dead:1::1/64 dev veth0 nodad
 
 ip -net "$ns0" link set veth1 up
 ip -net "$ns0" addr add 10.0.2.1/24 dev veth1
-ip -net "$ns0" addr add dead:2::1/64 dev veth1
-
-for i in 1 2; do
-  ip -net ns$i-$sfx link set lo up
-  ip -net ns$i-$sfx link set eth0 up
-  ip -net ns$i-$sfx addr add 10.0.$i.99/24 dev eth0
-  ip -net ns$i-$sfx route add default via 10.0.$i.1
-  ip -net ns$i-$sfx addr add dead:$i::99/64 dev eth0
-  ip -net ns$i-$sfx route add default via dead:$i::1
-done
+ip -net "$ns0" addr add dead:2::1/64 dev veth1 nodad
+
+do_config()
+{
+	ns="$1"
+	subnet="$2"
+
+	ip -net "$ns" link set eth0 up
+	ip -net "$ns" addr add "10.0.$subnet.99/24" dev eth0
+	ip -net "$ns" route add default via "10.0.$subnet.1"
+	ip -net "$ns" addr add "dead:$subnet::99/64" dev eth0 nodad
+	ip -net "$ns" route add default via "dead:$subnet::1"
+}
+
+do_config "$ns1" 1
+do_config "$ns2" 2
 
 bad_counter()
 {
@@ -83,7 +66,7 @@ bad_counter()
 	local tag=$4
 
 	echo "ERROR: $counter counter in $ns has unexpected value (expected $expect) at $tag" 1>&2
-	ip netns exec $ns nft list counter inet filter $counter 1>&2
+	ip netns exec "$ns" nft list counter inet filter "$counter" 1>&2
 }
 
 check_counters()
@@ -91,26 +74,23 @@ check_counters()
 	ns=$1
 	local lret=0
 
-	cnt=$(ip netns exec $ns nft list counter inet filter ns0in | grep -q "packets 1 bytes 84")
-	if [ $? -ne 0 ]; then
-		bad_counter $ns ns0in "packets 1 bytes 84" "check_counters 1"
+	if ! ip netns exec "$ns" nft list counter inet filter ns0in | grep -q "packets 1 bytes 84";then
+		bad_counter "$ns" ns0in "packets 1 bytes 84" "check_counters 1"
 		lret=1
 	fi
-	cnt=$(ip netns exec $ns nft list counter inet filter ns0out | grep -q "packets 1 bytes 84")
-	if [ $? -ne 0 ]; then
-		bad_counter $ns ns0out "packets 1 bytes 84" "check_counters 2"
+
+	if ! ip netns exec "$ns" nft list counter inet filter ns0out | grep -q "packets 1 bytes 84";then
+		bad_counter "$ns" ns0out "packets 1 bytes 84" "check_counters 2"
 		lret=1
 	fi
 
 	expect="packets 1 bytes 104"
-	cnt=$(ip netns exec $ns nft list counter inet filter ns0in6 | grep -q "$expect")
-	if [ $? -ne 0 ]; then
-		bad_counter $ns ns0in6 "$expect" "check_counters 3"
+	if ! ip netns exec "$ns" nft list counter inet filter ns0in6 | grep -q "$expect";then
+		bad_counter "$ns" ns0in6 "$expect" "check_counters 3"
 		lret=1
 	fi
-	cnt=$(ip netns exec $ns nft list counter inet filter ns0out6 | grep -q "$expect")
-	if [ $? -ne 0 ]; then
-		bad_counter $ns ns0out6 "$expect" "check_counters 4"
+	if ! ip netns exec "$ns" nft list counter inet filter ns0out6 | grep -q "$expect";then
+		bad_counter "$ns" ns0out6 "$expect" "check_counters 4"
 		lret=1
 	fi
 
@@ -122,41 +102,35 @@ check_ns0_counters()
 	local ns=$1
 	local lret=0
 
-	cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0")
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0";then
 		bad_counter "$ns0" ns0in "packets 0 bytes 0" "check_ns0_counters 1"
 		lret=1
 	fi
 
-	cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0")
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0";then
 		bad_counter "$ns0" ns0in6 "packets 0 bytes 0"
 		lret=1
 	fi
 
-	cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0")
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0";then
 		bad_counter "$ns0" ns0out "packets 0 bytes 0" "check_ns0_counters 2"
 		lret=1
 	fi
-	cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0")
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0";then
 		bad_counter "$ns0" ns0out6 "packets 0 bytes 0" "check_ns0_counters3 "
 		lret=1
 	fi
 
 	for dir in "in" "out" ; do
 		expect="packets 1 bytes 84"
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns0" $ns$dir "$expect" "check_ns0_counters 4"
+		if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" "$ns${dir}" "$expect" "check_ns0_counters 4"
 			lret=1
 		fi
 
 		expect="packets 1 bytes 104"
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir}6 | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns0" $ns$dir6 "$expect" "check_ns0_counters 5"
+		if ! ip netns exec "$ns0" nft list counter inet filter "${ns}${dir}6" | grep -q "$expect";then
+			bad_counter "$ns0" "$ns${dir}6" "$expect" "check_ns0_counters 5"
 			lret=1
 		fi
 	done
@@ -166,8 +140,8 @@ check_ns0_counters()
 
 reset_counters()
 {
-	for i in 0 1 2;do
-		ip netns exec ns$i-$sfx nft reset counters inet > /dev/null
+	for i in "$ns0" "$ns1" "$ns2" ;do
+		ip netns exec "$i" nft reset counters inet > /dev/null
 	done
 }
 
@@ -177,7 +151,7 @@ test_local_dnat6()
 	local lret=0
 	local IPF=""
 
-	if [ $family = "inet" ];then
+	if [ "$family" = "inet" ];then
 		IPF="ip6"
 	fi
 
@@ -195,8 +169,7 @@ EOF
 	fi
 
 	# ping netns1, expect rewrite to netns2
-	ip netns exec "$ns0" ping -q -c 1 dead:1::99 > /dev/null
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" ping -q -c 1 dead:1::99 > /dev/null;then
 		lret=1
 		echo "ERROR: ping6 failed"
 		return $lret
@@ -204,8 +177,7 @@ EOF
 
 	expect="packets 0 bytes 0"
 	for dir in "in6" "out6" ; do
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
 			bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat6 1"
 			lret=1
 		fi
@@ -213,8 +185,7 @@ EOF
 
 	expect="packets 1 bytes 104"
 	for dir in "in6" "out6" ; do
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
 			bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat6 2"
 			lret=1
 		fi
@@ -223,8 +194,7 @@ EOF
 	# expect 0 count in ns1
 	expect="packets 0 bytes 0"
 	for dir in "in6" "out6" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
 			bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat6 3"
 			lret=1
 		fi
@@ -233,8 +203,7 @@ EOF
 	# expect 1 packet in ns2
 	expect="packets 1 bytes 104"
 	for dir in "in6" "out6" ; do
-		cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
 			bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat6 4"
 			lret=1
 		fi
@@ -252,7 +221,7 @@ test_local_dnat()
 	local lret=0
 	local IPF=""
 
-	if [ $family = "inet" ];then
+	if [ "$family" = "inet" ];then
 		IPF="ip"
 	fi
 
@@ -265,7 +234,7 @@ table $family nat {
 }
 EOF
 	if [ $? -ne 0 ]; then
-		if [ $family = "inet" ];then
+		if [ "$family" = "inet" ];then
 			echo "SKIP: inet nat tests"
 			test_inet_nat=false
 			return $ksft_skip
@@ -275,8 +244,7 @@ EOF
 	fi
 
 	# ping netns1, expect rewrite to netns2
-	ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then
 		lret=1
 		echo "ERROR: ping failed"
 		return $lret
@@ -284,18 +252,16 @@ EOF
 
 	expect="packets 0 bytes 0"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat 1"
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" "ns1$dir" "$expect" "test_local_dnat 1"
 			lret=1
 		fi
 	done
 
 	expect="packets 1 bytes 84"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 2"
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" "ns2$dir" "$expect" "test_local_dnat 2"
 			lret=1
 		fi
 	done
@@ -303,9 +269,8 @@ EOF
 	# expect 0 count in ns1
 	expect="packets 0 bytes 0"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat 3"
+		if ! ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect";then
+			bad_counter "$ns1" "ns0$dir" "$expect" "test_local_dnat 3"
 			lret=1
 		fi
 	done
@@ -313,20 +278,18 @@ EOF
 	# expect 1 packet in ns2
 	expect="packets 1 bytes 84"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 4"
+		if ! ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect";then
+			bad_counter "$ns2" "ns0$dir" "$expect" "test_local_dnat 4"
 			lret=1
 		fi
 	done
 
 	test $lret -eq 0 && echo "PASS: ping to $ns1 was $family NATted to $ns2"
 
-	ip netns exec "$ns0" nft flush chain $family nat output
+	ip netns exec "$ns0" nft flush chain "$family" nat output
 
 	reset_counters
-	ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null;then
 		lret=1
 		echo "ERROR: ping failed"
 		return $lret
@@ -334,16 +297,14 @@ EOF
 
 	expect="packets 1 bytes 84"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
 			bad_counter "$ns1" ns1$dir "$expect" "test_local_dnat 5"
 			lret=1
 		fi
 	done
 	expect="packets 0 bytes 0"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
 			bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 6"
 			lret=1
 		fi
@@ -352,8 +313,7 @@ EOF
 	# expect 1 count in ns1
 	expect="packets 1 bytes 84"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
 			bad_counter "$ns0" ns0$dir "$expect" "test_local_dnat 7"
 			lret=1
 		fi
@@ -362,8 +322,7 @@ EOF
 	# expect 0 packet in ns2
 	expect="packets 0 bytes 0"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
 			bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 8"
 			lret=1
 		fi
@@ -374,13 +333,19 @@ EOF
 	return $lret
 }
 
+listener_ready()
+{
+	local ns="$1"
+	local port="$2"
+	local proto="$3"
+	ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port"
+}
+
 test_local_dnat_portonly()
 {
 	local family=$1
 	local daddr=$2
 	local lret=0
-	local sr_s
-	local sr_r
 
 ip netns exec "$ns0" nft -f /dev/stdin <<EOF
 table $family nat {
@@ -392,7 +357,7 @@ table $family nat {
 }
 EOF
 	if [ $? -ne 0 ]; then
-		if [ $family = "inet" ];then
+		if [ "$family" = "inet" ];then
 			echo "SKIP: inet port test"
 			test_inet_nat=false
 			return
@@ -401,17 +366,16 @@ EOF
 		return
 	fi
 
-	echo SERVER-$family | ip netns exec "$ns1" timeout 5 socat -u STDIN TCP-LISTEN:2000 &
-	sc_s=$!
+	echo "SERVER-$family" | ip netns exec "$ns1" timeout 3 socat -u STDIN TCP-LISTEN:2000 &
 
-	sleep 1
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" 2000 "-t"
 
-	result=$(ip netns exec "$ns0" timeout 1 socat TCP:$daddr:2000 STDOUT)
+	result=$(ip netns exec "$ns0" timeout 1 socat -u TCP:"$daddr":2000 STDOUT)
 
 	if [ "$result" = "SERVER-inet" ];then
 		echo "PASS: inet port rewrite without l3 address"
 	else
-		echo "ERROR: inet port rewrite"
+		echo "ERROR: inet port rewrite without l3 address, got $result"
 		ret=1
 	fi
 }
@@ -424,24 +388,20 @@ test_masquerade6()
 
 	ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
 
-	ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then
 		echo "ERROR: cannot ping $ns1 from $ns2 via ipv6"
 		return 1
-		lret=1
 	fi
 
 	expect="packets 1 bytes 104"
 	for dir in "in6" "out6" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns1" ns2$dir "$expect" "test_masquerade6 1"
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade6 1"
 			lret=1
 		fi
 
-		cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 2"
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade6 2"
 			lret=1
 		fi
 	done
@@ -462,8 +422,7 @@ EOF
 		return $ksft_skip
 	fi
 
-	ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then
 		echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags"
 		lret=1
 	fi
@@ -471,14 +430,12 @@ EOF
 	# ns1 should have seen packets from ns0, due to masquerade
 	expect="packets 1 bytes 104"
 	for dir in "in6" "out6" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
 			bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 3"
 			lret=1
 		fi
 
-		cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
 			bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 4"
 			lret=1
 		fi
@@ -487,27 +444,23 @@ EOF
 	# ns1 should not have seen packets from ns2, due to masquerade
 	expect="packets 0 bytes 0"
 	for dir in "in6" "out6" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
 			bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 5"
 			lret=1
 		fi
 
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns0" ns1$dir "$expect" "test_masquerade6 6"
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade6 6"
 			lret=1
 		fi
 	done
 
-	ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then
 		echo "ERROR: cannot ping $ns1 from $ns2 with active ipv6 masquerade $natflags (attempt 2)"
 		lret=1
 	fi
 
-	ip netns exec "$ns0" nft flush chain $family nat postrouting
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting;then
 		echo "ERROR: Could not flush $family nat postrouting" 1>&2
 		lret=1
 	fi
@@ -526,23 +479,20 @@ test_masquerade()
 	ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
 	ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
 
-	ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
-		echo "ERROR: cannot ping $ns1 from "$ns2" $natflags"
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
+		echo "ERROR: cannot ping $ns1 from $ns2 $natflags"
 		lret=1
 	fi
 
 	expect="packets 1 bytes 84"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns1" ns2$dir "$expect" "test_masquerade 1"
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" "ns2$dir" "$expect" "test_masquerade 1"
 			lret=1
 		fi
 
-		cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 2"
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 2"
 			lret=1
 		fi
 	done
@@ -563,8 +513,7 @@ EOF
 		return $ksft_skip
 	fi
 
-	ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
 		echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags"
 		lret=1
 	fi
@@ -572,15 +521,13 @@ EOF
 	# ns1 should have seen packets from ns0, due to masquerade
 	expect="packets 1 bytes 84"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 3"
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns0${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 3"
 			lret=1
 		fi
 
-		cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 4"
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns2" "ns1$dir" "$expect" "test_masquerade 4"
 			lret=1
 		fi
 	done
@@ -588,27 +535,23 @@ EOF
 	# ns1 should not have seen packets from ns2, due to masquerade
 	expect="packets 0 bytes 0"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 5"
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" "ns0$dir" "$expect" "test_masquerade 5"
 			lret=1
 		fi
 
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns0" ns1$dir "$expect" "test_masquerade 6"
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
+			bad_counter "$ns0" "ns1$dir" "$expect" "test_masquerade 6"
 			lret=1
 		fi
 	done
 
-	ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
 		echo "ERROR: cannot ping $ns1 from $ns2 with active ip masquerade $natflags (attempt 2)"
 		lret=1
 	fi
 
-	ip netns exec "$ns0" nft flush chain $family nat postrouting
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" nft flush chain "$family" nat postrouting; then
 		echo "ERROR: Could not flush $family nat postrouting" 1>&2
 		lret=1
 	fi
@@ -625,22 +568,19 @@ test_redirect6()
 
 	ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
 
-	ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then
 		echo "ERROR: cannnot ping $ns1 from $ns2 via ipv6"
 		lret=1
 	fi
 
 	expect="packets 1 bytes 104"
 	for dir in "in6" "out6" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
 			bad_counter "$ns1" ns2$dir "$expect" "test_redirect6 1"
 			lret=1
 		fi
 
-		cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns2" nft list counter inet filter "ns1${dir}" | grep -q "$expect";then
 			bad_counter "$ns2" ns1$dir "$expect" "test_redirect6 2"
 			lret=1
 		fi
@@ -662,8 +602,7 @@ EOF
 		return $ksft_skip
 	fi
 
-	ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null;then
 		echo "ERROR: cannot ping $ns1 from $ns2 via ipv6 with active $family redirect"
 		lret=1
 	fi
@@ -671,8 +610,7 @@ EOF
 	# ns1 should have seen no packets from ns2, due to redirection
 	expect="packets 0 bytes 0"
 	for dir in "in6" "out6" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
 			bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 3"
 			lret=1
 		fi
@@ -681,15 +619,13 @@ EOF
 	# ns0 should have seen packets from ns2, due to masquerade
 	expect="packets 1 bytes 104"
 	for dir in "in6" "out6" ; do
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
 			bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 4"
 			lret=1
 		fi
 	done
 
-	ip netns exec "$ns0" nft delete table $family nat
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" nft delete table "$family" nat;then
 		echo "ERROR: Could not delete $family nat table" 1>&2
 		lret=1
 	fi
@@ -707,22 +643,19 @@ test_redirect()
 	ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
 	ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
 
-	ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
 		echo "ERROR: cannot ping $ns1 from $ns2"
 		lret=1
 	fi
 
 	expect="packets 1 bytes 84"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
-			bad_counter "$ns1" $ns2$dir "$expect" "test_redirect 1"
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
+			bad_counter "$ns1" "$ns2$dir" "$expect" "test_redirect 1"
 			lret=1
 		fi
 
-		cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then
 			bad_counter "$ns2" ns1$dir "$expect" "test_redirect 2"
 			lret=1
 		fi
@@ -744,8 +677,7 @@ EOF
 		return $ksft_skip
 	fi
 
-	ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
 		echo "ERROR: cannot ping $ns1 from $ns2 with active $family ip redirect"
 		lret=1
 	fi
@@ -754,8 +686,7 @@ EOF
 	expect="packets 0 bytes 0"
 	for dir in "in" "out" ; do
 
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns1" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
 			bad_counter "$ns1" ns0$dir "$expect" "test_redirect 3"
 			lret=1
 		fi
@@ -764,15 +695,13 @@ EOF
 	# ns0 should have seen packets from ns2, due to masquerade
 	expect="packets 1 bytes 84"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns0" nft list counter inet filter "ns2${dir}" | grep -q "$expect";then
 			bad_counter "$ns0" ns0$dir "$expect" "test_redirect 4"
 			lret=1
 		fi
 	done
 
-	ip netns exec "$ns0" nft delete table $family nat
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" nft delete table "$family" nat;then
 		echo "ERROR: Could not delete $family nat table" 1>&2
 		lret=1
 	fi
@@ -803,13 +732,13 @@ test_port_shadow()
 	# make shadow entry, from client (ns2), going to (ns1), port 41404, sport 1405.
 	echo "fake-entry" | ip netns exec "$ns2" timeout 1 socat -u STDIN UDP:"$daddrc":41404,sourceport=1405
 
-	echo ROUTER | ip netns exec "$ns0" timeout 5 socat -u STDIN UDP4-LISTEN:1405 &
-	sc_r=$!
+	echo ROUTER | ip netns exec "$ns0" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405 2>/dev/null &
+	local sc_r=$!
+	echo CLIENT | ip netns exec "$ns2" timeout 3 socat -T 3 -u STDIN UDP4-LISTEN:1405,reuseport 2>/dev/null &
+	local sc_c=$!
 
-	echo CLIENT | ip netns exec "$ns2" timeout 5 socat -u STDIN UDP4-LISTEN:1405,reuseport &
-	sc_c=$!
-
-	sleep 0.3
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$ns0" 1405 "-u"
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$ns2" 1405 "-u"
 
 	# ns1 tries to connect to ns0:1405.  With default settings this should connect
 	# to client, it matches the conntrack entry created above.
@@ -846,7 +775,7 @@ table $family filter {
 EOF
 	test_port_shadow "port-filter" "ROUTER"
 
-	ip netns exec "$ns0" nft delete table $family filter
+	ip netns exec "$ns0" nft delete table "$family" filter
 }
 
 # This prevents port shadow of router service via notrack.
@@ -868,7 +797,7 @@ table $family raw {
 EOF
 	test_port_shadow "port-notrack" "ROUTER"
 
-	ip netns exec "$ns0" nft delete table $family raw
+	ip netns exec "$ns0" nft delete table "$family" raw
 }
 
 # This prevents port shadow of router service via sport remap.
@@ -886,21 +815,19 @@ table $family pat {
 EOF
 	test_port_shadow "pat" "ROUTER"
 
-	ip netns exec "$ns0" nft delete table $family pat
+	ip netns exec "$ns0" nft delete table "$family" pat
 }
 
 test_port_shadowing()
 {
 	local family="ip"
 
-	conntrack -h >/dev/null 2>&1
-	if [ $? -ne 0 ];then
+	if ! conntrack -h >/dev/null 2>&1;then
 		echo "SKIP: Could not run nat port shadowing test without conntrack tool"
 		return
 	fi
 
-	socat -h > /dev/null 2>&1
-	if [ $? -ne 0 ];then
+	if ! socat -h > /dev/null 2>&1;then
 		echo "SKIP: Could not run nat port shadowing test without socat tool"
 		return
 	fi
@@ -946,8 +873,7 @@ test_stateless_nat_ip()
 	ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
 	ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
 
-	ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null;then
 		echo "ERROR: cannot ping $ns1 from $ns2 before loading stateless rules"
 		return 1
 	fi
@@ -981,23 +907,20 @@ EOF
 
 	reset_counters
 
-	ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null; then
 		echo "ERROR: cannot ping $ns1 from $ns2 with stateless rules"
 		lret=1
 	fi
 
 	# ns1 should have seen packets from .2.2, due to stateless rewrite.
 	expect="packets 1 bytes 84"
-	cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect")
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then
 		bad_counter "$ns1" ns0insl "$expect" "test_stateless 1"
 		lret=1
 	fi
 
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect";then
 			bad_counter "$ns2" ns1$dir "$expect" "test_stateless 2"
 			lret=1
 		fi
@@ -1006,14 +929,12 @@ EOF
 	# ns1 should not have seen packets from ns2, due to masquerade
 	expect="packets 0 bytes 0"
 	for dir in "in" "out" ; do
-		cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect";then
 			bad_counter "$ns1" ns0$dir "$expect" "test_stateless 3"
 			lret=1
 		fi
 
-		cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect")
-		if [ $? -ne 0 ]; then
+		if ! ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect";then
 			bad_counter "$ns0" ns1$dir "$expect" "test_stateless 4"
 			lret=1
 		fi
@@ -1021,8 +942,7 @@ EOF
 
 	reset_counters
 
-	socat -h > /dev/null 2>&1
-	if [ $? -ne 0 ];then
+	if ! socat -h > /dev/null 2>&1;then
 		echo "SKIP: Could not run stateless nat frag test without socat tool"
 		if [ $lret -eq 0 ]; then
 			return $ksft_skip
@@ -1032,42 +952,36 @@ EOF
 		return $lret
 	fi
 
-	local tmpfile=$(mktemp)
-	dd if=/dev/urandom of=$tmpfile bs=4096 count=1 2>/dev/null
+	dd if=/dev/urandom of="$INFILE" bs=4096 count=1 2>/dev/null
 
-	local outfile=$(mktemp)
-	ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:$outfile < /dev/null &
-	sc_r=$!
+	ip netns exec "$ns1" timeout 3 socat -u UDP4-RECV:4233 OPEN:"$OUTFILE" < /dev/null 2>/dev/null &
+
+	busywait $BUSYWAIT_TIMEOUT listener_ready "$ns1" 4233 "-u"
 
-	sleep 1
 	# re-do with large ping -> ip fragmentation
-	ip netns exec "$ns2" timeout 3 socat - UDP4-SENDTO:"10.0.1.99:4233" < "$tmpfile" > /dev/null
-	if [ $? -ne 0 ] ; then
+	if ! ip netns exec "$ns2" timeout 3 socat -u STDIN UDP4-SENDTO:"10.0.1.99:4233" < "$INFILE" > /dev/null;then
 		echo "ERROR: failed to test udp $ns1 to $ns2 with stateless ip nat" 1>&2
 		lret=1
 	fi
 
 	wait
 
-	cmp "$tmpfile" "$outfile"
-	if [ $? -ne 0 ]; then
-		ls -l "$tmpfile" "$outfile"
+	if ! cmp "$INFILE" "$OUTFILE";then
+		ls -l "$INFILE" "$OUTFILE"
 		echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2
 		lret=1
 	fi
 
-	rm -f "$tmpfile" "$outfile"
+	:> "$OUTFILE"
 
 	# ns1 should have seen packets from 2.2, due to stateless rewrite.
 	expect="packets 3 bytes 4164"
-	cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect")
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns1" nft list counter inet filter ns0insl | grep -q "$expect";then
 		bad_counter "$ns1" ns0insl "$expect" "test_stateless 5"
 		lret=1
 	fi
 
-	ip netns exec "$ns0" nft delete table ip stateless
-	if [ $? -ne 0 ]; then
+	if ! ip netns exec "$ns0" nft delete table ip stateless; then
 		echo "ERROR: Could not delete table ip stateless" 1>&2
 		lret=1
 	fi
@@ -1078,8 +992,8 @@ EOF
 }
 
 # ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99
-for i in 0 1 2; do
-ip netns exec ns$i-$sfx nft -f /dev/stdin <<EOF
+for i in "$ns0" "$ns1" "$ns2" ;do
+ip netns exec "$i" nft -f /dev/stdin <<EOF
 table inet filter {
 	counter ns0in {}
 	counter ns1in {}
@@ -1145,7 +1059,7 @@ done
 
 # special case for stateless nat check, counter needs to
 # be done before (input) ip defragmentation
-ip netns exec ns1-$sfx nft -f /dev/stdin <<EOF
+ip netns exec "$ns1" nft -f /dev/stdin <<EOF
 table inet filter {
 	counter ns0insl {}
 
@@ -1156,31 +1070,49 @@ table inet filter {
 }
 EOF
 
-sleep 3
-# test basic connectivity
-for i in 1 2; do
-  ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 > /dev/null
-  if [ $? -ne 0 ];then
-  	echo "ERROR: Could not reach other namespace(s)" 1>&2
-	ret=1
-  fi
-
-  ip netns exec "$ns0" ping -c 1 -q dead:$i::99 > /dev/null
-  if [ $? -ne 0 ];then
-	echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2
-	ret=1
-  fi
-  check_counters ns$i-$sfx
-  if [ $? -ne 0 ]; then
-	ret=1
-  fi
-
-  check_ns0_counters ns$i
-  if [ $? -ne 0 ]; then
-	ret=1
-  fi
-  reset_counters
-done
+ping_basic()
+{
+	i="$1"
+	if ! ip netns exec "$ns0" ping -c 1 -q 10.0."$i".99 > /dev/null;then
+		echo "ERROR: Could not reach other namespace(s)" 1>&2
+		ret=1
+	fi
+
+	if ! ip netns exec "$ns0" ping -c 1 -q dead:"$i"::99 > /dev/null;then
+		echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2
+		ret=1
+	fi
+}
+
+test_basic_conn()
+{
+	local nsexec
+	name="$1"
+
+	nsexec=$(eval echo \$"$1")
+
+	ping_basic 1
+	ping_basic 2
+
+	if ! check_counters "$nsexec";then
+		return 1
+	fi
+
+	if ! check_ns0_counters "$name";then
+		return 1
+	fi
+
+	reset_counters
+	return 0
+}
+
+if ! test_basic_conn "ns1" ; then
+	echo "ERROR: basic test for ns1 failed" 1>&2
+	exit 1
+fi
+if ! test_basic_conn "ns2"; then
+	echo "ERROR: basic test for ns1 failed" 1>&2
+fi
 
 if [ $ret -eq 0 ];then
 	echo "PASS: netns routing/connectivity: $ns0 can reach $ns1 and $ns2"
diff --git a/tools/testing/selftests/netfilter/nft_nat_zones.sh b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh
index b9ab37380f33..3b81d88bdde3 100755
--- a/tools/testing/selftests/netfilter/nft_nat_zones.sh
+++ b/tools/testing/selftests/net/netfilter/nft_nat_zones.sh
@@ -3,17 +3,17 @@
 # Test connection tracking zone and NAT source port reallocation support.
 #
 
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
+source lib.sh
 
 # Don't increase too much, 2000 clients should work
 # just fine but script can then take several minutes with
 # KASAN/debug builds.
 maxclients=100
 
-have_iperf=1
+have_socat=0
 ret=0
 
+[ "$KSFT_MACHINE_SLOW" = yes ] && maxclients=40
 # client1---.
 #            veth1-.
 #                  |
@@ -31,12 +31,6 @@ ret=0
 #   NAT Gateway is supposed to do port reallocation for each of the
 #   connections.
 
-sfx=$(mktemp -u "XXXXXXXX")
-gw="ns-gw-$sfx"
-cl1="ns-cl1-$sfx"
-cl2="ns-cl2-$sfx"
-srv="ns-srv-$sfx"
-
 v4gc1=$(sysctl -n net.ipv4.neigh.default.gc_thresh1 2>/dev/null)
 v4gc2=$(sysctl -n net.ipv4.neigh.default.gc_thresh2 2>/dev/null)
 v4gc3=$(sysctl -n net.ipv4.neigh.default.gc_thresh3 2>/dev/null)
@@ -46,61 +40,29 @@ v6gc3=$(sysctl -n net.ipv6.neigh.default.gc_thresh3 2>/dev/null)
 
 cleanup()
 {
-	ip netns del $gw
-	ip netns del $srv
-	for i in $(seq 1 $maxclients); do
-		ip netns del ns-cl$i-$sfx 2>/dev/null
-	done
-
-	sysctl -q net.ipv4.neigh.default.gc_thresh1=$v4gc1 2>/dev/null
-	sysctl -q net.ipv4.neigh.default.gc_thresh2=$v4gc2 2>/dev/null
-	sysctl -q net.ipv4.neigh.default.gc_thresh3=$v4gc3 2>/dev/null
-	sysctl -q net.ipv6.neigh.default.gc_thresh1=$v6gc1 2>/dev/null
-	sysctl -q net.ipv6.neigh.default.gc_thresh2=$v6gc2 2>/dev/null
-	sysctl -q net.ipv6.neigh.default.gc_thresh3=$v6gc3 2>/dev/null
+	cleanup_all_ns
+
+	sysctl -q net.ipv4.neigh.default.gc_thresh1="$v4gc1" 2>/dev/null
+	sysctl -q net.ipv4.neigh.default.gc_thresh2="$v4gc2" 2>/dev/null
+	sysctl -q net.ipv4.neigh.default.gc_thresh3="$v4gc3" 2>/dev/null
+	sysctl -q net.ipv6.neigh.default.gc_thresh1="$v6gc1" 2>/dev/null
+	sysctl -q net.ipv6.neigh.default.gc_thresh2="$v6gc2" 2>/dev/null
+	sysctl -q net.ipv6.neigh.default.gc_thresh3="$v6gc3" 2>/dev/null
 }
 
-nft --version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without nft tool"
-	exit $ksft_skip
-fi
+checktool "nft --version" echo "run test without nft tool"
+checktool "conntrack -V" "run test without conntrack tool"
 
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
+if socat -h >/dev/null 2>&1; then
+	have_socat=1
 fi
 
-conntrack -V > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without conntrack tool"
-	exit $ksft_skip
-fi
-
-iperf3 -v >/dev/null 2>&1
-if [ $? -ne 0 ];then
-	have_iperf=0
-fi
-
-ip netns add "$gw"
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not create net namespace $gw"
-	exit $ksft_skip
-fi
-ip -net "$gw" link set lo up
+setup_ns gw srv
 
 trap cleanup EXIT
 
-ip netns add "$srv"
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not create server netns $srv"
-	exit $ksft_skip
-fi
-
 ip link add veth0 netns "$gw" type veth peer name eth0 netns "$srv"
 ip -net "$gw" link set veth0 up
-ip -net "$srv" link set lo up
 ip -net "$srv" link set eth0 up
 
 sysctl -q net.ipv6.neigh.default.gc_thresh1=512  2>/dev/null
@@ -110,55 +72,49 @@ sysctl -q net.ipv4.neigh.default.gc_thresh1=512  2>/dev/null
 sysctl -q net.ipv4.neigh.default.gc_thresh2=1024 2>/dev/null
 sysctl -q net.ipv4.neigh.default.gc_thresh3=4096 2>/dev/null
 
-for i in $(seq 1 $maxclients);do
-  cl="ns-cl$i-$sfx"
+for i in $(seq 1 "$maxclients");do
+  setup_ns "cl$i"
 
-  ip netns add "$cl"
-  if [ $? -ne 0 ];then
-     echo "SKIP: Could not create client netns $cl"
-     exit $ksft_skip
-  fi
-  ip link add veth$i netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1
-  if [ $? -ne 0 ];then
+  cl=$(eval echo \$cl"$i")
+  if ! ip link add veth"$i" netns "$gw" type veth peer name eth0 netns "$cl" > /dev/null 2>&1;then
     echo "SKIP: No virtual ethernet pair device support in kernel"
     exit $ksft_skip
   fi
 done
 
-for i in $(seq 1 $maxclients);do
-  cl="ns-cl$i-$sfx"
-  echo netns exec "$cl" ip link set lo up
+for i in $(seq 1 "$maxclients");do
+  cl=$(eval echo \$cl"$i")
   echo netns exec "$cl" ip link set eth0 up
   echo netns exec "$cl" sysctl -q net.ipv4.tcp_syn_retries=2
-  echo netns exec "$gw" ip link set veth$i up
-  echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.arp_ignore=2
-  echo netns exec "$gw" sysctl -q net.ipv4.conf.veth$i.rp_filter=0
+  echo netns exec "$gw" ip link set "veth$i" up
+  echo netns exec "$gw" sysctl -q net.ipv4.conf.veth"$i".arp_ignore=2
+  echo netns exec "$gw" sysctl -q net.ipv4.conf.veth"$i".rp_filter=0
 
   # clients have same IP addresses.
   echo netns exec "$cl" ip addr add 10.1.0.3/24 dev eth0
-  echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0
+  echo netns exec "$cl" ip addr add dead:1::3/64 dev eth0 nodad
   echo netns exec "$cl" ip route add default via 10.1.0.2 dev eth0
   echo netns exec "$cl" ip route add default via dead:1::2 dev eth0
 
   # NB: same addresses on client-facing interfaces.
-  echo netns exec "$gw" ip addr add 10.1.0.2/24 dev veth$i
-  echo netns exec "$gw" ip addr add dead:1::2/64 dev veth$i
+  echo netns exec "$gw" ip addr add 10.1.0.2/24 dev "veth$i"
+  echo netns exec "$gw" ip addr add dead:1::2/64 dev "veth$i" nodad
 
   # gw: policy routing
-  echo netns exec "$gw" ip route add 10.1.0.0/24 dev veth$i table $((1000+i))
-  echo netns exec "$gw" ip route add dead:1::0/64 dev veth$i table $((1000+i))
+  echo netns exec "$gw" ip route add 10.1.0.0/24 dev "veth$i" table $((1000+i))
+  echo netns exec "$gw" ip route add dead:1::0/64 dev "veth$i" table $((1000+i))
   echo netns exec "$gw" ip route add 10.3.0.0/24 dev veth0 table $((1000+i))
   echo netns exec "$gw" ip route add dead:3::0/64 dev veth0 table $((1000+i))
-  echo netns exec "$gw" ip rule add fwmark $i lookup $((1000+i))
+  echo netns exec "$gw" ip rule add fwmark "$i" lookup $((1000+i))
 done | ip -batch /dev/stdin
 
 ip -net "$gw" addr add 10.3.0.1/24 dev veth0
-ip -net "$gw" addr add dead:3::1/64 dev veth0
+ip -net "$gw" addr add dead:3::1/64 dev veth0 nodad
 
 ip -net "$srv" addr add 10.3.0.99/24 dev eth0
-ip -net "$srv" addr add dead:3::99/64 dev eth0
+ip -net "$srv" addr add dead:3::99/64 dev eth0 nodad
 
-ip netns exec $gw nft -f /dev/stdin<<EOF
+ip netns exec "$gw" nft -f /dev/stdin<<EOF
 table inet raw {
 	map iiftomark {
 		type ifname : mark
@@ -203,18 +159,22 @@ table inet raw {
 	}
 }
 EOF
+if [ "$?" -ne 0 ];then
+	echo "SKIP: Could not add nftables rules"
+	exit $ksft_skip
+fi
 
 ( echo add element inet raw iiftomark \{
 	for i in $(seq 1 $((maxclients-1))); do
-		echo \"veth$i\" : $i,
+		echo \"veth"$i"\" : "$i",
 	done
-	echo \"veth$maxclients\" : $maxclients \}
+	echo \"veth"$maxclients"\" : "$maxclients" \}
 	echo add element inet raw iiftozone \{
 	for i in $(seq 1 $((maxclients-1))); do
-		echo \"veth$i\" : $i,
+		echo \"veth"$i"\" : "$i",
 	done
 	echo \"veth$maxclients\" : $maxclients \}
-) | ip netns exec $gw nft -f /dev/stdin
+) | ip netns exec "$gw" nft -f /dev/stdin
 
 ip netns exec "$gw" sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null
 ip netns exec "$gw" sysctl -q net.ipv6.conf.all.forwarding=1 > /dev/null
@@ -224,73 +184,72 @@ ip netns exec "$gw" sysctl -q net.ipv4.conf.all.rp_filter=0 >/dev/null
 ip netns exec "$gw" sysctl -q net.ipv4.fwmark_reflect=1 > /dev/null
 ip netns exec "$gw" sysctl -q net.ipv6.fwmark_reflect=1 > /dev/null
 
-for i in $(seq 1 $maxclients); do
-  cl="ns-cl$i-$sfx"
-  ip netns exec $cl ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 &
-  if [ $? -ne 0 ]; then
-     echo FAIL: Ping failure from $cl 1>&2
-     ret=1
-     break
-  fi
+for i in $(seq 1 "$maxclients"); do
+  cl=$(eval echo \$cl"$i")
+  ip netns exec "$cl" ping -i 0.5 -q -c 3 10.3.0.99 > /dev/null 2>&1 &
 done
 
-wait
+wait || ret=1
 
-for i in $(seq 1 $maxclients); do
-   ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }"
-   if [ $? -ne 0 ];then
+[ "$ret" -ne 0 ] && "FAIL: Ping failure from $cl" 1>&2
+
+for i in $(seq 1 "$maxclients"); do
+   if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" | grep -q "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 counter packets 3 bytes 252 }"; then
       ret=1
       echo "FAIL: counter icmp mismatch for veth$i" 1>&2
-      ip netns exec $gw nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2
+      ip netns exec "$gw" nft get element inet raw inicmp "{ 10.1.0.3 . \"veth$i\" . 10.3.0.99 }" 1>&2
       break
    fi
 done
 
-ip netns exec $gw nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }"
-if [ $? -ne 0 ];then
+if ! ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 }" | grep -q "{ 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }"; then
     ret=1
-    echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * $maxclients)) bytes $((252 * $maxclients)) }"
-    ip netns exec $gw nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2
+    echo "FAIL: counter icmp mismatch for veth0: { 10.3.0.99 . \"veth0\" . 10.3.0.1 counter packets $((3 * maxclients)) bytes $((252 * maxclients)) }"
+    ip netns exec "$gw" nft get element inet raw inicmp "{ 10.3.99 . \"veth0\" . 10.3.0.1 }" 1>&2
 fi
 
-if  [ $ret -eq 0 ]; then
+if [ $ret -eq 0 ]; then
 	echo "PASS: ping test from all $maxclients namespaces"
 fi
 
-if [ $have_iperf -eq 0 ];then
-	echo "SKIP: iperf3 not installed"
+if [ $have_socat -eq 0 ];then
+	echo "SKIP: socat not installed"
 	if [ $ret -ne 0 ];then
 	    exit $ret
 	fi
 	exit $ksft_skip
 fi
 
-ip netns exec $srv iperf3 -s > /dev/null 2>&1 &
-iperfpid=$!
-sleep 1
+listener_ready()
+{
+	ss -N "$1" -lnt -o "sport = :5201" | grep -q 5201
+}
+
+ip netns exec "$srv" socat -u TCP-LISTEN:5201,fork STDOUT > /dev/null 2>/dev/null &
+socatpid=$!
+
+busywait 1000 listener_ready "$srv"
 
-for i in $(seq 1 $maxclients); do
+for i in $(seq 1 "$maxclients"); do
   if [ $ret -ne 0 ]; then
      break
   fi
-  cl="ns-cl$i-$sfx"
-  ip netns exec $cl iperf3 -c 10.3.0.99 --cport 10000 -n 1 > /dev/null
-  if [ $? -ne 0 ]; then
-     echo FAIL: Failure to connect for $cl 1>&2
-     ip netns exec $gw conntrack -S 1>&2
+  cl=$(eval echo \$cl"$i")
+  if ! ip netns exec "$cl" socat -4 -u STDIN TCP:10.3.0.99:5201,sourceport=10000 < /dev/null > /dev/null; then
+     echo "FAIL: Failure to connect for $cl" 1>&2
+     ip netns exec "$gw" conntrack -S 1>&2
      ret=1
   fi
 done
 if [ $ret -eq 0 ];then
-	echo "PASS: iperf3 connections for all $maxclients net namespaces"
+	echo "PASS: socat connections for all $maxclients net namespaces"
 fi
 
-kill $iperfpid
+kill $socatpid
 wait
 
-for i in $(seq 1 $maxclients); do
-   ip netns exec $gw nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null
-   if [ $? -ne 0 ];then
+for i in $(seq 1 "$maxclients"); do
+   if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.1.0.3 . 10000 . \"veth$i\" . 10.3.0.99 . 5201 }" > /dev/null;then
       ret=1
       echo "FAIL: can't find expected tcp entry for veth$i" 1>&2
       break
@@ -300,8 +259,7 @@ if [ $ret -eq 0 ];then
 	echo "PASS: Found client connection for all $maxclients net namespaces"
 fi
 
-ip netns exec $gw nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null
-if [ $? -ne 0 ];then
+if ! ip netns exec "$gw" nft get element inet raw inflows "{ 10.3.0.99 . 5201 . \"veth0\" . 10.3.0.1 . 10000 }" > /dev/null;then
     ret=1
     echo "FAIL: cannot find return entry on veth0" 1>&2
 fi
diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh
new file mode 100755
index 000000000000..8538f08c64c2
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_queue.sh
@@ -0,0 +1,417 @@
+#!/bin/bash
+#
+# This tests nf_queue:
+# 1. can process packets from all hooks
+# 2. support running nfqueue from more than one base chain
+#
+# shellcheck disable=SC2162,SC2317
+
+source lib.sh
+ret=0
+timeout=2
+
+cleanup()
+{
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+	ip netns pids "$nsrouter" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+
+	rm -f "$TMPINPUT"
+	rm -f "$TMPFILE0"
+	rm -f "$TMPFILE1"
+	rm -f "$TMPFILE2" "$TMPFILE3"
+}
+
+checktool "nft --version" "test without nft tool"
+
+trap cleanup EXIT
+
+setup_ns ns1 ns2 nsrouter
+
+TMPFILE0=$(mktemp)
+TMPFILE1=$(mktemp)
+TMPFILE2=$(mktemp)
+TMPFILE3=$(mktemp)
+
+TMPINPUT=$(mktemp)
+dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT"
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+load_ruleset() {
+	local name=$1
+	local prio=$2
+
+ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+table inet $name {
+	chain nfq {
+		ip protocol icmp queue bypass
+		icmpv6 type { "echo-request", "echo-reply" } queue num 1 bypass
+	}
+	chain pre {
+		type filter hook prerouting priority $prio; policy accept;
+		jump nfq
+	}
+	chain input {
+		type filter hook input priority $prio; policy accept;
+		jump nfq
+	}
+	chain forward {
+		type filter hook forward priority $prio; policy accept;
+		tcp dport 12345 queue num 2
+		jump nfq
+	}
+	chain output {
+		type filter hook output priority $prio; policy accept;
+		tcp dport 12345 queue num 3
+		tcp sport 23456 queue num 3
+		jump nfq
+	}
+	chain post {
+		type filter hook postrouting priority $prio; policy accept;
+		jump nfq
+	}
+}
+EOF
+}
+
+load_counter_ruleset() {
+	local prio=$1
+
+ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+table inet countrules {
+	chain pre {
+		type filter hook prerouting priority $prio; policy accept;
+		counter
+	}
+	chain input {
+		type filter hook input priority $prio; policy accept;
+		counter
+	}
+	chain forward {
+		type filter hook forward priority $prio; policy accept;
+		counter
+	}
+	chain output {
+		type filter hook output priority $prio; policy accept;
+		counter
+	}
+	chain post {
+		type filter hook postrouting priority $prio; policy accept;
+		counter
+	}
+}
+EOF
+}
+
+test_ping() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+	return 1
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then
+	return 2
+  fi
+
+  return 0
+}
+
+test_ping_router() {
+  if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then
+	return 3
+  fi
+
+  if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then
+	return 4
+  fi
+
+  return 0
+}
+
+test_queue_blackhole() {
+	local proto=$1
+
+ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+table $proto blackh {
+	chain forward {
+	type filter hook forward priority 0; policy accept;
+		queue num 600
+	}
+}
+EOF
+	if [ "$proto" = "ip" ] ;then
+		ip netns exec "$ns1" ping -W 2 -c 1 -q 10.0.2.99 > /dev/null
+		lret=$?
+	elif [ "$proto" = "ip6" ]; then
+		ip netns exec "$ns1" ping -W 2 -c 1 -q dead:2::99 > /dev/null
+		lret=$?
+	else
+		lret=111
+	fi
+
+	# queue without bypass keyword should drop traffic if no listener exists.
+	if [ "$lret" -eq 0 ];then
+		echo "FAIL: $proto expected failure, got $lret" 1>&2
+		exit 1
+	fi
+
+	if ! ip netns exec "$nsrouter" nft delete table "$proto" blackh; then
+	        echo "FAIL: $proto: Could not delete blackh table"
+	        exit 1
+	fi
+
+        echo "PASS: $proto: statement with no listener results in packet drop"
+}
+
+nf_queue_wait()
+{
+	local procfile="/proc/self/net/netfilter/nfnetlink_queue"
+	local netns id
+
+	netns="$1"
+	id="$2"
+
+	# if this file doesn't exist, nfnetlink_module isn't loaded.
+	# rather than loading it ourselves, wait for kernel module autoload
+	# completion, nfnetlink should do so automatically because nf_queue
+	# helper program, spawned in the background, asked for this functionality.
+	test -f "$procfile" &&
+		ip netns exec "$netns" cat "$procfile" | grep -q "^ *$id "
+}
+
+test_queue()
+{
+	local expected="$1"
+	local last=""
+
+	# spawn nf_queue listeners
+	ip netns exec "$nsrouter" ./nf_queue -c -q 0 -t $timeout > "$TMPFILE0" &
+	ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t $timeout > "$TMPFILE1" &
+
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 0
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 1
+
+	if ! test_ping;then
+		echo "FAIL: netns routing/connectivity with active listener on queues 0 and 1: $ret" 1>&2
+		exit $ret
+	fi
+
+	if ! test_ping_router;then
+		echo "FAIL: netns router unreachable listener on queue 0 and 1: $ret" 1>&2
+		exit $ret
+	fi
+
+	wait
+	ret=$?
+
+	for file in $TMPFILE0 $TMPFILE1; do
+		last=$(tail -n1 "$file")
+		if [ x"$last" != x"$expected packets total" ]; then
+			echo "FAIL: Expected $expected packets total, but got $last" 1>&2
+			ip netns exec "$nsrouter" nft list ruleset
+			exit 1
+		fi
+	done
+
+	echo "PASS: Expected and received $last"
+}
+
+listener_ready()
+{
+	ss -N "$1" -lnt -o "sport = :12345" | grep -q 12345
+}
+
+test_tcp_forward()
+{
+	ip netns exec "$nsrouter" ./nf_queue -q 2 -t "$timeout" &
+	local nfqpid=$!
+
+	timeout 5 ip netns exec "$ns2" socat -u TCP-LISTEN:12345 STDOUT >/dev/null &
+	local rpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2"
+
+	ip netns exec "$ns1" socat -u STDIN TCP:10.0.2.99:12345 <"$TMPINPUT" >/dev/null
+
+	wait "$rpid" && echo "PASS: tcp and nfqueue in forward chain"
+}
+
+test_tcp_localhost()
+{
+	dd conv=sparse status=none if=/dev/zero bs=1M count=200 of="$TMPINPUT"
+	timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null &
+	local rpid=$!
+
+	ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" &
+	local nfqpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter"
+
+	ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" >/dev/null
+
+	wait "$rpid" && echo "PASS: tcp via loopback"
+	wait 2>/dev/null
+}
+
+test_tcp_localhost_connectclose()
+{
+	ip netns exec "$nsrouter" ./connect_close -p 23456 -t "$timeout" &
+	ip netns exec "$nsrouter" ./nf_queue -q 3 -t "$timeout" &
+
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 3
+
+	wait && echo "PASS: tcp via loopback with connect/close"
+	wait 2>/dev/null
+}
+
+test_tcp_localhost_requeue()
+{
+ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet filter {
+	chain output {
+		type filter hook output priority 0; policy accept;
+		tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0
+	}
+	chain post {
+		type filter hook postrouting priority 0; policy accept;
+		tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0
+	}
+}
+EOF
+	timeout 5 ip netns exec "$nsrouter" socat -u TCP-LISTEN:12345 STDOUT >/dev/null &
+	local rpid=$!
+
+	ip netns exec "$nsrouter" ./nf_queue -c -q 1 -t "$timeout" > "$TMPFILE2" &
+
+	# nfqueue 1 will be called via output hook.  But this time,
+        # re-queue the packet to nfqueue program on queue 2.
+	ip netns exec "$nsrouter" ./nf_queue -G -d 150 -c -q 0 -Q 1 -t "$timeout" > "$TMPFILE3" &
+
+	busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter"
+	ip netns exec "$nsrouter" socat -u STDIN TCP:127.0.0.1:12345 <"$TMPINPUT" > /dev/null
+
+	wait
+
+	if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then
+		echo "FAIL: lost packets during requeue?!" 1>&2
+		return
+	fi
+
+	echo "PASS: tcp via loopback and re-queueing"
+}
+
+test_icmp_vrf() {
+	if ! ip -net "$ns1" link add tvrf type vrf table 9876;then
+		echo "SKIP: Could not add vrf device"
+		return
+	fi
+
+	ip -net "$ns1" li set eth0 master tvrf
+	ip -net "$ns1" li set tvrf up
+
+	ip -net "$ns1" route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876
+ip netns exec "$ns1" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet filter {
+	chain output {
+		type filter hook output priority 0; policy accept;
+		meta oifname "tvrf" icmp type echo-request counter queue num 1
+		meta oifname "eth0" icmp type echo-request counter queue num 1
+	}
+	chain post {
+		type filter hook postrouting priority 0; policy accept;
+		meta oifname "tvrf" icmp type echo-request counter queue num 1
+		meta oifname "eth0" icmp type echo-request counter queue num 1
+	}
+}
+EOF
+	ip netns exec "$ns1" ./nf_queue -q 1 -t "$timeout" &
+	local nfqpid=$!
+
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$ns1" 1
+
+	ip netns exec "$ns1" ip vrf exec tvrf ping -c 1 10.0.2.99 > /dev/null
+
+	for n in output post; do
+		for d in tvrf eth0; do
+			if ! ip netns exec "$ns1" nft list chain inet filter "$n" | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"; then
+				echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2
+				ip netns exec "$ns1" nft list ruleset
+				ret=1
+				return
+			fi
+		done
+	done
+
+	wait "$nfqpid" && echo "PASS: icmp+nfqueue via vrf"
+	wait 2>/dev/null
+}
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+load_ruleset "filter" 0
+
+if test_ping; then
+	# queue bypass works (rules were skipped, no listener)
+	echo "PASS: ${ns1} can reach ${ns2}"
+else
+	echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
+	exit $ret
+fi
+
+test_queue_blackhole ip
+test_queue_blackhole ip6
+
+# dummy ruleset to add base chains between the
+# queueing rules.  We don't want the second reinject
+# to re-execute the old hooks.
+load_counter_ruleset 10
+
+# we are hooking all: prerouting/input/forward/output/postrouting.
+# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so:
+# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply).
+# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply.
+# so we expect that userspace program receives 10 packets.
+test_queue 10
+
+# same.  We queue to a second program as well.
+load_ruleset "filter2" 20
+test_queue 20
+
+test_tcp_forward
+test_tcp_localhost
+test_tcp_localhost_connectclose
+test_tcp_localhost_requeue
+test_icmp_vrf
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_synproxy.sh b/tools/testing/selftests/net/netfilter/nft_synproxy.sh
new file mode 100755
index 000000000000..293f667a6aec
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_synproxy.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+ret=0
+
+checktool "nft --version" "run test without nft tool"
+checktool "iperf3 --version" "run test without iperf3"
+
+setup_ns nsr ns1 ns2
+
+modprobe -q nf_conntrack
+
+cleanup() {
+	ip netns pids "$ns1" | xargs kill 2>/dev/null
+	ip netns pids "$ns2" | xargs kill 2>/dev/null
+
+	cleanup_all_ns
+}
+
+trap cleanup EXIT
+
+ip link add veth0 netns "$nsr" type veth peer name eth0 netns "$ns1"
+ip link add veth1 netns "$nsr" type veth peer name eth0 netns "$ns2"
+
+for dev in veth0 veth1; do
+	ip -net "$nsr" link set "$dev" up
+done
+
+ip -net "$nsr" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsr" addr add 10.0.2.1/24 dev veth1
+
+ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth0.forwarding=1
+ip netns exec "$nsr" sysctl -q net.ipv4.conf.veth1.forwarding=1
+ip netns exec "$nsr" sysctl -q net.netfilter.nf_conntrack_tcp_loose=0
+
+for n in $ns1 $ns2; do
+  ip -net "$n" link set eth0 up
+done
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns2" route add default via 10.0.2.1
+
+# test basic connectivity
+if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+  echo "ERROR: $ns1 cannot reach $ns2" 1>&2
+  exit 1
+fi
+
+if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then
+  echo "ERROR: $ns2 cannot reach $ns1" 1>&2
+  exit 1
+fi
+
+ip netns exec "$ns2" iperf3 -s > /dev/null 2>&1 &
+# ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 &
+
+sleep 1
+
+ip netns exec "$nsr" nft -f - <<EOF
+table inet filter {
+   chain prerouting {
+      type filter hook prerouting priority -300; policy accept;
+      meta iif veth0 tcp flags syn counter notrack
+   }
+
+  chain forward {
+      type filter hook forward priority 0; policy accept;
+
+      ct state new,established counter accept
+
+      meta iif veth0 meta l4proto tcp ct state untracked,invalid synproxy mss 1460 sack-perm timestamp
+
+      ct state invalid counter drop
+
+      # make ns2 unreachable w.o. tcp synproxy
+      tcp flags syn counter drop
+   }
+}
+EOF
+if [ $? -ne 0 ]; then
+	echo "SKIP: Cannot add nft synproxy"
+	exit $ksft_skip
+fi
+
+if ! ip netns exec "$ns1" timeout 5 iperf3 -c 10.0.2.99 -n $((1 * 1024 * 1024)) > /dev/null; then
+	echo "FAIL: iperf3 returned an error" 1>&2
+	ret=1
+	ip netns exec "$nsr" nft list ruleset
+else
+	echo "PASS: synproxy connection successful"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_zones_many.sh b/tools/testing/selftests/net/netfilter/nft_zones_many.sh
index 5a8db0b48928..7db9982ba5a6 100755
--- a/tools/testing/selftests/netfilter/nft_zones_many.sh
+++ b/tools/testing/selftests/net/netfilter/nft_zones_many.sh
@@ -3,47 +3,34 @@
 # Test insertion speed for packets with identical addresses/ports
 # that are all placed in distinct conntrack zones.
 
-sfx=$(mktemp -u "XXXXXXXX")
-ns="ns-$sfx"
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
+source lib.sh
 
 zones=2000
+[ "$KSFT_MACHINE_SLOW" = yes ] && zones=500
+
 have_ct_tool=0
 ret=0
 
 cleanup()
 {
-	ip netns del $ns
-}
-
-checktool (){
-	if ! $1 > /dev/null 2>&1; then
-		echo "SKIP: Could not $2"
-		exit $ksft_skip
-	fi
+	cleanup_all_ns
 }
 
 checktool "nft --version" "run test without nft tool"
-checktool "ip -Version" "run test without ip tool"
 checktool "socat -V" "run test without socat tool"
-checktool "ip netns add $ns" "create net namespace"
+
+setup_ns ns1
 
 trap cleanup EXIT
 
-conntrack -V > /dev/null 2>&1
-if [ $? -eq 0 ];then
+if conntrack -V > /dev/null 2>&1; then
 	have_ct_tool=1
 fi
 
-ip -net "$ns" link set lo up
-
 test_zones() {
 	local max_zones=$1
 
-ip netns exec $ns sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600
-ip netns exec $ns nft -f /dev/stdin<<EOF
+ip netns exec "$ns1" nft -f /dev/stdin<<EOF
 flush ruleset
 table inet raw {
 	map rndzone {
@@ -56,29 +43,39 @@ table inet raw {
 	}
 }
 EOF
+if [ "$?" -ne 0 ];then
+	echo "SKIP: Cannot add nftables rules"
+	exit $ksft_skip
+fi
+
+	ip netns exec "$ns1" sysctl -q net.netfilter.nf_conntrack_udp_timeout=3600
+
 	(
 		echo "add element inet raw rndzone {"
-	for i in $(seq 1 $max_zones);do
+	for i in $(seq 1 "$max_zones");do
 		echo -n "$i : $i"
-		if [ $i -lt $max_zones ]; then
+		if [ "$i" -lt "$max_zones" ]; then
 			echo ","
 		else
 			echo "}"
 		fi
 	done
-	) | ip netns exec $ns nft -f /dev/stdin
+	) | ip netns exec "$ns1" nft -f /dev/stdin
 
 	local i=0
 	local j=0
-	local outerstart=$(date +%s%3N)
-	local stop=$outerstart
-
-	while [ $i -lt $max_zones ]; do
-		local start=$(date +%s%3N)
+	local outerstart
+	local stop
+	outerstart=$(date +%s%3N)
+	stop=$outerstart
+
+	while [ "$i" -lt "$max_zones" ]; do
+		local start
+		start=$(date +%s%3N)
 		i=$((i + 1000))
 		j=$((j + 1))
 		# nft rule in output places each packet in a different zone.
-		dd if=/dev/zero of=/dev/stdout bs=8k count=1000 2>/dev/null | ip netns exec "$ns" socat STDIN UDP:127.0.0.1:12345,sourceport=12345
+		dd if=/dev/zero bs=8k count=1000 2>/dev/null | ip netns exec "$ns1" socat -u STDIN UDP:127.0.0.1:12345,sourceport=12345
 		if [ $? -ne 0 ] ;then
 			ret=1
 			break
@@ -89,14 +86,15 @@ EOF
 		echo "PASS: added 1000 entries in $duration ms (now $i total, loop $j)"
 	done
 
-	if [ $have_ct_tool -eq 1 ]; then
-		local count=$(ip netns exec "$ns" conntrack -C)
-		local duration=$((stop-outerstart))
+	if [ "$have_ct_tool" -eq 1 ]; then
+		local count duration
+		count=$(ip netns exec "$ns1" conntrack -C)
+		duration=$((stop-outerstart))
 
-		if [ $count -eq $max_zones ]; then
+		if [ "$count" -ge "$max_zones" ]; then
 			echo "PASS: inserted $count entries from packet path in $duration ms total"
 		else
-			ip netns exec $ns conntrack -S 1>&2
+			ip netns exec "$ns1" conntrack -S 1>&2
 			echo "FAIL: inserted $count entries from packet path in $duration ms total, expected $max_zones entries"
 			ret=1
 		fi
@@ -110,18 +108,19 @@ EOF
 test_conntrack_tool() {
 	local max_zones=$1
 
-	ip netns exec $ns conntrack -F >/dev/null 2>/dev/null
+	ip netns exec "$ns1" conntrack -F >/dev/null 2>/dev/null
 
-	local outerstart=$(date +%s%3N)
-	local start=$(date +%s%3N)
-	local stop=$start
-	local i=0
-	while [ $i -lt $max_zones ]; do
+	local outerstart start stop i
+	outerstart=$(date +%s%3N)
+	start=$(date +%s%3N)
+	stop="$start"
+	i=0
+	while [ "$i" -lt "$max_zones" ]; do
 		i=$((i + 1))
-		ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \
+		ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \
 	                 --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i >/dev/null 2>&1
 		if [ $? -ne 0 ];then
-			ip netns exec "$ns" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \
+			ip netns exec "$ns1" conntrack -I -s 1.1.1.1 -d 2.2.2.2 --protonum 6 \
 	                 --timeout 3600 --state ESTABLISHED --sport 12345 --dport 1000 --zone $i > /dev/null
 			echo "FAIL: conntrack -I returned an error"
 			ret=1
@@ -137,13 +136,15 @@ test_conntrack_tool() {
 		fi
 	done
 
-	local count=$(ip netns exec "$ns" conntrack -C)
-	local duration=$((stop-outerstart))
+	local count
+	local duration
+	count=$(ip netns exec "$ns1" conntrack -C)
+	duration=$((stop-outerstart))
 
-	if [ $count -eq $max_zones ]; then
+	if [ "$count" -eq "$max_zones" ]; then
 		echo "PASS: inserted $count entries via ctnetlink in $duration ms"
 	else
-		ip netns exec $ns conntrack -S 1>&2
+		ip netns exec "$ns1" conntrack -S 1>&2
 		echo "FAIL: inserted $count entries via ctnetlink in $duration ms, expected $max_zones entries ($duration ms)"
 		ret=1
 	fi
@@ -151,7 +152,7 @@ test_conntrack_tool() {
 
 test_zones $zones
 
-if [ $have_ct_tool -eq 1 ];then
+if [ "$have_ct_tool" -eq 1 ];then
 	test_conntrack_tool $zones
 else
 	echo "SKIP: Could not run ctnetlink insertion test without conntrack tool"
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/common.sh b/tools/testing/selftests/net/netfilter/packetdrill/common.sh
new file mode 100755
index 000000000000..ed36d535196d
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/common.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# for debugging set net.netfilter.nf_log_all_netns=1 in init_net
+# or do not use net namespaces.
+modprobe -q nf_conntrack
+sysctl -q net.netfilter.nf_conntrack_log_invalid=6
+
+# Flush old cached data (fastopen cookies).
+ip tcp_metrics flush all > /dev/null 2>&1
+
+# TCP min, default, and max receive and send buffer sizes.
+sysctl -q net.ipv4.tcp_rmem="4096 540000 $((15*1024*1024))"
+sysctl -q net.ipv4.tcp_wmem="4096 $((256*1024)) 4194304"
+
+# TCP congestion control.
+sysctl -q net.ipv4.tcp_congestion_control=cubic
+
+# TCP slow start after idle.
+sysctl -q net.ipv4.tcp_slow_start_after_idle=0
+
+# TCP Explicit Congestion Notification (ECN)
+sysctl -q net.ipv4.tcp_ecn=0
+
+sysctl -q net.ipv4.tcp_notsent_lowat=4294967295 > /dev/null 2>&1
+
+# Override the default qdisc on the tun device.
+# Many tests fail with timing errors if the default
+# is FQ and that paces their flows.
+tc qdisc add dev tun0 root pfifo
+
+# Enable conntrack
+$xtables -A INPUT -m conntrack --ctstate NEW -p tcp --syn
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt
new file mode 100644
index 000000000000..d755bd64c54f
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt
@@ -0,0 +1,118 @@
+// check that already-acked (retransmitted) packet is let through rather
+// than tagged as INVALID.
+
+`packetdrill/common.sh`
+
+// should set -P DROP but it disconnects VM w.o. extra netns
++0 `$xtables -A INPUT -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 10) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000>
++0 > S. 0:0(0) ack 1 <mss 1460>
++.01 < . 1:1(0) ack 1 win 65535
++0 accept(3, ..., ...) = 4
+
++0.0001 < P. 1:1461(1460) ack 1 win 257
++.0 > . 1:1(0) ack 1461 win 65535
++0.0001 < P. 1461:2921(1460) ack 1 win 257
++.0 > . 1:1(0) ack 2921 win 65535
++0.0001 < P. 2921:4381(1460) ack 1 win 257
++.0 > . 1:1(0) ack 4381 win 65535
++0.0001 < P. 4381:5841(1460) ack 1 win 257
++.0 > . 1:1(0) ack 5841 win 65535
++0.0001 < P. 5841:7301(1460) ack 1 win 257
++.0 > . 1:1(0) ack 7301 win 65535
++0.0001 < P. 7301:8761(1460) ack 1 win 257
++.0 > . 1:1(0) ack 8761 win 65535
++0.0001 < P. 8761:10221(1460) ack 1 win 257
++.0 > . 1:1(0) ack 10221 win 65535
++0.0001 < P. 10221:11681(1460) ack 1 win 257
++.0 > . 1:1(0) ack 11681 win 65535
++0.0001 < P. 11681:13141(1460) ack 1 win 257
++.0 > . 1:1(0) ack 13141 win 65535
++0.0001 < P. 13141:14601(1460) ack 1 win 257
++.0 > . 1:1(0) ack 14601 win 65535
++0.0001 < P. 14601:16061(1460) ack 1 win 257
++.0 > . 1:1(0) ack 16061 win 65535
++0.0001 < P. 16061:17521(1460) ack 1 win 257
++.0 > . 1:1(0) ack 17521 win 65535
++0.0001 < P. 17521:18981(1460) ack 1 win 257
++.0 > . 1:1(0) ack 18981 win 65535
++0.0001 < P. 18981:20441(1460) ack 1 win 257
++.0 > . 1:1(0) ack 20441 win 65535
++0.0001 < P. 20441:21901(1460) ack 1 win 257
++.0 > . 1:1(0) ack 21901 win 65535
++0.0001 < P. 21901:23361(1460) ack 1 win 257
++.0 > . 1:1(0) ack 23361 win 65535
++0.0001 < P. 23361:24821(1460) ack 1 win 257
+0.055 > . 1:1(0) ack 24821 win 65535
++0.0001 < P. 24821:26281(1460) ack 1 win 257
++.0 > . 1:1(0) ack 26281 win 65535
++0.0001 < P. 26281:27741(1460) ack 1 win 257
++.0 > . 1:1(0) ack 27741 win 65535
++0.0001 < P. 27741:29201(1460) ack 1 win 257
++.0 > . 1:1(0) ack 29201 win 65535
++0.0001 < P. 29201:30661(1460) ack 1 win 257
++.0 > . 1:1(0) ack 30661 win 65535
++0.0001 < P. 30661:32121(1460) ack 1 win 257
++.0 > . 1:1(0) ack 32121 win 65535
++0.0001 < P. 32121:33581(1460) ack 1 win 257
++.0 > . 1:1(0) ack 33581 win 65535
++0.0001 < P. 33581:35041(1460) ack 1 win 257
++.0 > . 1:1(0) ack 35041 win 65535
++0.0001 < P. 35041:36501(1460) ack 1 win 257
++.0 > . 1:1(0) ack 36501 win 65535
++0.0001 < P. 36501:37961(1460) ack 1 win 257
++.0 > . 1:1(0) ack 37961 win 65535
++0.0001 < P. 37961:39421(1460) ack 1 win 257
++.0 > . 1:1(0) ack 39421 win 65535
++0.0001 < P. 39421:40881(1460) ack 1 win 257
++.0 > . 1:1(0) ack 40881 win 65535
++0.0001 < P. 40881:42341(1460) ack 1 win 257
++.0 > . 1:1(0) ack 42341 win 65535
++0.0001 < P. 42341:43801(1460) ack 1 win 257
++.0 > . 1:1(0) ack 43801 win 65535
++0.0001 < P. 43801:45261(1460) ack 1 win 257
++.0 > . 1:1(0) ack 45261 win 65535
++0.0001 < P. 45261:46721(1460) ack 1 win 257
++.0 > . 1:1(0) ack 46721 win 65535
++0.0001 < P. 46721:48181(1460) ack 1 win 257
++.0 > . 1:1(0) ack 48181 win 65535
++0.0001 < P. 48181:49641(1460) ack 1 win 257
++.0 > . 1:1(0) ack 49641 win 65535
++0.0001 < P. 49641:51101(1460) ack 1 win 257
++.0 > . 1:1(0) ack 51101 win 65535
++0.0001 < P. 51101:52561(1460) ack 1 win 257
++.0 > . 1:1(0) ack 52561 win 65535
++0.0001 < P. 52561:54021(1460) ack 1 win 257
++.0 > . 1:1(0) ack 54021 win 65535
++0.0001 < P. 54021:55481(1460) ack 1 win 257
++.0 > . 1:1(0) ack 55481 win 65535
++0.0001 < P. 55481:56941(1460) ack 1 win 257
++.0 > . 1:1(0) ack 56941 win 65535
++0.0001 < P. 56941:58401(1460) ack 1 win 257
++.0 > . 1:1(0) ack 58401 win 65535
++0.0001 < P. 58401:59861(1460) ack 1 win 257
++.0 > . 1:1(0) ack 59861 win 65535
++0.0001 < P. 59861:61321(1460) ack 1 win 257
++.0 > . 1:1(0) ack 61321 win 65535
++0.0001 < P. 61321:62781(1460) ack 1 win 257
++.0 > . 1:1(0) ack 62781 win 65535
++0.0001 < P. 62781:64241(1460) ack 1 win 257
++.0 > . 1:1(0) ack 64241 win 65535
++0.0001 < P. 64241:65701(1460) ack 1 win 257
++.0 > . 1:1(0) ack 65701 win 65535
++0.0001 < P. 65701:67161(1460) ack 1 win 257
++.0 > . 1:1(0) ack 67161 win 65535
+
+// nf_ct_proto_6: SEQ is under the lower bound (already ACKed data retransmitted) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.24.72 LEN=1500 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=34375 DPT=8080 SEQ=1 ACK=4162510439 WINDOW=257 RES=0x00 ACK PSH URGP=0
++0.0001 < P. 1:1461(1460) ack 1 win 257
+
+// only sent if above packet isn't flagged as invalid
++.0 > . 1:1(0) ack 67161 win 65535
+
++0 `$xtables -D INPUT -m conntrack --ctstate INVALID -j DROP`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt
new file mode 100644
index 000000000000..dccdd4c009c6
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt
@@ -0,0 +1,62 @@
+// check RST packet that doesn't exactly match expected next sequence
+// number still transitions conntrack state to CLOSE iff its already in
+// FIN/CLOSE_WAIT.
+
+`packetdrill/common.sh`
+
+//  5.771921 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture]
+//  5.771994 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture]
+//  5.772212 client_ip > server_ip TCP 66 45020 > 443 [ACK] Seq=1905874048 Ack=781810658 Win=36352 Len=0 TSval=3317842872 TSecr=675936334
+//  5.787924 server_ip > client_ip TLSv1.2 1300 [Packet size limited during capture]
+//  5.788126 server_ip > client_ip TLSv1.2 90 Application Data
+//  5.788207 server_ip > client_ip TCP 66 443 > 45020 [FIN, ACK] Seq=781811916 Ack=1905874048 Win=31104 Len=0 TSval=675936350 TSecr=3317842872
+//  5.788447 client_ip > server_ip TLSv1.2 90 Application Data
+//  5.788479 client_ip > server_ip TCP 66 45020 > 443 [RST, ACK] Seq=1905874072 Ack=781811917 Win=39040 Len=0 TSval=3317842889 TSecr=675936350
+//  5.788581 server_ip > client_ip TCP 54 8443 > 45020 [RST] Seq=781811892 Win=0 Len=0
+
++0 `iptables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `iptables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
++0.1 < S. 1:1(0) ack 1 win 65535 <mss 1460>
+
++0 > . 1:1(0) ack 1 win 65535
++0 < . 1:1001(1000) ack 1 win 65535
++0 < . 1001:2001(1000) ack 1 win 65535
++0 < . 2001:3001(1000) ack 1 win 65535
+
++0 > . 1:1(0) ack 1001 win 65535
++0 > . 1:1(0) ack 2001 win 65535
++0 > . 1:1(0) ack 3001 win 65535
+
++0 write(3, ..., 1000) = 1000
+
++0.0 > P. 1:1001(1000) ack 3001 win 65535
+
++0.1 read(3, ..., 1000) = 1000
+
+// Conntrack should move to FIN_WAIT, then CLOSE_WAIT.
++0 < F. 3001:3001(0) ack 1001 win 65535
++0 >  . 1001:1001(0) ack 3002 win 65535
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE_WAIT`
+
++1 close(3) = 0
+// RST: unread data. FIN was seen, hence ack + 1
++0 > R. 1001:1001(0) ack 3002 win 65535
+// ... and then, CLOSE.
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ `
+
+// Spurious RST from peer -- no sk state.  Should NOT get
+// marked INVALID, because conntrack is already closing.
++0.1 < R 2001:2001(0) win 0
+
+// No packets should have been marked INVALID
++0 `iptables -v -S INPUT  | grep INVALID | grep -q -- "-c 0 0"`
++0 `iptables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt
new file mode 100644
index 000000000000..686f18a3d9ef
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt
@@ -0,0 +1,59 @@
+// check that out of window resets are marked as INVALID and conntrack remains
+// in ESTABLISHED state.
+
+`packetdrill/common.sh`
+
++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
++0.1 < S. 1:1(0) ack 1 win 65535 <mss 1460>
+
++0 > . 1:1(0) ack 1 win 65535
++0 < . 1:1001(1000) ack 1 win 65535
++0 < . 1001:2001(1000) ack 1 win 65535
++0 < . 2001:3001(1000) ack 1 win 65535
+
++0 > . 1:1(0) ack 1001 win 65535
++0 > . 1:1(0) ack 2001 win 65535
++0 > . 1:1(0) ack 3001 win 65535
+
++0 write(3, ..., 1000) = 1000
+
+// out of window
++0.0 < R	0:0(0)	win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
+// out of window
++0.0 < R	1000000:1000000(0)	win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
+// in-window but not exact match
++0.0 < R	42:42(0)	win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
++0.0 > P. 1:1001(1000) ack 3001 win 65535
+
++0.1 read(3, ..., 1000) = 1000
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
++0 < . 3001:3001(0) ack 1001 win 65535
+
++0.0 < R. 3000:3000(0) ack 1001 win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
+// exact next sequence
++0.0 < R. 3001:3001(0) ack 1001 win 0
+// Conntrack should move to CLOSE
+
+// Expect four invalid RSTs
++0 `$xtables -v -S INPUT  | grep INVALID | grep -q -- "-c 4 "`
++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ `
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt
new file mode 100644
index 000000000000..3442cd29bc93
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt
@@ -0,0 +1,44 @@
+// Check connection re-use, i.e. peer that receives the SYN answers with
+// a challenge-ACK.
+// Check that conntrack lets all packets pass, including the challenge ack,
+// and that a new connection is established.
+
+`packetdrill/common.sh`
+
+// S  >
+//  . < (challnge-ack)
+// R. >
+// S  >
+// S. <
+// Expected outcome: established connection.
+
++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
+// Challenge ACK, old incarnation.
+0.1 < . 145824453:145824453(0) ack 643160523 win 240 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0>
+
++0.01 > R 643160523:643160523(0) win 0
+
++0.01 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT`
+
+// Must go through.
++0.01 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
+// correct synack
++0.1 < S. 0:0(0) ack 1 win 250 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0>
+
+// 3whs completes.
++0.01 > . 1:1(0) ack 1 win 256 <nop,nop,TS val 1 ecr 1>
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED`
+
+// No packets should have been marked INVALID
++0 `$xtables -v -S INPUT  | grep INVALID | grep -q -- "-c 0 0"`
++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt
new file mode 100644
index 000000000000..3047160c4bf3
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt
@@ -0,0 +1,51 @@
+// Check conntrack copes with syn/ack reply for a previous, old incarnation.
+
+// tcpdump with buggy sequence
+// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083107423 ecr 0,nop,wscale 7], length 0
+// OLD synack, for old/previous S
+// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 145824453, ack 643160523, win 65535, options [mss 8952,nop,wscale 5,TS val 3215437785 ecr 2082921663,nop,nop], length 0
+// This reset never makes it to the endpoint, elided in the packetdrill script
+// 10.192.171.30.2049 > 10.176.25.8.829: Flags [R.], seq 1, ack 1, win 65535, options [mss 8952,nop,wscale 5,TS val 3215443451 ecr 2082921663,nop,nop], length 0
+// Syn retransmit, no change
+// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083115583 ecr 0,nop,wscale 7], length 0
+// CORRECT synack, should be accepted, but conntrack classified this as INVALID:
+// SEQ is over the upper bound (over the window of the receiver) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.37.78 LEN=40 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=8080 DPT=34500 SEQ=162602411 ACK=2124350315 ..
+// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 162602410, ack 2375731742, win 65535, options [mss 8952,nop,wscale 5,TS val 3215445754 ecr 2083115583,nop,nop], length 0
+
+`packetdrill/common.sh`
+
++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
+// bogus/outdated synack, invalid ack value
+0.1 < S. 145824453:145824453(0) ack 643160523 win 240 <mss 1440,nop,nop,TS val 1 ecr 1,nop,wscale 0>
+
+// syn retransmitted
+1.01 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1015 ecr 0,nop,wscale 8>
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT`
+
+// correct synack
++0 < S. 145758918:145758918(0) ack 1 win 250 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0>
++0 write(3, ..., 1) = 1
+
+// with buggy conntrack above packet is dropped, so SYN rtx is seen:
+// script packet:  1.054007 . 1:1(0) ack 16777958 win 256 <nop,nop,TS val 1033 ecr 1>
+// actual packet:  3.010000 S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1015 ecr 0,nop,wscale 8>
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED`
+
++0 > P. 1:2(1) ack 4294901762 win 256 <nop,nop,TS val 1067 ecr 1>
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ASSURED | grep -q ESTABLISHED`
+
+// No packets should have been marked INVALID in OUTPUT direction, 1 in INPUT
++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
++0 `$xtables -v -S INPUT  | grep INVALID | grep -q -- "-c 1 "`
+
++0 `$xtables -D INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -D OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt
new file mode 100644
index 000000000000..842242f8ccf7
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt
@@ -0,0 +1,34 @@
+// Check reception of another SYN while we have an established conntrack state.
+// Challenge ACK is supposed to pass through, RST reply should clear conntrack
+// state and SYN retransmit should give us new 'SYN_RECV' connection state.
+
+`packetdrill/common.sh`
+
+// should show a match if bug is present:
++0 `iptables -A INPUT -m conntrack --ctstate INVALID -p tcp --tcp-flags SYN,ACK SYN,ACK`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 10) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7, TS val 1 ecr 0,nop,nop>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,TS val 100 ecr 1,nop,wscale 8>
++.01 < . 1:1(0) ack 1 win 257 <TS val 1 ecr 100,nop,nop>
++0 accept(3, ..., ...) = 4
+
++0 < P. 1:101(100) ack 1 win 257 <TS val 2 ecr 100,nop,nop>
++.001 > . 1:1(0) ack 101 win 256 <nop,nop,TS val 110 ecr 2>
++0 read(4, ..., 101) = 100
+
+1.0 < S 2000:2000(0) win 32792 <mss 1000,nop,wscale 7, TS val 233 ecr 0,nop,nop>
+// Won't expect this: challenge ack.
+
++0 > . 1:1(0) ack 101 win 256 <nop,nop,TS val 112 ecr 2>
++0 < R. 101:101(0) ack 1 win 257
++0 close(4) = 0
+
+1.5 < S 2000:2000(0) win 32792 <mss 1000,nop,wscale 0, TS val 233 ecr 0,nop,nop>
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep -q SYN_RECV`
++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"`
diff --git a/tools/testing/selftests/netfilter/rpath.sh b/tools/testing/selftests/net/netfilter/rpath.sh
index 5289c8447a41..4485fd7675ed 100755
--- a/tools/testing/selftests/netfilter/rpath.sh
+++ b/tools/testing/selftests/net/netfilter/rpath.sh
@@ -64,12 +64,18 @@ ip -net "$ns2" a a fec0:42::1/64 dev d0 nodad
 # firewall matches to test
 [ -n "$iptables" ] && {
 	common='-t raw -A PREROUTING -s 192.168.0.0/16'
-	ip netns exec "$ns2" "$iptables" $common -m rpfilter
+	if ! ip netns exec "$ns2" "$iptables" $common -m rpfilter;then
+		echo "Cannot add rpfilter rule"
+		exit $ksft_skip
+	fi
 	ip netns exec "$ns2" "$iptables" $common -m rpfilter --invert
 }
 [ -n "$ip6tables" ] && {
 	common='-t raw -A PREROUTING -s fec0::/16'
-	ip netns exec "$ns2" "$ip6tables" $common -m rpfilter
+	if ! ip netns exec "$ns2" "$ip6tables" $common -m rpfilter;then
+		echo "Cannot add rpfilter rule"
+		exit $ksft_skip
+	fi
 	ip netns exec "$ns2" "$ip6tables" $common -m rpfilter --invert
 }
 [ -n "$nft" ] && ip netns exec "$ns2" $nft -f - <<EOF
diff --git a/tools/testing/selftests/netfilter/sctp_collision.c b/tools/testing/selftests/net/netfilter/sctp_collision.c
index 21bb1cfd8a85..21bb1cfd8a85 100644
--- a/tools/testing/selftests/netfilter/sctp_collision.c
+++ b/tools/testing/selftests/net/netfilter/sctp_collision.c
diff --git a/tools/testing/selftests/net/netfilter/settings b/tools/testing/selftests/net/netfilter/settings
new file mode 100644
index 000000000000..abc5648b59ab
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/settings
@@ -0,0 +1 @@
+timeout=1800
diff --git a/tools/testing/selftests/netfilter/xt_string.sh b/tools/testing/selftests/net/netfilter/xt_string.sh
index 1802653a4728..8d401c69e317 100755
--- a/tools/testing/selftests/netfilter/xt_string.sh
+++ b/tools/testing/selftests/net/netfilter/xt_string.sh
@@ -5,53 +5,57 @@
 ksft_skip=4
 rc=0
 
-if ! iptables --version >/dev/null 2>&1; then
-	echo "SKIP: Test needs iptables"
-	exit $ksft_skip
-fi
-if ! ip -V >/dev/null 2>&1; then
-	echo "SKIP: Test needs iproute2"
-	exit $ksft_skip
-fi
-if ! nc -h >/dev/null 2>&1; then
-	echo "SKIP: Test needs netcat"
-	exit $ksft_skip
-fi
+source lib.sh
+
+checktool "socat -h" "run test without socat"
+checktool "iptables --version" "test needs iptables"
+
+infile=$(mktemp)
+
+cleanup()
+{
+	ip netns del "$netns"
+	rm -f "$infile"
+}
+
+trap cleanup EXIT
+
+setup_ns netns
+
+ip -net "$netns" link add d0 type dummy
+ip -net "$netns" link set d0 up
+ip -net "$netns" addr add 10.1.2.1/24 dev d0
 
 pattern="foo bar baz"
 patlen=11
 hdrlen=$((20 + 8)) # IPv4 + UDP
-ns="ns-$(mktemp -u XXXXXXXX)"
-trap 'ip netns del $ns' EXIT
-ip netns add "$ns"
-ip -net "$ns" link add d0 type dummy
-ip -net "$ns" link set d0 up
-ip -net "$ns" addr add 10.1.2.1/24 dev d0
-
-#ip netns exec "$ns" tcpdump -npXi d0 &
+
+#ip netns exec "$netns" tcpdump -npXi d0 &
 #tcpdump_pid=$!
-#trap 'kill $tcpdump_pid; ip netns del $ns' EXIT
+#trap 'kill $tcpdump_pid; ip netns del $netns' EXIT
 
 add_rule() { # (alg, from, to)
-	ip netns exec "$ns" \
+	ip netns exec "$netns" \
 		iptables -A OUTPUT -o d0 -m string \
-			--string "$pattern" --algo $1 --from $2 --to $3
+			--string "$pattern" --algo "$1" --from "$2" --to "$3"
 }
 showrules() { # ()
-	ip netns exec "$ns" iptables -v -S OUTPUT | grep '^-A'
+	ip netns exec "$netns" iptables -v -S OUTPUT | grep '^-A'
 }
 zerorules() {
-	ip netns exec "$ns" iptables -Z OUTPUT
+	ip netns exec "$netns" iptables -Z OUTPUT
 }
 countrule() { # (pattern)
 	showrules | grep -c -- "$*"
 }
 send() { # (offset)
-	( for ((i = 0; i < $1 - $hdrlen; i++)); do
-		printf " "
+	( for ((i = 0; i < $1 - hdrlen; i++)); do
+		echo -n " "
 	  done
-	  printf "$pattern"
-	) | ip netns exec "$ns" nc -w 1 -u 10.1.2.2 27374
+	  echo -n "$pattern"
+	) > "$infile"
+
+	ip netns exec "$netns" socat -t 1 -u STDIN UDP-SENDTO:10.1.2.2:27374 < "$infile"
 }
 
 add_rule bm 1000 1500
@@ -61,8 +65,8 @@ add_rule kmp 1400 1600
 
 zerorules
 send 0
-send $((1000 - $patlen))
-if [ $(countrule -c 0 0) -ne 4 ]; then
+send $((1000 - patlen))
+if [ "$(countrule -c 0 0)" -ne 4 ]; then
 	echo "FAIL: rules match data before --from"
 	showrules
 	((rc--))
@@ -70,16 +74,16 @@ fi
 
 zerorules
 send 1000
-send $((1400 - $patlen))
-if [ $(countrule -c 2) -ne 2 ]; then
+send $((1400 - patlen))
+if [ "$(countrule -c 2)" -ne 2 ]; then
 	echo "FAIL: only two rules should match at low offset"
 	showrules
 	((rc--))
 fi
 
 zerorules
-send $((1500 - $patlen))
-if [ $(countrule -c 1) -ne 4 ]; then
+send $((1500 - patlen))
+if [ "$(countrule -c 1)" -ne 4 ]; then
 	echo "FAIL: all rules should match at end of packet"
 	showrules
 	((rc--))
@@ -87,7 +91,7 @@ fi
 
 zerorules
 send 1495
-if [ $(countrule -c 1) -ne 1 ]; then
+if [ "$(countrule -c 1)" -ne 1 ]; then
 	echo "FAIL: only kmp with proper --to should match pattern spanning fragments"
 	showrules
 	((rc--))
@@ -95,23 +99,23 @@ fi
 
 zerorules
 send 1500
-if [ $(countrule -c 1) -ne 2 ]; then
+if [ "$(countrule -c 1)" -ne 2 ]; then
 	echo "FAIL: two rules should match pattern at start of second fragment"
 	showrules
 	((rc--))
 fi
 
 zerorules
-send $((1600 - $patlen))
-if [ $(countrule -c 1) -ne 2 ]; then
+send $((1600 - patlen))
+if [ "$(countrule -c 1)" -ne 2 ]; then
 	echo "FAIL: two rules should match pattern at end of largest --to"
 	showrules
 	((rc--))
 fi
 
 zerorules
-send $((1600 - $patlen + 1))
-if [ $(countrule -c 1) -ne 0 ]; then
+send $((1600 - patlen + 1))
+if [ "$(countrule -c 1)" -ne 0 ]; then
 	echo "FAIL: no rules should match pattern extending largest --to"
 	showrules
 	((rc--))
@@ -119,10 +123,11 @@ fi
 
 zerorules
 send 1600
-if [ $(countrule -c 1) -ne 0 ]; then
+if [ "$(countrule -c 1)" -ne 0 ]; then
 	echo "FAIL: no rule should match pattern past largest --to"
 	showrules
 	((rc--))
 fi
 
+[ $rc -eq 0 ] && echo "PASS: string match tests"
 exit $rc
diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py
new file mode 100755
index 000000000000..93d9d914529b
--- /dev/null
+++ b/tools/testing/selftests/net/nl_netdev.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import time
+from lib.py import ksft_run, ksft_exit, ksft_pr
+from lib.py import ksft_eq, ksft_ge, ksft_busy_wait
+from lib.py import NetdevFamily, NetdevSimDev, ip
+
+
+def empty_check(nf) -> None:
+    devs = nf.dev_get({}, dump=True)
+    ksft_ge(len(devs), 1)
+
+
+def lo_check(nf) -> None:
+    lo_info = nf.dev_get({"ifindex": 1})
+    ksft_eq(len(lo_info['xdp-features']), 0)
+    ksft_eq(len(lo_info['xdp-rx-metadata-features']), 0)
+
+
+def page_pool_check(nf) -> None:
+    with NetdevSimDev() as nsimdev:
+        nsim = nsimdev.nsims[0]
+
+        def up():
+            ip(f"link set dev {nsim.ifname} up")
+
+        def down():
+            ip(f"link set dev {nsim.ifname} down")
+
+        def get_pp():
+            pp_list = nf.page_pool_get({}, dump=True)
+            return [pp for pp in pp_list if pp.get("ifindex") == nsim.ifindex]
+
+        # No page pools when down
+        down()
+        ksft_eq(len(get_pp()), 0)
+
+        # Up, empty page pool appears
+        up()
+        pp_list = get_pp()
+        ksft_ge(len(pp_list), 0)
+        refs = sum([pp["inflight"] for pp in pp_list])
+        ksft_eq(refs, 0)
+
+        # Down, it disappears, again
+        down()
+        pp_list = get_pp()
+        ksft_eq(len(pp_list), 0)
+
+        # Up, allocate a page
+        up()
+        nsim.dfs_write("pp_hold", "y")
+        pp_list = nf.page_pool_get({}, dump=True)
+        refs = sum([pp["inflight"] for pp in pp_list if pp.get("ifindex") == nsim.ifindex])
+        ksft_ge(refs, 1)
+
+        # Now let's leak a page
+        down()
+        pp_list = get_pp()
+        ksft_eq(len(pp_list), 1)
+        refs = sum([pp["inflight"] for pp in pp_list])
+        ksft_eq(refs, 1)
+        attached = [pp for pp in pp_list if "detach-time" not in pp]
+        ksft_eq(len(attached), 0)
+
+        # New pp can get created, and we'll have two
+        up()
+        pp_list = get_pp()
+        attached = [pp for pp in pp_list if "detach-time" not in pp]
+        detached = [pp for pp in pp_list if "detach-time" in pp]
+        ksft_eq(len(attached), 1)
+        ksft_eq(len(detached), 1)
+
+        # Free the old page and the old pp is gone
+        nsim.dfs_write("pp_hold", "n")
+        # Freeing check is once a second so we may need to retry
+        ksft_busy_wait(lambda: len(get_pp()) == 1, deadline=2)
+
+        # And down...
+        down()
+        ksft_eq(len(get_pp()), 0)
+
+        # Last, leave the page hanging for destroy, nothing to check
+        # we're trying to exercise the orphaning path in the kernel
+        up()
+        nsim.dfs_write("pp_hold", "y")
+
+
+def main() -> None:
+    nf = NetdevFamily()
+    ksft_run([empty_check, lo_check, page_pool_check],
+             args=(nf, ))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py
index 5e0e539a323d..1dd057afd3fb 100644
--- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py
+++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py
@@ -489,7 +489,7 @@ class ovsactions(nla):
                     actstr, reason = parse_extract_field(
                         actstr,
                         "drop(",
-                        "([0-9]+)",
+                        r"([0-9]+)",
                         lambda x: int(x, 0),
                         False,
                         None,
@@ -502,9 +502,9 @@ class ovsactions(nla):
                     actstr = actstr[len("drop"): ]
                     return (totallen - len(actstr))
 
-            elif parse_starts_block(actstr, "^(\d+)", False, True):
+            elif parse_starts_block(actstr, r"^(\d+)", False, True):
                 actstr, output = parse_extract_field(
-                    actstr, None, "(\d+)", lambda x: int(x), False, "0"
+                    actstr, None, r"(\d+)", lambda x: int(x), False, "0"
                 )
                 self["attrs"].append(["OVS_ACTION_ATTR_OUTPUT", output])
                 parsed = True
@@ -512,7 +512,7 @@ class ovsactions(nla):
                 actstr, recircid = parse_extract_field(
                     actstr,
                     "recirc(",
-                    "([0-9a-fA-Fx]+)",
+                    r"([0-9a-fA-Fx]+)",
                     lambda x: int(x, 0),
                     False,
                     0,
@@ -588,17 +588,17 @@ class ovsactions(nla):
                                 actstr = actstr[3:]
 
                             actstr, ip_block_min = parse_extract_field(
-                                actstr, "=", "([0-9a-fA-F\.]+)", str, False
+                                actstr, "=", r"([0-9a-fA-F\.]+)", str, False
                             )
                             actstr, ip_block_max = parse_extract_field(
-                                actstr, "-", "([0-9a-fA-F\.]+)", str, False
+                                actstr, "-", r"([0-9a-fA-F\.]+)", str, False
                             )
 
                             actstr, proto_min = parse_extract_field(
-                                actstr, ":", "(\d+)", int, False
+                                actstr, ":", r"(\d+)", int, False
                             )
                             actstr, proto_max = parse_extract_field(
-                                actstr, "-", "(\d+)", int, False
+                                actstr, "-", r"(\d+)", int, False
                             )
 
                             if t is not None:
diff --git a/tools/testing/selftests/net/reuseaddr_conflict.c b/tools/testing/selftests/net/reuseaddr_conflict.c
index 7c5b12664b03..bfb07dc49518 100644
--- a/tools/testing/selftests/net/reuseaddr_conflict.c
+++ b/tools/testing/selftests/net/reuseaddr_conflict.c
@@ -109,6 +109,6 @@ int main(void)
 	fd1 = open_port(0, 1);
 	if (fd1 >= 0)
 		error(1, 0, "Was allowed to create an ipv4 reuseport on an already bound non-reuseport socket with no ipv6");
-	fprintf(stderr, "Success");
+	fprintf(stderr, "Success\n");
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/sample_map_ret0.c b/tools/testing/selftests/net/sample_map_ret0.bpf.c
index 495990d355ef..43ca92594926 100644
--- a/tools/testing/selftests/bpf/progs/sample_map_ret0.c
+++ b/tools/testing/selftests/net/sample_map_ret0.bpf.c
@@ -17,7 +17,7 @@ struct {
 } array SEC(".maps");
 
 /* Sample program which should always load for testing control paths. */
-SEC(".text") int func()
+SEC("xdp") int func()
 {
 	__u64 key64 = 0;
 	__u32 key = 0;
diff --git a/tools/testing/selftests/bpf/progs/sample_ret0.c b/tools/testing/selftests/net/sample_ret0.bpf.c
index fec99750d6ea..1df5ca98bb65 100644
--- a/tools/testing/selftests/bpf/progs/sample_ret0.c
+++ b/tools/testing/selftests/net/sample_ret0.bpf.c
@@ -1,6 +1,9 @@
 /* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */
 
+#define SEC(name) __attribute__((section(name), used))
+
 /* Sample program which should always load for testing control paths. */
+SEC("xdp")
 int func()
 {
 	return 0;
diff --git a/tools/testing/selftests/net/tcp_ao/lib/proc.c b/tools/testing/selftests/net/tcp_ao/lib/proc.c
index 2fb6dd8adba6..8b984fa04286 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/proc.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/proc.c
@@ -86,7 +86,7 @@ static void netstat_read_type(FILE *fnetstat, struct netstat **dest, char *line)
 
 	pos = strchr(line, ' ') + 1;
 
-	if (fscanf(fnetstat, type->header_name) == EOF)
+	if (fscanf(fnetstat, "%[^ :]", type->header_name) == EOF)
 		test_error("fscanf(%s)", type->header_name);
 	if (fread(&tmp, 1, 1, fnetstat) != 1 || tmp != ':')
 		test_error("Unexpected netstat format (%c)", tmp);
diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c
index 92276f916f2f..e408b9243b2c 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/setup.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c
@@ -17,37 +17,37 @@ static pthread_mutex_t ksft_print_lock = PTHREAD_MUTEX_INITIALIZER;
 void __test_msg(const char *buf)
 {
 	pthread_mutex_lock(&ksft_print_lock);
-	ksft_print_msg(buf);
+	ksft_print_msg("%s", buf);
 	pthread_mutex_unlock(&ksft_print_lock);
 }
 void __test_ok(const char *buf)
 {
 	pthread_mutex_lock(&ksft_print_lock);
-	ksft_test_result_pass(buf);
+	ksft_test_result_pass("%s", buf);
 	pthread_mutex_unlock(&ksft_print_lock);
 }
 void __test_fail(const char *buf)
 {
 	pthread_mutex_lock(&ksft_print_lock);
-	ksft_test_result_fail(buf);
+	ksft_test_result_fail("%s", buf);
 	pthread_mutex_unlock(&ksft_print_lock);
 }
 void __test_xfail(const char *buf)
 {
 	pthread_mutex_lock(&ksft_print_lock);
-	ksft_test_result_xfail(buf);
+	ksft_test_result_xfail("%s", buf);
 	pthread_mutex_unlock(&ksft_print_lock);
 }
 void __test_error(const char *buf)
 {
 	pthread_mutex_lock(&ksft_print_lock);
-	ksft_test_result_error(buf);
+	ksft_test_result_error("%s", buf);
 	pthread_mutex_unlock(&ksft_print_lock);
 }
 void __test_skip(const char *buf)
 {
 	pthread_mutex_lock(&ksft_print_lock);
-	ksft_test_result_skip(buf);
+	ksft_test_result_skip("%s", buf);
 	pthread_mutex_unlock(&ksft_print_lock);
 }
 
diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c
index 7df8b8700e39..a2fe88d35ac0 100644
--- a/tools/testing/selftests/net/tcp_ao/rst.c
+++ b/tools/testing/selftests/net/tcp_ao/rst.c
@@ -256,8 +256,6 @@ static int test_wait_fds(int sk[], size_t nr, bool is_writable[],
 
 static void test_client_active_rst(unsigned int port)
 {
-	/* one in queue, another accept()ed */
-	unsigned int wait_for = backlog + 2;
 	int i, sk[3], err;
 	bool is_writable[ARRAY_SIZE(sk)] = {false};
 	unsigned int last = ARRAY_SIZE(sk) - 1;
@@ -275,16 +273,20 @@ static void test_client_active_rst(unsigned int port)
 	for (i = 0; i < last; i++) {
 		err = _test_connect_socket(sk[i], this_ip_dest, port,
 					       (i == 0) ? TEST_TIMEOUT_SEC : -1);
-
 		if (err < 0)
 			test_error("failed to connect()");
 	}
 
-	synchronize_threads(); /* 2: connection accept()ed, another queued */
-	err = test_wait_fds(sk, last, is_writable, wait_for, TEST_TIMEOUT_SEC);
+	synchronize_threads(); /* 2: two connections: one accept()ed, another queued */
+	err = test_wait_fds(sk, last, is_writable, last, TEST_TIMEOUT_SEC);
 	if (err < 0)
 		test_error("test_wait_fds(): %d", err);
 
+	/* async connect() with third sk to get into request_sock_queue */
+	err = _test_connect_socket(sk[last], this_ip_dest, port, -1);
+	if (err < 0)
+		test_error("failed to connect()");
+
 	synchronize_threads(); /* 3: close listen socket */
 	if (test_client_verify(sk[0], packet_sz, quota / packet_sz, TEST_TIMEOUT_SEC))
 		test_fail("Failed to send data on connected socket");
@@ -292,13 +294,14 @@ static void test_client_active_rst(unsigned int port)
 		test_ok("Verified established tcp connection");
 
 	synchronize_threads(); /* 4: finishing up */
-	err = _test_connect_socket(sk[last], this_ip_dest, port, -1);
-	if (err < 0)
-		test_error("failed to connect()");
 
 	synchronize_threads(); /* 5: closed active sk */
-	err = test_wait_fds(sk, ARRAY_SIZE(sk), NULL,
-			    wait_for, TEST_TIMEOUT_SEC);
+	/*
+	 * Wait for 2 connections: one accepted, another in the accept queue,
+	 * the one in request_sock_queue won't get fully established, so
+	 * doesn't receive an active RST, see inet_csk_listen_stop().
+	 */
+	err = test_wait_fds(sk, last, NULL, last, TEST_TIMEOUT_SEC);
 	if (err < 0)
 		test_error("select(): %d", err);
 
diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
index 452de131fa3a..517930f9721b 100644
--- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
+++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c
@@ -21,7 +21,7 @@ static void make_listen(int sk)
 static void test_vefify_ao_info(int sk, struct tcp_ao_info_opt *info,
 				const char *tst)
 {
-	struct tcp_ao_info_opt tmp;
+	struct tcp_ao_info_opt tmp = {};
 	socklen_t len = sizeof(tmp);
 
 	if (getsockopt(sk, IPPROTO_TCP, TCP_AO_INFO, &tmp, &len))
diff --git a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh
index 8533393a4f18..02b986c9c247 100755
--- a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh
+++ b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh
@@ -154,17 +154,9 @@ setup_topo()
 		setup_topo_ns $ns
 	done
 
-	ip link add name veth0 type veth peer name veth1
-	ip link set dev veth0 netns $h1 name eth0
-	ip link set dev veth1 netns $sw1 name swp1
-
-	ip link add name veth0 type veth peer name veth1
-	ip link set dev veth0 netns $sw1 name veth0
-	ip link set dev veth1 netns $sw2 name veth0
-
-	ip link add name veth0 type veth peer name veth1
-	ip link set dev veth0 netns $h2 name eth0
-	ip link set dev veth1 netns $sw2 name swp1
+	ip -n $h1 link add name eth0 type veth peer name swp1 netns $sw1
+	ip -n $sw1 link add name veth0 type veth peer name veth0 netns $sw2
+	ip -n $h2 link add name eth0 type veth peer name swp1 netns $sw2
 }
 
 setup_host_common()
diff --git a/tools/testing/selftests/net/test_vxlan_mdb.sh b/tools/testing/selftests/net/test_vxlan_mdb.sh
index 74ff9fb2a6f0..58da5de99ac4 100755
--- a/tools/testing/selftests/net/test_vxlan_mdb.sh
+++ b/tools/testing/selftests/net/test_vxlan_mdb.sh
@@ -1177,6 +1177,7 @@ encap_params_common()
 	local plen=$1; shift
 	local enc_ethtype=$1; shift
 	local grp=$1; shift
+	local grp_dmac=$1; shift
 	local src=$1; shift
 	local mz=$1; shift
 
@@ -1195,11 +1196,11 @@ encap_params_common()
 	run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep2_ip src_vni 10020"
 
 	run_cmd "tc -n $ns2 filter replace dev vx0 ingress pref 1 handle 101 proto all flower enc_dst_ip $vtep1_ip action pass"
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Destination IP - match"
 
-	run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Destination IP - no match"
 
@@ -1212,20 +1213,20 @@ encap_params_common()
 	run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep1_ip dst_port 1111 src_vni 10020"
 
 	run_cmd "tc -n $ns2 filter replace dev veth0 ingress pref 1 handle 101 proto $enc_ethtype flower ip_proto udp dst_port 4789 action pass"
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev veth0 ingress" 101 1
 	log_test $? 0 "Default destination port - match"
 
-	run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev veth0 ingress" 101 1
 	log_test $? 0 "Default destination port - no match"
 
 	run_cmd "tc -n $ns2 filter replace dev veth0 ingress pref 1 handle 101 proto $enc_ethtype flower ip_proto udp dst_port 1111 action pass"
-	run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev veth0 ingress" 101 1
 	log_test $? 0 "Non-default destination port - match"
 
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev veth0 ingress" 101 1
 	log_test $? 0 "Non-default destination port - no match"
 
@@ -1238,11 +1239,11 @@ encap_params_common()
 	run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep1_ip src_vni 10020"
 
 	run_cmd "tc -n $ns2 filter replace dev vx0 ingress pref 1 handle 101 proto all flower enc_key_id 10010 action pass"
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Default destination VNI - match"
 
-	run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Default destination VNI - no match"
 
@@ -1250,11 +1251,11 @@ encap_params_common()
 	run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep1_ip vni 10010 src_vni 10020"
 
 	run_cmd "tc -n $ns2 filter replace dev vx0 ingress pref 1 handle 101 proto all flower enc_key_id 10020 action pass"
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Non-default destination VNI - match"
 
-	run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Non-default destination VNI - no match"
 
@@ -1272,6 +1273,7 @@ encap_params_ipv4_ipv4()
 	local plen=32
 	local enc_ethtype="ip"
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local src=192.0.2.129
 
 	echo
@@ -1279,7 +1281,7 @@ encap_params_ipv4_ipv4()
 	echo "------------------------------------------------------------------"
 
 	encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \
-		$grp $src "mausezahn"
+		$grp $grp_dmac $src "mausezahn"
 }
 
 encap_params_ipv6_ipv4()
@@ -1291,6 +1293,7 @@ encap_params_ipv6_ipv4()
 	local plen=32
 	local enc_ethtype="ip"
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local src=2001:db8:100::1
 
 	echo
@@ -1298,7 +1301,7 @@ encap_params_ipv6_ipv4()
 	echo "------------------------------------------------------------------"
 
 	encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \
-		$grp $src "mausezahn -6"
+		$grp $grp_dmac $src "mausezahn -6"
 }
 
 encap_params_ipv4_ipv6()
@@ -1310,6 +1313,7 @@ encap_params_ipv4_ipv6()
 	local plen=128
 	local enc_ethtype="ipv6"
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local src=192.0.2.129
 
 	echo
@@ -1317,7 +1321,7 @@ encap_params_ipv4_ipv6()
 	echo "------------------------------------------------------------------"
 
 	encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \
-		$grp $src "mausezahn"
+		$grp $grp_dmac $src "mausezahn"
 }
 
 encap_params_ipv6_ipv6()
@@ -1329,6 +1333,7 @@ encap_params_ipv6_ipv6()
 	local plen=128
 	local enc_ethtype="ipv6"
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local src=2001:db8:100::1
 
 	echo
@@ -1336,7 +1341,7 @@ encap_params_ipv6_ipv6()
 	echo "------------------------------------------------------------------"
 
 	encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \
-		$grp $src "mausezahn -6"
+		$grp $grp_dmac $src "mausezahn -6"
 }
 
 starg_exclude_ir_common()
@@ -1347,6 +1352,7 @@ starg_exclude_ir_common()
 	local vtep2_ip=$1; shift
 	local plen=$1; shift
 	local grp=$1; shift
+	local grp_dmac=$1; shift
 	local valid_src=$1; shift
 	local invalid_src=$1; shift
 	local mz=$1; shift
@@ -1368,14 +1374,14 @@ starg_exclude_ir_common()
 	run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode exclude source_list $invalid_src dst $vtep2_ip src_vni 10010"
 
 	# Check that invalid source is not forwarded to any VTEP.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 0
 	log_test $? 0 "Block excluded source - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 0
 	log_test $? 0 "Block excluded source - second VTEP"
 
 	# Check that valid source is forwarded to both VTEPs.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Forward valid source - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 1
@@ -1385,14 +1391,14 @@ starg_exclude_ir_common()
 	run_cmd "bridge -n $ns1 mdb del dev vx0 port vx0 grp $grp dst $vtep2_ip src_vni 10010"
 
 	# Check that invalid source is not forwarded to any VTEP.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Block excluded source after removal - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 1
 	log_test $? 0 "Block excluded source after removal - second VTEP"
 
 	# Check that valid source is forwarded to the remaining VTEP.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 2
 	log_test $? 0 "Forward valid source after removal - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 1
@@ -1407,6 +1413,7 @@ starg_exclude_ir_ipv4_ipv4()
 	local vtep2_ip=198.51.100.200
 	local plen=32
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local valid_src=192.0.2.129
 	local invalid_src=192.0.2.145
 
@@ -1415,7 +1422,7 @@ starg_exclude_ir_ipv4_ipv4()
 	echo "-------------------------------------------------------------"
 
 	starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \
-		$valid_src $invalid_src "mausezahn"
+		$grp_dmac $valid_src $invalid_src "mausezahn"
 }
 
 starg_exclude_ir_ipv6_ipv4()
@@ -1426,6 +1433,7 @@ starg_exclude_ir_ipv6_ipv4()
 	local vtep2_ip=198.51.100.200
 	local plen=32
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local valid_src=2001:db8:100::1
 	local invalid_src=2001:db8:200::1
 
@@ -1434,7 +1442,7 @@ starg_exclude_ir_ipv6_ipv4()
 	echo "-------------------------------------------------------------"
 
 	starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \
-		$valid_src $invalid_src "mausezahn -6"
+		$grp_dmac $valid_src $invalid_src "mausezahn -6"
 }
 
 starg_exclude_ir_ipv4_ipv6()
@@ -1445,6 +1453,7 @@ starg_exclude_ir_ipv4_ipv6()
 	local vtep2_ip=2001:db8:2000::1
 	local plen=128
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local valid_src=192.0.2.129
 	local invalid_src=192.0.2.145
 
@@ -1453,7 +1462,7 @@ starg_exclude_ir_ipv4_ipv6()
 	echo "-------------------------------------------------------------"
 
 	starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \
-		$valid_src $invalid_src "mausezahn"
+		$grp_dmac $valid_src $invalid_src "mausezahn"
 }
 
 starg_exclude_ir_ipv6_ipv6()
@@ -1464,6 +1473,7 @@ starg_exclude_ir_ipv6_ipv6()
 	local vtep2_ip=2001:db8:2000::1
 	local plen=128
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local valid_src=2001:db8:100::1
 	local invalid_src=2001:db8:200::1
 
@@ -1472,7 +1482,7 @@ starg_exclude_ir_ipv6_ipv6()
 	echo "-------------------------------------------------------------"
 
 	starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \
-		$valid_src $invalid_src "mausezahn -6"
+		$grp_dmac $valid_src $invalid_src "mausezahn -6"
 }
 
 starg_include_ir_common()
@@ -1483,6 +1493,7 @@ starg_include_ir_common()
 	local vtep2_ip=$1; shift
 	local plen=$1; shift
 	local grp=$1; shift
+	local grp_dmac=$1; shift
 	local valid_src=$1; shift
 	local invalid_src=$1; shift
 	local mz=$1; shift
@@ -1504,14 +1515,14 @@ starg_include_ir_common()
 	run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode include source_list $valid_src dst $vtep2_ip src_vni 10010"
 
 	# Check that invalid source is not forwarded to any VTEP.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 0
 	log_test $? 0 "Block excluded source - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 0
 	log_test $? 0 "Block excluded source - second VTEP"
 
 	# Check that valid source is forwarded to both VTEPs.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Forward valid source - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 1
@@ -1521,14 +1532,14 @@ starg_include_ir_common()
 	run_cmd "bridge -n $ns1 mdb del dev vx0 port vx0 grp $grp dst $vtep2_ip src_vni 10010"
 
 	# Check that invalid source is not forwarded to any VTEP.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Block excluded source after removal - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 1
 	log_test $? 0 "Block excluded source after removal - second VTEP"
 
 	# Check that valid source is forwarded to the remaining VTEP.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 2
 	log_test $? 0 "Forward valid source after removal - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 1
@@ -1543,6 +1554,7 @@ starg_include_ir_ipv4_ipv4()
 	local vtep2_ip=198.51.100.200
 	local plen=32
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local valid_src=192.0.2.129
 	local invalid_src=192.0.2.145
 
@@ -1551,7 +1563,7 @@ starg_include_ir_ipv4_ipv4()
 	echo "-------------------------------------------------------------"
 
 	starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \
-		$valid_src $invalid_src "mausezahn"
+		$grp_dmac $valid_src $invalid_src "mausezahn"
 }
 
 starg_include_ir_ipv6_ipv4()
@@ -1562,6 +1574,7 @@ starg_include_ir_ipv6_ipv4()
 	local vtep2_ip=198.51.100.200
 	local plen=32
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local valid_src=2001:db8:100::1
 	local invalid_src=2001:db8:200::1
 
@@ -1570,7 +1583,7 @@ starg_include_ir_ipv6_ipv4()
 	echo "-------------------------------------------------------------"
 
 	starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \
-		$valid_src $invalid_src "mausezahn -6"
+		$grp_dmac $valid_src $invalid_src "mausezahn -6"
 }
 
 starg_include_ir_ipv4_ipv6()
@@ -1581,6 +1594,7 @@ starg_include_ir_ipv4_ipv6()
 	local vtep2_ip=2001:db8:2000::1
 	local plen=128
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local valid_src=192.0.2.129
 	local invalid_src=192.0.2.145
 
@@ -1589,7 +1603,7 @@ starg_include_ir_ipv4_ipv6()
 	echo "-------------------------------------------------------------"
 
 	starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \
-		$valid_src $invalid_src "mausezahn"
+		$grp_dmac $valid_src $invalid_src "mausezahn"
 }
 
 starg_include_ir_ipv6_ipv6()
@@ -1600,6 +1614,7 @@ starg_include_ir_ipv6_ipv6()
 	local vtep2_ip=2001:db8:2000::1
 	local plen=128
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local valid_src=2001:db8:100::1
 	local invalid_src=2001:db8:200::1
 
@@ -1608,7 +1623,7 @@ starg_include_ir_ipv6_ipv6()
 	echo "-------------------------------------------------------------"
 
 	starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \
-		$valid_src $invalid_src "mausezahn -6"
+		$grp_dmac $valid_src $invalid_src "mausezahn -6"
 }
 
 starg_exclude_p2mp_common()
@@ -1618,6 +1633,7 @@ starg_exclude_p2mp_common()
 	local mcast_grp=$1; shift
 	local plen=$1; shift
 	local grp=$1; shift
+	local grp_dmac=$1; shift
 	local valid_src=$1; shift
 	local invalid_src=$1; shift
 	local mz=$1; shift
@@ -1635,12 +1651,12 @@ starg_exclude_p2mp_common()
 	run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode exclude source_list $invalid_src dst $mcast_grp src_vni 10010 via veth0"
 
 	# Check that invalid source is not forwarded.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 0
 	log_test $? 0 "Block excluded source"
 
 	# Check that valid source is forwarded.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Forward valid source"
 
@@ -1648,7 +1664,7 @@ starg_exclude_p2mp_common()
 	run_cmd "ip -n $ns2 address del $mcast_grp/$plen dev veth0"
 
 	# Check that valid source is not received anymore.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Receive of valid source after removal from group"
 }
@@ -1660,6 +1676,7 @@ starg_exclude_p2mp_ipv4_ipv4()
 	local mcast_grp=238.1.1.1
 	local plen=32
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local valid_src=192.0.2.129
 	local invalid_src=192.0.2.145
 
@@ -1667,7 +1684,7 @@ starg_exclude_p2mp_ipv4_ipv4()
 	echo "Data path: (*, G) EXCLUDE - P2MP - IPv4 overlay / IPv4 underlay"
 	echo "---------------------------------------------------------------"
 
-	starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \
+	starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \
 		$valid_src $invalid_src "mausezahn"
 }
 
@@ -1678,6 +1695,7 @@ starg_exclude_p2mp_ipv6_ipv4()
 	local mcast_grp=238.1.1.1
 	local plen=32
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local valid_src=2001:db8:100::1
 	local invalid_src=2001:db8:200::1
 
@@ -1685,7 +1703,7 @@ starg_exclude_p2mp_ipv6_ipv4()
 	echo "Data path: (*, G) EXCLUDE - P2MP - IPv6 overlay / IPv4 underlay"
 	echo "---------------------------------------------------------------"
 
-	starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \
+	starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \
 		$valid_src $invalid_src "mausezahn -6"
 }
 
@@ -1696,6 +1714,7 @@ starg_exclude_p2mp_ipv4_ipv6()
 	local mcast_grp=ff0e::2
 	local plen=128
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local valid_src=192.0.2.129
 	local invalid_src=192.0.2.145
 
@@ -1703,7 +1722,7 @@ starg_exclude_p2mp_ipv4_ipv6()
 	echo "Data path: (*, G) EXCLUDE - P2MP - IPv4 overlay / IPv6 underlay"
 	echo "---------------------------------------------------------------"
 
-	starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \
+	starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \
 		$valid_src $invalid_src "mausezahn"
 }
 
@@ -1714,6 +1733,7 @@ starg_exclude_p2mp_ipv6_ipv6()
 	local mcast_grp=ff0e::2
 	local plen=128
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local valid_src=2001:db8:100::1
 	local invalid_src=2001:db8:200::1
 
@@ -1721,7 +1741,7 @@ starg_exclude_p2mp_ipv6_ipv6()
 	echo "Data path: (*, G) EXCLUDE - P2MP - IPv6 overlay / IPv6 underlay"
 	echo "---------------------------------------------------------------"
 
-	starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \
+	starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \
 		$valid_src $invalid_src "mausezahn -6"
 }
 
@@ -1732,6 +1752,7 @@ starg_include_p2mp_common()
 	local mcast_grp=$1; shift
 	local plen=$1; shift
 	local grp=$1; shift
+	local grp_dmac=$1; shift
 	local valid_src=$1; shift
 	local invalid_src=$1; shift
 	local mz=$1; shift
@@ -1749,12 +1770,12 @@ starg_include_p2mp_common()
 	run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode include source_list $valid_src dst $mcast_grp src_vni 10010 via veth0"
 
 	# Check that invalid source is not forwarded.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 0
 	log_test $? 0 "Block excluded source"
 
 	# Check that valid source is forwarded.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Forward valid source"
 
@@ -1762,7 +1783,7 @@ starg_include_p2mp_common()
 	run_cmd "ip -n $ns2 address del $mcast_grp/$plen dev veth0"
 
 	# Check that valid source is not received anymore.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Receive of valid source after removal from group"
 }
@@ -1774,6 +1795,7 @@ starg_include_p2mp_ipv4_ipv4()
 	local mcast_grp=238.1.1.1
 	local plen=32
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local valid_src=192.0.2.129
 	local invalid_src=192.0.2.145
 
@@ -1781,7 +1803,7 @@ starg_include_p2mp_ipv4_ipv4()
 	echo "Data path: (*, G) INCLUDE - P2MP - IPv4 overlay / IPv4 underlay"
 	echo "---------------------------------------------------------------"
 
-	starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \
+	starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \
 		$valid_src $invalid_src "mausezahn"
 }
 
@@ -1792,6 +1814,7 @@ starg_include_p2mp_ipv6_ipv4()
 	local mcast_grp=238.1.1.1
 	local plen=32
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local valid_src=2001:db8:100::1
 	local invalid_src=2001:db8:200::1
 
@@ -1799,7 +1822,7 @@ starg_include_p2mp_ipv6_ipv4()
 	echo "Data path: (*, G) INCLUDE - P2MP - IPv6 overlay / IPv4 underlay"
 	echo "---------------------------------------------------------------"
 
-	starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \
+	starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \
 		$valid_src $invalid_src "mausezahn -6"
 }
 
@@ -1810,6 +1833,7 @@ starg_include_p2mp_ipv4_ipv6()
 	local mcast_grp=ff0e::2
 	local plen=128
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local valid_src=192.0.2.129
 	local invalid_src=192.0.2.145
 
@@ -1817,7 +1841,7 @@ starg_include_p2mp_ipv4_ipv6()
 	echo "Data path: (*, G) INCLUDE - P2MP - IPv4 overlay / IPv6 underlay"
 	echo "---------------------------------------------------------------"
 
-	starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \
+	starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \
 		$valid_src $invalid_src "mausezahn"
 }
 
@@ -1828,6 +1852,7 @@ starg_include_p2mp_ipv6_ipv6()
 	local mcast_grp=ff0e::2
 	local plen=128
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local valid_src=2001:db8:100::1
 	local invalid_src=2001:db8:200::1
 
@@ -1835,7 +1860,7 @@ starg_include_p2mp_ipv6_ipv6()
 	echo "Data path: (*, G) INCLUDE - P2MP - IPv6 overlay / IPv6 underlay"
 	echo "---------------------------------------------------------------"
 
-	starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \
+	starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \
 		$valid_src $invalid_src "mausezahn -6"
 }
 
@@ -1847,6 +1872,7 @@ egress_vni_translation_common()
 	local plen=$1; shift
 	local proto=$1; shift
 	local grp=$1; shift
+	local grp_dmac=$1; shift
 	local src=$1; shift
 	local mz=$1; shift
 
@@ -1882,20 +1908,20 @@ egress_vni_translation_common()
 	# Make sure that packets sent from the first VTEP over VLAN 10 are
 	# received by the SVI corresponding to the L3VNI (14000 / VLAN 4000) on
 	# the second VTEP, since it is configured as PVID.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev br0.4000 ingress" 101 1
 	log_test $? 0 "Egress VNI translation - PVID configured"
 
 	# Remove PVID flag from VLAN 4000 on the second VTEP and make sure
 	# packets are no longer received by the SVI interface.
 	run_cmd "bridge -n $ns2 vlan add vid 4000 dev vx0"
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev br0.4000 ingress" 101 1
 	log_test $? 0 "Egress VNI translation - no PVID configured"
 
 	# Reconfigure the PVID and make sure packets are received again.
 	run_cmd "bridge -n $ns2 vlan add vid 4000 dev vx0 pvid"
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev br0.4000 ingress" 101 2
 	log_test $? 0 "Egress VNI translation - PVID reconfigured"
 }
@@ -1908,6 +1934,7 @@ egress_vni_translation_ipv4_ipv4()
 	local plen=32
 	local proto="ipv4"
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local src=192.0.2.129
 
 	echo
@@ -1915,7 +1942,7 @@ egress_vni_translation_ipv4_ipv4()
 	echo "----------------------------------------------------------------"
 
 	egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \
-		$src "mausezahn"
+		$grp_dmac $src "mausezahn"
 }
 
 egress_vni_translation_ipv6_ipv4()
@@ -1926,6 +1953,7 @@ egress_vni_translation_ipv6_ipv4()
 	local plen=32
 	local proto="ipv6"
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local src=2001:db8:100::1
 
 	echo
@@ -1933,7 +1961,7 @@ egress_vni_translation_ipv6_ipv4()
 	echo "----------------------------------------------------------------"
 
 	egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \
-		$src "mausezahn -6"
+		$grp_dmac $src "mausezahn -6"
 }
 
 egress_vni_translation_ipv4_ipv6()
@@ -1944,6 +1972,7 @@ egress_vni_translation_ipv4_ipv6()
 	local plen=128
 	local proto="ipv4"
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local src=192.0.2.129
 
 	echo
@@ -1951,7 +1980,7 @@ egress_vni_translation_ipv4_ipv6()
 	echo "----------------------------------------------------------------"
 
 	egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \
-		$src "mausezahn"
+		$grp_dmac $src "mausezahn"
 }
 
 egress_vni_translation_ipv6_ipv6()
@@ -1962,6 +1991,7 @@ egress_vni_translation_ipv6_ipv6()
 	local plen=128
 	local proto="ipv6"
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local src=2001:db8:100::1
 
 	echo
@@ -1969,7 +1999,7 @@ egress_vni_translation_ipv6_ipv6()
 	echo "----------------------------------------------------------------"
 
 	egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \
-		$src "mausezahn -6"
+		$grp_dmac $src "mausezahn -6"
 }
 
 all_zeros_mdb_common()
@@ -1982,12 +2012,18 @@ all_zeros_mdb_common()
 	local vtep4_ip=$1; shift
 	local plen=$1; shift
 	local ipv4_grp=239.1.1.1
+	local ipv4_grp_dmac=01:00:5e:01:01:01
 	local ipv4_unreg_grp=239.2.2.2
+	local ipv4_unreg_grp_dmac=01:00:5e:02:02:02
 	local ipv4_ll_grp=224.0.0.100
+	local ipv4_ll_grp_dmac=01:00:5e:00:00:64
 	local ipv4_src=192.0.2.129
 	local ipv6_grp=ff0e::1
+	local ipv6_grp_dmac=33:33:00:00:00:01
 	local ipv6_unreg_grp=ff0e::2
+	local ipv6_unreg_grp_dmac=33:33:00:00:00:02
 	local ipv6_ll_grp=ff02::1
+	local ipv6_ll_grp_dmac=33:33:00:00:00:01
 	local ipv6_src=2001:db8:100::1
 
 	# Install all-zeros (catchall) MDB entries for IPv4 and IPv6 traffic
@@ -2023,7 +2059,7 @@ all_zeros_mdb_common()
 
 	# Send registered IPv4 multicast and make sure it only arrives to the
 	# first VTEP.
-	run_cmd "ip netns exec $ns1 mausezahn br0.10 -A $ipv4_src -B $ipv4_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 mausezahn br0.10 -a own -b $ipv4_grp_dmac -A $ipv4_src -B $ipv4_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "Registered IPv4 multicast - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 0
@@ -2031,7 +2067,7 @@ all_zeros_mdb_common()
 
 	# Send unregistered IPv4 multicast that is not link-local and make sure
 	# it arrives to the first and second VTEPs.
-	run_cmd "ip netns exec $ns1 mausezahn br0.10 -A $ipv4_src -B $ipv4_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 mausezahn br0.10 -a own -b $ipv4_unreg_grp_dmac -A $ipv4_src -B $ipv4_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 2
 	log_test $? 0 "Unregistered IPv4 multicast - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 1
@@ -2039,7 +2075,7 @@ all_zeros_mdb_common()
 
 	# Send IPv4 link-local multicast traffic and make sure it does not
 	# arrive to any VTEP.
-	run_cmd "ip netns exec $ns1 mausezahn br0.10 -A $ipv4_src -B $ipv4_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 mausezahn br0.10 -a own -b $ipv4_ll_grp_dmac -A $ipv4_src -B $ipv4_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 2
 	log_test $? 0 "Link-local IPv4 multicast - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 1
@@ -2074,7 +2110,7 @@ all_zeros_mdb_common()
 
 	# Send registered IPv6 multicast and make sure it only arrives to the
 	# third VTEP.
-	run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -A $ipv6_src -B $ipv6_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -a own -b $ipv6_grp_dmac -A $ipv6_src -B $ipv6_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 103 1
 	log_test $? 0 "Registered IPv6 multicast - third VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 104 0
@@ -2082,7 +2118,7 @@ all_zeros_mdb_common()
 
 	# Send unregistered IPv6 multicast that is not link-local and make sure
 	# it arrives to the third and fourth VTEPs.
-	run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -A $ipv6_src -B $ipv6_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -a own -b $ipv6_unreg_grp_dmac -A $ipv6_src -B $ipv6_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 103 2
 	log_test $? 0 "Unregistered IPv6 multicast - third VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 104 1
@@ -2090,7 +2126,7 @@ all_zeros_mdb_common()
 
 	# Send IPv6 link-local multicast traffic and make sure it does not
 	# arrive to any VTEP.
-	run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -A $ipv6_src -B $ipv6_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -a own -b $ipv6_ll_grp_dmac -A $ipv6_src -B $ipv6_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 103 2
 	log_test $? 0 "Link-local IPv6 multicast - third VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 104 1
@@ -2165,6 +2201,7 @@ mdb_fdb_common()
 	local plen=$1; shift
 	local proto=$1; shift
 	local grp=$1; shift
+	local grp_dmac=$1; shift
 	local src=$1; shift
 	local mz=$1; shift
 
@@ -2188,7 +2225,7 @@ mdb_fdb_common()
 
 	# Send IP multicast traffic and make sure it is forwarded by the MDB
 	# and only arrives to the first VTEP.
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "IP multicast - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 0
@@ -2205,7 +2242,7 @@ mdb_fdb_common()
 	# Remove the MDB entry and make sure that IP multicast is now forwarded
 	# by the FDB to the second VTEP.
 	run_cmd "bridge -n $ns1 mdb del dev vx0 port vx0 grp $grp dst $vtep1_ip src_vni 10010"
-	run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
+	run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q"
 	tc_check_packets "$ns2" "dev vx0 ingress" 101 1
 	log_test $? 0 "IP multicast after removal - first VTEP"
 	tc_check_packets "$ns2" "dev vx0 ingress" 102 2
@@ -2221,14 +2258,15 @@ mdb_fdb_ipv4_ipv4()
 	local plen=32
 	local proto="ipv4"
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local src=192.0.2.129
 
 	echo
 	echo "Data path: MDB with FDB - IPv4 overlay / IPv4 underlay"
 	echo "------------------------------------------------------"
 
-	mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \
-		"mausezahn"
+	mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \
+		$grp_dmac $src "mausezahn"
 }
 
 mdb_fdb_ipv6_ipv4()
@@ -2240,14 +2278,15 @@ mdb_fdb_ipv6_ipv4()
 	local plen=32
 	local proto="ipv6"
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local src=2001:db8:100::1
 
 	echo
 	echo "Data path: MDB with FDB - IPv6 overlay / IPv4 underlay"
 	echo "------------------------------------------------------"
 
-	mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \
-		"mausezahn -6"
+	mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \
+		$grp_dmac $src "mausezahn -6"
 }
 
 mdb_fdb_ipv4_ipv6()
@@ -2259,14 +2298,15 @@ mdb_fdb_ipv4_ipv6()
 	local plen=128
 	local proto="ipv4"
 	local grp=239.1.1.1
+	local grp_dmac=01:00:5e:01:01:01
 	local src=192.0.2.129
 
 	echo
 	echo "Data path: MDB with FDB - IPv4 overlay / IPv6 underlay"
 	echo "------------------------------------------------------"
 
-	mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \
-		"mausezahn"
+	mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \
+		$grp_dmac $src "mausezahn"
 }
 
 mdb_fdb_ipv6_ipv6()
@@ -2278,14 +2318,15 @@ mdb_fdb_ipv6_ipv6()
 	local plen=128
 	local proto="ipv6"
 	local grp=ff0e::1
+	local grp_dmac=33:33:00:00:00:01
 	local src=2001:db8:100::1
 
 	echo
 	echo "Data path: MDB with FDB - IPv6 overlay / IPv6 underlay"
 	echo "------------------------------------------------------"
 
-	mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \
-		"mausezahn -6"
+	mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \
+		$grp_dmac $src "mausezahn -6"
 }
 
 mdb_grp1_loop()
@@ -2320,7 +2361,9 @@ mdb_torture_common()
 	local vtep1_ip=$1; shift
 	local vtep2_ip=$1; shift
 	local grp1=$1; shift
+	local grp1_dmac=$1; shift
 	local grp2=$1; shift
+	local grp2_dmac=$1; shift
 	local src=$1; shift
 	local mz=$1; shift
 	local pid1
@@ -2345,9 +2388,9 @@ mdb_torture_common()
 	pid1=$!
 	mdb_grp2_loop $ns1 $vtep1_ip $vtep2_ip $grp2 &
 	pid2=$!
-	ip netns exec $ns1 $mz br0.10 -A $src -B $grp1 -t udp sp=12345,dp=54321 -p 100 -c 0 -q &
+	ip netns exec $ns1 $mz br0.10 -a own -b $grp1_dmac -A $src -B $grp1 -t udp sp=12345,dp=54321 -p 100 -c 0 -q &
 	pid3=$!
-	ip netns exec $ns1 $mz br0.10 -A $src -B $grp2 -t udp sp=12345,dp=54321 -p 100 -c 0 -q &
+	ip netns exec $ns1 $mz br0.10 -a own -b $grp2_dmac -A $src -B $grp2 -t udp sp=12345,dp=54321 -p 100 -c 0 -q &
 	pid4=$!
 
 	sleep 30
@@ -2363,15 +2406,17 @@ mdb_torture_ipv4_ipv4()
 	local vtep1_ip=198.51.100.100
 	local vtep2_ip=198.51.100.200
 	local grp1=239.1.1.1
+	local grp1_dmac=01:00:5e:01:01:01
 	local grp2=239.2.2.2
+	local grp2_dmac=01:00:5e:02:02:02
 	local src=192.0.2.129
 
 	echo
 	echo "Data path: MDB torture test - IPv4 overlay / IPv4 underlay"
 	echo "----------------------------------------------------------"
 
-	mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \
-		"mausezahn"
+	mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \
+		$grp2_dmac $src "mausezahn"
 }
 
 mdb_torture_ipv6_ipv4()
@@ -2380,15 +2425,17 @@ mdb_torture_ipv6_ipv4()
 	local vtep1_ip=198.51.100.100
 	local vtep2_ip=198.51.100.200
 	local grp1=ff0e::1
+	local grp1_dmac=33:33:00:00:00:01
 	local grp2=ff0e::2
+	local grp2_dmac=33:33:00:00:00:02
 	local src=2001:db8:100::1
 
 	echo
 	echo "Data path: MDB torture test - IPv6 overlay / IPv4 underlay"
 	echo "----------------------------------------------------------"
 
-	mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \
-		"mausezahn -6"
+	mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \
+		$grp2_dmac $src "mausezahn -6"
 }
 
 mdb_torture_ipv4_ipv6()
@@ -2397,15 +2444,17 @@ mdb_torture_ipv4_ipv6()
 	local vtep1_ip=2001:db8:1000::1
 	local vtep2_ip=2001:db8:2000::1
 	local grp1=239.1.1.1
+	local grp1_dmac=01:00:5e:01:01:01
 	local grp2=239.2.2.2
+	local grp2_dmac=01:00:5e:02:02:02
 	local src=192.0.2.129
 
 	echo
 	echo "Data path: MDB torture test - IPv4 overlay / IPv6 underlay"
 	echo "----------------------------------------------------------"
 
-	mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \
-		"mausezahn"
+	mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \
+		$grp2_dmac $src "mausezahn"
 }
 
 mdb_torture_ipv6_ipv6()
@@ -2414,15 +2463,17 @@ mdb_torture_ipv6_ipv6()
 	local vtep1_ip=2001:db8:1000::1
 	local vtep2_ip=2001:db8:2000::1
 	local grp1=ff0e::1
+	local grp1_dmac=33:33:00:00:00:01
 	local grp2=ff0e::2
+	local grp2_dmac=33:33:00:00:00:02
 	local src=2001:db8:100::1
 
 	echo
 	echo "Data path: MDB torture test - IPv6 overlay / IPv6 underlay"
 	echo "----------------------------------------------------------"
 
-	mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \
-		"mausezahn -6"
+	mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \
+		$grp2_dmac $src "mausezahn -6"
 }
 
 ################################################################################
diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index c6eda21cefb6..f27a12d2a2c9 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -1615,6 +1615,40 @@ TEST_F(tls, getsockopt)
 	EXPECT_EQ(errno, EINVAL);
 }
 
+TEST_F(tls, recv_efault)
+{
+	char *rec1 = "1111111111";
+	char *rec2 = "2222222222";
+	struct msghdr hdr = {};
+	struct iovec iov[2];
+	char recv_mem[12];
+	int ret;
+
+	if (self->notls)
+		SKIP(return, "no TLS support");
+
+	EXPECT_EQ(send(self->fd, rec1, 10, 0), 10);
+	EXPECT_EQ(send(self->fd, rec2, 10, 0), 10);
+
+	iov[0].iov_base = recv_mem;
+	iov[0].iov_len = sizeof(recv_mem);
+	iov[1].iov_base = NULL; /* broken iov to make process_rx_list fail */
+	iov[1].iov_len = 1;
+
+	hdr.msg_iovlen = 2;
+	hdr.msg_iov = iov;
+
+	EXPECT_EQ(recv(self->cfd, recv_mem, 1, 0), 1);
+	EXPECT_EQ(recv_mem[0], rec1[0]);
+
+	ret = recvmsg(self->cfd, &hdr, 0);
+	EXPECT_LE(ret, sizeof(recv_mem));
+	EXPECT_GE(ret, 9);
+	EXPECT_EQ(memcmp(rec1, recv_mem, 9), 0);
+	if (ret > 9)
+		EXPECT_EQ(memcmp(rec2, recv_mem + 9, ret - 9), 0);
+}
+
 FIXTURE(tls_err)
 {
 	int fd, cfd;
diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh
index 8802604148dd..11a1ebda564f 100755
--- a/tools/testing/selftests/net/udpgro.sh
+++ b/tools/testing/selftests/net/udpgro.sh
@@ -7,7 +7,7 @@ source net_helper.sh
 
 readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
 
-BPF_FILE="xdp_dummy.o"
+BPF_FILE="xdp_dummy.bpf.o"
 
 # set global exit status, but never reset nonzero one.
 check_err()
diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh
index 7080eae5312b..c51ea90a1395 100755
--- a/tools/testing/selftests/net/udpgro_bench.sh
+++ b/tools/testing/selftests/net/udpgro_bench.sh
@@ -7,7 +7,7 @@ source net_helper.sh
 
 readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
 
-BPF_FILE="xdp_dummy.o"
+BPF_FILE="xdp_dummy.bpf.o"
 
 cleanup() {
 	local -r jobs="$(jobs -p)"
diff --git a/tools/testing/selftests/net/udpgro_frglist.sh b/tools/testing/selftests/net/udpgro_frglist.sh
index e1ff645bd3d1..17404f49cdb6 100755
--- a/tools/testing/selftests/net/udpgro_frglist.sh
+++ b/tools/testing/selftests/net/udpgro_frglist.sh
@@ -7,7 +7,7 @@ source net_helper.sh
 
 readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
 
-BPF_FILE="xdp_dummy.o"
+BPF_FILE="xdp_dummy.bpf.o"
 
 cleanup() {
 	local -r jobs="$(jobs -p)"
@@ -42,8 +42,8 @@ run_one() {
 
 	ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp
 	tc -n "${PEER_NS}" qdisc add dev veth1 clsact
-	tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.o section schedcls/ingress6/nat_6  direct-action
-	tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.o section schedcls/egress4/snat4 direct-action
+	tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file nat6to4.bpf.o section schedcls/ingress6/nat_6  direct-action
+	tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file nat6to4.bpf.o section schedcls/egress4/snat4 direct-action
         echo ${rx_args}
 	ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r &
 
@@ -89,7 +89,7 @@ if [ ! -f ${BPF_FILE} ]; then
 	exit -1
 fi
 
-if [ ! -f nat6to4.o ]; then
+if [ ! -f nat6to4.bpf.o ]; then
 	echo "Missing nat6to4 helper. Run 'make' first"
 	exit -1
 fi
diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh
index 9cd5e885e91f..550d8eb3e224 100755
--- a/tools/testing/selftests/net/udpgro_fwd.sh
+++ b/tools/testing/selftests/net/udpgro_fwd.sh
@@ -3,7 +3,7 @@
 
 source net_helper.sh
 
-BPF_FILE="xdp_dummy.o"
+BPF_FILE="xdp_dummy.bpf.o"
 readonly BASE="ns-$(mktemp -u XXXXXX)"
 readonly SRC=2
 readonly DST=1
@@ -217,6 +217,7 @@ for family in 4 6; do
 	cleanup
 
 	create_ns
+	ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on
 	ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on
 	run_test "GRO frag list" $BM_NET$DST 1 0
 	cleanup
@@ -227,6 +228,7 @@ for family in 4 6; do
 	# use NAT to circumvent GRO FWD check
 	create_ns
 	ip -n $NS_DST addr add dev veth$DST $BM_NET$DST_NAT/$SUFFIX
+	ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on
 	ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
 	ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $BM_NET$DST_NAT \
 					-j DNAT --to-destination $BM_NET$DST
@@ -240,13 +242,15 @@ for family in 4 6; do
 	cleanup
 
 	create_vxlan_pair
+	ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on
 	ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on
-	run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1
+	run_test "GRO frag list over UDP tunnel" $OL_NET$DST 10 10
 	cleanup
 
 	# use NAT to circumvent GRO FWD check
 	create_vxlan_pair
 	ip -n $NS_DST addr add dev $VXDEV$DST $OL_NET$DST_NAT/$SUFFIX
+	ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on
 	ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
 	ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $OL_NET$DST_NAT \
 					-j DNAT --to-destination $OL_NET$DST
@@ -254,13 +258,7 @@ for family in 4 6; do
 	# load arp cache before running the test to reduce the amount of
 	# stray traffic on top of the UDP tunnel
 	ip netns exec $NS_SRC $PING -q -c 1 $OL_NET$DST_NAT >/dev/null
-	run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST
-	cleanup
-
-	create_vxlan_pair
-	run_bench "UDP tunnel fwd perf" $OL_NET$DST
-	ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
-	run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST
+	run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 10 10 $OL_NET$DST
 	cleanup
 done
 
diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index 1d975bf52af3..85b3baa3f7f3 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -34,7 +34,7 @@
 #endif
 
 #ifndef UDP_MAX_SEGMENTS
-#define UDP_MAX_SEGMENTS	(1 << 6UL)
+#define UDP_MAX_SEGMENTS	(1 << 7UL)
 #endif
 
 #define CONST_MTU_TEST	1500
diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh
index 5ae85def0739..4f1edbafb946 100755
--- a/tools/testing/selftests/net/veth.sh
+++ b/tools/testing/selftests/net/veth.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 
-BPF_FILE="xdp_dummy.o"
+BPF_FILE="xdp_dummy.bpf.o"
 readonly STATS="$(mktemp -p /tmp ns-XXXXXX)"
 readonly BASE=`basename $STATS`
 readonly SRC=2
@@ -249,9 +249,9 @@ cleanup
 create_ns
 ip -n $NS_DST link set dev veth$DST up
 ip -n $NS_DST link set dev veth$DST xdp object ${BPF_FILE} section xdp
-chk_gro_flag "gro vs xdp while down - gro flag on" $DST on
+chk_gro_flag "gro vs xdp while down - gro flag off" $DST off
 ip -n $NS_DST link set dev veth$DST down
-chk_gro_flag "                      - after down" $DST on
+chk_gro_flag "                      - after down" $DST off
 ip -n $NS_DST link set dev veth$DST xdp off
 chk_gro_flag "                      - after xdp off" $DST off
 ip -n $NS_DST link set dev veth$DST up
@@ -261,6 +261,21 @@ chk_gro_flag "                      - after peer xdp" $DST off
 cleanup
 
 create_ns
+ip -n $NS_DST link set dev veth$DST up
+ip -n $NS_DST link set dev veth$DST xdp object ${BPF_FILE} section xdp
+ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on
+chk_gro_flag "gro vs xdp while down - gro flag on" $DST on
+ip -n $NS_DST link set dev veth$DST down
+chk_gro_flag "                      - after down" $DST on
+ip -n $NS_DST link set dev veth$DST xdp off
+chk_gro_flag "                      - after xdp off" $DST on
+ip -n $NS_DST link set dev veth$DST up
+chk_gro_flag "                      - after up" $DST on
+ip -n $NS_SRC link set dev veth$SRC xdp object ${BPF_FILE} section xdp
+chk_gro_flag "                      - after peer xdp" $DST on
+cleanup
+
+create_ns
 chk_channels "default channels" $DST 1 1
 
 ip -n $NS_DST link set dev veth$DST down
@@ -327,11 +342,14 @@ if [ $CPUS -gt 2 ]; then
 fi
 
 ip -n $NS_DST link set dev veth$DST xdp object ${BPF_FILE} section xdp 2>/dev/null
-chk_gro_flag "with xdp attached - gro flag" $DST on
+chk_gro_flag "with xdp attached - gro flag" $DST off
 chk_gro_flag "        - peer gro flag" $SRC off
 chk_tso_flag "        - tso flag" $SRC off
 chk_tso_flag "        - peer tso flag" $DST on
 ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
+chk_gro "        - no aggregation" 10
+ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on
+chk_gro_flag "        - gro flag with GRO on" $DST on
 chk_gro "        - aggregation" 1
 
 
diff --git a/tools/testing/selftests/net/xdp_dummy.c b/tools/testing/selftests/net/xdp_dummy.bpf.c
index d988b2e0cee8..d988b2e0cee8 100644
--- a/tools/testing/selftests/net/xdp_dummy.c
+++ b/tools/testing/selftests/net/xdp_dummy.bpf.c
diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile
deleted file mode 100644
index 936c3085bb83..000000000000
--- a/tools/testing/selftests/netfilter/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Makefile for netfilter selftests
-
-TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
-	conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \
-	nft_concat_range.sh nft_conntrack_helper.sh \
-	nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
-	ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh \
-	conntrack_vrf.sh nft_synproxy.sh rpath.sh nft_audit.sh \
-	conntrack_sctp_collision.sh xt_string.sh \
-	bridge_netfilter.sh
-
-HOSTPKG_CONFIG := pkg-config
-
-CFLAGS += $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null)
-LDLIBS += $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl)
-
-TEST_GEN_FILES =  nf-queue connect_close audit_logread sctp_collision \
-	conntrack_dump_flush
-
-include ../lib.mk
diff --git a/tools/testing/selftests/netfilter/bridge_brouter.sh b/tools/testing/selftests/netfilter/bridge_brouter.sh
deleted file mode 100755
index 29f3955b9af7..000000000000
--- a/tools/testing/selftests/netfilter/bridge_brouter.sh
+++ /dev/null
@@ -1,146 +0,0 @@
-#!/bin/bash
-#
-# This test is for bridge 'brouting', i.e. make some packets being routed
-# rather than getting bridged even though they arrive on interface that is
-# part of a bridge.
-
-#           eth0    br0     eth0
-# setup is: ns1 <-> ns0 <-> ns2
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-ret=0
-
-ebtables -V > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ebtables"
-	exit $ksft_skip
-fi
-
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
-ip netns add ns0
-ip netns add ns1
-ip netns add ns2
-
-ip link add veth0 netns ns0 type veth peer name eth0 netns ns1
-if [ $? -ne 0 ]; then
-	echo "SKIP: Can't create veth device"
-	exit $ksft_skip
-fi
-ip link add veth1 netns ns0 type veth peer name eth0 netns ns2
-
-ip -net ns0 link set lo up
-ip -net ns0 link set veth0 up
-ip -net ns0 link set veth1 up
-
-ip -net ns0 link add br0 type bridge
-if [ $? -ne 0 ]; then
-	echo "SKIP: Can't create bridge br0"
-	exit $ksft_skip
-fi
-
-ip -net ns0 link set veth0 master br0
-ip -net ns0 link set veth1 master br0
-ip -net ns0 link set br0 up
-ip -net ns0 addr add 10.0.0.1/24 dev br0
-
-# place both in same subnet, ns1 and ns2 connected via ns0:br0
-for i in 1 2; do
-  ip -net ns$i link set lo up
-  ip -net ns$i link set eth0 up
-  ip -net ns$i addr add 10.0.0.1$i/24 dev eth0
-done
-
-test_ebtables_broute()
-{
-	local cipt
-
-	# redirect is needed so the dstmac is rewritten to the bridge itself,
-	# ip stack won't process OTHERHOST (foreign unicast mac) packets.
-	ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP
-	if [ $? -ne 0 ]; then
-		echo "SKIP: Could not add ebtables broute redirect rule"
-		return $ksft_skip
-	fi
-
-	# ping netns1, expected to not work (ip forwarding is off)
-	ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1
-	if [ $? -eq 0 ]; then
-		echo "ERROR: ping works, should have failed" 1>&2
-		return 1
-	fi
-
-	# enable forwarding on both interfaces.
-	# neither needs an ip address, but at least the bridge needs
-	# an ip address in same network segment as ns1 and ns2 (ns0
-	# needs to be able to determine route for to-be-forwarded packet).
-	ip netns exec ns0 sysctl -q net.ipv4.conf.veth0.forwarding=1
-	ip netns exec ns0 sysctl -q net.ipv4.conf.veth1.forwarding=1
-
-	sleep 1
-
-	ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null
-	if [ $? -ne 0 ]; then
-		echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2
-		return 1
-	fi
-
-	echo "PASS: ns1/ns2 connectivity with active broute rule"
-	ip netns exec ns0 ebtables -t broute -F
-
-	# ping netns1, expected to work (frames are bridged)
-	ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null
-	if [ $? -ne 0 ]; then
-		echo "ERROR: ping did not work, but it should (bridged)" 1>&2
-		return 1
-	fi
-
-	ip netns exec ns0 ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP
-
-	# ping netns1, expected to not work (DROP in bridge forward)
-	ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1
-	if [ $? -eq 0 ]; then
-		echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2
-		return 1
-	fi
-
-	# re-activate brouter
-	ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP
-
-	ip netns exec ns2 ping -q -c 1 10.0.0.11 > /dev/null
-	if [ $? -ne 0 ]; then
-		echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2
-		return 1
-	fi
-
-	echo "PASS: ns1/ns2 connectivity with active broute rule and bridge forward drop"
-	return 0
-}
-
-# test basic connectivity
-ip netns exec ns1 ping -c 1 -q 10.0.0.12 > /dev/null
-if [ $? -ne 0 ]; then
-    echo "ERROR: Could not reach ns2 from ns1" 1>&2
-    ret=1
-fi
-
-ip netns exec ns2 ping -c 1 -q 10.0.0.11 > /dev/null
-if [ $? -ne 0 ]; then
-    echo "ERROR: Could not reach ns1 from ns2" 1>&2
-    ret=1
-fi
-
-if [ $ret -eq 0 ];then
-    echo "PASS: netns connectivity: ns1 and ns2 can reach each other"
-fi
-
-test_ebtables_broute
-ret=$?
-for i in 0 1 2; do ip netns del ns$i;done
-
-exit $ret
diff --git a/tools/testing/selftests/netfilter/bridge_netfilter.sh b/tools/testing/selftests/netfilter/bridge_netfilter.sh
deleted file mode 100644
index 659b3ab02c8b..000000000000
--- a/tools/testing/selftests/netfilter/bridge_netfilter.sh
+++ /dev/null
@@ -1,188 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Test bridge netfilter + conntrack, a combination that doesn't really work,
-# with multicast/broadcast packets racing for hash table insertion.
-
-#           eth0    br0     eth0
-# setup is: ns1 <->,ns0 <-> ns3
-#           ns2 <-'    `'-> ns4
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-ret=0
-
-sfx=$(mktemp -u "XXXXXXXX")
-ns0="ns0-$sfx"
-ns1="ns1-$sfx"
-ns2="ns2-$sfx"
-ns3="ns3-$sfx"
-ns4="ns4-$sfx"
-
-ebtables -V > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ebtables"
-	exit $ksft_skip
-fi
-
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
-for i in $(seq 0 4); do
-  eval ip netns add \$ns$i
-done
-
-cleanup() {
-  for i in $(seq 0 4); do eval ip netns del \$ns$i;done
-}
-
-trap cleanup EXIT
-
-do_ping()
-{
-	fromns="$1"
-	dstip="$2"
-
-	ip netns exec $fromns ping -c 1 -q $dstip > /dev/null
-	if [ $? -ne 0 ]; then
-		echo "ERROR: ping from $fromns to $dstip"
-		ip netns exec ${ns0} nft list ruleset
-		ret=1
-	fi
-}
-
-bcast_ping()
-{
-	fromns="$1"
-	dstip="$2"
-
-	for i in $(seq 1 1000); do
-		ip netns exec $fromns ping -q -f -b -c 1 -q $dstip > /dev/null 2>&1
-		if [ $? -ne 0 ]; then
-			echo "ERROR: ping -b from $fromns to $dstip"
-			ip netns exec ${ns0} nft list ruleset
-			fi
-	done
-}
-
-ip link add veth1 netns ${ns0} type veth peer name eth0 netns ${ns1}
-if [ $? -ne 0 ]; then
-	echo "SKIP: Can't create veth device"
-	exit $ksft_skip
-fi
-
-ip link add veth2 netns ${ns0} type veth peer name eth0 netns $ns2
-ip link add veth3 netns ${ns0} type veth peer name eth0 netns $ns3
-ip link add veth4 netns ${ns0} type veth peer name eth0 netns $ns4
-
-ip -net ${ns0} link set lo up
-
-for i in $(seq 1 4); do
-  ip -net ${ns0} link set veth$i up
-done
-
-ip -net ${ns0} link add br0 type bridge stp_state 0 forward_delay 0 nf_call_iptables 1 nf_call_ip6tables 1 nf_call_arptables 1
-if [ $? -ne 0 ]; then
-	echo "SKIP: Can't create bridge br0"
-	exit $ksft_skip
-fi
-
-# make veth0,1,2 part of bridge.
-for i in $(seq 1 3); do
-  ip -net ${ns0} link set veth$i master br0
-done
-
-# add a macvlan on top of the bridge.
-MACVLAN_ADDR=ba:f3:13:37:42:23
-ip -net ${ns0} link add link br0 name macvlan0 type macvlan mode private
-ip -net ${ns0} link set macvlan0 address ${MACVLAN_ADDR}
-ip -net ${ns0} link set macvlan0 up
-ip -net ${ns0} addr add 10.23.0.1/24 dev macvlan0
-
-# add a macvlan on top of veth4.
-MACVLAN_ADDR=ba:f3:13:37:42:24
-ip -net ${ns0} link add link veth4 name macvlan4 type macvlan mode vepa
-ip -net ${ns0} link set macvlan4 address ${MACVLAN_ADDR}
-ip -net ${ns0} link set macvlan4 up
-
-# make the macvlan part of the bridge.
-# veth4 is not a bridge port, only the macvlan on top of it.
-ip -net ${ns0} link set macvlan4 master br0
-
-ip -net ${ns0} link set br0 up
-ip -net ${ns0} addr add 10.0.0.1/24 dev br0
-ip netns exec ${ns0} sysctl -q net.bridge.bridge-nf-call-iptables=1
-ret=$?
-if [ $ret -ne 0 ] ; then
-	echo "SKIP: bridge netfilter not available"
-	ret=$ksft_skip
-fi
-
-# for testing, so namespaces will reply to ping -b probes.
-ip netns exec ${ns0} sysctl -q net.ipv4.icmp_echo_ignore_broadcasts=0
-
-# enable conntrack in ns0 and drop broadcast packets in forward to
-# avoid them from getting confirmed in the postrouting hook before
-# the cloned skb is passed up the stack.
-ip netns exec ${ns0} nft -f - <<EOF
-table ip filter {
-	chain input {
-		type filter hook input priority 1; policy accept
-		iifname br0 counter
-		ct state new accept
-	}
-}
-
-table bridge filter {
-	chain forward {
-		type filter hook forward priority 0; policy accept
-		meta pkttype broadcast ip protocol icmp counter drop
-	}
-}
-EOF
-
-# place 1, 2 & 3 in same subnet, connected via ns0:br0.
-# ns4 is placed in same subnet as well, but its not
-# part of the bridge: the corresponding veth4 is not
-# part of the bridge, only its macvlan interface.
-for i in $(seq 1 4); do
-  eval ip -net \$ns$i link set lo up
-  eval ip -net \$ns$i link set eth0 up
-done
-for i in $(seq 1 2); do
-  eval ip -net \$ns$i addr add 10.0.0.1$i/24 dev eth0
-done
-
-ip -net ${ns3} addr add 10.23.0.13/24 dev eth0
-ip -net ${ns4} addr add 10.23.0.14/24 dev eth0
-
-# test basic connectivity
-do_ping ${ns1} 10.0.0.12
-do_ping ${ns3} 10.23.0.1
-do_ping ${ns4} 10.23.0.1
-
-if [ $ret -eq 0 ];then
-	echo "PASS: netns connectivity: ns1 can reach ns2, ns3 and ns4 can reach ns0"
-fi
-
-bcast_ping ${ns1} 10.0.0.255
-
-# This should deliver broadcast to macvlan0, which is on top of ns0:br0.
-bcast_ping ${ns3} 10.23.0.255
-
-# same, this time via veth4:macvlan4.
-bcast_ping ${ns4} 10.23.0.255
-
-read t < /proc/sys/kernel/tainted
-
-if [ $t -eq 0 ];then
-	echo PASS: kernel not tainted
-else
-	echo ERROR: kernel is tainted
-	ret=1
-fi
-
-exit $ret
diff --git a/tools/testing/selftests/netfilter/config b/tools/testing/selftests/netfilter/config
deleted file mode 100644
index 7c42b1b2c69b..000000000000
--- a/tools/testing/selftests/netfilter/config
+++ /dev/null
@@ -1,9 +0,0 @@
-CONFIG_NET_NS=y
-CONFIG_NF_TABLES_INET=y
-CONFIG_NFT_QUEUE=m
-CONFIG_NFT_NAT=m
-CONFIG_NFT_REDIR=m
-CONFIG_NFT_MASQ=m
-CONFIG_NFT_FLOW_OFFLOAD=m
-CONFIG_NF_CT_NETLINK=m
-CONFIG_AUDIT=y
diff --git a/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh
deleted file mode 100755
index a924e595cfd8..000000000000
--- a/tools/testing/selftests/netfilter/conntrack_sctp_collision.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Testing For SCTP COLLISION SCENARIO as Below:
-#
-#   14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359]
-#   14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187]
-#   14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359]
-#   14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO]
-#   14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK]
-#   14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970]
-#
-# TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS
-
-CLIENT_NS=$(mktemp -u client-XXXXXXXX)
-CLIENT_IP="198.51.200.1"
-CLIENT_PORT=1234
-
-SERVER_NS=$(mktemp -u server-XXXXXXXX)
-SERVER_IP="198.51.100.1"
-SERVER_PORT=1234
-
-ROUTER_NS=$(mktemp -u router-XXXXXXXX)
-CLIENT_GW="198.51.200.2"
-SERVER_GW="198.51.100.2"
-
-# setup the topo
-setup() {
-	ip net add $CLIENT_NS
-	ip net add $SERVER_NS
-	ip net add $ROUTER_NS
-	ip -n $SERVER_NS link add link0 type veth peer name link1 netns $ROUTER_NS
-	ip -n $CLIENT_NS link add link3 type veth peer name link2 netns $ROUTER_NS
-
-	ip -n $SERVER_NS link set link0 up
-	ip -n $SERVER_NS addr add $SERVER_IP/24 dev link0
-	ip -n $SERVER_NS route add $CLIENT_IP dev link0 via $SERVER_GW
-
-	ip -n $ROUTER_NS link set link1 up
-	ip -n $ROUTER_NS link set link2 up
-	ip -n $ROUTER_NS addr add $SERVER_GW/24 dev link1
-	ip -n $ROUTER_NS addr add $CLIENT_GW/24 dev link2
-	ip net exec $ROUTER_NS sysctl -wq net.ipv4.ip_forward=1
-
-	ip -n $CLIENT_NS link set link3 up
-	ip -n $CLIENT_NS addr add $CLIENT_IP/24 dev link3
-	ip -n $CLIENT_NS route add $SERVER_IP dev link3 via $CLIENT_GW
-
-	# simulate the delay on OVS upcall by setting up a delay for INIT_ACK with
-	# tc on $SERVER_NS side
-	tc -n $SERVER_NS qdisc add dev link0 root handle 1: htb
-	tc -n $SERVER_NS class add dev link0 parent 1: classid 1:1 htb rate 100mbit
-	tc -n $SERVER_NS filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \
-		0xff match u8 2 0xff at 32 flowid 1:1
-	tc -n $SERVER_NS qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms
-
-	# simulate the ctstate check on OVS nf_conntrack
-	ip net exec $ROUTER_NS iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP
-	ip net exec $ROUTER_NS iptables -A INPUT -p sctp -j DROP
-
-	# use a smaller number for assoc's max_retrans to reproduce the issue
-	modprobe sctp
-	ip net exec $CLIENT_NS sysctl -wq net.sctp.association_max_retrans=3
-}
-
-cleanup() {
-	ip net exec $CLIENT_NS pkill sctp_collision 2>&1 >/dev/null
-	ip net exec $SERVER_NS pkill sctp_collision 2>&1 >/dev/null
-	ip net del "$CLIENT_NS"
-	ip net del "$SERVER_NS"
-	ip net del "$ROUTER_NS"
-}
-
-do_test() {
-	ip net exec $SERVER_NS ./sctp_collision server \
-		$SERVER_IP $SERVER_PORT $CLIENT_IP $CLIENT_PORT &
-	ip net exec $CLIENT_NS ./sctp_collision client \
-		$CLIENT_IP $CLIENT_PORT $SERVER_IP $SERVER_PORT
-}
-
-# NOTE: one way to work around the issue is set a smaller hb_interval
-# ip net exec $CLIENT_NS sysctl -wq net.sctp.hb_interval=3500
-
-# run the test case
-trap cleanup EXIT
-setup && \
-echo "Test for SCTP Collision in nf_conntrack:" && \
-do_test && echo "PASS!"
-exit $?
diff --git a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh
deleted file mode 100755
index e7d7bf13cff5..000000000000
--- a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh
+++ /dev/null
@@ -1,167 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Check that UNREPLIED tcp conntrack will eventually timeout.
-#
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-ret=0
-
-waittime=20
-sfx=$(mktemp -u "XXXXXXXX")
-ns1="ns1-$sfx"
-ns2="ns2-$sfx"
-
-nft --version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without nft tool"
-	exit $ksft_skip
-fi
-
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
-cleanup() {
-	ip netns pids $ns1 | xargs kill 2>/dev/null
-	ip netns pids $ns2 | xargs kill 2>/dev/null
-
-	ip netns del $ns1
-	ip netns del $ns2
-}
-
-ipv4() {
-    echo -n 192.168.$1.2
-}
-
-check_counter()
-{
-	ns=$1
-	name=$2
-	expect=$3
-	local lret=0
-
-	cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect")
-	if [ $? -ne 0 ]; then
-		echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2
-		ip netns exec $ns2 nft list counter inet filter "$name" 1>&2
-		lret=1
-	fi
-
-	return $lret
-}
-
-# Create test namespaces
-ip netns add $ns1 || exit 1
-
-trap cleanup EXIT
-
-ip netns add $ns2 || exit 1
-
-# Connect the namespace to the host using a veth pair
-ip -net $ns1 link add name veth1 type veth peer name veth2
-ip -net $ns1 link set netns $ns2 dev veth2
-
-ip -net $ns1 link set up dev lo
-ip -net $ns2 link set up dev lo
-ip -net $ns1 link set up dev veth1
-ip -net $ns2 link set up dev veth2
-
-ip -net $ns2 addr add 10.11.11.2/24 dev veth2
-ip -net $ns2 route add default via 10.11.11.1
-
-ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1
-
-# add a rule inside NS so we enable conntrack
-ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT
-
-ip -net $ns1 addr add 10.11.11.1/24 dev veth1
-ip -net $ns1 route add 10.99.99.99 via 10.11.11.2
-
-# Check connectivity works
-ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1
-
-ip netns exec $ns2 nc -l -p 8080 < /dev/null &
-
-# however, conntrack entries are there
-
-ip netns exec $ns2 nft -f - <<EOF
-table inet filter {
-	counter connreq { }
-	counter redir { }
-	chain input {
-		type filter hook input priority 0; policy accept;
-		ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept
-		ct state new ct status dnat tcp dport 8080 counter name "redir" accept
-	}
-}
-EOF
-if [ $? -ne 0 ]; then
-	echo "ERROR: Could not load nft rules"
-	exit 1
-fi
-
-ip netns exec $ns2 sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10
-
-echo "INFO: connect $ns1 -> $ns2 to the virtual ip"
-ip netns exec $ns1 bash -c 'while true ; do
-	nc -p 60000 10.99.99.99 80
-	sleep 1
-	done' &
-
-sleep 1
-
-ip netns exec $ns2 nft -f - <<EOF
-table inet nat {
-	chain prerouting {
-		type nat hook prerouting priority 0; policy accept;
-		ip daddr 10.99.99.99 tcp dport 80 redirect to :8080
-	}
-}
-EOF
-if [ $? -ne 0 ]; then
-	echo "ERROR: Could not load nat redirect"
-	exit 1
-fi
-
-count=$(ip netns exec $ns2 conntrack -L -p tcp --dport 80 2>/dev/null | wc -l)
-if [ $count -eq 0 ]; then
-	echo "ERROR: $ns2 did not pick up tcp connection from peer"
-	exit 1
-fi
-
-echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect"
-for i in $(seq 1 $waittime); do
-	echo -n "."
-
-	sleep 1
-
-	count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l)
-	if [ $count -gt 0 ]; then
-		echo
-		echo "PASS: redirection took effect after $i seconds"
-		break
-	fi
-
-	m=$((i%20))
-	if [ $m -eq 0 ]; then
-		echo " waited for $i seconds"
-	fi
-done
-
-expect="packets 1 bytes 60"
-check_counter "$ns2" "redir" "$expect"
-if [ $? -ne 0 ]; then
-	ret=1
-fi
-
-if [ $ret -eq 0 ];then
-	echo "PASS: redirection counter has expected values"
-else
-	echo "ERROR: no tcp connection was redirected"
-fi
-
-exit $ret
diff --git a/tools/testing/selftests/netfilter/ipvs.sh b/tools/testing/selftests/netfilter/ipvs.sh
deleted file mode 100755
index c3b8f90c497e..000000000000
--- a/tools/testing/selftests/netfilter/ipvs.sh
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# End-to-end ipvs test suite
-# Topology:
-#--------------------------------------------------------------+
-#                      |                                       |
-#         ns0          |         ns1                           |
-#      -----------     |     -----------    -----------        |
-#      | veth01  | --------- | veth10  |    | veth12  |        |
-#      -----------    peer   -----------    -----------        |
-#           |          |                        |              |
-#      -----------     |                        |              |
-#      |  br0    |     |-----------------  peer |--------------|
-#      -----------     |                        |              |
-#           |          |                        |              |
-#      ----------     peer   ----------      -----------       |
-#      |  veth02 | --------- |  veth20 |     | veth21  |       |
-#      ----------      |     ----------      -----------       |
-#                      |         ns2                           |
-#                      |                                       |
-#--------------------------------------------------------------+
-#
-# We assume that all network driver are loaded
-#
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-ret=0
-GREEN='\033[0;92m'
-RED='\033[0;31m'
-NC='\033[0m' # No Color
-
-readonly port=8080
-
-readonly vip_v4=207.175.44.110
-readonly cip_v4=10.0.0.2
-readonly gip_v4=10.0.0.1
-readonly dip_v4=172.16.0.1
-readonly rip_v4=172.16.0.2
-readonly sip_v4=10.0.0.3
-
-readonly infile="$(mktemp)"
-readonly outfile="$(mktemp)"
-readonly datalen=32
-
-sysipvsnet="/proc/sys/net/ipv4/vs/"
-if [ ! -d $sysipvsnet ]; then
-	modprobe -q ip_vs
-	if [ $? -ne 0 ]; then
-		echo "skip: could not run test without ipvs module"
-		exit $ksft_skip
-	fi
-fi
-
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ]; then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
-ipvsadm -v > /dev/null 2>&1
-if [ $? -ne 0 ]; then
-	echo "SKIP: Could not run test without ipvsadm"
-	exit $ksft_skip
-fi
-
-setup() {
-	ip netns add ns0
-	ip netns add ns1
-	ip netns add ns2
-
-	ip link add veth01 netns ns0 type veth peer name veth10 netns ns1
-	ip link add veth02 netns ns0 type veth peer name veth20 netns ns2
-	ip link add veth12 netns ns1 type veth peer name veth21 netns ns2
-
-	ip netns exec ns0 ip link set veth01 up
-	ip netns exec ns0 ip link set veth02 up
-	ip netns exec ns0 ip link add br0 type bridge
-	ip netns exec ns0 ip link set veth01 master br0
-	ip netns exec ns0 ip link set veth02 master br0
-	ip netns exec ns0 ip link set br0 up
-	ip netns exec ns0 ip addr add ${cip_v4}/24 dev br0
-
-	ip netns exec ns1 ip link set lo up
-	ip netns exec ns1 ip link set veth10 up
-	ip netns exec ns1 ip addr add ${gip_v4}/24 dev veth10
-	ip netns exec ns1 ip link set veth12 up
-	ip netns exec ns1 ip addr add ${dip_v4}/24 dev veth12
-
-	ip netns exec ns2 ip link set lo up
-	ip netns exec ns2 ip link set veth21 up
-	ip netns exec ns2 ip addr add ${rip_v4}/24 dev veth21
-	ip netns exec ns2 ip link set veth20 up
-	ip netns exec ns2 ip addr add ${sip_v4}/24 dev veth20
-
-	sleep 1
-
-	dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none
-}
-
-cleanup() {
-	for i in 0 1 2
-	do
-		ip netns del ns$i > /dev/null 2>&1
-	done
-
-	if [ -f "${outfile}" ]; then
-		rm "${outfile}"
-	fi
-	if [ -f "${infile}" ]; then
-		rm "${infile}"
-	fi
-}
-
-server_listen() {
-	ip netns exec ns2 nc -l -p 8080 > "${outfile}" &
-	server_pid=$!
-	sleep 0.2
-}
-
-client_connect() {
-	ip netns exec ns0 timeout 2 nc -w 1 ${vip_v4} ${port} < "${infile}"
-}
-
-verify_data() {
-	wait "${server_pid}"
-	cmp "$infile" "$outfile" 2>/dev/null
-}
-
-test_service() {
-	server_listen
-	client_connect
-	verify_data
-}
-
-
-test_dr() {
-	ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0
-
-	ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1
-	ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr
-	ip netns exec ns1 ipvsadm -a -t ${vip_v4}:${port} -r ${rip_v4}:${port}
-	ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1
-
-	# avoid incorrect arp response
-	ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1
-	ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2
-	# avoid reverse route lookup
-	ip netns exec ns2 sysctl -qw  net.ipv4.conf.all.rp_filter=0
-	ip netns exec ns2 sysctl -qw  net.ipv4.conf.veth21.rp_filter=0
-	ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1
-
-	test_service
-}
-
-test_nat() {
-	ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0
-
-	ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1
-	ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr
-	ip netns exec ns1 ipvsadm -a -m -t ${vip_v4}:${port} -r ${rip_v4}:${port}
-	ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1
-
-	ip netns exec ns2 ip link del veth20
-	ip netns exec ns2 ip route add default via ${dip_v4} dev veth21
-
-	test_service
-}
-
-test_tun() {
-	ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0
-
-	ip netns exec ns1 modprobe ipip
-	ip netns exec ns1 ip link set tunl0 up
-	ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=0
-	ip netns exec ns1 sysctl -qw net.ipv4.conf.all.send_redirects=0
-	ip netns exec ns1 sysctl -qw net.ipv4.conf.default.send_redirects=0
-	ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr
-	ip netns exec ns1 ipvsadm -a -i -t ${vip_v4}:${port} -r ${rip_v4}:${port}
-	ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1
-
-	ip netns exec ns2 modprobe ipip
-	ip netns exec ns2 ip link set tunl0 up
-	ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1
-	ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2
-	ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0
-	ip netns exec ns2 sysctl -qw net.ipv4.conf.tunl0.rp_filter=0
-	ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0
-	ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1
-
-	test_service
-}
-
-run_tests() {
-	local errors=
-
-	echo "Testing DR mode..."
-	cleanup
-	setup
-	test_dr
-	errors=$(( $errors + $? ))
-
-	echo "Testing NAT mode..."
-	cleanup
-	setup
-	test_nat
-	errors=$(( $errors + $? ))
-
-	echo "Testing Tunnel mode..."
-	cleanup
-	setup
-	test_tun
-	errors=$(( $errors + $? ))
-
-	return $errors
-}
-
-trap cleanup EXIT
-
-run_tests
-
-if [ $? -ne 0 ]; then
-	echo -e "$(basename $0): ${RED}FAIL${NC}"
-	exit 1
-fi
-echo -e "$(basename $0): ${GREEN}PASS${NC}"
-exit 0
diff --git a/tools/testing/selftests/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/netfilter/nf_nat_edemux.sh
deleted file mode 100755
index a1aa8f4a5828..000000000000
--- a/tools/testing/selftests/netfilter/nf_nat_edemux.sh
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Test NAT source port clash resolution
-#
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-ret=0
-
-sfx=$(mktemp -u "XXXXXXXX")
-ns1="ns1-$sfx"
-ns2="ns2-$sfx"
-socatpid=0
-
-cleanup()
-{
-	[ $socatpid -gt 0 ] && kill $socatpid
-	ip netns del $ns1
-	ip netns del $ns2
-}
-
-socat -h > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without socat"
-	exit $ksft_skip
-fi
-
-iptables --version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without iptables"
-	exit $ksft_skip
-fi
-
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
-ip netns add "$ns1"
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not create net namespace $ns1"
-	exit $ksft_skip
-fi
-
-trap cleanup EXIT
-
-ip netns add $ns2
-
-# Connect the namespaces using a veth pair
-ip link add name veth2 type veth peer name veth1
-ip link set netns $ns1 dev veth1
-ip link set netns $ns2 dev veth2
-
-ip netns exec $ns1 ip link set up dev lo
-ip netns exec $ns1 ip link set up dev veth1
-ip netns exec $ns1 ip addr add 192.168.1.1/24 dev veth1
-
-ip netns exec $ns2 ip link set up dev lo
-ip netns exec $ns2 ip link set up dev veth2
-ip netns exec $ns2 ip addr add 192.168.1.2/24 dev veth2
-
-# Create a server in one namespace
-ip netns exec $ns1 socat -u TCP-LISTEN:5201,fork OPEN:/dev/null,wronly=1 &
-socatpid=$!
-
-# Restrict source port to just one so we don't have to exhaust
-# all others.
-ip netns exec $ns2 sysctl -q net.ipv4.ip_local_port_range="10000 10000"
-
-# add a virtual IP using DNAT
-ip netns exec $ns2 iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201
-
-# ... and route it to the other namespace
-ip netns exec $ns2 ip route add 10.96.0.1 via 192.168.1.1
-
-sleep 1
-
-# add a persistent connection from the other namespace
-ip netns exec $ns2 socat -t 10 - TCP:192.168.1.1:5201 > /dev/null &
-
-sleep 1
-
-# ip daddr:dport will be rewritten to 192.168.1.1 5201
-# NAT must reallocate source port 10000 because
-# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use
-echo test | ip netns exec $ns2 socat -t 3 -u STDIN TCP:10.96.0.1:443,connect-timeout=3 >/dev/null
-ret=$?
-
-# Check socat can connect to 10.96.0.1:443 (aka 192.168.1.1:5201).
-if [ $ret -eq 0 ]; then
-	echo "PASS: socat can connect via NAT'd address"
-else
-	echo "FAIL: socat cannot connect via NAT'd address"
-fi
-
-# check sport clashres.
-ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5202 -j REDIRECT --to-ports 5201
-ip netns exec $ns1 iptables -t nat -A PREROUTING -p tcp --dport 5203 -j REDIRECT --to-ports 5201
-
-sleep 5 | ip netns exec $ns2 socat -t 5 -u STDIN TCP:192.168.1.1:5202,connect-timeout=5 >/dev/null &
-cpid1=$!
-sleep 1
-
-# if connect succeeds, client closes instantly due to EOF on stdin.
-# if connect hangs, it will time out after 5s.
-echo | ip netns exec $ns2 socat -t 3 -u STDIN TCP:192.168.1.1:5203,connect-timeout=5 >/dev/null &
-cpid2=$!
-
-time_then=$(date +%s)
-wait $cpid2
-rv=$?
-time_now=$(date +%s)
-
-# Check how much time has elapsed, expectation is for
-# 'cpid2' to connect and then exit (and no connect delay).
-delta=$((time_now - time_then))
-
-if [ $delta -lt 2 -a $rv -eq 0 ]; then
-	echo "PASS: could connect to service via redirected ports"
-else
-	echo "FAIL: socat cannot connect to service via redirect ($delta seconds elapsed, returned $rv)"
-	ret=1
-fi
-
-exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh
deleted file mode 100755
index faa7778d7bd1..000000000000
--- a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/bin/bash
-#
-# This tests connection tracking helper assignment:
-# 1. can attach ftp helper to a connection from nft ruleset.
-# 2. auto-assign still works.
-#
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-ret=0
-
-sfx=$(mktemp -u "XXXXXXXX")
-ns1="ns1-$sfx"
-ns2="ns2-$sfx"
-testipv6=1
-
-cleanup()
-{
-	ip netns del ${ns1}
-	ip netns del ${ns2}
-}
-
-nft --version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without nft tool"
-	exit $ksft_skip
-fi
-
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
-conntrack -V > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without conntrack tool"
-	exit $ksft_skip
-fi
-
-which nc >/dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without netcat tool"
-	exit $ksft_skip
-fi
-
-trap cleanup EXIT
-
-ip netns add ${ns1}
-ip netns add ${ns2}
-
-ip link add veth0 netns ${ns1} type veth peer name veth0 netns ${ns2} > /dev/null 2>&1
-if [ $? -ne 0 ];then
-    echo "SKIP: No virtual ethernet pair device support in kernel"
-    exit $ksft_skip
-fi
-
-ip -net ${ns1} link set lo up
-ip -net ${ns1} link set veth0 up
-
-ip -net ${ns2} link set lo up
-ip -net ${ns2} link set veth0 up
-
-ip -net ${ns1} addr add 10.0.1.1/24 dev veth0
-ip -net ${ns1} addr add dead:1::1/64 dev veth0
-
-ip -net ${ns2} addr add 10.0.1.2/24 dev veth0
-ip -net ${ns2} addr add dead:1::2/64 dev veth0
-
-load_ruleset_family() {
-	local family=$1
-	local ns=$2
-
-ip netns exec ${ns} nft -f - <<EOF
-table $family raw {
-	ct helper ftp {
-             type "ftp" protocol tcp
-        }
-	chain pre {
-		type filter hook prerouting priority 0; policy accept;
-		tcp dport 2121 ct helper set "ftp"
-	}
-	chain output {
-		type filter hook output priority 0; policy accept;
-		tcp dport 2121 ct helper set "ftp"
-	}
-}
-EOF
-	return $?
-}
-
-check_for_helper()
-{
-	local netns=$1
-	local message=$2
-	local port=$3
-
-	if echo $message |grep -q 'ipv6';then
-		local family="ipv6"
-	else
-		local family="ipv4"
-	fi
-
-	ip netns exec ${netns} conntrack -L -f $family -p tcp --dport $port 2> /dev/null |grep -q 'helper=ftp'
-	if [ $? -ne 0 ] ; then
-		if [ $autoassign -eq 0 ] ;then
-			echo "FAIL: ${netns} did not show attached helper $message" 1>&2
-			ret=1
-		else
-			echo "PASS: ${netns} did not show attached helper $message" 1>&2
-		fi
-	else
-		if [ $autoassign -eq 0 ] ;then
-			echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2
-		else
-			echo "FAIL: ${netns} connection on port $port has ftp helper attached" 1>&2
-			ret=1
-		fi
-	fi
-
-	return 0
-}
-
-test_helper()
-{
-	local port=$1
-	local autoassign=$2
-
-	if [ $autoassign -eq 0 ] ;then
-		msg="set via ruleset"
-	else
-		msg="auto-assign"
-	fi
-
-	sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null &
-
-	sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null &
-	sleep 1
-
-	check_for_helper "$ns1" "ip $msg" $port $autoassign
-	check_for_helper "$ns2" "ip $msg" $port $autoassign
-
-	wait
-
-	if [ $testipv6 -eq 0 ] ;then
-		return 0
-	fi
-
-	ip netns exec ${ns1} conntrack -F 2> /dev/null
-	ip netns exec ${ns2} conntrack -F 2> /dev/null
-
-	sleep 3 | ip netns exec ${ns2} nc -w 2 -6 -l -p $port > /dev/null &
-
-	sleep 1 | ip netns exec ${ns1} nc -w 2 -6 dead:1::2 $port > /dev/null &
-	sleep 1
-
-	check_for_helper "$ns1" "ipv6 $msg" $port
-	check_for_helper "$ns2" "ipv6 $msg" $port
-
-	wait
-}
-
-load_ruleset_family ip ${ns1}
-if [ $? -ne 0 ];then
-	echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2
-	exit 1
-fi
-
-load_ruleset_family ip6 ${ns1}
-if [ $? -ne 0 ];then
-	echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2
-	testipv6=0
-fi
-
-load_ruleset_family inet ${ns2}
-if [ $? -ne 0 ];then
-	echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2
-	load_ruleset_family ip ${ns2}
-	if [ $? -ne 0 ];then
-		echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2
-		exit 1
-	fi
-
-	if [ $testipv6 -eq 1 ] ;then
-		load_ruleset_family ip6 ${ns2}
-		if [ $? -ne 0 ];then
-			echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2
-			exit 1
-		fi
-	fi
-fi
-
-test_helper 2121 0
-ip netns exec ${ns1} sysctl -qe 'net.netfilter.nf_conntrack_helper=1'
-ip netns exec ${ns2} sysctl -qe 'net.netfilter.nf_conntrack_helper=1'
-test_helper 21 1
-
-exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_fib.sh b/tools/testing/selftests/netfilter/nft_fib.sh
deleted file mode 100755
index dff476e45e77..000000000000
--- a/tools/testing/selftests/netfilter/nft_fib.sh
+++ /dev/null
@@ -1,273 +0,0 @@
-#!/bin/bash
-#
-# This tests the fib expression.
-#
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-ret=0
-
-sfx=$(mktemp -u "XXXXXXXX")
-ns1="ns1-$sfx"
-ns2="ns2-$sfx"
-nsrouter="nsrouter-$sfx"
-timeout=4
-
-log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
-
-cleanup()
-{
-	ip netns del ${ns1}
-	ip netns del ${ns2}
-	ip netns del ${nsrouter}
-
-	[ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
-}
-
-nft --version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without nft tool"
-	exit $ksft_skip
-fi
-
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
-ip netns add ${nsrouter}
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not create net namespace"
-	exit $ksft_skip
-fi
-
-trap cleanup EXIT
-
-dmesg | grep -q ' nft_rpfilter: '
-if [ $? -eq 0 ]; then
-	dmesg -c | grep ' nft_rpfilter: '
-	echo "WARN: a previous test run has failed" 1>&2
-fi
-
-sysctl -q net.netfilter.nf_log_all_netns=1
-ip netns add ${ns1}
-ip netns add ${ns2}
-
-load_ruleset() {
-	local netns=$1
-
-ip netns exec ${netns} nft -f /dev/stdin <<EOF
-table inet filter {
-	chain prerouting {
-		type filter hook prerouting priority 0; policy accept;
-	        fib saddr . iif oif missing counter log prefix "$netns nft_rpfilter: " drop
-	}
-}
-EOF
-}
-
-load_pbr_ruleset() {
-	local netns=$1
-
-ip netns exec ${netns} nft -f /dev/stdin <<EOF
-table inet filter {
-	chain forward {
-		type filter hook forward priority raw;
-		fib saddr . iif oif gt 0 accept
-		log drop
-	}
-}
-EOF
-}
-
-load_ruleset_count() {
-	local netns=$1
-
-ip netns exec ${netns} nft -f /dev/stdin <<EOF
-table inet filter {
-	chain prerouting {
-		type filter hook prerouting priority 0; policy accept;
-		ip daddr 1.1.1.1 fib saddr . iif oif missing counter drop
-		ip6 daddr 1c3::c01d fib saddr . iif oif missing counter drop
-	}
-}
-EOF
-}
-
-check_drops() {
-	dmesg | grep -q ' nft_rpfilter: '
-	if [ $? -eq 0 ]; then
-		dmesg | grep ' nft_rpfilter: '
-		echo "FAIL: rpfilter did drop packets"
-		return 1
-	fi
-
-	return 0
-}
-
-check_fib_counter() {
-	local want=$1
-	local ns=$2
-	local address=$3
-
-	line=$(ip netns exec ${ns} nft list table inet filter | grep 'fib saddr . iif' | grep $address | grep "packets $want" )
-	ret=$?
-
-	if [ $ret -ne 0 ];then
-		echo "Netns $ns fib counter doesn't match expected packet count of $want for $address" 1>&2
-		ip netns exec ${ns} nft list table inet filter
-		return 1
-	fi
-
-	if [ $want -gt 0 ]; then
-		echo "PASS: fib expression did drop packets for $address"
-	fi
-
-	return 0
-}
-
-load_ruleset ${nsrouter}
-load_ruleset ${ns1}
-load_ruleset ${ns2}
-
-ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1
-if [ $? -ne 0 ];then
-    echo "SKIP: No virtual ethernet pair device support in kernel"
-    exit $ksft_skip
-fi
-ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2}
-
-ip -net ${nsrouter} link set lo up
-ip -net ${nsrouter} link set veth0 up
-ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0
-ip -net ${nsrouter} addr add dead:1::1/64 dev veth0
-
-ip -net ${nsrouter} link set veth1 up
-ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1
-ip -net ${nsrouter} addr add dead:2::1/64 dev veth1
-
-ip -net ${ns1} link set lo up
-ip -net ${ns1} link set eth0 up
-
-ip -net ${ns2} link set lo up
-ip -net ${ns2} link set eth0 up
-
-ip -net ${ns1} addr add 10.0.1.99/24 dev eth0
-ip -net ${ns1} addr add dead:1::99/64 dev eth0
-ip -net ${ns1} route add default via 10.0.1.1
-ip -net ${ns1} route add default via dead:1::1
-
-ip -net ${ns2} addr add 10.0.2.99/24 dev eth0
-ip -net ${ns2} addr add dead:2::99/64 dev eth0
-ip -net ${ns2} route add default via 10.0.2.1
-ip -net ${ns2} route add default via dead:2::1
-
-test_ping() {
-  local daddr4=$1
-  local daddr6=$2
-
-  ip netns exec ${ns1} ping -c 1 -q $daddr4 > /dev/null
-  ret=$?
-  if [ $ret -ne 0 ];then
-	check_drops
-	echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2
-	return 1
-  fi
-
-  ip netns exec ${ns1} ping -c 3 -q $daddr6 > /dev/null
-  ret=$?
-  if [ $ret -ne 0 ];then
-	check_drops
-	echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2
-	return 1
-  fi
-
-  return 0
-}
-
-ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
-ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
-ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
-ip netns exec ${nsrouter} sysctl net.ipv4.conf.all.rp_filter=0 > /dev/null
-ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.rp_filter=0 > /dev/null
-
-sleep 3
-
-test_ping 10.0.2.1 dead:2::1 || exit 1
-check_drops || exit 1
-
-test_ping 10.0.2.99 dead:2::99 || exit 1
-check_drops || exit 1
-
-echo "PASS: fib expression did not cause unwanted packet drops"
-
-ip netns exec ${nsrouter} nft flush table inet filter
-
-ip -net ${ns1} route del default
-ip -net ${ns1} -6 route del default
-
-ip -net ${ns1} addr del 10.0.1.99/24 dev eth0
-ip -net ${ns1} addr del dead:1::99/64 dev eth0
-
-ip -net ${ns1} addr add 10.0.2.99/24 dev eth0
-ip -net ${ns1} addr add dead:2::99/64 dev eth0
-
-ip -net ${ns1} route add default via 10.0.2.1
-ip -net ${ns1} -6 route add default via dead:2::1
-
-ip -net ${nsrouter} addr add dead:2::1/64 dev veth0
-
-# switch to ruleset that doesn't log, this time
-# its expected that this does drop the packets.
-load_ruleset_count ${nsrouter}
-
-# ns1 has a default route, but nsrouter does not.
-# must not check return value, ping to 1.1.1.1 will
-# fail.
-check_fib_counter 0 ${nsrouter} 1.1.1.1 || exit 1
-check_fib_counter 0 ${nsrouter} 1c3::c01d || exit 1
-
-ip netns exec ${ns1} ping -c 1 -W 1 -q 1.1.1.1 > /dev/null
-check_fib_counter 1 ${nsrouter} 1.1.1.1 || exit 1
-
-sleep 2
-ip netns exec ${ns1} ping -c 3 -q 1c3::c01d > /dev/null
-check_fib_counter 3 ${nsrouter} 1c3::c01d || exit 1
-
-# delete all rules
-ip netns exec ${ns1} nft flush ruleset
-ip netns exec ${ns2} nft flush ruleset
-ip netns exec ${nsrouter} nft flush ruleset
-
-ip -net ${ns1} addr add 10.0.1.99/24 dev eth0
-ip -net ${ns1} addr add dead:1::99/64 dev eth0
-
-ip -net ${ns1} addr del 10.0.2.99/24 dev eth0
-ip -net ${ns1} addr del dead:2::99/64 dev eth0
-
-ip -net ${nsrouter} addr del dead:2::1/64 dev veth0
-
-# ... pbr ruleset for the router, check iif+oif.
-load_pbr_ruleset ${nsrouter}
-if [ $? -ne 0 ] ; then
-	echo "SKIP: Could not load fib forward ruleset"
-	exit $ksft_skip
-fi
-
-ip -net ${nsrouter} rule add from all table 128
-ip -net ${nsrouter} rule add from all iif veth0 table 129
-ip -net ${nsrouter} route add table 128 to 10.0.1.0/24 dev veth0
-ip -net ${nsrouter} route add table 129 to 10.0.2.0/24 dev veth1
-
-# drop main ipv4 table
-ip -net ${nsrouter} -4 rule delete table main
-
-test_ping 10.0.2.99 dead:2::99
-if [ $? -ne 0 ] ; then
-	ip -net ${nsrouter} nft list ruleset
-	echo "FAIL: fib mismatch in pbr setup"
-	exit 1
-fi
-
-echo "PASS: fib expression forward check with policy based routing"
-exit 0
diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh
deleted file mode 100755
index e12729753351..000000000000
--- a/tools/testing/selftests/netfilter/nft_queue.sh
+++ /dev/null
@@ -1,449 +0,0 @@
-#!/bin/bash
-#
-# This tests nf_queue:
-# 1. can process packets from all hooks
-# 2. support running nfqueue from more than one base chain
-#
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-ret=0
-
-sfx=$(mktemp -u "XXXXXXXX")
-ns1="ns1-$sfx"
-ns2="ns2-$sfx"
-nsrouter="nsrouter-$sfx"
-timeout=4
-
-cleanup()
-{
-	ip netns pids ${ns1} | xargs kill 2>/dev/null
-	ip netns pids ${ns2} | xargs kill 2>/dev/null
-	ip netns pids ${nsrouter} | xargs kill 2>/dev/null
-
-	ip netns del ${ns1}
-	ip netns del ${ns2}
-	ip netns del ${nsrouter}
-	rm -f "$TMPFILE0"
-	rm -f "$TMPFILE1"
-	rm -f "$TMPFILE2" "$TMPFILE3"
-}
-
-nft --version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without nft tool"
-	exit $ksft_skip
-fi
-
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
-ip netns add ${nsrouter}
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not create net namespace"
-	exit $ksft_skip
-fi
-
-TMPFILE0=$(mktemp)
-TMPFILE1=$(mktemp)
-TMPFILE2=$(mktemp)
-TMPFILE3=$(mktemp)
-trap cleanup EXIT
-
-ip netns add ${ns1}
-ip netns add ${ns2}
-
-ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1
-if [ $? -ne 0 ];then
-    echo "SKIP: No virtual ethernet pair device support in kernel"
-    exit $ksft_skip
-fi
-ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2}
-
-ip -net ${nsrouter} link set lo up
-ip -net ${nsrouter} link set veth0 up
-ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0
-ip -net ${nsrouter} addr add dead:1::1/64 dev veth0
-
-ip -net ${nsrouter} link set veth1 up
-ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1
-ip -net ${nsrouter} addr add dead:2::1/64 dev veth1
-
-ip -net ${ns1} link set lo up
-ip -net ${ns1} link set eth0 up
-
-ip -net ${ns2} link set lo up
-ip -net ${ns2} link set eth0 up
-
-ip -net ${ns1} addr add 10.0.1.99/24 dev eth0
-ip -net ${ns1} addr add dead:1::99/64 dev eth0
-ip -net ${ns1} route add default via 10.0.1.1
-ip -net ${ns1} route add default via dead:1::1
-
-ip -net ${ns2} addr add 10.0.2.99/24 dev eth0
-ip -net ${ns2} addr add dead:2::99/64 dev eth0
-ip -net ${ns2} route add default via 10.0.2.1
-ip -net ${ns2} route add default via dead:2::1
-
-load_ruleset() {
-	local name=$1
-	local prio=$2
-
-ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF
-table inet $name {
-	chain nfq {
-		ip protocol icmp queue bypass
-		icmpv6 type { "echo-request", "echo-reply" } queue num 1 bypass
-	}
-	chain pre {
-		type filter hook prerouting priority $prio; policy accept;
-		jump nfq
-	}
-	chain input {
-		type filter hook input priority $prio; policy accept;
-		jump nfq
-	}
-	chain forward {
-		type filter hook forward priority $prio; policy accept;
-		tcp dport 12345 queue num 2
-		jump nfq
-	}
-	chain output {
-		type filter hook output priority $prio; policy accept;
-		tcp dport 12345 queue num 3
-		tcp sport 23456 queue num 3
-		jump nfq
-	}
-	chain post {
-		type filter hook postrouting priority $prio; policy accept;
-		jump nfq
-	}
-}
-EOF
-}
-
-load_counter_ruleset() {
-	local prio=$1
-
-ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF
-table inet countrules {
-	chain pre {
-		type filter hook prerouting priority $prio; policy accept;
-		counter
-	}
-	chain input {
-		type filter hook input priority $prio; policy accept;
-		counter
-	}
-	chain forward {
-		type filter hook forward priority $prio; policy accept;
-		counter
-	}
-	chain output {
-		type filter hook output priority $prio; policy accept;
-		counter
-	}
-	chain post {
-		type filter hook postrouting priority $prio; policy accept;
-		counter
-	}
-}
-EOF
-}
-
-test_ping() {
-  ip netns exec ${ns1} ping -c 1 -q 10.0.2.99 > /dev/null
-  if [ $? -ne 0 ];then
-	return 1
-  fi
-
-  ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null
-  if [ $? -ne 0 ];then
-	return 1
-  fi
-
-  return 0
-}
-
-test_ping_router() {
-  ip netns exec ${ns1} ping -c 1 -q 10.0.2.1 > /dev/null
-  if [ $? -ne 0 ];then
-	return 1
-  fi
-
-  ip netns exec ${ns1} ping -c 1 -q dead:2::1 > /dev/null
-  if [ $? -ne 0 ];then
-	return 1
-  fi
-
-  return 0
-}
-
-test_queue_blackhole() {
-	local proto=$1
-
-ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF
-table $proto blackh {
-	chain forward {
-	type filter hook forward priority 0; policy accept;
-		queue num 600
-	}
-}
-EOF
-	if [ $proto = "ip" ] ;then
-		ip netns exec ${ns1} ping -W 2 -c 1 -q 10.0.2.99 > /dev/null
-		lret=$?
-	elif [ $proto = "ip6" ]; then
-		ip netns exec ${ns1} ping -W 2 -c 1 -q dead:2::99 > /dev/null
-		lret=$?
-	else
-		lret=111
-	fi
-
-	# queue without bypass keyword should drop traffic if no listener exists.
-	if [ $lret -eq 0 ];then
-		echo "FAIL: $proto expected failure, got $lret" 1>&2
-		exit 1
-	fi
-
-	ip netns exec ${nsrouter} nft delete table $proto blackh
-	if [ $? -ne 0 ] ;then
-	        echo "FAIL: $proto: Could not delete blackh table"
-	        exit 1
-	fi
-
-        echo "PASS: $proto: statement with no listener results in packet drop"
-}
-
-test_queue()
-{
-	local expected=$1
-	local last=""
-
-	# spawn nf-queue listeners
-	ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t $timeout > "$TMPFILE0" &
-	ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE1" &
-	sleep 1
-	test_ping
-	ret=$?
-	if [ $ret -ne 0 ];then
-		echo "FAIL: netns routing/connectivity with active listener on queue $queue: $ret" 1>&2
-		exit $ret
-	fi
-
-	test_ping_router
-	ret=$?
-	if [ $ret -ne 0 ];then
-		echo "FAIL: netns router unreachable listener on queue $queue: $ret" 1>&2
-		exit $ret
-	fi
-
-	wait
-	ret=$?
-
-	for file in $TMPFILE0 $TMPFILE1; do
-		last=$(tail -n1 "$file")
-		if [ x"$last" != x"$expected packets total" ]; then
-			echo "FAIL: Expected $expected packets total, but got $last" 1>&2
-			cat "$file" 1>&2
-
-			ip netns exec ${nsrouter} nft list ruleset
-			exit 1
-		fi
-	done
-
-	echo "PASS: Expected and received $last"
-}
-
-test_tcp_forward()
-{
-	ip netns exec ${nsrouter} ./nf-queue -q 2 -t $timeout &
-	local nfqpid=$!
-
-	tmpfile=$(mktemp) || exit 1
-	dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile
-	ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null &
-	local rpid=$!
-
-	sleep 1
-	ip netns exec ${ns1} nc -w 5 10.0.2.99 12345 <"$tmpfile" >/dev/null &
-
-	rm -f "$tmpfile"
-
-	wait $rpid
-	wait $lpid
-	[ $? -eq 0 ] && echo "PASS: tcp and nfqueue in forward chain"
-}
-
-test_tcp_localhost()
-{
-	tmpfile=$(mktemp) || exit 1
-
-	dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile
-	ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null &
-	local rpid=$!
-
-	ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout &
-	local nfqpid=$!
-
-	sleep 1
-	ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null
-	rm -f "$tmpfile"
-
-	wait $rpid
-	[ $? -eq 0 ] && echo "PASS: tcp via loopback"
-	wait 2>/dev/null
-}
-
-test_tcp_localhost_connectclose()
-{
-	tmpfile=$(mktemp) || exit 1
-
-	ip netns exec ${nsrouter} ./connect_close -p 23456 -t $timeout &
-
-	ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout &
-	local nfqpid=$!
-
-	sleep 1
-	rm -f "$tmpfile"
-
-	wait $rpid
-	[ $? -eq 0 ] && echo "PASS: tcp via loopback with connect/close"
-	wait 2>/dev/null
-}
-
-test_tcp_localhost_requeue()
-{
-ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF
-flush ruleset
-table inet filter {
-	chain output {
-		type filter hook output priority 0; policy accept;
-		tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0
-	}
-	chain post {
-		type filter hook postrouting priority 0; policy accept;
-		tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0
-	}
-}
-EOF
-	tmpfile=$(mktemp) || exit 1
-	dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile
-	ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null &
-	local rpid=$!
-
-	ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE2" &
-
-	# nfqueue 1 will be called via output hook.  But this time,
-        # re-queue the packet to nfqueue program on queue 2.
-	ip netns exec ${nsrouter} ./nf-queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" &
-
-	sleep 1
-	ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null
-	rm -f "$tmpfile"
-
-	wait
-
-	if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then
-		echo "FAIL: lost packets during requeue?!" 1>&2
-		return
-	fi
-
-	echo "PASS: tcp via loopback and re-queueing"
-}
-
-test_icmp_vrf() {
-	ip -net $ns1 link add tvrf type vrf table 9876
-	if [ $? -ne 0 ];then
-		echo "SKIP: Could not add vrf device"
-		return
-	fi
-
-	ip -net $ns1 li set eth0 master tvrf
-	ip -net $ns1 li set tvrf up
-
-	ip -net $ns1 route add 10.0.2.0/24 via 10.0.1.1 dev eth0 table 9876
-ip netns exec ${ns1} nft -f /dev/stdin <<EOF
-flush ruleset
-table inet filter {
-	chain output {
-		type filter hook output priority 0; policy accept;
-		meta oifname "tvrf" icmp type echo-request counter queue num 1
-		meta oifname "eth0" icmp type echo-request counter queue num 1
-	}
-	chain post {
-		type filter hook postrouting priority 0; policy accept;
-		meta oifname "tvrf" icmp type echo-request counter queue num 1
-		meta oifname "eth0" icmp type echo-request counter queue num 1
-	}
-}
-EOF
-	ip netns exec ${ns1} ./nf-queue -q 1 -t $timeout &
-	local nfqpid=$!
-
-	sleep 1
-	ip netns exec ${ns1} ip vrf exec tvrf ping -c 1 10.0.2.99 > /dev/null
-
-	for n in output post; do
-		for d in tvrf eth0; do
-			ip netns exec ${ns1} nft list chain inet filter $n | grep -q "oifname \"$d\" icmp type echo-request counter packets 1"
-			if [ $? -ne 0 ] ; then
-				echo "FAIL: chain $n: icmp packet counter mismatch for device $d" 1>&2
-				ip netns exec ${ns1} nft list ruleset
-				ret=1
-				return
-			fi
-		done
-	done
-
-	wait $nfqpid
-	[ $? -eq 0 ] && echo "PASS: icmp+nfqueue via vrf"
-	wait 2>/dev/null
-}
-
-ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
-ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
-ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
-
-load_ruleset "filter" 0
-
-sleep 3
-
-test_ping
-ret=$?
-if [ $ret -eq 0 ];then
-	# queue bypass works (rules were skipped, no listener)
-	echo "PASS: ${ns1} can reach ${ns2}"
-else
-	echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
-	exit $ret
-fi
-
-test_queue_blackhole ip
-test_queue_blackhole ip6
-
-# dummy ruleset to add base chains between the
-# queueing rules.  We don't want the second reinject
-# to re-execute the old hooks.
-load_counter_ruleset 10
-
-# we are hooking all: prerouting/input/forward/output/postrouting.
-# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so:
-# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply).
-# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply.
-# so we expect that userspace program receives 10 packets.
-test_queue 10
-
-# same.  We queue to a second program as well.
-load_ruleset "filter2" 20
-test_queue 20
-
-test_tcp_forward
-test_tcp_localhost
-test_tcp_localhost_connectclose
-test_tcp_localhost_requeue
-test_icmp_vrf
-
-exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_synproxy.sh b/tools/testing/selftests/netfilter/nft_synproxy.sh
deleted file mode 100755
index b62933b680d6..000000000000
--- a/tools/testing/selftests/netfilter/nft_synproxy.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-ret=0
-
-rnd=$(mktemp -u XXXXXXXX)
-nsr="nsr-$rnd"	# synproxy machine
-ns1="ns1-$rnd"  # iperf client
-ns2="ns2-$rnd"  # iperf server
-
-checktool (){
-	if ! $1 > /dev/null 2>&1; then
-		echo "SKIP: Could not $2"
-		exit $ksft_skip
-	fi
-}
-
-checktool "nft --version" "run test without nft tool"
-checktool "ip -Version" "run test without ip tool"
-checktool "iperf3 --version" "run test without iperf3"
-checktool "ip netns add $nsr" "create net namespace"
-
-modprobe -q nf_conntrack
-
-ip netns add $ns1
-ip netns add $ns2
-
-cleanup() {
-	ip netns pids $ns1 | xargs kill 2>/dev/null
-	ip netns pids $ns2 | xargs kill 2>/dev/null
-	ip netns del $ns1
-	ip netns del $ns2
-
-	ip netns del $nsr
-}
-
-trap cleanup EXIT
-
-ip link add veth0 netns $nsr type veth peer name eth0 netns $ns1
-ip link add veth1 netns $nsr type veth peer name eth0 netns $ns2
-
-for dev in lo veth0 veth1; do
-ip -net $nsr link set $dev up
-done
-
-ip -net $nsr addr add 10.0.1.1/24 dev veth0
-ip -net $nsr addr add 10.0.2.1/24 dev veth1
-
-ip netns exec $nsr sysctl -q net.ipv4.conf.veth0.forwarding=1
-ip netns exec $nsr sysctl -q net.ipv4.conf.veth1.forwarding=1
-ip netns exec $nsr sysctl -q net.netfilter.nf_conntrack_tcp_loose=0
-
-for n in $ns1 $ns2; do
-  ip -net $n link set lo up
-  ip -net $n link set eth0 up
-done
-ip -net $ns1 addr add 10.0.1.99/24 dev eth0
-ip -net $ns2 addr add 10.0.2.99/24 dev eth0
-ip -net $ns1 route add default via 10.0.1.1
-ip -net $ns2 route add default via 10.0.2.1
-
-# test basic connectivity
-if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then
-  echo "ERROR: $ns1 cannot reach $ns2" 1>&2
-  exit 1
-fi
-
-if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then
-  echo "ERROR: $ns2 cannot reach $ns1" 1>&2
-  exit 1
-fi
-
-ip netns exec $ns2 iperf3 -s > /dev/null 2>&1 &
-# ip netns exec $nsr tcpdump -vvv -n -i veth1 tcp | head -n 10 &
-
-sleep 1
-
-ip netns exec $nsr nft -f - <<EOF
-table inet filter {
-   chain prerouting {
-      type filter hook prerouting priority -300; policy accept;
-      meta iif veth0 tcp flags syn counter notrack
-   }
-
-  chain forward {
-      type filter hook forward priority 0; policy accept;
-
-      ct state new,established counter accept
-
-      meta iif veth0 meta l4proto tcp ct state untracked,invalid synproxy mss 1460 sack-perm timestamp
-
-      ct state invalid counter drop
-
-      # make ns2 unreachable w.o. tcp synproxy
-      tcp flags syn counter drop
-   }
-}
-EOF
-if [ $? -ne 0 ]; then
-	echo "SKIP: Cannot add nft synproxy"
-	exit $ksft_skip
-fi
-
-ip netns exec $ns1 timeout 5 iperf3 -c 10.0.2.99 -n $((1 * 1024 * 1024)) > /dev/null
-
-if [ $? -ne 0 ]; then
-	echo "FAIL: iperf3 returned an error" 1>&2
-	ret=$?
-	ip netns exec $nsr nft list ruleset
-else
-	echo "PASS: synproxy connection successful"
-fi
-
-exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_trans_stress.sh b/tools/testing/selftests/netfilter/nft_trans_stress.sh
deleted file mode 100755
index 2ffba45a78bf..000000000000
--- a/tools/testing/selftests/netfilter/nft_trans_stress.sh
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/bin/bash
-#
-# This test is for stress-testing the nf_tables config plane path vs.
-# packet path processing: Make sure we never release rules that are
-# still visible to other cpus.
-#
-# set -e
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-testns=testns-$(mktemp -u "XXXXXXXX")
-tmp=""
-
-tables="foo bar baz quux"
-global_ret=0
-eret=0
-lret=0
-
-cleanup() {
-	ip netns pids "$testns" | xargs kill 2>/dev/null
-	ip netns del "$testns"
-
-	rm -f "$tmp"
-}
-
-check_result()
-{
-	local r=$1
-	local OK="PASS"
-
-	if [ $r -ne 0 ] ;then
-		OK="FAIL"
-		global_ret=$r
-	fi
-
-	echo "$OK: nft $2 test returned $r"
-
-	eret=0
-}
-
-nft --version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without nft tool"
-	exit $ksft_skip
-fi
-
-ip -Version > /dev/null 2>&1
-if [ $? -ne 0 ];then
-	echo "SKIP: Could not run test without ip tool"
-	exit $ksft_skip
-fi
-
-trap cleanup EXIT
-tmp=$(mktemp)
-
-for table in $tables; do
-	echo add table inet "$table" >> "$tmp"
-	echo flush table inet "$table" >> "$tmp"
-
-	echo "add chain inet $table INPUT { type filter hook input priority 0; }" >> "$tmp"
-	echo "add chain inet $table OUTPUT { type filter hook output priority 0; }" >> "$tmp"
-	for c in $(seq 1 400); do
-		chain=$(printf "chain%03u" "$c")
-		echo "add chain inet $table $chain" >> "$tmp"
-	done
-
-	for c in $(seq 1 400); do
-		chain=$(printf "chain%03u" "$c")
-		for BASE in INPUT OUTPUT; do
-			echo "add rule inet $table $BASE counter jump $chain" >> "$tmp"
-		done
-		echo "add rule inet $table $chain counter return" >> "$tmp"
-	done
-done
-
-ip netns add "$testns"
-ip -netns "$testns" link set lo up
-
-lscpu | grep ^CPU\(s\): | ( read cpu cpunum ;
-cpunum=$((cpunum-1))
-for i in $(seq 0 $cpunum);do
-	mask=$(printf 0x%x $((1<<$i)))
-        ip netns exec "$testns" taskset $mask ping -4 127.0.0.1 -fq > /dev/null &
-        ip netns exec "$testns" taskset $mask ping -6 ::1 -fq > /dev/null &
-done)
-
-sleep 1
-
-ip netns exec "$testns" nft -f "$tmp"
-for i in $(seq 1 10) ; do ip netns exec "$testns" nft -f "$tmp" & done
-
-for table in $tables;do
-	randsleep=$((RANDOM%2))
-	sleep $randsleep
-	ip netns exec "$testns" nft delete table inet $table
-	lret=$?
-	if [ $lret -ne 0 ]; then
-		eret=$lret
-	fi
-done
-
-check_result $eret "add/delete"
-
-for i in $(seq 1 10) ; do
-	(echo "flush ruleset"; cat "$tmp") | ip netns exec "$testns" nft -f /dev/stdin
-
-	lret=$?
-	if [ $lret -ne 0 ]; then
-		eret=$lret
-	fi
-done
-
-check_result $eret "reload"
-
-for i in $(seq 1 10) ; do
-	(echo "flush ruleset"; cat "$tmp"
-	 echo "insert rule inet foo INPUT meta nftrace set 1"
-	 echo "insert rule inet foo OUTPUT meta nftrace set 1"
-	 ) | ip netns exec "$testns" nft -f /dev/stdin
-	lret=$?
-	if [ $lret -ne 0 ]; then
-		eret=$lret
-	fi
-
-	(echo "flush ruleset"; cat "$tmp"
-	 ) | ip netns exec "$testns" nft -f /dev/stdin
-
-	lret=$?
-	if [ $lret -ne 0 ]; then
-		eret=$lret
-	fi
-done
-
-check_result $eret "add/delete with nftrace enabled"
-
-echo "insert rule inet foo INPUT meta nftrace set 1" >> $tmp
-echo "insert rule inet foo OUTPUT meta nftrace set 1" >> $tmp
-
-for i in $(seq 1 10) ; do
-	(echo "flush ruleset"; cat "$tmp") | ip netns exec "$testns" nft -f /dev/stdin
-
-	lret=$?
-	if [ $lret -ne 0 ]; then
-		eret=1
-	fi
-done
-
-check_result $lret "add/delete with nftrace enabled"
-
-exit $global_ret
diff --git a/tools/testing/selftests/netfilter/settings b/tools/testing/selftests/netfilter/settings
deleted file mode 100644
index 6091b45d226b..000000000000
--- a/tools/testing/selftests/netfilter/settings
+++ /dev/null
@@ -1 +0,0 @@
-timeout=120
diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 6ba4f8275ac4..94bb6e11c16f 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -27,6 +27,7 @@
 #include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #include <sys/time.h>
+#include <sys/utsname.h>
 #include <sys/wait.h>
 #include <dirent.h>
 #include <errno.h>
@@ -600,6 +601,25 @@ int expect_strne(const char *expr, int llen, const char *cmp)
 	return ret;
 }
 
+#define EXPECT_STRBUFEQ(cond, expr, buf, val, cmp)				\
+	do { if (!(cond)) result(llen, SKIPPED); else ret += expect_str_buf_eq(expr, buf, val, llen, cmp); } while (0)
+
+static __attribute__((unused))
+int expect_str_buf_eq(size_t expr, const char *buf, size_t val, int llen, const char *cmp)
+{
+	llen += printf(" = %lu <%s> ", expr, buf);
+	if (strcmp(buf, cmp) != 0) {
+		result(llen, FAIL);
+		return 1;
+	}
+	if (expr != val) {
+		result(llen, FAIL);
+		return 1;
+	}
+
+	result(llen, OK);
+	return 0;
+}
 
 /* declare tests based on line numbers. There must be exactly one test per line. */
 #define CASE_TEST(name) \
@@ -761,6 +781,45 @@ int test_stat_timestamps(void)
 	return 0;
 }
 
+int test_uname(void)
+{
+	struct utsname buf;
+	char osrelease[sizeof(buf.release)];
+	ssize_t r;
+	int fd;
+
+	memset(&buf.domainname, 'P', sizeof(buf.domainname));
+
+	if (uname(&buf))
+		return 1;
+
+	if (strncmp("Linux", buf.sysname, sizeof(buf.sysname)))
+		return 1;
+
+	fd = open("/proc/sys/kernel/osrelease", O_RDONLY);
+	if (fd == -1)
+		return 1;
+
+	r = read(fd, osrelease, sizeof(osrelease));
+	if (r == -1)
+		return 1;
+
+	close(fd);
+
+	if (osrelease[r - 1] == '\n')
+		r--;
+
+	/* Validate one of the later fields to ensure field sizes are correct */
+	if (strncmp(osrelease, buf.release, r))
+		return 1;
+
+	/* Ensure the field domainname is set, it is missing from struct old_utsname */
+	if (strnlen(buf.domainname, sizeof(buf.domainname)) == sizeof(buf.domainname))
+		return 1;
+
+	return 0;
+}
+
 int test_mmap_munmap(void)
 {
 	int ret, fd, i, page_size;
@@ -966,6 +1025,8 @@ int run_syscall(int min, int max)
 		CASE_TEST(stat_fault);        EXPECT_SYSER(1, stat(NULL, &stat_buf), -1, EFAULT); break;
 		CASE_TEST(stat_timestamps);   EXPECT_SYSZR(1, test_stat_timestamps()); break;
 		CASE_TEST(symlink_root);      EXPECT_SYSER(1, symlink("/", "/"), -1, EEXIST); break;
+		CASE_TEST(uname);             EXPECT_SYSZR(proc, test_uname()); break;
+		CASE_TEST(uname_fault);       EXPECT_SYSER(1, uname(NULL), -1, EFAULT); break;
 		CASE_TEST(unlink_root);       EXPECT_SYSER(1, unlink("/"), -1, EISDIR); break;
 		CASE_TEST(unlink_blah);       EXPECT_SYSER(1, unlink("/proc/self/blah"), -1, ENOENT); break;
 		CASE_TEST(wait_child);        EXPECT_SYSER(1, wait(&tmp), -1, ECHILD); break;
@@ -991,6 +1052,14 @@ int run_stdlib(int min, int max)
 	for (test = min; test >= 0 && test <= max; test++) {
 		int llen = 0; /* line length */
 
+		/* For functions that take a long buffer, like strlcat()
+		 * Add some more chars after the \0, to test functions that overwrite the buffer set
+		 * the \0 at the exact right position.
+		 */
+		char buf[10] = "test123456";
+		buf[4] = '\0';
+
+
 		/* avoid leaving empty lines below, this will insert holes into
 		 * test numbers.
 		 */
@@ -1007,6 +1076,19 @@ int run_stdlib(int min, int max)
 		CASE_TEST(strchr_foobar_z);    EXPECT_STRZR(1, strchr("foobar", 'z')); break;
 		CASE_TEST(strrchr_foobar_o);   EXPECT_STREQ(1, strrchr("foobar", 'o'), "obar"); break;
 		CASE_TEST(strrchr_foobar_z);   EXPECT_STRZR(1, strrchr("foobar", 'z')); break;
+#ifdef NOLIBC
+		CASE_TEST(strlcat_0);          EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 0), buf, 3, "test"); break;
+		CASE_TEST(strlcat_1);          EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 1), buf, 4, "test"); break;
+		CASE_TEST(strlcat_5);          EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 5), buf, 7, "test"); break;
+		CASE_TEST(strlcat_6);          EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 6), buf, 7, "testb"); break;
+		CASE_TEST(strlcat_7);          EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 7), buf, 7, "testba"); break;
+		CASE_TEST(strlcat_8);          EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 8), buf, 7, "testbar"); break;
+		CASE_TEST(strlcpy_0);          EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 0), buf, 3, "test"); break;
+		CASE_TEST(strlcpy_1);          EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 1), buf, 3, ""); break;
+		CASE_TEST(strlcpy_2);          EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 2), buf, 3, "b"); break;
+		CASE_TEST(strlcpy_3);          EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 3), buf, 3, "ba"); break;
+		CASE_TEST(strlcpy_4);          EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 4), buf, 3, "bar"); break;
+#endif
 		CASE_TEST(memcmp_20_20);       EXPECT_EQ(1, memcmp("aaa\x20", "aaa\x20", 4), 0); break;
 		CASE_TEST(memcmp_20_60);       EXPECT_LT(1, memcmp("aaa\x20", "aaa\x60", 4), 0); break;
 		CASE_TEST(memcmp_60_20);       EXPECT_GT(1, memcmp("aaa\x60", "aaa\x20", 4), 0); break;
diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore
index 790c47001e77..ee93dc4969b8 100644
--- a/tools/testing/selftests/perf_events/.gitignore
+++ b/tools/testing/selftests/perf_events/.gitignore
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 sigtrap_threads
 remove_on_exec
+watermark_signal
diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile
index db93c4ff081a..70e3ff211278 100644
--- a/tools/testing/selftests/perf_events/Makefile
+++ b/tools/testing/selftests/perf_events/Makefile
@@ -2,5 +2,5 @@
 CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
 LDFLAGS += -lpthread
 
-TEST_GEN_PROGS := sigtrap_threads remove_on_exec
+TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal
 include ../lib.mk
diff --git a/tools/testing/selftests/perf_events/watermark_signal.c b/tools/testing/selftests/perf_events/watermark_signal.c
new file mode 100644
index 000000000000..49dc1e831174
--- /dev/null
+++ b/tools/testing/selftests/perf_events/watermark_signal.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/perf_event.h>
+#include <stddef.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#define __maybe_unused __attribute__((__unused__))
+
+static int sigio_count;
+
+static void handle_sigio(int signum __maybe_unused,
+			 siginfo_t *oh __maybe_unused,
+			 void *uc __maybe_unused)
+{
+	++sigio_count;
+}
+
+static void do_child(void)
+{
+	raise(SIGSTOP);
+
+	for (int i = 0; i < 20; ++i)
+		sleep(1);
+
+	raise(SIGSTOP);
+
+	exit(0);
+}
+
+TEST(watermark_signal)
+{
+	struct perf_event_attr attr;
+	struct perf_event_mmap_page *p = NULL;
+	struct sigaction previous_sigio, sigio = { 0 };
+	pid_t child = -1;
+	int child_status;
+	int fd = -1;
+	long page_size = sysconf(_SC_PAGE_SIZE);
+
+	sigio.sa_sigaction = handle_sigio;
+	EXPECT_EQ(sigaction(SIGIO, &sigio, &previous_sigio), 0);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.size = sizeof(attr);
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.config = PERF_COUNT_SW_DUMMY;
+	attr.sample_period = 1;
+	attr.disabled = 1;
+	attr.watermark = 1;
+	attr.context_switch = 1;
+	attr.wakeup_watermark = 1;
+
+	child = fork();
+	EXPECT_GE(child, 0);
+	if (child == 0)
+		do_child();
+	else if (child < 0) {
+		perror("fork()");
+		goto cleanup;
+	}
+
+	if (waitpid(child, &child_status, WSTOPPED) != child ||
+	    !(WIFSTOPPED(child_status) && WSTOPSIG(child_status) == SIGSTOP)) {
+		fprintf(stderr,
+			"failed to sycnhronize with child errno=%d status=%x\n",
+			errno,
+			child_status);
+		goto cleanup;
+	}
+
+	fd = syscall(__NR_perf_event_open, &attr, child, -1, -1,
+		     PERF_FLAG_FD_CLOEXEC);
+	if (fd < 0) {
+		fprintf(stderr, "failed opening event %llx\n", attr.config);
+		goto cleanup;
+	}
+
+	if (fcntl(fd, F_SETFL, FASYNC)) {
+		perror("F_SETFL FASYNC");
+		goto cleanup;
+	}
+
+	if (fcntl(fd, F_SETOWN, getpid())) {
+		perror("F_SETOWN getpid()");
+		goto cleanup;
+	}
+
+	if (fcntl(fd, F_SETSIG, SIGIO)) {
+		perror("F_SETSIG SIGIO");
+		goto cleanup;
+	}
+
+	p = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (p == NULL) {
+		perror("mmap");
+		goto cleanup;
+	}
+
+	if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) {
+		perror("PERF_EVENT_IOC_ENABLE");
+		goto cleanup;
+	}
+
+	if (kill(child, SIGCONT) < 0) {
+		perror("SIGCONT");
+		goto cleanup;
+	}
+
+	if (waitpid(child, &child_status, WSTOPPED) != -1 || errno != EINTR)
+		fprintf(stderr,
+			"expected SIGIO to terminate wait errno=%d status=%x\n%d",
+			errno,
+			child_status,
+			sigio_count);
+
+	EXPECT_GE(sigio_count, 1);
+
+cleanup:
+	if (p != NULL)
+		munmap(p, 2 * page_size);
+
+	if (fd >= 0)
+		close(fd);
+
+	if (child > 0) {
+		kill(child, SIGKILL);
+		waitpid(child, NULL, 0);
+	}
+
+	sigaction(SIGIO, &previous_sigio, NULL);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config
index f6f2965e17af..6133524710f7 100644
--- a/tools/testing/selftests/pidfd/config
+++ b/tools/testing/selftests/pidfd/config
@@ -3,5 +3,7 @@ CONFIG_IPC_NS=y
 CONFIG_USER_NS=y
 CONFIG_PID_NS=y
 CONFIG_NET_NS=y
+CONFIG_TIME_NS=y
+CONFIG_GENERIC_VDSO_TIME_NS=y
 CONFIG_CGROUPS=y
 CONFIG_CHECKPOINT_RESTORE=y
diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
index 01cc37bf611c..f062a986e382 100644
--- a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
@@ -307,5 +307,5 @@ int main(int argc, char **argv)
 	test_pidfd_fdinfo_nspid();
 	test_pidfd_dead_fdinfo();
 
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c
index 8a59438ccc78..c62564c264b1 100644
--- a/tools/testing/selftests/pidfd/pidfd_open_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_open_test.c
@@ -159,5 +159,7 @@ on_error:
 	if (pidfd >= 0)
 		close(pidfd);
 
-	return !ret ? ksft_exit_pass() : ksft_exit_fail();
+	if (ret)
+		ksft_exit_fail();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/pidfd/pidfd_poll_test.c b/tools/testing/selftests/pidfd/pidfd_poll_test.c
index 610811275357..55d74a50358f 100644
--- a/tools/testing/selftests/pidfd/pidfd_poll_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_poll_test.c
@@ -112,5 +112,5 @@ int main(int argc, char **argv)
 	}
 
 	ksft_test_result_pass("pidfd poll test: pass\n");
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c
index 6e2f2cd400ca..47746b0c6acd 100644
--- a/tools/testing/selftests/pidfd/pidfd_setns_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c
@@ -158,7 +158,7 @@ FIXTURE_SETUP(current_nsset)
 	/* Create task that exits right away. */
 	self->child_pid_exited = create_child(&self->child_pidfd_exited,
 					      CLONE_NEWUSER | CLONE_NEWNET);
-	EXPECT_GT(self->child_pid_exited, 0);
+	EXPECT_GE(self->child_pid_exited, 0);
 
 	if (self->child_pid_exited == 0)
 		_exit(EXIT_SUCCESS);
diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c
index c081ae91313a..9faa686f90e4 100644
--- a/tools/testing/selftests/pidfd/pidfd_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_test.c
@@ -572,5 +572,5 @@ int main(int argc, char **argv)
 	test_pidfd_send_signal_exited_fail();
 	test_pidfd_send_signal_recycled_pid_fail();
 
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/power_supply/test_power_supply_properties.sh b/tools/testing/selftests/power_supply/test_power_supply_properties.sh
index df272dfe1d2a..a66b1313ed88 100755
--- a/tools/testing/selftests/power_supply/test_power_supply_properties.sh
+++ b/tools/testing/selftests/power_supply/test_power_supply_properties.sh
@@ -23,7 +23,7 @@ count_tests() {
 	total_tests=0
 
 	for i in $SUPPLIES; do
-		total_tests=$(("$total_tests" + "$NUM_TESTS"))
+		total_tests=$((total_tests + NUM_TESTS))
 	done
 
 	echo "$total_tests"
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
index a89f1fbf86ec..1d293ab77185 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
@@ -47,4 +47,16 @@
 /* Default to taking the first of any alternative feature sections */
 test_feature = 1
 
+#define DCBT_SETUP_STREAMS(from, from_parms, to, to_parms, scratch)	\
+	lis	scratch,0x8000;	/* GO=1 */				\
+	clrldi	scratch,scratch,32;					\
+	/* setup read stream 0 */					\
+	dcbt	0,from,0b01000;		/* addr from */			\
+	dcbt	0,from_parms,0b01010;	/* length and depth from */	\
+	/* setup write stream 1 */					\
+	dcbtst	0,to,0b01000;		/* addr to */			\
+	dcbtst	0,to_parms,0b01010;	/* length and depth to */	\
+	eieio;								\
+	dcbt	0,scratch,0b01010;	/* all streams GO */
+
 #endif /* __SELFTESTS_POWERPC_PPC_ASM_H */
diff --git a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c
index 505294da1b9f..d6f99eb9be65 100644
--- a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c
+++ b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c
@@ -154,7 +154,7 @@ static int dev_papr_vpd_null_handle(void)
 static int papr_vpd_close_handle_without_reading(void)
 {
 	const int devfd = open(DEVPATH, O_RDONLY);
-	struct papr_location_code lc;
+	struct papr_location_code lc = { .str = "", };
 	int fd;
 
 	SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh
index bbac5f4b03d0..990d24696fd3 100755
--- a/tools/testing/selftests/rcutorture/bin/torture.sh
+++ b/tools/testing/selftests/rcutorture/bin/torture.sh
@@ -391,7 +391,7 @@ __EOF__
 		forceflavor="`echo $flavor | sed -e 's/^CONFIG/CONFIG_FORCE/'`"
 		deselectedflavors="`grep -v $flavor $T/rcutasksflavors | tr '\012' ' ' | tr -s ' ' | sed -e 's/ *$//'`"
 		echo " --- Running RCU Tasks Trace flavor $flavor `date`" >> $rtfdir/log
-		tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1
+		tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y CONFIG_KPROBES=n CONFIG_RCU_TRACE=n CONFIG_TRACING=n CONFIG_BLK_DEV_IO_TRACE=n CONFIG_UPROBE_EVENTS=n $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1
 		retcode=$?
 		if test "$retcode" -ne 0
 		then
@@ -425,7 +425,7 @@ fi
 if test "$do_scftorture" = "yes"
 then
 	# Scale memory based on the number of CPUs.
-	scfmem=$((2+HALF_ALLOTED_CPUS/16))
+	scfmem=$((3+HALF_ALLOTED_CPUS/16))
 	torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1"
 	torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make
 fi
@@ -559,7 +559,7 @@ do_kcsan="$do_kcsan_save"
 if test "$do_kvfree" = "yes"
 then
 	torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot"
-	torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make
+	torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make
 fi
 
 if test "$do_clocksourcewd" = "yes"
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09
index fc45645bb5f4..9ecd1b4e653d 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09
@@ -10,8 +10,9 @@ CONFIG_NO_HZ_FULL=n
 CONFIG_RCU_TRACE=n
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_RCU_BOOST=n
+CONFIG_RCU_BOOST=y
+CONFIG_RCU_BOOST_DELAY=100
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
-#CHECK#CONFIG_RCU_EXPERT=n
+CONFIG_RCU_EXPERT=y
 CONFIG_KPROBES=n
 CONFIG_FTRACE=n
diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile
index 2deac2031de9..021863f86053 100644
--- a/tools/testing/selftests/resctrl/Makefile
+++ b/tools/testing/selftests/resctrl/Makefile
@@ -5,6 +5,8 @@ CFLAGS += $(KHDR_INCLUDES)
 
 TEST_GEN_PROGS := resctrl_tests
 
+LOCAL_HDRS += $(wildcard *.h)
+
 include ../lib.mk
 
-$(OUTPUT)/resctrl_tests: $(wildcard *.[ch])
+$(OUTPUT)/resctrl_tests: $(wildcard *.c)
diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index 4cb991be8e31..c7686fb6641a 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -128,7 +128,7 @@ static int check_results(struct resctrl_val_param *param, const char *cache_type
 	return fail;
 }
 
-void cat_test_cleanup(void)
+static void cat_test_cleanup(void)
 {
 	remove(RESULT_FILE_NAME);
 }
@@ -284,13 +284,10 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param
 
 	ret = cat_test(test, uparams, &param, span, start_mask);
 	if (ret)
-		goto out;
+		return ret;
 
 	ret = check_results(&param, test->resource,
 			    cache_total_size, full_cache_mask, start_mask);
-out:
-	cat_test_cleanup();
-
 	return ret;
 }
 
@@ -373,6 +370,7 @@ struct resctrl_test l3_cat_test = {
 	.resource = "L3",
 	.feature_check = test_resource_feature_check,
 	.run_test = cat_run_test,
+	.cleanup = cat_test_cleanup,
 };
 
 struct resctrl_test l3_noncont_cat_test = {
diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c
index a81f91222a89..0105afec6188 100644
--- a/tools/testing/selftests/resctrl/cmt_test.c
+++ b/tools/testing/selftests/resctrl/cmt_test.c
@@ -40,11 +40,11 @@ static int show_results_info(unsigned long sum_llc_val, int no_of_bits,
 	int ret;
 
 	avg_llc_val = sum_llc_val / num_of_runs;
-	avg_diff = (long)abs(cache_span - avg_llc_val);
+	avg_diff = (long)(cache_span - avg_llc_val);
 	diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100;
 
 	ret = platform && abs((int)diff_percent) > max_diff_percent &&
-	      abs(avg_diff) > max_diff;
+	      labs(avg_diff) > max_diff;
 
 	ksft_print_msg("%s Check cache miss rate within %lu%%\n",
 		       ret ? "Fail:" : "Pass:", max_diff_percent);
@@ -91,7 +91,7 @@ static int check_results(struct resctrl_val_param *param, size_t span, int no_of
 				 MAX_DIFF, MAX_DIFF_PERCENT, runs - 1, true);
 }
 
-void cmt_test_cleanup(void)
+static void cmt_test_cleanup(void)
 {
 	remove(RESULT_FILE_NAME);
 }
@@ -161,7 +161,6 @@ static int cmt_run_test(const struct resctrl_test *test, const struct user_param
 		ksft_print_msg("Intel CMT may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n");
 
 out:
-	cmt_test_cleanup();
 	free(span_str);
 
 	return ret;
@@ -178,4 +177,5 @@ struct resctrl_test cmt_test = {
 	.resource = "L3",
 	.feature_check = cmt_feature_check,
 	.run_test = cmt_run_test,
+	.cleanup = cmt_test_cleanup,
 };
diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index 7946e32e85c8..a6ad39aae162 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -60,8 +60,8 @@ static bool show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
 	/* Memory bandwidth from 100% down to 10% */
 	for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP;
 	     allocation++) {
-		unsigned long avg_bw_imc, avg_bw_resc;
 		unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
+		long avg_bw_imc, avg_bw_resc;
 		int avg_diff_per;
 		float avg_diff;
 
@@ -137,7 +137,7 @@ static int check_results(void)
 	return show_mba_info(bw_imc, bw_resc);
 }
 
-void mba_test_cleanup(void)
+static void mba_test_cleanup(void)
 {
 	remove(RESULT_FILE_NAME);
 }
@@ -158,13 +158,10 @@ static int mba_run_test(const struct resctrl_test *test, const struct user_param
 
 	ret = resctrl_val(test, uparams, uparams->benchmark_cmd, &param);
 	if (ret)
-		goto out;
+		return ret;
 
 	ret = check_results();
 
-out:
-	mba_test_cleanup();
-
 	return ret;
 }
 
@@ -180,4 +177,5 @@ struct resctrl_test mba_test = {
 	.vendor_specific = ARCH_INTEL,
 	.feature_check = mba_feature_check,
 	.run_test = mba_run_test,
+	.cleanup = mba_test_cleanup,
 };
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
index d67ffa3ec63a..6fec51e1ff46 100644
--- a/tools/testing/selftests/resctrl/mbm_test.c
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -17,8 +17,8 @@
 static int
 show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, size_t span)
 {
-	unsigned long avg_bw_imc = 0, avg_bw_resc = 0;
 	unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
+	long avg_bw_imc = 0, avg_bw_resc = 0;
 	int runs, ret, avg_diff_per;
 	float avg_diff = 0;
 
@@ -105,7 +105,7 @@ static int mbm_setup(const struct resctrl_test *test,
 	return ret;
 }
 
-void mbm_test_cleanup(void)
+static void mbm_test_cleanup(void)
 {
 	remove(RESULT_FILE_NAME);
 }
@@ -126,15 +126,12 @@ static int mbm_run_test(const struct resctrl_test *test, const struct user_param
 
 	ret = resctrl_val(test, uparams, uparams->benchmark_cmd, &param);
 	if (ret)
-		goto out;
+		return ret;
 
 	ret = check_results(DEFAULT_SPAN);
 	if (ret && (get_vendor() == ARCH_INTEL))
 		ksft_print_msg("Intel MBM may be inaccurate when Sub-NUMA Clustering is enabled. Check BIOS configuration.\n");
 
-out:
-	mbm_test_cleanup();
-
 	return ret;
 }
 
@@ -150,4 +147,5 @@ struct resctrl_test mbm_test = {
 	.vendor_specific = ARCH_INTEL,
 	.feature_check = mbm_feature_check,
 	.run_test = mbm_run_test,
+	.cleanup = mbm_test_cleanup,
 };
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 2051bd135e0d..00d51fa7531c 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -72,6 +72,7 @@ struct user_params {
  * @disabled:		Test is disabled
  * @feature_check:	Callback to check required resctrl features
  * @run_test:		Callback to run the test
+ * @cleanup:		Callback to cleanup after the test
  */
 struct resctrl_test {
 	const char	*name;
@@ -82,6 +83,7 @@ struct resctrl_test {
 	bool		(*feature_check)(const struct resctrl_test *test);
 	int		(*run_test)(const struct resctrl_test *test,
 				    const struct user_params *uparams);
+	void		(*cleanup)(void);
 };
 
 /*
@@ -156,9 +158,6 @@ int resctrl_val(const struct resctrl_test *test,
 		const struct user_params *uparams,
 		const char * const *benchmark_cmd,
 		struct resctrl_val_param *param);
-void tests_cleanup(void);
-void mbm_test_cleanup(void);
-void mba_test_cleanup(void);
 unsigned long create_bit_mask(unsigned int start, unsigned int len);
 unsigned int count_contiguous_bits(unsigned long val, unsigned int *start);
 int get_full_cbm(const char *cache_type, unsigned long *mask);
@@ -166,11 +165,9 @@ int get_mask_no_shareable(const char *cache_type, unsigned long *mask);
 int get_cache_size(int cpu_no, const char *cache_type, unsigned long *cache_size);
 int resource_info_unsigned_get(const char *resource, const char *filename, unsigned int *val);
 void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
-int signal_handler_register(void);
+int signal_handler_register(const struct resctrl_test *test);
 void signal_handler_unregister(void);
-void cat_test_cleanup(void);
 unsigned int count_bits(unsigned long n);
-void cmt_test_cleanup(void);
 
 void perf_event_attr_initialize(struct perf_event_attr *pea, __u64 config);
 void perf_event_initialize_read_format(struct perf_event_read *pe_read);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index f3dc1b9696e7..ecbb7605a981 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -81,19 +81,11 @@ static void cmd_help(void)
 	printf("\t-h: help\n");
 }
 
-void tests_cleanup(void)
-{
-	mbm_test_cleanup();
-	mba_test_cleanup();
-	cmt_test_cleanup();
-	cat_test_cleanup();
-}
-
-static int test_prepare(void)
+static int test_prepare(const struct resctrl_test *test)
 {
 	int res;
 
-	res = signal_handler_register();
+	res = signal_handler_register(test);
 	if (res) {
 		ksft_print_msg("Failed to register signal handler\n");
 		return res;
@@ -108,8 +100,10 @@ static int test_prepare(void)
 	return 0;
 }
 
-static void test_cleanup(void)
+static void test_cleanup(const struct resctrl_test *test)
 {
+	if (test->cleanup)
+		test->cleanup();
 	umount_resctrlfs();
 	signal_handler_unregister();
 }
@@ -136,7 +130,7 @@ static void run_single_test(const struct resctrl_test *test, const struct user_p
 
 	ksft_print_msg("Starting %s test ...\n", test->name);
 
-	if (test_prepare()) {
+	if (test_prepare(test)) {
 		ksft_exit_fail_msg("Abnormal failure when preparing for the test\n");
 		return;
 	}
@@ -151,7 +145,7 @@ static void run_single_test(const struct resctrl_test *test, const struct user_p
 	ksft_test_result(!ret, "%s: test\n", test->name);
 
 cleanup:
-	test_cleanup();
+	test_cleanup(test);
 }
 
 static void init_user_params(struct user_params *uparams)
@@ -253,13 +247,13 @@ last_arg:
 	 * 2. We execute perf commands
 	 */
 	if (geteuid() != 0)
-		return ksft_exit_skip("Not running as root. Skipping...\n");
+		ksft_exit_skip("Not running as root. Skipping...\n");
 
 	if (!check_resctrlfs_support())
-		return ksft_exit_skip("resctrl FS does not exist. Enable X86_CPU_RESCTRL config option.\n");
+		ksft_exit_skip("resctrl FS does not exist. Enable X86_CPU_RESCTRL config option.\n");
 
 	if (umount_resctrlfs())
-		return ksft_exit_skip("resctrl FS unmount failed.\n");
+		ksft_exit_skip("resctrl FS unmount failed.\n");
 
 	filter_dmesg();
 
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 5a49f07a6c85..445f306d4c2f 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -62,6 +62,7 @@ struct imc_counter_config {
 static char mbm_total_path[1024];
 static int imcs;
 static struct imc_counter_config imc_counters_config[MAX_IMCS][2];
+static const struct resctrl_test *current_test;
 
 void membw_initialize_perf_event_attr(int i, int j)
 {
@@ -472,7 +473,8 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
 	if (bm_pid)
 		kill(bm_pid, SIGKILL);
 	umount_resctrlfs();
-	tests_cleanup();
+	if (current_test && current_test->cleanup)
+		current_test->cleanup();
 	ksft_print_msg("Ending\n\n");
 
 	exit(EXIT_SUCCESS);
@@ -482,13 +484,14 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
  * Register CTRL-C handler for parent, as it has to kill
  * child process before exiting.
  */
-int signal_handler_register(void)
+int signal_handler_register(const struct resctrl_test *test)
 {
 	struct sigaction sigact = {};
 	int ret = 0;
 
 	bm_pid = 0;
 
+	current_test = test;
 	sigact.sa_sigaction = ctrlc_handler;
 	sigemptyset(&sigact.sa_mask);
 	sigact.sa_flags = SA_SIGINFO;
@@ -510,6 +513,7 @@ void signal_handler_unregister(void)
 {
 	struct sigaction sigact = {};
 
+	current_test = NULL;
 	sigact.sa_handler = SIG_DFL;
 	sigemptyset(&sigact.sa_mask);
 	if (sigaction(SIGINT, &sigact, NULL) ||
diff --git a/tools/testing/selftests/riscv/hwprobe/cbo.c b/tools/testing/selftests/riscv/hwprobe/cbo.c
index c537d52fafc5..a40541bb7c7d 100644
--- a/tools/testing/selftests/riscv/hwprobe/cbo.c
+++ b/tools/testing/selftests/riscv/hwprobe/cbo.c
@@ -19,7 +19,7 @@
 #include "hwprobe.h"
 #include "../../kselftest.h"
 
-#define MK_CBO(fn) cpu_to_le32((fn) << 20 | 10 << 15 | 2 << 12 | 0 << 7 | 15)
+#define MK_CBO(fn) le32_bswap((uint32_t)(fn) << 20 | 10 << 15 | 2 << 12 | 0 << 7 | 15)
 
 static char mem[4096] __aligned(4096) = { [0 ... 4095] = 0xa5 };
 
diff --git a/tools/testing/selftests/riscv/hwprobe/hwprobe.h b/tools/testing/selftests/riscv/hwprobe/hwprobe.h
index e3fccb390c4d..f3de970c3222 100644
--- a/tools/testing/selftests/riscv/hwprobe/hwprobe.h
+++ b/tools/testing/selftests/riscv/hwprobe/hwprobe.h
@@ -4,6 +4,16 @@
 #include <stddef.h>
 #include <asm/hwprobe.h>
 
+#if __BYTE_ORDER == __BIG_ENDIAN
+# define le32_bswap(_x)				\
+	((((_x) & 0x000000ffU) << 24) |		\
+	 (((_x) & 0x0000ff00U) <<  8) |		\
+	 (((_x) & 0x00ff0000U) >>  8) |		\
+	 (((_x) & 0xff000000U) >> 24))
+#else
+# define le32_bswap(_x) (_x)
+#endif
+
 /*
  * Rather than relying on having a new enough libc to define this, just do it
  * ourselves.  This way we don't need to be coupled to a new-enough libc to
diff --git a/tools/testing/selftests/riscv/mm/mmap_bottomup.c b/tools/testing/selftests/riscv/mm/mmap_bottomup.c
index 1757d19ca89b..7f7d3eb8b9c9 100644
--- a/tools/testing/selftests/riscv/mm/mmap_bottomup.c
+++ b/tools/testing/selftests/riscv/mm/mmap_bottomup.c
@@ -6,30 +6,9 @@
 
 TEST(infinite_rlimit)
 {
-// Only works on 64 bit
-#if __riscv_xlen == 64
-	struct addresses mmap_addresses;
-
 	EXPECT_EQ(BOTTOM_UP, memory_layout());
 
-	do_mmaps(&mmap_addresses);
-
-	EXPECT_NE(MAP_FAILED, mmap_addresses.no_hint);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_37_addr);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_38_addr);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_46_addr);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_47_addr);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_55_addr);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_56_addr);
-
-	EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.no_hint);
-	EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_37_addr);
-	EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_38_addr);
-	EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_46_addr);
-	EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_47_addr);
-	EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_55_addr);
-	EXPECT_GT(1UL << 56, (unsigned long)mmap_addresses.on_56_addr);
-#endif
+	TEST_MMAPS;
 }
 
 TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/riscv/mm/mmap_default.c b/tools/testing/selftests/riscv/mm/mmap_default.c
index c63c60b9397e..2ba3ec990006 100644
--- a/tools/testing/selftests/riscv/mm/mmap_default.c
+++ b/tools/testing/selftests/riscv/mm/mmap_default.c
@@ -6,30 +6,9 @@
 
 TEST(default_rlimit)
 {
-// Only works on 64 bit
-#if __riscv_xlen == 64
-	struct addresses mmap_addresses;
-
 	EXPECT_EQ(TOP_DOWN, memory_layout());
 
-	do_mmaps(&mmap_addresses);
-
-	EXPECT_NE(MAP_FAILED, mmap_addresses.no_hint);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_37_addr);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_38_addr);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_46_addr);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_47_addr);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_55_addr);
-	EXPECT_NE(MAP_FAILED, mmap_addresses.on_56_addr);
-
-	EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.no_hint);
-	EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_37_addr);
-	EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_38_addr);
-	EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_46_addr);
-	EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_47_addr);
-	EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_55_addr);
-	EXPECT_GT(1UL << 56, (unsigned long)mmap_addresses.on_56_addr);
-#endif
+	TEST_MMAPS;
 }
 
 TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/riscv/mm/mmap_test.h b/tools/testing/selftests/riscv/mm/mmap_test.h
index 2e0db9c5be6c..3b29ca3bb3d4 100644
--- a/tools/testing/selftests/riscv/mm/mmap_test.h
+++ b/tools/testing/selftests/riscv/mm/mmap_test.h
@@ -4,63 +4,86 @@
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <stddef.h>
+#include <strings.h>
+#include "../../kselftest_harness.h"
 
 #define TOP_DOWN 0
 #define BOTTOM_UP 1
 
-struct addresses {
-	int *no_hint;
-	int *on_37_addr;
-	int *on_38_addr;
-	int *on_46_addr;
-	int *on_47_addr;
-	int *on_55_addr;
-	int *on_56_addr;
+#if __riscv_xlen == 64
+uint64_t random_addresses[] = {
+	0x19764f0d73b3a9f0, 0x016049584cecef59, 0x3580bdd3562f4acd,
+	0x1164219f20b17da0, 0x07d97fcb40ff2373, 0x76ec528921272ee7,
+	0x4dd48c38a3de3f70, 0x2e11415055f6997d, 0x14b43334ac476c02,
+	0x375a60795aff19f6, 0x47f3051725b8ee1a, 0x4e697cf240494a9f,
+	0x456b59b5c2f9e9d1, 0x101724379d63cb96, 0x7fe9ad31619528c1,
+	0x2f417247c495c2ea, 0x329a5a5b82943a5e, 0x06d7a9d6adcd3827,
+	0x327b0b9ee37f62d5, 0x17c7b1851dfd9b76, 0x006ebb6456ec2cd9,
+	0x00836cd14146a134, 0x00e5c4dcde7126db, 0x004c29feadf75753,
+	0x00d8b20149ed930c, 0x00d71574c269387a, 0x0006ebe4a82acb7a,
+	0x0016135df51f471b, 0x00758bdb55455160, 0x00d0bdd949b13b32,
+	0x00ecea01e7c5f54b, 0x00e37b071b9948b1, 0x0011fdd00ff57ab3,
+	0x00e407294b52f5ea, 0x00567748c200ed20, 0x000d073084651046,
+	0x00ac896f4365463c, 0x00eb0d49a0b26216, 0x0066a2564a982a31,
+	0x002e0d20237784ae, 0x0000554ff8a77a76, 0x00006ce07a54c012,
+	0x000009570516d799, 0x00000954ca15b84d, 0x0000684f0d453379,
+	0x00002ae5816302b5, 0x0000042403fb54bf, 0x00004bad7392bf30,
+	0x00003e73bfa4b5e3, 0x00005442c29978e0, 0x00002803f11286b6,
+	0x000073875d745fc6, 0x00007cede9cb8240, 0x000027df84cc6a4f,
+	0x00006d7e0e74242a, 0x00004afd0b836e02, 0x000047d0e837cd82,
+	0x00003b42405efeda, 0x00001531bafa4c95, 0x00007172cae34ac4,
+};
+#else
+uint32_t random_addresses[] = {
+	0x8dc302e0, 0x929ab1e0, 0xb47683ba, 0xea519c73, 0xa19f1c90, 0xc49ba213,
+	0x8f57c625, 0xadfe5137, 0x874d4d95, 0xaa20f09d, 0xcf21ebfc, 0xda7737f1,
+	0xcedf392a, 0x83026c14, 0xccedca52, 0xc6ccf826, 0xe0cd9415, 0x997472ca,
+	0xa21a44c1, 0xe82196f5, 0xa23fd66b, 0xc28d5590, 0xd009cdce, 0xcf0be646,
+	0x8fc8c7ff, 0xe2a85984, 0xa3d3236b, 0x89a0619d, 0xc03db924, 0xb5d4cc1b,
+	0xb96ee04c, 0xd191da48, 0xb432a000, 0xaa2bebbc, 0xa2fcb289, 0xb0cca89b,
+	0xb0c18d6a, 0x88f58deb, 0xa4d42d1c, 0xe4d74e86, 0x99902b09, 0x8f786d31,
+	0xbec5e381, 0x9a727e65, 0xa9a65040, 0xa880d789, 0x8f1b335e, 0xfc821c1e,
+	0x97e34be4, 0xbbef84ed, 0xf447d197, 0xfd7ceee2, 0xe632348d, 0xee4590f4,
+	0x958992a5, 0xd57e05d6, 0xfd240970, 0xc5b0dcff, 0xd96da2c2, 0xa7ae041d,
 };
+#endif
 
 // Only works on 64 bit
 #if __riscv_xlen == 64
-static inline void do_mmaps(struct addresses *mmap_addresses)
-{
-	/*
-	 * Place all of the hint addresses on the boundaries of mmap
-	 * sv39, sv48, sv57
-	 * User addresses end at 1<<38, 1<<47, 1<<56 respectively
-	 */
-	void *on_37_bits = (void *)(1UL << 37);
-	void *on_38_bits = (void *)(1UL << 38);
-	void *on_46_bits = (void *)(1UL << 46);
-	void *on_47_bits = (void *)(1UL << 47);
-	void *on_55_bits = (void *)(1UL << 55);
-	void *on_56_bits = (void *)(1UL << 56);
+#define PROT (PROT_READ | PROT_WRITE)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
 
-	int prot = PROT_READ | PROT_WRITE;
-	int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+/* mmap must return a value that doesn't use more bits than the hint address. */
+static inline unsigned long get_max_value(unsigned long input)
+{
+	unsigned long max_bit = (1UL << (((sizeof(unsigned long) * 8) - 1 -
+					  __builtin_clzl(input))));
 
-	mmap_addresses->no_hint =
-		mmap(NULL, 5 * sizeof(int), prot, flags, 0, 0);
-	mmap_addresses->on_37_addr =
-		mmap(on_37_bits, 5 * sizeof(int), prot, flags, 0, 0);
-	mmap_addresses->on_38_addr =
-		mmap(on_38_bits, 5 * sizeof(int), prot, flags, 0, 0);
-	mmap_addresses->on_46_addr =
-		mmap(on_46_bits, 5 * sizeof(int), prot, flags, 0, 0);
-	mmap_addresses->on_47_addr =
-		mmap(on_47_bits, 5 * sizeof(int), prot, flags, 0, 0);
-	mmap_addresses->on_55_addr =
-		mmap(on_55_bits, 5 * sizeof(int), prot, flags, 0, 0);
-	mmap_addresses->on_56_addr =
-		mmap(on_56_bits, 5 * sizeof(int), prot, flags, 0, 0);
+	return max_bit + (max_bit - 1);
 }
+
+#define TEST_MMAPS                                                            \
+	({                                                                    \
+		void *mmap_addr;                                              \
+		for (int i = 0; i < ARRAY_SIZE(random_addresses); i++) {      \
+			mmap_addr = mmap((void *)random_addresses[i],         \
+					 5 * sizeof(int), PROT, FLAGS, 0, 0); \
+			EXPECT_NE(MAP_FAILED, mmap_addr);                     \
+			EXPECT_GE((void *)get_max_value(random_addresses[i]), \
+				  mmap_addr);                                 \
+			mmap_addr = mmap((void *)random_addresses[i],         \
+					 5 * sizeof(int), PROT, FLAGS, 0, 0); \
+			EXPECT_NE(MAP_FAILED, mmap_addr);                     \
+			EXPECT_GE((void *)get_max_value(random_addresses[i]), \
+				  mmap_addr);                                 \
+		}                                                             \
+	})
 #endif /* __riscv_xlen == 64 */
 
 static inline int memory_layout(void)
 {
-	int prot = PROT_READ | PROT_WRITE;
-	int flags = MAP_PRIVATE | MAP_ANONYMOUS;
-
-	void *value1 = mmap(NULL, sizeof(int), prot, flags, 0, 0);
-	void *value2 = mmap(NULL, sizeof(int), prot, flags, 0, 0);
+	void *value1 = mmap(NULL, sizeof(int), PROT, FLAGS, 0, 0);
+	void *value2 = mmap(NULL, sizeof(int), PROT, FLAGS, 0, 0);
 
 	return value2 > value1;
 }
diff --git a/tools/testing/selftests/seccomp/settings b/tools/testing/selftests/seccomp/settings
index 6091b45d226b..a953c96aa16e 100644
--- a/tools/testing/selftests/seccomp/settings
+++ b/tools/testing/selftests/seccomp/settings
@@ -1 +1 @@
-timeout=120
+timeout=180
diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile
index 867f88ce2570..26ea30fae23c 100644
--- a/tools/testing/selftests/sgx/Makefile
+++ b/tools/testing/selftests/sgx/Makefile
@@ -12,7 +12,7 @@ OBJCOPY := $(CROSS_COMPILE)objcopy
 endif
 
 INCLUDES := -I$(top_srcdir)/tools/include
-HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC
+HOST_CFLAGS := -Wall -Werror $(KHDR_INCLUDES) -g $(INCLUDES) -fPIC
 HOST_LDFLAGS := -z noexecstack -lcrypto
 ENCL_CFLAGS += -Wall -Werror -static-pie -nostdlib -ffreestanding -fPIE \
 	       -fno-stack-protector -mrdrnd $(INCLUDES)
diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c
index d73b29becf5b..200034a0fee5 100644
--- a/tools/testing/selftests/sgx/sigstruct.c
+++ b/tools/testing/selftests/sgx/sigstruct.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
-#define _GNU_SOURCE
 #include <assert.h>
 #include <getopt.h>
 #include <stdbool.h>
diff --git a/tools/testing/selftests/sync/sync_test.c b/tools/testing/selftests/sync/sync_test.c
index 414a617db993..93db5aa246a3 100644
--- a/tools/testing/selftests/sync/sync_test.c
+++ b/tools/testing/selftests/sync/sync_test.c
@@ -109,6 +109,5 @@ int main(void)
 		ksft_exit_fail_msg("%d out of %d sync tests failed\n",
 					err, ksft_test_num());
 
-	/* need this return to keep gcc happy */
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_test.c b/tools/testing/selftests/syscall_user_dispatch/sud_test.c
index b5d592d4099e..d975a6767329 100644
--- a/tools/testing/selftests/syscall_user_dispatch/sud_test.c
+++ b/tools/testing/selftests/syscall_user_dispatch/sud_test.c
@@ -158,6 +158,20 @@ static void handle_sigsys(int sig, siginfo_t *info, void *ucontext)
 
 	/* In preparation for sigreturn. */
 	SYSCALL_DISPATCH_OFF(glob_sel);
+
+	/*
+	 * The tests for argument handling assume that `syscall(x) == x`. This
+	 * is a NOP on x86 because the syscall number is passed in %rax, which
+	 * happens to also be the function ABI return register.  Other
+	 * architectures may need to swizzle the arguments around.
+	 */
+#if defined(__riscv)
+/* REG_A7 is not defined in libc headers */
+# define REG_A7 (REG_A0 + 7)
+
+	((ucontext_t *)ucontext)->uc_mcontext.__gregs[REG_A0] =
+			((ucontext_t *)ucontext)->uc_mcontext.__gregs[REG_A7];
+#endif
 }
 
 TEST(dispatch_and_return)
diff --git a/tools/testing/selftests/timers/adjtick.c b/tools/testing/selftests/timers/adjtick.c
index 47e05fdc32c5..205b76a4abb4 100644
--- a/tools/testing/selftests/timers/adjtick.c
+++ b/tools/testing/selftests/timers/adjtick.c
@@ -205,7 +205,7 @@ int main(int argc, char **argv)
 	adjtimex(&tx1);
 
 	if (err)
-		return ksft_exit_fail();
+		ksft_exit_fail();
 
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c
index 4332b494103d..ad52e608b88e 100644
--- a/tools/testing/selftests/timers/alarmtimer-suspend.c
+++ b/tools/testing/selftests/timers/alarmtimer-suspend.c
@@ -173,6 +173,6 @@ int main(void)
 		timer_delete(tm1);
 	}
 	if (final_ret)
-		return ksft_exit_fail();
-	return ksft_exit_pass();
+		ksft_exit_fail();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/change_skew.c b/tools/testing/selftests/timers/change_skew.c
index 992a77f2a74c..4421cd562c24 100644
--- a/tools/testing/selftests/timers/change_skew.c
+++ b/tools/testing/selftests/timers/change_skew.c
@@ -89,8 +89,8 @@ int main(int argc, char **argv)
 
 	if (ret) {
 		printf("[FAIL]");
-		return ksft_exit_fail();
+		ksft_exit_fail();
 	}
 	printf("[OK]");
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/freq-step.c b/tools/testing/selftests/timers/freq-step.c
index 4b76450d78d1..73b636f89fdc 100644
--- a/tools/testing/selftests/timers/freq-step.c
+++ b/tools/testing/selftests/timers/freq-step.c
@@ -257,7 +257,7 @@ int main(int argc, char **argv)
 	set_frequency(0.0);
 
 	if (fails)
-		return ksft_exit_fail();
+		ksft_exit_fail();
 
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c
index 23eb398c8140..986abbdb1521 100644
--- a/tools/testing/selftests/timers/leap-a-day.c
+++ b/tools/testing/selftests/timers/leap-a-day.c
@@ -268,7 +268,7 @@ int main(int argc, char **argv)
 		if (ret < 0) {
 			printf("Error: Problem setting STA_INS/STA_DEL!: %s\n",
 							time_state_str(ret));
-			return ksft_exit_fail();
+			ksft_exit_fail();
 		}
 
 		/* Validate STA_INS was set */
@@ -277,7 +277,7 @@ int main(int argc, char **argv)
 		if (tx.status != STA_INS && tx.status != STA_DEL) {
 			printf("Error: STA_INS/STA_DEL not set!: %s\n",
 							time_state_str(ret));
-			return ksft_exit_fail();
+			ksft_exit_fail();
 		}
 
 		if (tai_time) {
@@ -295,7 +295,7 @@ int main(int argc, char **argv)
 		se.sigev_value.sival_int = 0;
 		if (timer_create(CLOCK_REALTIME, &se, &tm1) == -1) {
 			printf("Error: timer_create failed\n");
-			return ksft_exit_fail();
+			ksft_exit_fail();
 		}
 		its1.it_value.tv_sec = next_leap;
 		its1.it_value.tv_nsec = 0;
@@ -366,7 +366,7 @@ int main(int argc, char **argv)
 		if (error_found) {
 			printf("Errors observed\n");
 			clear_time_state();
-			return ksft_exit_fail();
+			ksft_exit_fail();
 		}
 		printf("\n");
 		if ((iterations != -1) && !(--iterations))
@@ -374,5 +374,5 @@ int main(int argc, char **argv)
 	}
 
 	clear_time_state();
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/leapcrash.c b/tools/testing/selftests/timers/leapcrash.c
index f70802c5dd0d..8fd065eec904 100644
--- a/tools/testing/selftests/timers/leapcrash.c
+++ b/tools/testing/selftests/timers/leapcrash.c
@@ -87,7 +87,7 @@ int main(void)
 		tv.tv_usec = 0;
 		if (settimeofday(&tv, NULL)) {
 			printf("Error: You're likely not running with proper (ie: root) permissions\n");
-			return ksft_exit_fail();
+			ksft_exit_fail();
 		}
 		tx.modes = 0;
 		adjtimex(&tx);
@@ -104,5 +104,5 @@ int main(void)
 		fflush(stdout);
 	}
 	printf("[OK]\n");
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/mqueue-lat.c b/tools/testing/selftests/timers/mqueue-lat.c
index 7916cf5cc6ff..f3179a605bba 100644
--- a/tools/testing/selftests/timers/mqueue-lat.c
+++ b/tools/testing/selftests/timers/mqueue-lat.c
@@ -107,8 +107,8 @@ int main(int argc, char **argv)
 	ret = mqueue_lat_test();
 	if (ret < 0) {
 		printf("[FAILED]\n");
-		return ksft_exit_fail();
+		ksft_exit_fail();
 	}
 	printf("[OK]\n");
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index d49dd3ffd0d9..07c81c0093c0 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -66,7 +66,7 @@ static int check_diff(struct timeval start, struct timeval end)
 	diff = end.tv_usec - start.tv_usec;
 	diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC;
 
-	if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) {
+	if (llabs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) {
 		printf("Diff too high: %lld..", diff);
 		return -1;
 	}
@@ -184,80 +184,71 @@ static int check_timer_create(int which)
 	return 0;
 }
 
-int remain;
-__thread int got_signal;
+static pthread_t ctd_thread;
+static volatile int ctd_count, ctd_failed;
 
-static void *distribution_thread(void *arg)
+static void ctd_sighandler(int sig)
 {
-	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
-	return NULL;
+	if (pthread_self() != ctd_thread)
+		ctd_failed = 1;
+	ctd_count--;
 }
 
-static void distribution_handler(int nr)
+static void *ctd_thread_func(void *arg)
 {
-	if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED))
-		__atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED);
-}
-
-/*
- * Test that all running threads _eventually_ receive CLOCK_PROCESS_CPUTIME_ID
- * timer signals. This primarily tests that the kernel does not favour any one.
- */
-static int check_timer_distribution(void)
-{
-	int err, i;
-	timer_t id;
-	const int nthreads = 10;
-	pthread_t threads[nthreads];
 	struct itimerspec val = {
 		.it_value.tv_sec = 0,
 		.it_value.tv_nsec = 1000 * 1000,
 		.it_interval.tv_sec = 0,
 		.it_interval.tv_nsec = 1000 * 1000,
 	};
+	timer_t id;
 
-	remain = nthreads + 1;  /* worker threads + this thread */
-	signal(SIGALRM, distribution_handler);
-	err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id);
-	if (err < 0) {
-		ksft_perror("Can't create timer");
-		return -1;
-	}
-	err = timer_settime(id, 0, &val, NULL);
-	if (err < 0) {
-		ksft_perror("Can't set timer");
-		return -1;
-	}
+	/* 1/10 seconds to ensure the leader sleeps */
+	usleep(10000);
 
-	for (i = 0; i < nthreads; i++) {
-		err = pthread_create(&threads[i], NULL, distribution_thread,
-				     NULL);
-		if (err) {
-			ksft_print_msg("Can't create thread: %s (%d)\n",
-				       strerror(errno), errno);
-			return -1;
-		}
-	}
+	ctd_count = 100;
+	if (timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id))
+		return "Can't create timer\n";
+	if (timer_settime(id, 0, &val, NULL))
+		return "Can't set timer\n";
 
-	/* Wait for all threads to receive the signal. */
-	while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
+	while (ctd_count > 0 && !ctd_failed)
+		;
 
-	for (i = 0; i < nthreads; i++) {
-		err = pthread_join(threads[i], NULL);
-		if (err) {
-			ksft_print_msg("Can't join thread: %s (%d)\n",
-				       strerror(errno), errno);
-			return -1;
-		}
-	}
+	if (timer_delete(id))
+		return "Can't delete timer\n";
 
-	if (timer_delete(id)) {
-		ksft_perror("Can't delete timer");
-		return -1;
-	}
+	return NULL;
+}
+
+/*
+ * Test that only the running thread receives the timer signal.
+ */
+static int check_timer_distribution(void)
+{
+	const char *errmsg;
 
-	ksft_test_result_pass("check_timer_distribution\n");
+	signal(SIGALRM, ctd_sighandler);
+
+	errmsg = "Can't create thread\n";
+	if (pthread_create(&ctd_thread, NULL, ctd_thread_func, NULL))
+		goto err;
+
+	errmsg = "Can't join thread\n";
+	if (pthread_join(ctd_thread, (void **)&errmsg) || errmsg)
+		goto err;
+
+	if (!ctd_failed)
+		ksft_test_result_pass("check signal distribution\n");
+	else if (ksft_min_kernel_version(6, 3))
+		ksft_test_result_fail("check signal distribution\n");
+	else
+		ksft_test_result_skip("check signal distribution (old kernel)\n");
 	return 0;
+err:
+	ksft_print_msg("%s", errmsg);
+	return -1;
 }
 
 int main(int argc, char **argv)
@@ -269,16 +260,16 @@ int main(int argc, char **argv)
 	ksft_print_msg("based timers if other threads run on the CPU...\n");
 
 	if (check_itimer(ITIMER_VIRTUAL) < 0)
-		return ksft_exit_fail();
+		ksft_exit_fail();
 
 	if (check_itimer(ITIMER_PROF) < 0)
-		return ksft_exit_fail();
+		ksft_exit_fail();
 
 	if (check_itimer(ITIMER_REAL) < 0)
-		return ksft_exit_fail();
+		ksft_exit_fail();
 
 	if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0)
-		return ksft_exit_fail();
+		ksft_exit_fail();
 
 	/*
 	 * It's unfortunately hard to reliably test a timer expiration
@@ -290,10 +281,10 @@ int main(int argc, char **argv)
 	 * find a better solution.
 	 */
 	if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0)
-		return ksft_exit_fail();
+		ksft_exit_fail();
 
 	if (check_timer_distribution() < 0)
-		return ksft_exit_fail();
+		ksft_exit_fail();
 
 	ksft_finished();
 }
diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c
index 6eba203f9da7..030143eb09b4 100644
--- a/tools/testing/selftests/timers/raw_skew.c
+++ b/tools/testing/selftests/timers/raw_skew.c
@@ -137,11 +137,11 @@ int main(int argc, char **argv)
 		if (tx1.offset || tx2.offset ||
 		    tx1.freq != tx2.freq || tx1.tick != tx2.tick) {
 			printf("	[SKIP]\n");
-			return ksft_exit_skip("The clock was adjusted externally. Shutdown NTPd or other time sync daemons\n");
+			ksft_exit_skip("The clock was adjusted externally. Shutdown NTPd or other time sync daemons\n");
 		}
 		printf("	[FAILED]\n");
-		return ksft_exit_fail();
+		ksft_exit_fail();
 	}
 	printf("	[OK]\n");
-	return  ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/set-2038.c b/tools/testing/selftests/timers/set-2038.c
index 688cfd81b531..f7d978721b9e 100644
--- a/tools/testing/selftests/timers/set-2038.c
+++ b/tools/testing/selftests/timers/set-2038.c
@@ -128,6 +128,6 @@ out:
 	/* restore clock */
 	settime(start);
 	if (ret)
-		return ksft_exit_fail();
-	return ksft_exit_pass();
+		ksft_exit_fail();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/set-tai.c b/tools/testing/selftests/timers/set-tai.c
index 8c4179ee2ca2..5b67462efcd6 100644
--- a/tools/testing/selftests/timers/set-tai.c
+++ b/tools/testing/selftests/timers/set-tai.c
@@ -61,9 +61,9 @@ int main(int argc, char **argv)
 		ret = get_tai();
 		if (ret != i) {
 			printf("[FAILED] expected: %i got %i\n", i, ret);
-			return ksft_exit_fail();
+			ksft_exit_fail();
 		}
 	}
 	printf("[OK]\n");
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c
index 50da45437daa..7ce240c89b21 100644
--- a/tools/testing/selftests/timers/set-timer-lat.c
+++ b/tools/testing/selftests/timers/set-timer-lat.c
@@ -278,6 +278,6 @@ int main(void)
 		ret |= do_timer_oneshot(clock_id, 0);
 	}
 	if (ret)
-		return ksft_exit_fail();
-	return ksft_exit_pass();
+		ksft_exit_fail();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/set-tz.c b/tools/testing/selftests/timers/set-tz.c
index 62bd33eb16f0..20daaf1782b7 100644
--- a/tools/testing/selftests/timers/set-tz.c
+++ b/tools/testing/selftests/timers/set-tz.c
@@ -102,9 +102,9 @@ int main(int argc, char **argv)
 	printf("[OK]\n");
 
 	set_tz(min, dst);
-	return ksft_exit_pass();
+	ksft_exit_pass();
 
 err:
 	set_tz(min, dst);
-	return ksft_exit_fail();
+	ksft_exit_fail();
 }
diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c
index 63913f75b384..c8e6bffe4e0a 100644
--- a/tools/testing/selftests/timers/skew_consistency.c
+++ b/tools/testing/selftests/timers/skew_consistency.c
@@ -70,8 +70,8 @@ int main(int argc, char **argv)
 
 	if (ret) {
 		printf("[FAILED]\n");
-		return ksft_exit_fail();
+		ksft_exit_fail();
 	}
 	printf("[OK]\n");
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c
index 80aed4bf06fb..76b38e41d9c7 100644
--- a/tools/testing/selftests/timers/threadtest.c
+++ b/tools/testing/selftests/timers/threadtest.c
@@ -189,5 +189,5 @@ out:
 	/* die */
 	if (ret)
 		ksft_exit_fail();
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c
index 48b9a803235a..d500884801d8 100644
--- a/tools/testing/selftests/timers/valid-adjtimex.c
+++ b/tools/testing/selftests/timers/valid-adjtimex.c
@@ -21,9 +21,6 @@
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *   GNU General Public License for more details.
  */
-
-
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
@@ -62,45 +59,47 @@ int clear_time_state(void)
 #define NUM_FREQ_OUTOFRANGE 4
 #define NUM_FREQ_INVALID 2
 
+#define SHIFTED_PPM (1 << 16)
+
 long valid_freq[NUM_FREQ_VALID] = {
-	-499<<16,
-	-450<<16,
-	-400<<16,
-	-350<<16,
-	-300<<16,
-	-250<<16,
-	-200<<16,
-	-150<<16,
-	-100<<16,
-	-75<<16,
-	-50<<16,
-	-25<<16,
-	-10<<16,
-	-5<<16,
-	-1<<16,
+	 -499 * SHIFTED_PPM,
+	 -450 * SHIFTED_PPM,
+	 -400 * SHIFTED_PPM,
+	 -350 * SHIFTED_PPM,
+	 -300 * SHIFTED_PPM,
+	 -250 * SHIFTED_PPM,
+	 -200 * SHIFTED_PPM,
+	 -150 * SHIFTED_PPM,
+	 -100 * SHIFTED_PPM,
+	  -75 * SHIFTED_PPM,
+	  -50 * SHIFTED_PPM,
+	  -25 * SHIFTED_PPM,
+	  -10 * SHIFTED_PPM,
+	   -5 * SHIFTED_PPM,
+	   -1 * SHIFTED_PPM,
 	-1000,
-	1<<16,
-	5<<16,
-	10<<16,
-	25<<16,
-	50<<16,
-	75<<16,
-	100<<16,
-	150<<16,
-	200<<16,
-	250<<16,
-	300<<16,
-	350<<16,
-	400<<16,
-	450<<16,
-	499<<16,
+	    1 * SHIFTED_PPM,
+	    5 * SHIFTED_PPM,
+	   10 * SHIFTED_PPM,
+	   25 * SHIFTED_PPM,
+	   50 * SHIFTED_PPM,
+	   75 * SHIFTED_PPM,
+	  100 * SHIFTED_PPM,
+	  150 * SHIFTED_PPM,
+	  200 * SHIFTED_PPM,
+	  250 * SHIFTED_PPM,
+	  300 * SHIFTED_PPM,
+	  350 * SHIFTED_PPM,
+	  400 * SHIFTED_PPM,
+	  450 * SHIFTED_PPM,
+	  499 * SHIFTED_PPM,
 };
 
 long outofrange_freq[NUM_FREQ_OUTOFRANGE] = {
-	-1000<<16,
-	-550<<16,
-	550<<16,
-	1000<<16,
+	-1000 * SHIFTED_PPM,
+	 -550 * SHIFTED_PPM,
+	  550 * SHIFTED_PPM,
+	 1000 * SHIFTED_PPM,
 };
 
 #define LONG_MAX (~0UL>>1)
@@ -321,10 +320,10 @@ int validate_set_offset(void)
 int main(int argc, char **argv)
 {
 	if (validate_freq())
-		return ksft_exit_fail();
+		ksft_exit_fail();
 
 	if (validate_set_offset())
-		return ksft_exit_fail();
+		ksft_exit_fail();
 
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/tty/tty_tstamp_update.c b/tools/testing/selftests/tty/tty_tstamp_update.c
index 0ee97943dccc..9e1a40f5db17 100644
--- a/tools/testing/selftests/tty/tty_tstamp_update.c
+++ b/tools/testing/selftests/tty/tty_tstamp_update.c
@@ -47,42 +47,60 @@ int main(int argc, char **argv)
 	int r;
 	char tty[PATH_MAX] = {};
 	struct stat st1, st2;
+	int result = KSFT_FAIL;
 
 	ksft_print_header();
 	ksft_set_plan(1);
 
 	r = readlink("/proc/self/fd/0", tty, PATH_MAX);
-	if (r < 0)
-		ksft_exit_fail_msg("readlink on /proc/self/fd/0 failed: %m\n");
+	if (r < 0) {
+		ksft_print_msg("readlink on /proc/self/fd/0 failed: %m\n");
+		goto out;
+	}
+
+	if (!tty_valid(tty)) {
+		ksft_print_msg("invalid tty path '%s'\n", tty);
+		result = KSFT_SKIP;
+		goto out;
 
-	if (!tty_valid(tty))
-		ksft_exit_skip("invalid tty path '%s'\n", tty);
+	}
 
 	r = stat(tty, &st1);
-	if (r < 0)
-		ksft_exit_fail_msg("stat failed on tty path '%s': %m\n", tty);
+	if (r < 0) {
+		ksft_print_msg("stat failed on tty path '%s': %m\n", tty);
+		goto out;
+	}
 
 	/* We need to wait at least 8 seconds in order to observe timestamp change */
 	/* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fbf47635315ab308c9b58a1ea0906e711a9228de */
 	sleep(10);
 
 	r = write_dev_tty();
-	if (r < 0)
-		ksft_exit_fail_msg("failed to write to /dev/tty: %s\n",
-				   strerror(-r));
+	if (r < 0) {
+		ksft_print_msg("failed to write to /dev/tty: %s\n",
+			       strerror(-r));
+		goto out;
+	}
 
 	r = stat(tty, &st2);
-	if (r < 0)
-		ksft_exit_fail_msg("stat failed on tty path '%s': %m\n", tty);
+	if (r < 0) {
+		ksft_print_msg("stat failed on tty path '%s': %m\n", tty);
+		goto out;
+	}
 
 	/* We wrote to the terminal so timestamps should have been updated */
 	if (st1.st_atim.tv_sec == st2.st_atim.tv_sec &&
 	    st1.st_mtim.tv_sec == st2.st_mtim.tv_sec) {
-		ksft_test_result_fail("tty timestamps not updated\n");
-		ksft_exit_fail();
+		ksft_print_msg("tty timestamps not updated\n");
+		goto out;
 	}
 
-	ksft_test_result_pass(
+	ksft_print_msg(
 		"timestamps of terminal '%s' updated after write to /dev/tty\n", tty);
-	return EXIT_SUCCESS;
+	result = KSFT_PASS;
+
+out:
+	ksft_test_result_report(result, "tty_tstamp_update\n");
+
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/turbostat/defcolumns.py b/tools/testing/selftests/turbostat/defcolumns.py
new file mode 100755
index 000000000000..d9b042097da7
--- /dev/null
+++ b/tools/testing/selftests/turbostat/defcolumns.py
@@ -0,0 +1,60 @@
+#!/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+from shutil import which
+
+turbostat = which('turbostat')
+if turbostat is None:
+	print('Could not find turbostat binary')
+	exit(1)
+
+timeout = which('timeout')
+if timeout is None:
+	print('Could not find timeout binary')
+	exit(1)
+
+proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True)
+if proc_turbostat.returncode != 0:
+	print(f'turbostat failed with {proc_turbostat.returncode}')
+	exit(1)
+
+#
+# By default --list reports also "usec" and "Time_Of_Day_Seconds" columns
+# which are only visible when running with --debug.
+#
+expected_columns_debug = proc_turbostat.stdout.replace(b',', b'\t').strip()
+expected_columns = expected_columns_debug.replace(b'usec\t', b'').replace(b'Time_Of_Day_Seconds\t', b'').replace(b'X2APIC\t', b'').replace(b'APIC\t', b'')
+
+#
+# Run turbostat with no options for 10 seconds and send SIGINT
+#
+timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '1s']
+turbostat_argv = [turbostat, '-i', '0.250']
+
+print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True)
+proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True)
+if proc_turbostat.returncode != 0:
+	print(f'turbostat failed with {proc_turbostat.returncode}')
+	exit(1)
+actual_columns = proc_turbostat.stdout.split(b'\n')[0]
+if expected_columns != actual_columns:
+	print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}')
+	exit(1)
+print('OK')
+
+#
+# Same, but with --debug
+#
+turbostat_argv.append('--debug')
+
+print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True)
+proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True)
+if proc_turbostat.returncode != 0:
+	print(f'turbostat failed with {proc_turbostat.returncode}')
+	exit(1)
+actual_columns = proc_turbostat.stdout.split(b'\n')[0]
+if expected_columns_debug != actual_columns:
+	print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}')
+	exit(1)
+print('OK')
diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c
index cef1ff1af223..7288a05136ba 100644
--- a/tools/testing/selftests/user_events/abi_test.c
+++ b/tools/testing/selftests/user_events/abi_test.c
@@ -16,6 +16,8 @@
 #include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include <glob.h>
+#include <string.h>
 #include <asm/unistd.h>
 
 #include "../kselftest_harness.h"
@@ -23,6 +25,62 @@
 
 const char *data_file = "/sys/kernel/tracing/user_events_data";
 const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable";
+const char *multi_dir_glob = "/sys/kernel/tracing/events/user_events_multi/__abi_event.*";
+
+static int wait_for_delete(char *dir)
+{
+	struct stat buf;
+	int i;
+
+	for (i = 0; i < 10000; ++i) {
+		if (stat(dir, &buf) == -1 && errno == ENOENT)
+			return 0;
+
+		usleep(1000);
+	}
+
+	return -1;
+}
+
+static int find_multi_event_dir(char *unique_field, char *out_dir, int dir_len)
+{
+	char path[256];
+	glob_t buf;
+	int i, ret;
+
+	ret = glob(multi_dir_glob, GLOB_ONLYDIR, NULL, &buf);
+
+	if (ret)
+		return -1;
+
+	ret = -1;
+
+	for (i = 0; i < buf.gl_pathc; ++i) {
+		FILE *fp;
+
+		snprintf(path, sizeof(path), "%s/format", buf.gl_pathv[i]);
+		fp = fopen(path, "r");
+
+		if (!fp)
+			continue;
+
+		while (fgets(path, sizeof(path), fp) != NULL) {
+			if (strstr(path, unique_field)) {
+				fclose(fp);
+				/* strscpy is not available, use snprintf */
+				snprintf(out_dir, dir_len, "%s", buf.gl_pathv[i]);
+				ret = 0;
+				goto out;
+			}
+		}
+
+		fclose(fp);
+	}
+out:
+	globfree(&buf);
+
+	return ret;
+}
 
 static bool event_exists(void)
 {
@@ -74,6 +132,39 @@ static int event_delete(void)
 	return ret;
 }
 
+static int reg_enable_multi(void *enable, int size, int bit, int flags,
+			    char *args)
+{
+	struct user_reg reg = {0};
+	char full_args[512] = {0};
+	int fd = open(data_file, O_RDWR);
+	int len;
+	int ret;
+
+	if (fd < 0)
+		return -1;
+
+	len = snprintf(full_args, sizeof(full_args), "__abi_event %s", args);
+
+	if (len > sizeof(full_args)) {
+		ret = -E2BIG;
+		goto out;
+	}
+
+	reg.size = sizeof(reg);
+	reg.name_args = (__u64)full_args;
+	reg.flags = USER_EVENT_REG_MULTI_FORMAT | flags;
+	reg.enable_bit = bit;
+	reg.enable_addr = (__u64)enable;
+	reg.enable_size = size;
+
+	ret = ioctl(fd, DIAG_IOCSREG, &reg);
+out:
+	close(fd);
+
+	return ret;
+}
+
 static int reg_enable_flags(void *enable, int size, int bit, int flags)
 {
 	struct user_reg reg = {0};
@@ -207,6 +298,49 @@ TEST_F(user, bit_sizes) {
 	ASSERT_NE(0, reg_enable(&self->check, 128, 0));
 }
 
+TEST_F(user, multi_format) {
+	char first_dir[256];
+	char second_dir[256];
+	struct stat buf;
+
+	/* Multiple formats for the same name should work */
+	ASSERT_EQ(0, reg_enable_multi(&self->check, sizeof(int), 0,
+				      0, "u32 multi_first"));
+
+	ASSERT_EQ(0, reg_enable_multi(&self->check, sizeof(int), 1,
+				      0, "u64 multi_second"));
+
+	/* Same name with same format should also work */
+	ASSERT_EQ(0, reg_enable_multi(&self->check, sizeof(int), 2,
+				      0, "u64 multi_second"));
+
+	ASSERT_EQ(0, find_multi_event_dir("multi_first",
+					  first_dir, sizeof(first_dir)));
+
+	ASSERT_EQ(0, find_multi_event_dir("multi_second",
+					  second_dir, sizeof(second_dir)));
+
+	/* Should not be found in the same dir */
+	ASSERT_NE(0, strcmp(first_dir, second_dir));
+
+	/* First dir should still exist */
+	ASSERT_EQ(0, stat(first_dir, &buf));
+
+	/* Disabling first register should remove first dir */
+	ASSERT_EQ(0, reg_disable(&self->check, 0));
+	ASSERT_EQ(0, wait_for_delete(first_dir));
+
+	/* Second dir should still exist */
+	ASSERT_EQ(0, stat(second_dir, &buf));
+
+	/* Disabling second register should remove second dir */
+	ASSERT_EQ(0, reg_disable(&self->check, 1));
+	/* Ensure bit 1 and 2 are tied together, should not delete yet */
+	ASSERT_EQ(0, stat(second_dir, &buf));
+	ASSERT_EQ(0, reg_disable(&self->check, 2));
+	ASSERT_EQ(0, wait_for_delete(second_dir));
+}
+
 TEST_F(user, forks) {
 	int i;
 
diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h
index cdfed403ba13..7b543e7f04d7 100644
--- a/tools/testing/selftests/vDSO/vdso_config.h
+++ b/tools/testing/selftests/vDSO/vdso_config.h
@@ -53,15 +53,19 @@
 #if __riscv_xlen == 32
 #define VDSO_32BIT		1
 #endif
+#elif defined(__loongarch__)
+#define VDSO_VERSION		6
+#define VDSO_NAMES		1
 #endif
 
-static const char *versions[6] = {
+static const char *versions[7] = {
 	"LINUX_2.6",
 	"LINUX_2.6.15",
 	"LINUX_2.6.29",
 	"LINUX_2.6.39",
 	"LINUX_4",
 	"LINUX_4.15",
+	"LINUX_5.10"
 };
 
 static const char *names[2][6] = {
diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c
index 1df5d057d79f..b758f68c6c9c 100644
--- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c
+++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c
@@ -13,13 +13,7 @@
 
 #include "../kselftest.h"
 #include "parse_vdso.h"
-
-#if defined(__riscv)
-const char *version = "LINUX_4.15";
-#else
-const char *version = "LINUX_2.6";
-#endif
-const char *name = "__vdso_getcpu";
+#include "vdso_config.h"
 
 struct getcpu_cache;
 typedef long (*getcpu_t)(unsigned int *, unsigned int *,
@@ -27,6 +21,8 @@ typedef long (*getcpu_t)(unsigned int *, unsigned int *,
 
 int main(int argc, char **argv)
 {
+	const char *version = versions[VDSO_VERSION];
+	const char **name = (const char **)&names[VDSO_NAMES];
 	unsigned long sysinfo_ehdr;
 	unsigned int cpu, node;
 	getcpu_t get_cpu;
@@ -40,9 +36,9 @@ int main(int argc, char **argv)
 
 	vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR));
 
-	get_cpu = (getcpu_t)vdso_sym(version, name);
+	get_cpu = (getcpu_t)vdso_sym(version, name[4]);
 	if (!get_cpu) {
-		printf("Could not find %s\n", name);
+		printf("Could not find %s\n", name[4]);
 		return KSFT_SKIP;
 	}
 
@@ -50,7 +46,7 @@ int main(int argc, char **argv)
 	if (ret == 0) {
 		printf("Running on CPU %u node %u\n", cpu, node);
 	} else {
-		printf("%s failed\n", name);
+		printf("%s failed\n", name[4]);
 		return KSFT_FAIL;
 	}
 
diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c
index e411f287a426..ee4f1ca56a71 100644
--- a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c
+++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c
@@ -18,25 +18,13 @@
 
 #include "../kselftest.h"
 #include "parse_vdso.h"
-
-/*
- * ARM64's vDSO exports its gettimeofday() implementation with a different
- * name and version from other architectures, so we need to handle it as
- * a special case.
- */
-#if defined(__aarch64__)
-const char *version = "LINUX_2.6.39";
-const char *name = "__kernel_gettimeofday";
-#elif defined(__riscv)
-const char *version = "LINUX_4.15";
-const char *name = "__vdso_gettimeofday";
-#else
-const char *version = "LINUX_2.6";
-const char *name = "__vdso_gettimeofday";
-#endif
+#include "vdso_config.h"
 
 int main(int argc, char **argv)
 {
+	const char *version = versions[VDSO_VERSION];
+	const char **name = (const char **)&names[VDSO_NAMES];
+
 	unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR);
 	if (!sysinfo_ehdr) {
 		printf("AT_SYSINFO_EHDR is not present!\n");
@@ -47,10 +35,10 @@ int main(int argc, char **argv)
 
 	/* Find gettimeofday. */
 	typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
-	gtod_t gtod = (gtod_t)vdso_sym(version, name);
+	gtod_t gtod = (gtod_t)vdso_sym(version, name[0]);
 
 	if (!gtod) {
-		printf("Could not find %s\n", name);
+		printf("Could not find %s\n", name[0]);
 		return KSFT_SKIP;
 	}
 
@@ -61,7 +49,7 @@ int main(int argc, char **argv)
 		printf("The time is %lld.%06lld\n",
 		       (long long)tv.tv_sec, (long long)tv.tv_usec);
 	} else {
-		printf("%s failed\n", name);
+		printf("%s failed\n", name[0]);
 		return KSFT_FAIL;
 	}
 
diff --git a/tools/testing/selftests/wireguard/qemu/arch/riscv32.config b/tools/testing/selftests/wireguard/qemu/arch/riscv32.config
index 2fc36efb166d..66290cf289a9 100644
--- a/tools/testing/selftests/wireguard/qemu/arch/riscv32.config
+++ b/tools/testing/selftests/wireguard/qemu/arch/riscv32.config
@@ -2,7 +2,8 @@ CONFIG_NONPORTABLE=y
 CONFIG_ARCH_RV32I=y
 CONFIG_MMU=y
 CONFIG_FPU=y
-CONFIG_SOC_VIRT=y
+CONFIG_ARCH_VIRT=y
+CONFIG_RISCV_ISA_FALLBACK=y
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_OF_PLATFORM=y
diff --git a/tools/testing/selftests/wireguard/qemu/arch/riscv64.config b/tools/testing/selftests/wireguard/qemu/arch/riscv64.config
index dc266f3b1915..db1aa9f388b9 100644
--- a/tools/testing/selftests/wireguard/qemu/arch/riscv64.config
+++ b/tools/testing/selftests/wireguard/qemu/arch/riscv64.config
@@ -1,7 +1,8 @@
 CONFIG_ARCH_RV64I=y
 CONFIG_MMU=y
 CONFIG_FPU=y
-CONFIG_SOC_VIRT=y
+CONFIG_ARCH_VIRT=y
+CONFIG_RISCV_ISA_FALLBACK=y
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_OF_PLATFORM=y
diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c
index d884fd69dd51..95aad6d8849b 100644
--- a/tools/testing/selftests/x86/amx.c
+++ b/tools/testing/selftests/x86/amx.c
@@ -103,21 +103,6 @@ static void clearhandler(int sig)
 
 #define CPUID_LEAF1_ECX_XSAVE_MASK	(1 << 26)
 #define CPUID_LEAF1_ECX_OSXSAVE_MASK	(1 << 27)
-static inline void check_cpuid_xsave(void)
-{
-	uint32_t eax, ebx, ecx, edx;
-
-	/*
-	 * CPUID.1:ECX.XSAVE[bit 26] enumerates general
-	 * support for the XSAVE feature set, including
-	 * XGETBV.
-	 */
-	__cpuid_count(1, 0, eax, ebx, ecx, edx);
-	if (!(ecx & CPUID_LEAF1_ECX_XSAVE_MASK))
-		fatal_error("cpuid: no CPU xsave support");
-	if (!(ecx & CPUID_LEAF1_ECX_OSXSAVE_MASK))
-		fatal_error("cpuid: no OS xsave support");
-}
 
 static uint32_t xbuf_size;
 
@@ -350,6 +335,7 @@ enum expected_result { FAIL_EXPECTED, SUCCESS_EXPECTED };
 
 /* arch_prctl() and sigaltstack() test */
 
+#define ARCH_GET_XCOMP_SUPP	0x1021
 #define ARCH_GET_XCOMP_PERM	0x1022
 #define ARCH_REQ_XCOMP_PERM	0x1023
 
@@ -928,8 +914,15 @@ static void test_ptrace(void)
 
 int main(void)
 {
-	/* Check hardware availability at first */
-	check_cpuid_xsave();
+	unsigned long features;
+	long rc;
+
+	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_SUPP, &features);
+	if (rc || (features & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE) {
+		ksft_print_msg("no AMX support\n");
+		return KSFT_SKIP;
+	}
+
 	check_cpuid_xtiledata();
 
 	init_stashed_xsave();
diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c
index 215b8150b7cc..0ea4f6813930 100644
--- a/tools/testing/selftests/x86/lam.c
+++ b/tools/testing/selftests/x86/lam.c
@@ -1183,7 +1183,7 @@ int main(int argc, char **argv)
 
 	if (!cpu_has_lam()) {
 		ksft_print_msg("Unsupported LAM feature!\n");
-		return -1;
+		return KSFT_SKIP;
 	}
 
 	while ((c = getopt(argc, argv, "ht:")) != -1) {
@@ -1237,5 +1237,5 @@ int main(int argc, char **argv)
 
 	ksft_set_plan(tests_cnt);
 
-	return ksft_exit_pass();
+	ksft_exit_pass();
 }
diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c
index f0d876d48277..d53959e03593 100644
--- a/tools/testing/selftests/x86/test_mremap_vdso.c
+++ b/tools/testing/selftests/x86/test_mremap_vdso.c
@@ -19,6 +19,7 @@
 #include <sys/auxv.h>
 #include <sys/syscall.h>
 #include <sys/wait.h>
+#include "../kselftest.h"
 
 #define PAGE_SIZE	4096
 
@@ -29,13 +30,13 @@ static int try_to_remap(void *vdso_addr, unsigned long size)
 	/* Searching for memory location where to remap */
 	dest_addr = mmap(0, size, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
 	if (dest_addr == MAP_FAILED) {
-		printf("[WARN]\tmmap failed (%d): %m\n", errno);
+		ksft_print_msg("WARN: mmap failed (%d): %m\n", errno);
 		return 0;
 	}
 
-	printf("[NOTE]\tMoving vDSO: [%p, %#lx] -> [%p, %#lx]\n",
-		vdso_addr, (unsigned long)vdso_addr + size,
-		dest_addr, (unsigned long)dest_addr + size);
+	ksft_print_msg("Moving vDSO: [%p, %#lx] -> [%p, %#lx]\n",
+		       vdso_addr, (unsigned long)vdso_addr + size,
+		       dest_addr, (unsigned long)dest_addr + size);
 	fflush(stdout);
 
 	new_addr = mremap(vdso_addr, size, size,
@@ -43,10 +44,10 @@ static int try_to_remap(void *vdso_addr, unsigned long size)
 	if ((unsigned long)new_addr == (unsigned long)-1) {
 		munmap(dest_addr, size);
 		if (errno == EINVAL) {
-			printf("[NOTE]\tvDSO partial move failed, will try with bigger size\n");
+			ksft_print_msg("vDSO partial move failed, will try with bigger size\n");
 			return -1; /* Retry with larger */
 		}
-		printf("[FAIL]\tmremap failed (%d): %m\n", errno);
+		ksft_print_msg("[FAIL]\tmremap failed (%d): %m\n", errno);
 		return 1;
 	}
 
@@ -58,11 +59,12 @@ int main(int argc, char **argv, char **envp)
 {
 	pid_t child;
 
+	ksft_print_header();
+	ksft_set_plan(1);
+
 	child = fork();
-	if (child == -1) {
-		printf("[WARN]\tfailed to fork (%d): %m\n", errno);
-		return 1;
-	}
+	if (child == -1)
+		ksft_exit_fail_msg("failed to fork (%d): %m\n", errno);
 
 	if (child == 0) {
 		unsigned long vdso_size = PAGE_SIZE;
@@ -70,9 +72,9 @@ int main(int argc, char **argv, char **envp)
 		int ret = -1;
 
 		auxval = getauxval(AT_SYSINFO_EHDR);
-		printf("\tAT_SYSINFO_EHDR is %#lx\n", auxval);
+		ksft_print_msg("AT_SYSINFO_EHDR is %#lx\n", auxval);
 		if (!auxval || auxval == -ENOENT) {
-			printf("[WARN]\tgetauxval failed\n");
+			ksft_print_msg("WARN: getauxval failed\n");
 			return 0;
 		}
 
@@ -92,16 +94,13 @@ int main(int argc, char **argv, char **envp)
 		int status;
 
 		if (waitpid(child, &status, 0) != child ||
-			!WIFEXITED(status)) {
-			printf("[FAIL]\tmremap() of the vDSO does not work on this kernel!\n");
-			return 1;
-		} else if (WEXITSTATUS(status) != 0) {
-			printf("[FAIL]\tChild failed with %d\n",
-					WEXITSTATUS(status));
-			return 1;
-		}
-		printf("[OK]\n");
+			!WIFEXITED(status))
+			ksft_test_result_fail("mremap() of the vDSO does not work on this kernel!\n");
+		else if (WEXITSTATUS(status) != 0)
+			ksft_test_result_fail("Child failed with %d\n", WEXITSTATUS(status));
+		else
+			ksft_test_result_pass("%s\n", __func__);
 	}
 
-	return 0;
+	ksft_finished();
 }
diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
index 47cab972807c..d4c8e8d79d38 100644
--- a/tools/testing/selftests/x86/test_vsyscall.c
+++ b/tools/testing/selftests/x86/test_vsyscall.c
@@ -21,6 +21,13 @@
 #include <sys/uio.h>
 
 #include "helpers.h"
+#include "../kselftest.h"
+
+#ifdef __x86_64__
+#define TOTAL_TESTS 13
+#else
+#define TOTAL_TESTS 8
+#endif
 
 #ifdef __x86_64__
 # define VSYS(x) (x)
@@ -39,18 +46,6 @@
 /* max length of lines in /proc/self/maps - anything longer is skipped here */
 #define MAPS_LINE_LEN 128
 
-static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
-		       int flags)
-{
-	struct sigaction sa;
-	memset(&sa, 0, sizeof(sa));
-	sa.sa_sigaction = handler;
-	sa.sa_flags = SA_SIGINFO | flags;
-	sigemptyset(&sa.sa_mask);
-	if (sigaction(sig, &sa, 0))
-		err(1, "sigaction");
-}
-
 /* vsyscalls and vDSO */
 bool vsyscall_map_r = false, vsyscall_map_x = false;
 
@@ -75,83 +70,25 @@ static void init_vdso(void)
 	if (!vdso)
 		vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
 	if (!vdso) {
-		printf("[WARN]\tfailed to find vDSO\n");
+		ksft_print_msg("[WARN] failed to find vDSO\n");
 		return;
 	}
 
 	vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday");
 	if (!vdso_gtod)
-		printf("[WARN]\tfailed to find gettimeofday in vDSO\n");
+		ksft_print_msg("[WARN] failed to find gettimeofday in vDSO\n");
 
 	vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime");
 	if (!vdso_gettime)
-		printf("[WARN]\tfailed to find clock_gettime in vDSO\n");
+		ksft_print_msg("[WARN] failed to find clock_gettime in vDSO\n");
 
 	vdso_time = (time_func_t)dlsym(vdso, "__vdso_time");
 	if (!vdso_time)
-		printf("[WARN]\tfailed to find time in vDSO\n");
+		ksft_print_msg("[WARN] failed to find time in vDSO\n");
 
 	vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu");
 	if (!vdso_getcpu)
-		printf("[WARN]\tfailed to find getcpu in vDSO\n");
-}
-
-static int init_vsys(void)
-{
-#ifdef __x86_64__
-	int nerrs = 0;
-	FILE *maps;
-	char line[MAPS_LINE_LEN];
-	bool found = false;
-
-	maps = fopen("/proc/self/maps", "r");
-	if (!maps) {
-		printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n");
-		vsyscall_map_r = true;
-		return 0;
-	}
-
-	while (fgets(line, MAPS_LINE_LEN, maps)) {
-		char r, x;
-		void *start, *end;
-		char name[MAPS_LINE_LEN];
-
-		/* sscanf() is safe here as strlen(name) >= strlen(line) */
-		if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
-			   &start, &end, &r, &x, name) != 5)
-			continue;
-
-		if (strcmp(name, "[vsyscall]"))
-			continue;
-
-		printf("\tvsyscall map: %s", line);
-
-		if (start != (void *)0xffffffffff600000 ||
-		    end != (void *)0xffffffffff601000) {
-			printf("[FAIL]\taddress range is nonsense\n");
-			nerrs++;
-		}
-
-		printf("\tvsyscall permissions are %c-%c\n", r, x);
-		vsyscall_map_r = (r == 'r');
-		vsyscall_map_x = (x == 'x');
-
-		found = true;
-		break;
-	}
-
-	fclose(maps);
-
-	if (!found) {
-		printf("\tno vsyscall map in /proc/self/maps\n");
-		vsyscall_map_r = false;
-		vsyscall_map_x = false;
-	}
-
-	return nerrs;
-#else
-	return 0;
-#endif
+		ksft_print_msg("[WARN] failed to find getcpu in vDSO\n");
 }
 
 /* syscalls */
@@ -176,98 +113,76 @@ static inline long sys_getcpu(unsigned * cpu, unsigned * node,
 	return syscall(SYS_getcpu, cpu, node, cache);
 }
 
-static jmp_buf jmpbuf;
-static volatile unsigned long segv_err;
-
-static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
-{
-	ucontext_t *ctx = (ucontext_t *)ctx_void;
-
-	segv_err =  ctx->uc_mcontext.gregs[REG_ERR];
-	siglongjmp(jmpbuf, 1);
-}
-
 static double tv_diff(const struct timeval *a, const struct timeval *b)
 {
 	return (double)(a->tv_sec - b->tv_sec) +
 		(double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6;
 }
 
-static int check_gtod(const struct timeval *tv_sys1,
-		      const struct timeval *tv_sys2,
-		      const struct timezone *tz_sys,
-		      const char *which,
-		      const struct timeval *tv_other,
-		      const struct timezone *tz_other)
+static void check_gtod(const struct timeval *tv_sys1,
+		       const struct timeval *tv_sys2,
+		       const struct timezone *tz_sys,
+		       const char *which,
+		       const struct timeval *tv_other,
+		       const struct timezone *tz_other)
 {
-	int nerrs = 0;
 	double d1, d2;
 
-	if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) {
-		printf("[FAIL] %s tz mismatch\n", which);
-		nerrs++;
-	}
+	if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest ||
+			 tz_sys->tz_dsttime != tz_other->tz_dsttime))
+		ksft_print_msg("%s tz mismatch\n", which);
 
 	d1 = tv_diff(tv_other, tv_sys1);
 	d2 = tv_diff(tv_sys2, tv_other); 
-	printf("\t%s time offsets: %lf %lf\n", which, d1, d2);
 
-	if (d1 < 0 || d2 < 0) {
-		printf("[FAIL]\t%s time was inconsistent with the syscall\n", which);
-		nerrs++;
-	} else {
-		printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which);
-	}
+	ksft_print_msg("%s time offsets: %lf %lf\n", which, d1, d2);
 
-	return nerrs;
+	ksft_test_result(!(d1 < 0 || d2 < 0), "%s gettimeofday()'s timeval\n", which);
 }
 
-static int test_gtod(void)
+static void test_gtod(void)
 {
 	struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys;
 	struct timezone tz_sys, tz_vdso, tz_vsys;
 	long ret_vdso = -1;
 	long ret_vsys = -1;
-	int nerrs = 0;
 
-	printf("[RUN]\ttest gettimeofday()\n");
+	ksft_print_msg("test gettimeofday()\n");
 
 	if (sys_gtod(&tv_sys1, &tz_sys) != 0)
-		err(1, "syscall gettimeofday");
+		ksft_exit_fail_msg("syscall gettimeofday: %s\n", strerror(errno));
 	if (vdso_gtod)
 		ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso);
 	if (vsyscall_map_x)
 		ret_vsys = vgtod(&tv_vsys, &tz_vsys);
 	if (sys_gtod(&tv_sys2, &tz_sys) != 0)
-		err(1, "syscall gettimeofday");
+		ksft_exit_fail_msg("syscall gettimeofday: %s\n", strerror(errno));
 
 	if (vdso_gtod) {
-		if (ret_vdso == 0) {
-			nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso);
-		} else {
-			printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso);
-			nerrs++;
-		}
+		if (ret_vdso == 0)
+			check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso);
+		else
+			ksft_test_result_fail("vDSO gettimeofday() failed: %ld\n", ret_vdso);
+	} else {
+		ksft_test_result_skip("vdso_gtod isn't set\n");
 	}
 
 	if (vsyscall_map_x) {
-		if (ret_vsys == 0) {
-			nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
-		} else {
-			printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys);
-			nerrs++;
-		}
+		if (ret_vsys == 0)
+			check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
+		else
+			ksft_test_result_fail("vsys gettimeofday() failed: %ld\n", ret_vsys);
+	} else {
+		ksft_test_result_skip("vsyscall_map_x isn't set\n");
 	}
-
-	return nerrs;
 }
 
-static int test_time(void) {
-	int nerrs = 0;
-
-	printf("[RUN]\ttest time()\n");
+static void test_time(void)
+{
 	long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0;
 	long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1;
+
+	ksft_print_msg("test time()\n");
 	t_sys1 = sys_time(&t2_sys1);
 	if (vdso_time)
 		t_vdso = vdso_time(&t2_vdso);
@@ -275,56 +190,60 @@ static int test_time(void) {
 		t_vsys = vtime(&t2_vsys);
 	t_sys2 = sys_time(&t2_sys2);
 	if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) {
-		printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2);
-		nerrs++;
-		return nerrs;
+		ksft_print_msg("syscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n",
+			       t_sys1, t2_sys1, t_sys2, t2_sys2);
+		ksft_test_result_skip("vdso_time\n");
+		ksft_test_result_skip("vdso_time\n");
+		return;
 	}
 
 	if (vdso_time) {
-		if (t_vdso < 0 || t_vdso != t2_vdso) {
-			printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso);
-			nerrs++;
-		} else if (t_vdso < t_sys1 || t_vdso > t_sys2) {
-			printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2);
-			nerrs++;
-		} else {
-			printf("[OK]\tvDSO time() is okay\n");
-		}
+		if (t_vdso < 0 || t_vdso != t2_vdso)
+			ksft_test_result_fail("vDSO failed (ret:%ld output:%ld)\n",
+					      t_vdso, t2_vdso);
+		else if (t_vdso < t_sys1 || t_vdso > t_sys2)
+			ksft_test_result_fail("vDSO returned the wrong time (%ld %ld %ld)\n",
+					      t_sys1, t_vdso, t_sys2);
+		else
+			ksft_test_result_pass("vDSO time() is okay\n");
+	} else {
+		ksft_test_result_skip("vdso_time isn't set\n");
 	}
 
 	if (vsyscall_map_x) {
-		if (t_vsys < 0 || t_vsys != t2_vsys) {
-			printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys);
-			nerrs++;
-		} else if (t_vsys < t_sys1 || t_vsys > t_sys2) {
-			printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2);
-			nerrs++;
-		} else {
-			printf("[OK]\tvsyscall time() is okay\n");
-		}
+		if (t_vsys < 0 || t_vsys != t2_vsys)
+			ksft_test_result_fail("vsyscall failed (ret:%ld output:%ld)\n",
+					      t_vsys, t2_vsys);
+		else if (t_vsys < t_sys1 || t_vsys > t_sys2)
+			ksft_test_result_fail("vsyscall returned the wrong time (%ld %ld %ld)\n",
+					      t_sys1, t_vsys, t_sys2);
+		else
+			ksft_test_result_pass("vsyscall time() is okay\n");
+	} else {
+		ksft_test_result_skip("vsyscall_map_x isn't set\n");
 	}
-
-	return nerrs;
 }
 
-static int test_getcpu(int cpu)
+static void test_getcpu(int cpu)
 {
-	int nerrs = 0;
+	unsigned int cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys;
 	long ret_sys, ret_vdso = -1, ret_vsys = -1;
+	unsigned int node = 0;
+	bool have_node = false;
+	cpu_set_t cpuset;
 
-	printf("[RUN]\tgetcpu() on CPU %d\n", cpu);
+	ksft_print_msg("getcpu() on CPU %d\n", cpu);
 
-	cpu_set_t cpuset;
 	CPU_ZERO(&cpuset);
 	CPU_SET(cpu, &cpuset);
 	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
-		printf("[SKIP]\tfailed to force CPU %d\n", cpu);
-		return nerrs;
+		ksft_print_msg("failed to force CPU %d\n", cpu);
+		ksft_test_result_skip("vdso_getcpu\n");
+		ksft_test_result_skip("vsyscall_map_x\n");
+
+		return;
 	}
 
-	unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys;
-	unsigned node = 0;
-	bool have_node = false;
 	ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
 	if (vdso_getcpu)
 		ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
@@ -332,10 +251,9 @@ static int test_getcpu(int cpu)
 		ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
 
 	if (ret_sys == 0) {
-		if (cpu_sys != cpu) {
-			printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu);
-			nerrs++;
-		}
+		if (cpu_sys != cpu)
+			ksft_print_msg("syscall reported CPU %hu but should be %d\n",
+				       cpu_sys, cpu);
 
 		have_node = true;
 		node = node_sys;
@@ -343,63 +261,84 @@ static int test_getcpu(int cpu)
 
 	if (vdso_getcpu) {
 		if (ret_vdso) {
-			printf("[FAIL]\tvDSO getcpu() failed\n");
-			nerrs++;
+			ksft_test_result_fail("vDSO getcpu() failed\n");
 		} else {
 			if (!have_node) {
 				have_node = true;
 				node = node_vdso;
 			}
 
-			if (cpu_vdso != cpu) {
-				printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu);
-				nerrs++;
-			} else {
-				printf("[OK]\tvDSO reported correct CPU\n");
-			}
-
-			if (node_vdso != node) {
-				printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node);
-				nerrs++;
+			if (cpu_vdso != cpu || node_vdso != node) {
+				if (cpu_vdso != cpu)
+					ksft_print_msg("vDSO reported CPU %hu but should be %d\n",
+						       cpu_vdso, cpu);
+				if (node_vdso != node)
+					ksft_print_msg("vDSO reported node %hu but should be %hu\n",
+						       node_vdso, node);
+				ksft_test_result_fail("Wrong values\n");
 			} else {
-				printf("[OK]\tvDSO reported correct node\n");
+				ksft_test_result_pass("vDSO reported correct CPU and node\n");
 			}
 		}
+	} else {
+		ksft_test_result_skip("vdso_getcpu isn't set\n");
 	}
 
 	if (vsyscall_map_x) {
 		if (ret_vsys) {
-			printf("[FAIL]\tvsyscall getcpu() failed\n");
-			nerrs++;
+			ksft_test_result_fail("vsyscall getcpu() failed\n");
 		} else {
 			if (!have_node) {
 				have_node = true;
 				node = node_vsys;
 			}
 
-			if (cpu_vsys != cpu) {
-				printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu);
-				nerrs++;
+			if (cpu_vsys != cpu || node_vsys != node) {
+				if (cpu_vsys != cpu)
+					ksft_print_msg("vsyscall reported CPU %hu but should be %d\n",
+						       cpu_vsys, cpu);
+				if (node_vsys != node)
+					ksft_print_msg("vsyscall reported node %hu but should be %hu\n",
+						       node_vsys, node);
+				ksft_test_result_fail("Wrong values\n");
 			} else {
-				printf("[OK]\tvsyscall reported correct CPU\n");
-			}
-
-			if (node_vsys != node) {
-				printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node);
-				nerrs++;
-			} else {
-				printf("[OK]\tvsyscall reported correct node\n");
+				ksft_test_result_pass("vsyscall reported correct CPU and node\n");
 			}
 		}
+	} else {
+		ksft_test_result_skip("vsyscall_map_x isn't set\n");
 	}
+}
+
+#ifdef __x86_64__
+
+static jmp_buf jmpbuf;
+static volatile unsigned long segv_err;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
 
-	return nerrs;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		ksft_exit_fail_msg("sigaction failed\n");
 }
 
-static int test_vsys_r(void)
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
 {
-#ifdef __x86_64__
-	printf("[RUN]\tChecking read access to the vsyscall page\n");
+	ucontext_t *ctx = (ucontext_t *)ctx_void;
+
+	segv_err =  ctx->uc_mcontext.gregs[REG_ERR];
+	siglongjmp(jmpbuf, 1);
+}
+
+static void test_vsys_r(void)
+{
+	ksft_print_msg("Checking read access to the vsyscall page\n");
 	bool can_read;
 	if (sigsetjmp(jmpbuf, 1) == 0) {
 		*(volatile int *)0xffffffffff600000;
@@ -408,32 +347,25 @@ static int test_vsys_r(void)
 		can_read = false;
 	}
 
-	if (can_read && !vsyscall_map_r) {
-		printf("[FAIL]\tWe have read access, but we shouldn't\n");
-		return 1;
-	} else if (!can_read && vsyscall_map_r) {
-		printf("[FAIL]\tWe don't have read access, but we should\n");
-		return 1;
-	} else if (can_read) {
-		printf("[OK]\tWe have read access\n");
-	} else {
-		printf("[OK]\tWe do not have read access: #PF(0x%lx)\n",
-		       segv_err);
-	}
-#endif
-
-	return 0;
+	if (can_read && !vsyscall_map_r)
+		ksft_test_result_fail("We have read access, but we shouldn't\n");
+	else if (!can_read && vsyscall_map_r)
+		ksft_test_result_fail("We don't have read access, but we should\n");
+	else if (can_read)
+		ksft_test_result_pass("We have read access\n");
+	else
+		ksft_test_result_pass("We do not have read access: #PF(0x%lx)\n", segv_err);
 }
 
-static int test_vsys_x(void)
+static void test_vsys_x(void)
 {
-#ifdef __x86_64__
 	if (vsyscall_map_x) {
 		/* We already tested this adequately. */
-		return 0;
+		ksft_test_result_pass("vsyscall_map_x is true\n");
+		return;
 	}
 
-	printf("[RUN]\tMake sure that vsyscalls really page fault\n");
+	ksft_print_msg("Make sure that vsyscalls really page fault\n");
 
 	bool can_exec;
 	if (sigsetjmp(jmpbuf, 1) == 0) {
@@ -443,20 +375,14 @@ static int test_vsys_x(void)
 		can_exec = false;
 	}
 
-	if (can_exec) {
-		printf("[FAIL]\tExecuting the vsyscall did not page fault\n");
-		return 1;
-	} else if (segv_err & (1 << 4)) { /* INSTR */
-		printf("[OK]\tExecuting the vsyscall page failed: #PF(0x%lx)\n",
-		       segv_err);
-	} else {
-		printf("[FAIL]\tExecution failed with the wrong error: #PF(0x%lx)\n",
-		       segv_err);
-		return 1;
-	}
-#endif
-
-	return 0;
+	if (can_exec)
+		ksft_test_result_fail("Executing the vsyscall did not page fault\n");
+	else if (segv_err & (1 << 4)) /* INSTR */
+		ksft_test_result_pass("Executing the vsyscall page failed: #PF(0x%lx)\n",
+				      segv_err);
+	else
+		ksft_test_result_fail("Execution failed with the wrong error: #PF(0x%lx)\n",
+				      segv_err);
 }
 
 /*
@@ -470,14 +396,13 @@ static int test_vsys_x(void)
  * fact that ptrace() ever worked was a nice courtesy of old kernels,
  * but the code to support it is fairly gross.
  */
-static int test_process_vm_readv(void)
+static void test_process_vm_readv(void)
 {
-#ifdef __x86_64__
 	char buf[4096];
 	struct iovec local, remote;
 	int ret;
 
-	printf("[RUN]\tprocess_vm_readv() from vsyscall page\n");
+	ksft_print_msg("process_vm_readv() from vsyscall page\n");
 
 	local.iov_base = buf;
 	local.iov_len = 4096;
@@ -489,27 +414,71 @@ static int test_process_vm_readv(void)
 		 * We expect process_vm_readv() to work if and only if the
 		 * vsyscall page is readable.
 		 */
-		printf("[%s]\tprocess_vm_readv() failed (ret = %d, errno = %d)\n", vsyscall_map_r ? "FAIL" : "OK", ret, errno);
-		return vsyscall_map_r ? 1 : 0;
+		ksft_test_result(!vsyscall_map_r,
+				 "process_vm_readv() failed (ret = %d, errno = %d)\n", ret, errno);
+		return;
 	}
 
-	if (vsyscall_map_r) {
-		if (!memcmp(buf, remote.iov_base, sizeof(buf))) {
-			printf("[OK]\tIt worked and read correct data\n");
-		} else {
-			printf("[FAIL]\tIt worked but returned incorrect data\n");
-			return 1;
+	if (vsyscall_map_r)
+		ksft_test_result(!memcmp(buf, remote.iov_base, sizeof(buf)), "Read data\n");
+	else
+		ksft_test_result_fail("process_rm_readv() succeeded, but it should have failed in this configuration\n");
+}
+
+static void init_vsys(void)
+{
+	int nerrs = 0;
+	FILE *maps;
+	char line[MAPS_LINE_LEN];
+	bool found = false;
+
+	maps = fopen("/proc/self/maps", "r");
+	if (!maps) {
+		ksft_test_result_skip("Could not open /proc/self/maps -- assuming vsyscall is r-x\n");
+		vsyscall_map_r = true;
+		return;
+	}
+
+	while (fgets(line, MAPS_LINE_LEN, maps)) {
+		char r, x;
+		void *start, *end;
+		char name[MAPS_LINE_LEN];
+
+		/* sscanf() is safe here as strlen(name) >= strlen(line) */
+		if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
+			   &start, &end, &r, &x, name) != 5)
+			continue;
+
+		if (strcmp(name, "[vsyscall]"))
+			continue;
+
+		ksft_print_msg("vsyscall map: %s", line);
+
+		if (start != (void *)0xffffffffff600000 ||
+		    end != (void *)0xffffffffff601000) {
+			ksft_print_msg("address range is nonsense\n");
+			nerrs++;
 		}
-	} else {
-		printf("[FAIL]\tprocess_rm_readv() succeeded, but it should have failed in this configuration\n");
-		return 1;
+
+		ksft_print_msg("vsyscall permissions are %c-%c\n", r, x);
+		vsyscall_map_r = (r == 'r');
+		vsyscall_map_x = (x == 'x');
+
+		found = true;
+		break;
 	}
-#endif
 
-	return 0;
+	fclose(maps);
+
+	if (!found) {
+		ksft_print_msg("no vsyscall map in /proc/self/maps\n");
+		vsyscall_map_r = false;
+		vsyscall_map_x = false;
+	}
+
+	ksft_test_result(!nerrs, "vsyscall map\n");
 }
 
-#ifdef __x86_64__
 static volatile sig_atomic_t num_vsyscall_traps;
 
 static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
@@ -521,15 +490,17 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
 		num_vsyscall_traps++;
 }
 
-static int test_emulation(void)
+static void test_emulation(void)
 {
 	time_t tmp;
 	bool is_native;
 
-	if (!vsyscall_map_x)
-		return 0;
+	if (!vsyscall_map_x) {
+		ksft_test_result_skip("vsyscall_map_x isn't set\n");
+		return;
+	}
 
-	printf("[RUN]\tchecking that vsyscalls are emulated\n");
+	ksft_print_msg("checking that vsyscalls are emulated\n");
 	sethandler(SIGTRAP, sigtrap, 0);
 	set_eflags(get_eflags() | X86_EFLAGS_TF);
 	vtime(&tmp);
@@ -545,36 +516,35 @@ static int test_emulation(void)
 	 */
 	is_native = (num_vsyscall_traps > 1);
 
-	printf("[%s]\tvsyscalls are %s (%d instructions in vsyscall page)\n",
-	       (is_native ? "FAIL" : "OK"),
-	       (is_native ? "native" : "emulated"),
-	       (int)num_vsyscall_traps);
-
-	return is_native;
+	ksft_test_result(!is_native, "vsyscalls are %s (%d instructions in vsyscall page)\n",
+			 (is_native ? "native" : "emulated"), (int)num_vsyscall_traps);
 }
 #endif
 
 int main(int argc, char **argv)
 {
-	int nerrs = 0;
+	int total_tests = TOTAL_TESTS;
 
-	init_vdso();
-	nerrs += init_vsys();
+	ksft_print_header();
+	ksft_set_plan(total_tests);
 
-	nerrs += test_gtod();
-	nerrs += test_time();
-	nerrs += test_getcpu(0);
-	nerrs += test_getcpu(1);
-
-	sethandler(SIGSEGV, sigsegv, 0);
-	nerrs += test_vsys_r();
-	nerrs += test_vsys_x();
+	init_vdso();
+#ifdef __x86_64__
+	init_vsys();
+#endif
 
-	nerrs += test_process_vm_readv();
+	test_gtod();
+	test_time();
+	test_getcpu(0);
+	test_getcpu(1);
 
 #ifdef __x86_64__
-	nerrs += test_emulation();
+	sethandler(SIGSEGV, sigsegv, 0);
+	test_vsys_r();
+	test_vsys_x();
+	test_process_vm_readv();
+	test_emulation();
 #endif
 
-	return nerrs ? 1 : 0;
+	ksft_finished();
 }
diff --git a/tools/tracing/latency/.gitignore b/tools/tracing/latency/.gitignore
index 0863960761e7..2bb8e60f7fdd 100644
--- a/tools/tracing/latency/.gitignore
+++ b/tools/tracing/latency/.gitignore
@@ -1,2 +1,5 @@
-# SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: GPL-2.0-only
 latency-collector
+fixdep
+feature
+FEATURE-DUMP
diff --git a/tools/tracing/latency/Build b/tools/tracing/latency/Build
new file mode 100644
index 000000000000..0ce65ea72bf9
--- /dev/null
+++ b/tools/tracing/latency/Build
@@ -0,0 +1 @@
+latency-collector-y += latency-collector.o
diff --git a/tools/tracing/latency/Makefile b/tools/tracing/latency/Makefile
index 40c4ddaf8be1..6518b03e05c7 100644
--- a/tools/tracing/latency/Makefile
+++ b/tools/tracing/latency/Makefile
@@ -1,24 +1,93 @@
-# SPDX-License-Identifier: GPL-2.0
-# Makefile for vm tools
-#
-VAR_CFLAGS := $(shell pkg-config --cflags libtracefs 2>/dev/null)
-VAR_LDLIBS := $(shell pkg-config --libs libtracefs 2>/dev/null)
+# SPDX-License-Identifier: GPL-2.0-only
 
-TARGETS = latency-collector
-CFLAGS = -Wall -Wextra -g -O2 $(VAR_CFLAGS)
-LDFLAGS = -lpthread $(VAR_LDLIBS)
+ifeq ($(srctree),)
+  srctree	:= $(patsubst %/,%,$(dir $(CURDIR)))
+  srctree	:= $(patsubst %/,%,$(dir $(srctree)))
+  srctree	:= $(patsubst %/,%,$(dir $(srctree)))
+endif
 
-all: $(TARGETS)
+include $(srctree)/tools/scripts/Makefile.include
 
-%: %.c
-	$(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
+# O is an alias for OUTPUT
+OUTPUT		:= $(O)
 
-clean:
-	$(RM) latency-collector
+ifeq ($(OUTPUT),)
+  OUTPUT	:= $(CURDIR)
+else
+  # subdir is used by the ../Makefile in $(call descend,)
+  ifneq ($(subdir),)
+    OUTPUT	:= $(OUTPUT)/$(subdir)
+  endif
+endif
+
+ifneq ($(patsubst %/,,$(lastword $(OUTPUT))),)
+  OUTPUT	:= $(OUTPUT)/
+endif
+
+LATENCY-COLLECTOR	:= $(OUTPUT)latency-collector
+LATENCY-COLLECTOR_IN	:= $(LATENCY-COLLECTOR)-in.o
+
+export CC	:= gcc
+export LD	:= ld
+export AR	:= ar
+export PKG_CONFIG := pkg-config
+
+FEATURE_TESTS	:= libtraceevent
+FEATURE_TESTS	+= libtracefs
+FEATURE_DISPLAY	:= libtraceevent
+FEATURE_DISPLAY	+= libtracefs
+
+ifeq ($(V),1)
+  Q 		=
+else
+  Q 		= @
+endif
+
+all: $(LATENCY-COLLECTOR)
+
+include $(srctree)/tools/build/Makefile.include
+
+# check for dependencies only on required targets
+NON_CONFIG_TARGETS := clean install
 
-prefix ?= /usr/local
-sbindir ?= ${prefix}/sbin
+config		:= 1
+ifdef MAKECMDGOALS
+ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),)
+ config		:= 0
+endif
+endif
 
-install: all
-	install -d $(DESTDIR)$(sbindir)
-	install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir)
+ifeq ($(config),1)
+  include $(srctree)/tools/build/Makefile.feature
+  include Makefile.config
+endif
+
+CFLAGS		+= $(INCLUDES) $(LIB_INCLUDES)
+
+export CFLAGS OUTPUT srctree
+
+$(LATENCY-COLLECTOR): $(LATENCY-COLLECTOR_IN)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) -o $(LATENCY-COLLECTOR) $(LATENCY-COLLECTOR_IN) $(EXTLIBS)
+
+latency-collector.%: fixdep FORCE
+	make -f $(srctree)/tools/build/Makefile.build dir=. $@
+
+$(LATENCY-COLLECTOR_IN): fixdep FORCE
+	make $(build)=latency-collector
+
+INSTALL		:= install
+MKDIR		:= mkdir
+STRIP		:= strip
+BINDIR		:= /usr/bin
+
+install:
+	@$(MKDIR) -p $(DESTDIR)$(BINDIR)
+	$(call QUIET_INSTALL,latency-collector)$(INSTALL) $(LATENCY-COLLECTOR) -m 755 $(DESTDIR)$(BINDIR)
+	@$(STRIP) $(DESTDIR)$(BINDIR)/latency-collector
+
+clean:
+	$(call QUIET_CLEAN, latency-collector)
+	$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+	$(Q)@rm -f latency-collector fixdep FEATURE-DUMP
+	$(Q)rm -rf feature
+.PHONY: FORCE clean install
diff --git a/tools/tracing/latency/Makefile.config b/tools/tracing/latency/Makefile.config
new file mode 100644
index 000000000000..b25e531a1f95
--- /dev/null
+++ b/tools/tracing/latency/Makefile.config
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+STOP_ERROR :=
+
+define lib_setup
+  $(eval EXTLIBS += -l$(1))
+  $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)"))
+endef
+
+$(call feature_check,libtraceevent)
+ifeq ($(feature-libtraceevent), 1)
+  $(call detected,CONFIG_LIBTRACEEVENT)
+  $(call lib_setup,traceevent)
+else
+  STOP_ERROR := 1
+  $(info libtraceevent is missing. Please install libtraceevent-dev/libtraceevent-devel)
+endif
+
+$(call feature_check,libtracefs)
+ifeq ($(feature-libtracefs), 1)
+  $(call detected,CONFIG_LIBTRACEFS)
+  $(call lib_setup,tracefs)
+else
+  STOP_ERROR := 1
+  $(info libtracefs is missing. Please install libtracefs-dev/libtracefs-devel)
+endif
+
+ifeq ($(STOP_ERROR),1)
+  $(error Please, check the errors above.)
+endif
diff --git a/tools/tracing/rtla/.gitignore b/tools/tracing/rtla/.gitignore
index e9df32419b2b..293f0dbb0ca2 100644
--- a/tools/tracing/rtla/.gitignore
+++ b/tools/tracing/rtla/.gitignore
@@ -1 +1,6 @@
-/rtla
+# SPDX-License-Identifier: GPL-2.0-only
+rtla
+rtla-static
+fixdep
+feature
+FEATURE-DUMP
diff --git a/tools/tracing/rtla/Build b/tools/tracing/rtla/Build
new file mode 100644
index 000000000000..6c9d5b36a315
--- /dev/null
+++ b/tools/tracing/rtla/Build
@@ -0,0 +1 @@
+rtla-y += src/
diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index afd18c678ff5..b5878be36125 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -1,157 +1,86 @@
-NAME	:=	rtla
-# Follow the kernel version
-VERSION :=	$(shell cat VERSION 2> /dev/null || make -sC ../../.. kernelversion | grep -v make)
-
-# From libtracefs:
-# Makefiles suck: This macro sets a default value of $(2) for the
-# variable named by $(1), unless the variable has been set by
-# environment or command line. This is necessary for CC and AR
-# because make sets default values, so the simpler ?= approach
-# won't work as expected.
-define allow-override
-  $(if $(or $(findstring environment,$(origin $(1))),\
-            $(findstring command line,$(origin $(1)))),,\
-    $(eval $(1) = $(2)))
-endef
-
-# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix.
-$(call allow-override,CC,$(CROSS_COMPILE)gcc)
-$(call allow-override,AR,$(CROSS_COMPILE)ar)
-$(call allow-override,STRIP,$(CROSS_COMPILE)strip)
-$(call allow-override,PKG_CONFIG,pkg-config)
-$(call allow-override,LD_SO_CONF_PATH,/etc/ld.so.conf.d/)
-$(call allow-override,LDCONFIG,ldconfig)
-
-INSTALL	=	install
-MKDIR	=	mkdir
-FOPTS	:=	-flto=auto -ffat-lto-objects -fexceptions -fstack-protector-strong \
-		-fasynchronous-unwind-tables -fstack-clash-protection
-WOPTS	:= 	-Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -Wno-maybe-uninitialized
-
-ifeq ($(CC),clang)
-  FOPTS := $(filter-out -ffat-lto-objects, $(FOPTS))
-  WOPTS := $(filter-out -Wno-maybe-uninitialized, $(WOPTS))
+# SPDX-License-Identifier: GPL-2.0-only
+
+ifeq ($(srctree),)
+  srctree	:= $(patsubst %/,%,$(dir $(CURDIR)))
+  srctree	:= $(patsubst %/,%,$(dir $(srctree)))
+  srctree	:= $(patsubst %/,%,$(dir $(srctree)))
 endif
 
-TRACEFS_HEADERS	:= $$($(PKG_CONFIG) --cflags libtracefs)
-
-CFLAGS	:=	-O -g -DVERSION=\"$(VERSION)\" $(FOPTS) $(MOPTS) $(WOPTS) $(TRACEFS_HEADERS) $(EXTRA_CFLAGS)
-LDFLAGS	:=	-flto=auto -ggdb $(EXTRA_LDFLAGS)
-LIBS	:=	$$($(PKG_CONFIG) --libs libtracefs)
-
-SRC	:=	$(wildcard src/*.c)
-HDR	:=	$(wildcard src/*.h)
-OBJ	:=	$(SRC:.c=.o)
-DIRS	:=	src
-FILES	:=	Makefile README.txt
-CEXT	:=	bz2
-TARBALL	:=	$(NAME)-$(VERSION).tar.$(CEXT)
-TAROPTS	:=	-cvjf $(TARBALL)
-BINDIR	:=	/usr/bin
-DATADIR	:=	/usr/share
-DOCDIR	:=	$(DATADIR)/doc
-MANDIR	:=	$(DATADIR)/man
-LICDIR	:=	$(DATADIR)/licenses
-SRCTREE	:=	$(or $(BUILD_SRC),$(CURDIR))
-
-# If running from the tarball, man pages are stored in the Documentation
-# dir. If running from the kernel source, man pages are stored in
-# Documentation/tools/rtla/.
-ifneq ($(wildcard Documentation/.*),)
-DOCSRC	=	Documentation/
+include $(srctree)/tools/scripts/Makefile.include
+
+# O is an alias for OUTPUT
+OUTPUT          := $(O)
+
+ifeq ($(OUTPUT),)
+  OUTPUT        := $(CURDIR)
 else
-DOCSRC	=	$(SRCTREE)/../../../Documentation/tools/rtla/
+  # subdir is used by the ../Makefile in $(call descend,)
+  ifneq ($(subdir),)
+    OUTPUT        := $(OUTPUT)/$(subdir)
+  endif
+endif
+
+ifneq ($(patsubst %/,,$(lastword $(OUTPUT))),)
+  OUTPUT        := $(OUTPUT)/
 endif
 
-LIBTRACEEVENT_MIN_VERSION = 1.5
-LIBTRACEFS_MIN_VERSION = 1.3
+RTLA		:= $(OUTPUT)rtla
+RTLA_IN		:= $(RTLA)-in.o
 
-.PHONY:	all warnings show_warnings
-all:	warnings rtla
+VERSION		:= $(shell sh -c "make -sC ../../.. kernelversion | grep -v make")
+DOCSRC		:= ../../../Documentation/tools/rtla/
 
-TEST_LIBTRACEEVENT = $(shell sh -c "$(PKG_CONFIG) --atleast-version $(LIBTRACEEVENT_MIN_VERSION) libtraceevent > /dev/null 2>&1 || echo n")
-ifeq ("$(TEST_LIBTRACEEVENT)", "n")
-WARNINGS = show_warnings
-MISSING_LIBS += echo "**   libtraceevent version $(LIBTRACEEVENT_MIN_VERSION) or higher";
-MISSING_PACKAGES += "libtraceevent-devel"
-MISSING_SOURCE += echo "**  https://git.kernel.org/pub/scm/libs/libtrace/libtraceevent.git/ ";
+FEATURE_TESTS	:= libtraceevent
+FEATURE_TESTS	+= libtracefs
+FEATURE_DISPLAY	:= libtraceevent
+FEATURE_DISPLAY	+= libtracefs
+
+ifeq ($(V),1)
+  Q		=
+else
+  Q		= @
 endif
 
-TEST_LIBTRACEFS = $(shell sh -c "$(PKG_CONFIG) --atleast-version $(LIBTRACEFS_MIN_VERSION) libtracefs > /dev/null 2>&1 || echo n")
-ifeq ("$(TEST_LIBTRACEFS)", "n")
-WARNINGS = show_warnings
-MISSING_LIBS += echo "**   libtracefs version $(LIBTRACEFS_MIN_VERSION) or higher";
-MISSING_PACKAGES += "libtracefs-devel"
-MISSING_SOURCE += echo "**  https://git.kernel.org/pub/scm/libs/libtrace/libtracefs.git/ ";
+all: $(RTLA)
+
+include $(srctree)/tools/build/Makefile.include
+include Makefile.rtla
+
+# check for dependencies only on required targets
+NON_CONFIG_TARGETS := clean install tarball doc doc_clean doc_install
+
+config		:= 1
+ifdef MAKECMDGOALS
+ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),)
+ config		:= 0
+endif
 endif
 
-define show_dependencies
-	@echo "********************************************";				\
-	echo "** NOTICE: Failed build dependencies";					\
-	echo "**";									\
-	echo "** Required Libraries:";							\
-	$(MISSING_LIBS)									\
-	echo "**";									\
-	echo "** Consider installing the latest libtracefs from your";			\
-	echo "** distribution, e.g., 'dnf install $(MISSING_PACKAGES)' on Fedora,";	\
-	echo "** or from source:";							\
-	echo "**";									\
-	$(MISSING_SOURCE)								\
-	echo "**";									\
-	echo "********************************************"
-endef
-
-show_warnings:
-	$(call show_dependencies);
-
-ifneq ("$(WARNINGS)", "")
-ERROR_OUT = $(error Please add the necessary dependencies)
-
-warnings: $(WARNINGS)
-	$(ERROR_OUT)
+ifeq ($(config),1)
+  include $(srctree)/tools/build/Makefile.feature
+  include Makefile.config
 endif
 
-rtla: $(OBJ)
-	$(CC) -o rtla $(LDFLAGS) $(OBJ) $(LIBS)
-
-static: $(OBJ)
-	$(CC) -o rtla-static $(LDFLAGS) --static $(OBJ) $(LIBS) -lpthread -ldl
-
-.PHONY: install
-install: doc_install
-	$(MKDIR) -p $(DESTDIR)$(BINDIR)
-	$(INSTALL) rtla -m 755 $(DESTDIR)$(BINDIR)
-	$(STRIP) $(DESTDIR)$(BINDIR)/rtla
-	@test ! -f $(DESTDIR)$(BINDIR)/osnoise || rm $(DESTDIR)$(BINDIR)/osnoise
-	ln -s rtla $(DESTDIR)$(BINDIR)/osnoise
-	@test ! -f $(DESTDIR)$(BINDIR)/hwnoise || rm $(DESTDIR)$(BINDIR)/hwnoise
-	ln -s rtla $(DESTDIR)$(BINDIR)/hwnoise
-	@test ! -f $(DESTDIR)$(BINDIR)/timerlat || rm $(DESTDIR)$(BINDIR)/timerlat
-	ln -s rtla $(DESTDIR)$(BINDIR)/timerlat
-
-.PHONY: clean tarball
-clean: doc_clean
-	@test ! -f rtla || rm rtla
-	@test ! -f rtla-static || rm rtla-static
-	@test ! -f src/rtla.o || rm src/rtla.o
-	@test ! -f $(TARBALL) || rm -f $(TARBALL)
-	@rm -rf *~ $(OBJ) *.tar.$(CEXT)
-
-tarball: clean
-	rm -rf $(NAME)-$(VERSION) && mkdir $(NAME)-$(VERSION)
-	echo $(VERSION) > $(NAME)-$(VERSION)/VERSION
-	cp -r $(DIRS) $(FILES) $(NAME)-$(VERSION)
-	mkdir $(NAME)-$(VERSION)/Documentation/
-	cp -rp $(SRCTREE)/../../../Documentation/tools/rtla/* $(NAME)-$(VERSION)/Documentation/
-	tar $(TAROPTS) --exclude='*~' $(NAME)-$(VERSION)
-	rm -rf $(NAME)-$(VERSION)
-
-.PHONY: doc doc_clean doc_install
-doc:
-	$(MAKE) -C $(DOCSRC)
-
-doc_clean:
-	$(MAKE) -C $(DOCSRC) clean
-
-doc_install:
-	$(MAKE) -C $(DOCSRC) install
+CFLAGS		+= $(INCLUDES) $(LIB_INCLUDES)
+
+export CFLAGS OUTPUT srctree
+
+$(RTLA): $(RTLA_IN)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) -o $(RTLA) $(RTLA_IN) $(EXTLIBS)
+
+static: $(RTLA_IN)
+	$(eval LDFLAGS += -static)
+	$(QUIET_LINK)$(CC) -static $(LDFLAGS) -o $(RTLA)-static $(RTLA_IN)  $(EXTLIBS)
+
+rtla.%: fixdep FORCE
+	make -f $(srctree)/tools/build/Makefile.build dir=. $@
+
+$(RTLA_IN): fixdep FORCE
+	make $(build)=rtla
+
+clean: doc_clean fixdep-clean
+	$(call QUIET_CLEAN, rtla)
+	$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+	$(Q)rm -f rtla rtla-static fixdep FEATURE-DUMP rtla-*
+	$(Q)rm -rf feature
+.PHONY: FORCE clean
diff --git a/tools/tracing/rtla/Makefile.config b/tools/tracing/rtla/Makefile.config
new file mode 100644
index 000000000000..6d4ba77847b6
--- /dev/null
+++ b/tools/tracing/rtla/Makefile.config
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+STOP_ERROR :=
+
+LIBTRACEEVENT_MIN_VERSION = 1.5
+LIBTRACEFS_MIN_VERSION = 1.3
+
+define lib_setup
+  $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)"))
+  $(eval EXTLIBS += -l$(1))
+endef
+
+$(call feature_check,libtraceevent)
+ifeq ($(feature-libtraceevent), 1)
+  $(call detected,CONFIG_LIBTRACEEVENT)
+
+  TEST = $(shell sh -c "$(PKG_CONFIG) --atleast-version $(LIBTRACEEVENT_MIN_VERSION) libtraceevent > /dev/null 2>&1 && echo y || echo n")
+  ifeq ($(TEST),n)
+    $(info libtraceevent version is too low, it must be at least $(LIBTRACEEVENT_MIN_VERSION))
+    STOP_ERROR := 1
+  endif
+
+  $(call lib_setup,traceevent)
+else
+  STOP_ERROR := 1
+  $(info libtraceevent is missing. Please install libtraceevent-dev/libtraceevent-devel)
+endif
+
+$(call feature_check,libtracefs)
+ifeq ($(feature-libtracefs), 1)
+  $(call detected,CONFIG_LIBTRACEFS)
+
+  TEST = $(shell sh -c "$(PKG_CONFIG) --atleast-version $(LIBTRACEFS_MIN_VERSION) libtracefs > /dev/null 2>&1 && echo y || echo n")
+  ifeq ($(TEST),n)
+    $(info libtracefs version is too low, it must be at least $(LIBTRACEFS_MIN_VERSION))
+    STOP_ERROR := 1
+  endif
+
+  $(call lib_setup,tracefs)
+else
+  STOP_ERROR := 1
+  $(info libtracefs is missing. Please install libtracefs-dev/libtracefs-devel)
+endif
+
+ifeq ($(STOP_ERROR),1)
+  $(error Please, check the errors above.)
+endif
diff --git a/tools/tracing/rtla/Makefile.rtla b/tools/tracing/rtla/Makefile.rtla
new file mode 100644
index 000000000000..3ff0b8970896
--- /dev/null
+++ b/tools/tracing/rtla/Makefile.rtla
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+define allow-override
+  $(if $(or $(findstring environment,$(origin $(1))),\
+            $(findstring command line,$(origin $(1)))),,\
+    $(eval $(1) = $(2)))
+endef
+
+# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix.
+$(call allow-override,CC,$(CROSS_COMPILE)gcc)
+$(call allow-override,AR,$(CROSS_COMPILE)ar)
+$(call allow-override,STRIP,$(CROSS_COMPILE)strip)
+$(call allow-override,PKG_CONFIG,pkg-config)
+$(call allow-override,LD_SO_CONF_PATH,/etc/ld.so.conf.d/)
+$(call allow-override,LDCONFIG,ldconfig)
+export CC AR STRIP PKG_CONFIG LD_SO_CONF_PATH LDCONFIG
+
+FOPTS		:= -flto=auto -ffat-lto-objects -fexceptions -fstack-protector-strong	\
+		-fasynchronous-unwind-tables -fstack-clash-protection
+WOPTS		:= -O -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2		\
+		-Wp,-D_GLIBCXX_ASSERTIONS -Wno-maybe-uninitialized
+
+ifeq ($(CC),clang)
+  FOPTS		:= $(filter-out -flto=auto -ffat-lto-objects, $(FOPTS))
+  WOPTS		:= $(filter-out -Wno-maybe-uninitialized, $(WOPTS))
+endif
+
+CFLAGS		:= -g -DVERSION=\"$(VERSION)\" $(FOPTS) $(WOPTS) $(CFLAGS)
+LDFLAGS		:= -ggdb $(LDFLAGS)
+
+RM		:= rm -rf
+LN		:= ln -s
+INSTALL		:= install
+MKDIR		:= mkdir
+STRIP		:= strip
+BINDIR		:= /usr/bin
+
+.PHONY: install
+install: doc_install
+	@$(MKDIR) -p $(DESTDIR)$(BINDIR)
+	$(call QUIET_INSTALL,rtla)$(INSTALL) rtla -m 755 $(DESTDIR)$(BINDIR)
+	@$(STRIP) $(DESTDIR)$(BINDIR)/rtla
+	@test ! -f $(DESTDIR)$(BINDIR)/osnoise || $(RM) $(DESTDIR)$(BINDIR)/osnoise
+	@$(LN) rtla $(DESTDIR)$(BINDIR)/osnoise
+	@test ! -f $(DESTDIR)$(BINDIR)/hwnoise || $(RM) $(DESTDIR)$(BINDIR)/hwnoise
+	@$(LN) -s rtla $(DESTDIR)$(BINDIR)/hwnoise
+	@test ! -f $(DESTDIR)$(BINDIR)/timerlat || $(RM) $(DESTDIR)$(BINDIR)/timerlat
+	@$(LN) -s rtla $(DESTDIR)$(BINDIR)/timerlat
+
+.PHONY: doc doc_clean doc_install
+doc:
+	$(MAKE) -C $(DOCSRC)
+
+doc_clean:
+	$(MAKE) -C $(DOCSRC) clean
+
+doc_install:
+	$(MAKE) -C $(DOCSRC) install
+
+# This section is neesary for the tarball, when the tarball
+# support is removed, we can delete these entries.
+NAME		:= rtla
+DIRS		:= src
+FILES		:= Makefile README.txt
+CEXT		:= bz2
+TARBALL		:= $(NAME)-$(VERSION).tar.$(CEXT)
+TAROPTS		:= -cvjf $(TARBALL)
+SRCTREE		:=	$(or $(BUILD_SRC),$(CURDIR))
+
+tarball: clean
+	$(RM) $(NAME)-$(VERSION) && $(MKDIR) $(NAME)-$(VERSION)
+	echo $(VERSION) > $(NAME)-$(VERSION)/VERSION
+	cp -r $(DIRS) $(FILES) $(NAME)-$(VERSION)
+	$(MKDIR) $(NAME)-$(VERSION)/Documentation/
+	cp -rp $(SRCTREE)/../../../Documentation/tools/$(NAME)/* $(NAME)-$(VERSION)/Documentation/
+	cp Makefile.standalone $(NAME)-$(VERSION)/Makefile
+	cp Makefile.$(NAME) $(NAME)-$(VERSION)/
+	tar $(TAROPTS) --exclude='*~' $(NAME)-$(VERSION)
+	$(RM) $(NAME)-$(VERSION)
+.PHONY: tarball
diff --git a/tools/tracing/rtla/Makefile.standalone b/tools/tracing/rtla/Makefile.standalone
new file mode 100644
index 000000000000..86d07cb52fa5
--- /dev/null
+++ b/tools/tracing/rtla/Makefile.standalone
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+VERSION		:= $(shell cat VERSION)
+CFLAGS		+= $$($(PKG_CONFIG) --cflags libtracefs)
+EXTLIBS		+= $$($(PKG_CONFIG) --libs libtracefs)
+
+rtla:
+
+include Makefile.rtla
+
+SRC		:= $(wildcard src/*.c)
+HDR		:= $(wildcard src/*.h)
+OBJ		:= $(SRC:.c=.o)
+DOCSRC		:= Documentation/
+
+rtla: $(OBJ)
+	$(CC) -o rtla $(LDFLAGS) $(OBJ) $(LIBS) $(EXTLIBS)
+	$(info This is a deprecated method to compile RTLA, please compile from Linux kernel source)
+
+.PHONY: clean tarball
+clean: doc_clean
+	@test ! -f rtla || rm rtla
+	@test ! -f rtla-static || rm rtla-static
+	@test ! -f src/rtla.o || rm src/rtla.o
+	@test ! -f $(TARBALL) || rm -f $(TARBALL)
+	@rm -rf *~ $(OBJ) *.tar.$(CEXT)
diff --git a/tools/tracing/rtla/sample/timerlat_load.py b/tools/tracing/rtla/sample/timerlat_load.py
new file mode 100644
index 000000000000..8cc5eb2d2e69
--- /dev/null
+++ b/tools/tracing/rtla/sample/timerlat_load.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (C) 2024 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+#
+# This is a sample code about how to use timerlat's timer by any workload
+# so rtla can measure and provide auto-analysis for the overall latency (IOW
+# the response time) for a task.
+#
+# Before running it, you need to dispatch timerlat with -U option in a terminal.
+# Then # run this script pinned to a CPU on another terminal. For example:
+#
+# timerlat_load.py 1 -p 95
+#
+# The "Timerlat IRQ" is the IRQ latency, The thread latency is the latency
+# for the python process to get the CPU. The Ret from user Timer Latency is
+# the overall latency. In other words, it is the response time for that
+# activation.
+#
+# This is just an example, the load is reading 20MB of data from /dev/full
+# It is in python because it is easy to read :-)
+
+import argparse
+import sys
+import os
+
+parser = argparse.ArgumentParser(description='user-space timerlat thread in Python')
+parser.add_argument("cpu", help='CPU to run timerlat thread')
+parser.add_argument("-p", "--prio", help='FIFO priority')
+
+args = parser.parse_args()
+
+try:
+    affinity_mask = { int(args.cpu) }
+except:
+    print("Invalid cpu: " + args.cpu)
+    exit(1)
+
+try:
+    os.sched_setaffinity(0, affinity_mask);
+except:
+    print("Error setting affinity")
+    exit(1)
+
+if (args.prio):
+    try:
+        param = os.sched_param(int(args.prio))
+        os.sched_setscheduler(0, os.SCHED_FIFO, param)
+    except:
+        print("Error setting priority")
+        exit(1)
+
+try:
+    timerlat_path = "/sys/kernel/tracing/osnoise/per_cpu/cpu" + args.cpu + "/timerlat_fd"
+    timerlat_fd = open(timerlat_path, 'r')
+except:
+    print("Error opening timerlat fd, did you run timerlat -U?")
+    exit(1)
+
+try:
+    data_fd = open("/dev/full", 'r');
+except:
+    print("Error opening data fd")
+
+while True:
+    try:
+        timerlat_fd.read(1)
+        data_fd.read(20*1024*1024)
+    except:
+        print("Leaving")
+        break
+
+timerlat_fd.close()
+data_fd.close()
diff --git a/tools/tracing/rtla/src/Build b/tools/tracing/rtla/src/Build
new file mode 100644
index 000000000000..dbed9e31829b
--- /dev/null
+++ b/tools/tracing/rtla/src/Build
@@ -0,0 +1,11 @@
+rtla-y += trace.o
+rtla-y += utils.o
+rtla-y += osnoise.o
+rtla-y += osnoise_top.o
+rtla-y += osnoise_hist.o
+rtla-y += timerlat.o
+rtla-y += timerlat_top.o
+rtla-y += timerlat_hist.o
+rtla-y += timerlat_u.o
+rtla-y += timerlat_aa.o
+rtla-y += rtla.o
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index dbf154082f95..8bd51aab6513 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -39,6 +39,7 @@ struct timerlat_hist_params {
 	int			hk_cpus;
 	int			no_aa;
 	int			dump_tasks;
+	int			user_workload;
 	int			user_hist;
 	cpu_set_t		hk_cpu_set;
 	struct sched_attr	sched_param;
@@ -534,6 +535,7 @@ static void timerlat_hist_usage(char *usage)
 		"		d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
 		"						       in nanoseconds",
 		"	  -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads",
+		"	  -U/--user-load: enable timerlat for user-defined user-space workload",
 		NULL,
 	};
 
@@ -595,6 +597,7 @@ static struct timerlat_hist_params
 			{"thread",		required_argument,	0, 'T'},
 			{"trace",		optional_argument,	0, 't'},
 			{"user-threads",	no_argument,		0, 'u'},
+			{"user-load",		no_argument,		0, 'U'},
 			{"event",		required_argument,	0, 'e'},
 			{"no-irq",		no_argument,		0, '0'},
 			{"no-thread",		no_argument,		0, '1'},
@@ -613,7 +616,7 @@ static struct timerlat_hist_params
 		/* getopt_long stores the option index here. */
 		int option_index = 0;
 
-		c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:np:P:s:t::T:u0123456:7:8:9\1",
+		c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:np:P:s:t::T:uU0123456:7:8:9\1",
 				 long_options, &option_index);
 
 		/* detect the end of the options. */
@@ -724,6 +727,9 @@ static struct timerlat_hist_params
 				params->trace_output = "timerlat_trace.txt";
 			break;
 		case 'u':
+			params->user_workload = 1;
+			/* fallback: -u implies in -U */
+		case 'U':
 			params->user_hist = 1;
 			break;
 		case '0': /* no irq */
@@ -985,7 +991,7 @@ int timerlat_hist_main(int argc, char *argv[])
 		}
 	}
 
-	if (params->cgroup && !params->user_hist) {
+	if (params->cgroup && !params->user_workload) {
 		retval = set_comm_cgroup("timerlat/", params->cgroup_name);
 		if (!retval) {
 			err_msg("Failed to move threads to cgroup\n");
@@ -1049,7 +1055,7 @@ int timerlat_hist_main(int argc, char *argv[])
 	tool->start_time = time(NULL);
 	timerlat_hist_set_signals(params);
 
-	if (params->user_hist) {
+	if (params->user_workload) {
 		/* rtla asked to stop */
 		params_u.should_run = 1;
 		/* all threads left */
@@ -1086,14 +1092,14 @@ int timerlat_hist_main(int argc, char *argv[])
 			break;
 
 		/* is there still any user-threads ? */
-		if (params->user_hist) {
+		if (params->user_workload) {
 			if (params_u.stopped_running) {
 				debug_msg("timerlat user-space threads stopped!\n");
 				break;
 			}
 		}
 	}
-	if (params->user_hist && !params_u.stopped_running) {
+	if (params->user_workload && !params_u.stopped_running) {
 		params_u.should_run = 0;
 		sleep(1);
 	}
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 3e9af2c38688..8a3fa64319c6 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -43,6 +43,7 @@ struct timerlat_top_params {
 	int			cgroup;
 	int			hk_cpus;
 	int			user_top;
+	int			user_workload;
 	cpu_set_t		hk_cpu_set;
 	struct sched_attr	sched_param;
 	struct trace_events	*events;
@@ -364,6 +365,7 @@ static void timerlat_top_usage(char *usage)
 		"		d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period",
 		"						       in nanoseconds",
 		"	  -u/--user-threads: use rtla user-space threads instead of in-kernel timerlat threads",
+		"	  -U/--user-load: enable timerlat for user-defined user-space workload",
 		NULL,
 	};
 
@@ -423,6 +425,7 @@ static struct timerlat_top_params
 			{"thread",		required_argument,	0, 'T'},
 			{"trace",		optional_argument,	0, 't'},
 			{"user-threads",	no_argument,		0, 'u'},
+			{"user-load",		no_argument,		0, 'U'},
 			{"trigger",		required_argument,	0, '0'},
 			{"filter",		required_argument,	0, '1'},
 			{"dma-latency",		required_argument,	0, '2'},
@@ -435,7 +438,7 @@ static struct timerlat_top_params
 		/* getopt_long stores the option index here. */
 		int option_index = 0;
 
-		c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:np:P:qs:t::T:u0:1:2:345:",
+		c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:np:P:qs:t::T:uU0:1:2:345:",
 				 long_options, &option_index);
 
 		/* detect the end of the options. */
@@ -552,6 +555,9 @@ static struct timerlat_top_params
 
 			break;
 		case 'u':
+			params->user_workload = true;
+			/* fallback: -u implies -U */
+		case 'U':
 			params->user_top = true;
 			break;
 		case '0': /* trigger */
@@ -869,7 +875,7 @@ int timerlat_top_main(int argc, char *argv[])
 	top->start_time = time(NULL);
 	timerlat_top_set_signals(params);
 
-	if (params->user_top) {
+	if (params->user_workload) {
 		/* rtla asked to stop */
 		params_u.should_run = 1;
 		/* all threads left */
@@ -912,7 +918,7 @@ int timerlat_top_main(int argc, char *argv[])
 			break;
 
 		/* is there still any user-threads ? */
-		if (params->user_top) {
+		if (params->user_workload) {
 			if (params_u.stopped_running) {
 				debug_msg("timerlat user space threads stopped!\n");
 				break;
@@ -920,7 +926,7 @@ int timerlat_top_main(int argc, char *argv[])
 		}
 	}
 
-	if (params->user_top && !params_u.stopped_running) {
+	if (params->user_workload && !params_u.stopped_running) {
 		params_u.should_run = 0;
 		sleep(1);
 	}
diff --git a/tools/verification/rv/.gitignore b/tools/verification/rv/.gitignore
new file mode 100644
index 000000000000..34a486585a34
--- /dev/null
+++ b/tools/verification/rv/.gitignore
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+rv
+rv-static
+fixdep
+feature
+FEATURE-DUMP
diff --git a/tools/verification/rv/Build b/tools/verification/rv/Build
new file mode 100644
index 000000000000..a44c22349d4b
--- /dev/null
+++ b/tools/verification/rv/Build
@@ -0,0 +1 @@
+rv-y += src/
diff --git a/tools/verification/rv/Makefile b/tools/verification/rv/Makefile
index 485f8aeddbe0..411d62b3d8eb 100644
--- a/tools/verification/rv/Makefile
+++ b/tools/verification/rv/Makefile
@@ -1,146 +1,87 @@
-NAME	:=	rv
-# Follow the kernel version
-VERSION :=	$(shell cat VERSION 2> /dev/null || make -sC ../../.. kernelversion | grep -v make)
-
-# From libtracefs:
-# Makefiles suck: This macro sets a default value of $(2) for the
-# variable named by $(1), unless the variable has been set by
-# environment or command line. This is necessary for CC and AR
-# because make sets default values, so the simpler ?= approach
-# won't work as expected.
-define allow-override
-  $(if $(or $(findstring environment,$(origin $(1))),\
-            $(findstring command line,$(origin $(1)))),,\
-    $(eval $(1) = $(2)))
-endef
-
-# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix.
-$(call allow-override,CC,$(CROSS_COMPILE)gcc)
-$(call allow-override,AR,$(CROSS_COMPILE)ar)
-$(call allow-override,STRIP,$(CROSS_COMPILE)strip)
-$(call allow-override,PKG_CONFIG,pkg-config)
-$(call allow-override,LD_SO_CONF_PATH,/etc/ld.so.conf.d/)
-$(call allow-override,LDCONFIG,ldconfig)
-
-INSTALL	=	install
-MKDIR	=	mkdir
-FOPTS	:=	-flto=auto -ffat-lto-objects -fexceptions -fstack-protector-strong \
-		-fasynchronous-unwind-tables -fstack-clash-protection
-WOPTS	:= 	-Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -Wno-maybe-uninitialized
-
-ifeq ($(CC),clang)
-  FOPTS := $(filter-out -ffat-lto-objects, $(FOPTS))
-  WOPTS := $(filter-out -Wno-maybe-uninitialized, $(WOPTS))
+# SPDX-License-Identifier: GPL-2.0-only
+
+ifeq ($(srctree),)
+  srctree	:= $(patsubst %/,%,$(dir $(CURDIR)))
+  srctree	:= $(patsubst %/,%,$(dir $(srctree)))
+  srctree	:= $(patsubst %/,%,$(dir $(srctree)))
 endif
 
-TRACEFS_HEADERS	:= $$($(PKG_CONFIG) --cflags libtracefs)
-
-CFLAGS	:=	-O -g -DVERSION=\"$(VERSION)\" $(FOPTS) $(MOPTS) $(WOPTS) $(TRACEFS_HEADERS) $(EXTRA_CFLAGS) -I include
-LDFLAGS	:=	-flto=auto -ggdb $(EXTRA_LDFLAGS)
-LIBS	:=	$$($(PKG_CONFIG) --libs libtracefs)
-
-SRC	:=	$(wildcard src/*.c)
-HDR	:=	$(wildcard src/*.h)
-OBJ	:=	$(SRC:.c=.o)
-DIRS	:=	src
-FILES	:=	Makefile README.txt
-CEXT	:=	bz2
-TARBALL	:=	$(NAME)-$(VERSION).tar.$(CEXT)
-TAROPTS	:=	-cvjf $(TARBALL)
-BINDIR	:=	/usr/bin
-DATADIR	:=	/usr/share
-DOCDIR	:=	$(DATADIR)/doc
-MANDIR	:=	$(DATADIR)/man
-LICDIR	:=	$(DATADIR)/licenses
-SRCTREE	:=	$(or $(BUILD_SRC),$(CURDIR))
-
-# If running from the tarball, man pages are stored in the Documentation
-# dir. If running from the kernel source, man pages are stored in
-# Documentation/tools/rv/.
-ifneq ($(wildcard Documentation/.*),)
-DOCSRC	=	Documentation/
+include $(srctree)/tools/scripts/Makefile.include
+
+# O is an alias for OUTPUT
+OUTPUT          := $(O)
+
+ifeq ($(OUTPUT),)
+  OUTPUT       := $(CURDIR)/
 else
-DOCSRC	=	$(SRCTREE)/../../../Documentation/tools/rv/
+  # subdir is used by the ../Makefile in $(call descend,)
+  ifneq ($(subdir),)
+    OUTPUT        := $(OUTPUT)/$(subdir)
+  endif
+endif
+
+ifneq ($(patsubst %/,,$(lastword $(OUTPUT))),)
+  OUTPUT := $(OUTPUT)/
 endif
 
-LIBTRACEEVENT_MIN_VERSION = 1.5
-LIBTRACEFS_MIN_VERSION = 1.3
+RV		:= $(OUTPUT)rv
+RV_IN		:= $(RV)-in.o
 
-.PHONY:	all warnings show_warnings
-all:	warnings rv
+VERSION		:= $(shell sh -c "make -sC ../../.. kernelversion | grep -v make")
+DOCSRC		:= ../../../Documentation/tools/rv/
 
-TEST_LIBTRACEEVENT = $(shell sh -c "$(PKG_CONFIG) --atleast-version $(LIBTRACEEVENT_MIN_VERSION) libtraceevent > /dev/null 2>&1 || echo n")
-ifeq ("$(TEST_LIBTRACEEVENT)", "n")
-WARNINGS = show_warnings
-MISSING_LIBS += echo "**   libtraceevent version $(LIBTRACEEVENT_MIN_VERSION) or higher";
-MISSING_PACKAGES += "libtraceevent-devel"
-MISSING_SOURCE += echo "**  https://git.kernel.org/pub/scm/libs/libtrace/libtraceevent.git/ ";
+FEATURE_TESTS	:= libtraceevent
+FEATURE_TESTS	+= libtracefs
+FEATURE_DISPLAY	:= libtraceevent
+FEATURE_DISPLAY	+= libtracefs
+
+ifeq ($(V),1)
+  Q		=
+else
+  Q		= @
 endif
 
-TEST_LIBTRACEFS = $(shell sh -c "$(PKG_CONFIG) --atleast-version $(LIBTRACEFS_MIN_VERSION) libtracefs > /dev/null 2>&1 || echo n")
-ifeq ("$(TEST_LIBTRACEFS)", "n")
-WARNINGS = show_warnings
-MISSING_LIBS += echo "**   libtracefs version $(LIBTRACEFS_MIN_VERSION) or higher";
-MISSING_PACKAGES += "libtracefs-devel"
-MISSING_SOURCE += echo "**  https://git.kernel.org/pub/scm/libs/libtrace/libtracefs.git/ ";
+all: $(RV)
+
+include $(srctree)/tools/build/Makefile.include
+include Makefile.rv
+
+# check for dependencies only on required targets
+NON_CONFIG_TARGETS := clean install doc doc_clean doc_install
+
+config		:= 1
+ifdef MAKECMDGOALS
+  ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),)
+  config	:= 0
+endif
 endif
 
-define show_dependencies
-	@echo "********************************************";				\
-	echo "** NOTICE: Failed build dependencies";					\
-	echo "**";									\
-	echo "** Required Libraries:";							\
-	$(MISSING_LIBS)									\
-	echo "**";									\
-	echo "** Consider installing the latest libtracefs from your";			\
-	echo "** distribution, e.g., 'dnf install $(MISSING_PACKAGES)' on Fedora,";	\
-	echo "** or from source:";							\
-	echo "**";									\
-	$(MISSING_SOURCE)								\
-	echo "**";									\
-	echo "********************************************"
-endef
-
-show_warnings:
-	$(call show_dependencies);
-
-ifneq ("$(WARNINGS)", "")
-ERROR_OUT = $(error Please add the necessary dependencies)
-
-warnings: $(WARNINGS)
-	$(ERROR_OUT)
+ifeq ($(config),1)
+  include $(srctree)/tools/build/Makefile.feature
+  include Makefile.config
 endif
 
-rv: $(OBJ)
-	$(CC) -o rv $(LDFLAGS) $(OBJ) $(LIBS)
-
-.PHONY: install
-install: doc_install
-	$(MKDIR) -p $(DESTDIR)$(BINDIR)
-	$(INSTALL) rv -m 755 $(DESTDIR)$(BINDIR)
-	$(STRIP) $(DESTDIR)$(BINDIR)/rv
-
-.PHONY: clean tarball
-clean: doc_clean
-	@test ! -f rv || rm rv
-	@test ! -f $(TARBALL) || rm -f $(TARBALL)
-	@rm -rf *~ $(OBJ) *.tar.$(CEXT)
-
-tarball: clean
-	rm -rf $(NAME)-$(VERSION) && mkdir $(NAME)-$(VERSION)
-	echo $(VERSION) > $(NAME)-$(VERSION)/VERSION
-	cp -r $(DIRS) $(FILES) $(NAME)-$(VERSION)
-	mkdir $(NAME)-$(VERSION)/Documentation/
-	cp -rp $(SRCTREE)/../../../Documentation/tools/rv/* $(NAME)-$(VERSION)/Documentation/
-	tar $(TAROPTS) --exclude='*~' $(NAME)-$(VERSION)
-	rm -rf $(NAME)-$(VERSION)
-
-.PHONY: doc doc_clean doc_install
-doc:
-	$(MAKE) -C $(DOCSRC)
-
-doc_clean:
-	$(MAKE) -C $(DOCSRC) clean
-
-doc_install:
-	$(MAKE) -C $(DOCSRC) install
+CFLAGS		+= $(INCLUDES) $(LIB_INCLUDES)
+
+export CFLAGS OUTPUT srctree
+
+$(RV): $(RV_IN)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) -o $(RV) $(RV_IN) $(EXTLIBS)
+
+static: $(RV_IN)
+	$(eval LDFLAGS += -static)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) -o $(RV)-static $(RV_IN) $(EXTLIBS)
+
+rv.%: fixdep FORCE
+	make -f $(srctree)/tools/build/Makefile.build dir=. $@
+
+$(RV_IN): fixdep FORCE
+	make $(build)=rv
+
+clean: doc_clean fixdep-clean
+	$(call QUIET_CLEAN, rv)
+	$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+	$(Q)rm -f rv rv-static fixdep FEATURE-DUMP rv-*
+	$(Q)rm -rf feature
+
+.PHONY: FORCE clean
diff --git a/tools/verification/rv/Makefile.config b/tools/verification/rv/Makefile.config
new file mode 100644
index 000000000000..6d4ba77847b6
--- /dev/null
+++ b/tools/verification/rv/Makefile.config
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+STOP_ERROR :=
+
+LIBTRACEEVENT_MIN_VERSION = 1.5
+LIBTRACEFS_MIN_VERSION = 1.3
+
+define lib_setup
+  $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)"))
+  $(eval EXTLIBS += -l$(1))
+endef
+
+$(call feature_check,libtraceevent)
+ifeq ($(feature-libtraceevent), 1)
+  $(call detected,CONFIG_LIBTRACEEVENT)
+
+  TEST = $(shell sh -c "$(PKG_CONFIG) --atleast-version $(LIBTRACEEVENT_MIN_VERSION) libtraceevent > /dev/null 2>&1 && echo y || echo n")
+  ifeq ($(TEST),n)
+    $(info libtraceevent version is too low, it must be at least $(LIBTRACEEVENT_MIN_VERSION))
+    STOP_ERROR := 1
+  endif
+
+  $(call lib_setup,traceevent)
+else
+  STOP_ERROR := 1
+  $(info libtraceevent is missing. Please install libtraceevent-dev/libtraceevent-devel)
+endif
+
+$(call feature_check,libtracefs)
+ifeq ($(feature-libtracefs), 1)
+  $(call detected,CONFIG_LIBTRACEFS)
+
+  TEST = $(shell sh -c "$(PKG_CONFIG) --atleast-version $(LIBTRACEFS_MIN_VERSION) libtracefs > /dev/null 2>&1 && echo y || echo n")
+  ifeq ($(TEST),n)
+    $(info libtracefs version is too low, it must be at least $(LIBTRACEFS_MIN_VERSION))
+    STOP_ERROR := 1
+  endif
+
+  $(call lib_setup,tracefs)
+else
+  STOP_ERROR := 1
+  $(info libtracefs is missing. Please install libtracefs-dev/libtracefs-devel)
+endif
+
+ifeq ($(STOP_ERROR),1)
+  $(error Please, check the errors above.)
+endif
diff --git a/tools/verification/rv/Makefile.rv b/tools/verification/rv/Makefile.rv
new file mode 100644
index 000000000000..161baa29eb86
--- /dev/null
+++ b/tools/verification/rv/Makefile.rv
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+define allow-override
+  $(if $(or $(findstring environment,$(origin $(1))),\
+            $(findstring command line,$(origin $(1)))),,\
+    $(eval $(1) = $(2)))
+endef
+
+# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix.
+$(call allow-override,CC,$(CROSS_COMPILE)gcc)
+$(call allow-override,AR,$(CROSS_COMPILE)ar)
+$(call allow-override,STRIP,$(CROSS_COMPILE)strip)
+$(call allow-override,PKG_CONFIG,pkg-config)
+$(call allow-override,LD_SO_CONF_PATH,/etc/ld.so.conf.d/)
+$(call allow-override,LDCONFIG,ldconfig)
+export CC AR STRIP PKG_CONFIG LD_SO_CONF_PATH LDCONFIG
+
+FOPTS		:=-flto=auto -ffat-lto-objects -fexceptions -fstack-protector-strong	\
+		-fasynchronous-unwind-tables -fstack-clash-protection
+WOPTS		:=	-O -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2	\
+		-Wp,-D_GLIBCXX_ASSERTIONS -Wno-maybe-uninitialized
+
+ifeq ($(CC),clang)
+  FOPTS		:= $(filter-out -flto=auto -ffat-lto-objects, $(FOPTS))
+  WOPTS		:= $(filter-out -Wno-maybe-uninitialized, $(WOPTS))
+endif
+
+INCLUDE		:= -Iinclude/
+CFLAGS		:= -g -DVERSION=\"$(VERSION)\" $(FOPTS) $(WOPTS) $(EXTRA_CFLAGS) $(INCLUDE)
+LDFLAGS		:= -ggdb $(EXTRA_LDFLAGS)
+
+INSTALL		:= install
+MKDIR		:= mkdir
+STRIP		:= strip
+BINDIR		:= /usr/bin
+
+.PHONY: install
+install: doc_install
+	$(Q)$(MKDIR) -p $(DESTDIR)$(BINDIR)
+	$(call QUIET_INSTALL,rv)$(INSTALL) $(OUTPUT)rv -m 755 $(DESTDIR)$(BINDIR)
+	$(Q)@$(STRIP) $(DESTDIR)$(BINDIR)/rv
+
+.PHONY: doc doc_clean doc_install
+doc:
+	$(MAKE) -C $(DOCSRC)
+
+doc_clean:
+	$(MAKE) -C $(DOCSRC) clean
+
+doc_install:
+	$(MAKE) -C $(DOCSRC) install
diff --git a/tools/verification/rv/src/Build b/tools/verification/rv/src/Build
new file mode 100644
index 000000000000..d781983c1a79
--- /dev/null
+++ b/tools/verification/rv/src/Build
@@ -0,0 +1,4 @@
+rv-y += trace.o
+rv-y += utils.o
+rv-y += in_kernel.o
+rv-y += rv.o