Re-apply r306516 (by cem):

Reduce the cost of TLB invalidation on x86 by using per-CPU completion flags Reduce contention during TLB invalidation operations by using a per-CPU completion flag, rather than a single atomically-updated variable. On a Westmere system (2 sockets x 4 cores x 1 threads), dtrace measurements show that smp_tlb_shootdown is about 50% faster with this patch; observations with VTune show that the percentage of time spent in invlrng_single_page on an interrupt (actually doing invalidation, rather than synchronization) increases from 31% with the old mechanism to 71% with the new one. (Running a basic file server workload.) Submitted by: Anton Rang <rang at acm.org> Reviewed by: cem (earlier version) Sponsored by: Dell EMC Isilon Differential Revision: https://reviews.freebsd.org/D8041
svn path=/head/; revision=306680
2016-10-04 17:01:24 +00:00 · 2016-10-04 17:01:24 +00:00 · 83c001d3c2 · 2020-12-20 02:59:44 +00:00
commit 83c001d3c2
parent 37d0ac15e6
5 changed files with 58 additions and 22 deletions
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@ -409,6 +409,7 @@ void
 invltlb_invpcid_handler(void)
 {
 	struct invpcid_descr d;
+	uint32_t generation;

 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
@ -417,17 +418,20 @@ invltlb_invpcid_handler(void)
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */

+	generation = smp_tlb_generation;
 	d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 	d.pad = 0;
 	d.addr = 0;
 	invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB :
 	    INVPCID_CTX);
-	atomic_add_int(&smp_tlb_wait, 1);
+	PCPU_SET(smp_tlb_done, generation);
 }

 void
 invltlb_pcid_handler(void)
 {
+	uint32_t generation;
+  
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
@ -435,6 +439,7 @@ invltlb_pcid_handler(void)
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */

+	generation = smp_tlb_generation;	/* Overlap with serialization */
 	if (smp_tlb_pmap == kernel_pmap) {
 		invltlb_glob();
 	} else {
@ -450,5 +455,5 @@ invltlb_pcid_handler(void)
 			    smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid);
 		}
 	}
-	atomic_add_int(&smp_tlb_wait, 1);
+	PCPU_SET(smp_tlb_done, generation);
 }
--- a/sys/amd64/include/pcpu.h
+++ b/sys/amd64/include/pcpu.h
@ -65,7 +65,8 @@
 	u_int	pc_vcpu_id;		/* Xen vCPU ID */		\
 	uint32_t pc_pcid_next;						\
 	uint32_t pc_pcid_gen;						\
-	char	__pad[149]		/* be divisor of PAGE_SIZE	\
+	uint32_t pc_smp_tlb_done;	/* TLB op acknowledgement */	\
+	char	__pad[145]		/* be divisor of PAGE_SIZE	\
 					   after cache alignment */

 #define	PC_DBREG_CMD_NONE	0
--- a/sys/i386/include/pcpu.h
+++ b/sys/i386/include/pcpu.h
@ -59,7 +59,8 @@
 	u_int	pc_cmci_mask;		/* MCx banks for CMCI */	\
 	u_int	pc_vcpu_id;		/* Xen vCPU ID */		\
 	vm_offset_t pc_qmap_addr;	/* KVA for temporary mappings */\
-	char	__pad[229]
+	uint32_t pc_smp_tlb_done;	/* TLB op acknowledgement */	\
+	char	__pad[225]

 #ifdef _KERNEL

--- a/sys/x86/include/x86_smp.h
+++ b/sys/x86/include/x86_smp.h
@ -35,7 +35,7 @@ extern volatile int aps_ready;
 extern struct mtx ap_boot_mtx;
 extern int cpu_logical;
 extern int cpu_cores;
-extern volatile int smp_tlb_wait;
+extern volatile uint32_t smp_tlb_generation;
 extern struct pmap *smp_tlb_pmap;
 extern u_int xhits_gbl[];
 extern u_int xhits_pg[];
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@ -1304,12 +1304,22 @@ cpususpend_handler(void)
 void
 invlcache_handler(void)
 {
+	uint32_t generation;
+
 #ifdef COUNT_IPIS
 	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */

+	/*
+	 * Reading the generation here allows greater parallelism
+	 * since wbinvd is a serializing instruction.  Without the
+	 * temporary, we'd wait for wbinvd to complete, then the read
+	 * would execute, then the dependent write, whuch must then
+	 * complete before return from interrupt.
+	 */
+	generation = smp_tlb_generation;
 	wbinvd();
-	atomic_add_int(&smp_tlb_wait, 1);
+	PCPU_SET(smp_tlb_done, generation);
 }

 /*
@ -1367,7 +1377,7 @@ SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
 /* Variables needed for SMP tlb shootdown. */
 static vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 pmap_t smp_tlb_pmap;
-volatile int smp_tlb_wait;
+volatile uint32_t smp_tlb_generation;

 #ifdef __amd64__
 #define	read_eflags() read_rflags()
@ -1377,15 +1387,16 @@ static void
 smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
    vm_offset_t addr1, vm_offset_t addr2)
 {
-	int cpu, ncpu, othercpus;
-
-	othercpus = mp_ncpus - 1;	/* does not shootdown self */
+	cpuset_t other_cpus;
+	volatile uint32_t *p_cpudone;
+	uint32_t generation;
+	int cpu;

 	/*
 	 * Check for other cpus.  Return if none.
 	 */
 	if (CPU_ISFULLSET(&mask)) {
-		if (othercpus < 1)
+		if (mp_ncpus <= 1)
 			return;
 	} else {
 		CPU_CLR(PCPU_GET(cpuid), &mask);
@ -1399,23 +1410,28 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
 	smp_tlb_pmap = pmap;
-	smp_tlb_wait =  0;
+	generation = ++smp_tlb_generation;
 	if (CPU_ISFULLSET(&mask)) {
-		ncpu = othercpus;
 		ipi_all_but_self(vector);
+		other_cpus = all_cpus;
+		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 	} else {
-		ncpu = 0;
+		other_cpus = mask;
 		while ((cpu = CPU_FFS(&mask)) != 0) {
 			cpu--;
 			CPU_CLR(cpu, &mask);
 			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
 			    cpu, vector);
 			ipi_send_cpu(cpu, vector);
-			ncpu++;
 		}
 	}
-	while (smp_tlb_wait < ncpu)
-		ia32_pause();
+	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &other_cpus);
+		p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
+		while (*p_cpudone != generation)
+			ia32_pause();
+	}
 	mtx_unlock_spin(&smp_ipi_mtx);
 }

@ -1473,6 +1489,8 @@ smp_cache_flush(void)
 void
 invltlb_handler(void)
 {
+	uint32_t generation;
+  
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
@ -1480,16 +1498,23 @@ invltlb_handler(void)
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */

+	/*
+	 * Reading the generation here allows greater parallelism
+	 * since invalidating the TLB is a serializing operation.
+	 */
+	generation = smp_tlb_generation;
 	if (smp_tlb_pmap == kernel_pmap)
 		invltlb_glob();
 	else
 		invltlb();
-	atomic_add_int(&smp_tlb_wait, 1);
+	PCPU_SET(smp_tlb_done, generation);
 }

 void
 invlpg_handler(void)
 {
+	uint32_t generation;
+
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
@ -1497,14 +1522,16 @@ invlpg_handler(void)
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */

+	generation = smp_tlb_generation;	/* Overlap with serialization */
 	invlpg(smp_tlb_addr1);
-	atomic_add_int(&smp_tlb_wait, 1);
+	PCPU_SET(smp_tlb_done, generation);
 }

 void
 invlrng_handler(void)
 {
-	vm_offset_t addr;
+	vm_offset_t addr, addr2;
+	uint32_t generation;

 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
@ -1514,10 +1541,12 @@ invlrng_handler(void)
 #endif /* COUNT_IPIS */

 	addr = smp_tlb_addr1;
+	addr2 = smp_tlb_addr2;
+	generation = smp_tlb_generation;	/* Overlap with serialization */
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
-	} while (addr < smp_tlb_addr2);
+	} while (addr < addr2);

-	atomic_add_int(&smp_tlb_wait, 1);
+	PCPU_SET(smp_tlb_done, generation);
 }