1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-08-05 19:35:52 +03:00

x86: Make the divisor in setting non_temporal_threshold cpu specific

Different systems prefer a different divisors.

From benchmarks[1] so far the following divisors have been found:
    ICX     : 2
    SKX     : 2
    BWD     : 8

For Intel, we are generalizing that BWD and older prefers 8 as a
divisor, and SKL and newer prefers 2. This number can be further tuned
as benchmarks are run.

[1]: https://github.com/goldsteinn/memcpy-nt-benchmarks
Reviewed-by: DJ Delorie <dj@redhat.com>
This commit is contained in:
Noah Goldstein
2023-06-07 13:18:03 -05:00
parent f193ea20ed
commit 180897c161
4 changed files with 51 additions and 26 deletions

View File

@@ -636,6 +636,7 @@ init_cpu_features (struct cpu_features *cpu_features)
unsigned int stepping = 0; unsigned int stepping = 0;
enum cpu_features_kind kind; enum cpu_features_kind kind;
cpu_features->cachesize_non_temporal_divisor = 4;
#if !HAS_CPUID #if !HAS_CPUID
if (__get_cpuid_max (0, 0) == 0) if (__get_cpuid_max (0, 0) == 0)
{ {
@@ -716,13 +717,13 @@ init_cpu_features (struct cpu_features *cpu_features)
/* Bigcore/Default Tuning. */ /* Bigcore/Default Tuning. */
default: default:
default_tuning:
/* Unknown family 0x06 processors. Assuming this is one /* Unknown family 0x06 processors. Assuming this is one
of Core i3/i5/i7 processors if AVX is available. */ of Core i3/i5/i7 processors if AVX is available. */
if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
break; break;
/* Fall through. */
case INTEL_BIGCORE_NEHALEM: enable_modern_features:
case INTEL_BIGCORE_WESTMERE:
/* Rep string instructions, unaligned load, unaligned copy, /* Rep string instructions, unaligned load, unaligned copy,
and pminub are fast on Intel Core i3, i5 and i7. */ and pminub are fast on Intel Core i3, i5 and i7. */
cpu_features->preferred[index_arch_Fast_Rep_String] cpu_features->preferred[index_arch_Fast_Rep_String]
@@ -732,12 +733,23 @@ init_cpu_features (struct cpu_features *cpu_features)
| bit_arch_Prefer_PMINUB_for_stringop); | bit_arch_Prefer_PMINUB_for_stringop);
break; break;
/* case INTEL_BIGCORE_NEHALEM:
Default tuned Bigcore microarch. case INTEL_BIGCORE_WESTMERE:
/* Older CPUs prefer non-temporal stores at lower threshold. */
cpu_features->cachesize_non_temporal_divisor = 8;
goto enable_modern_features;
/* Older Bigcore microarch (smaller non-temporal store
threshold). */
case INTEL_BIGCORE_SANDYBRIDGE: case INTEL_BIGCORE_SANDYBRIDGE:
case INTEL_BIGCORE_IVYBRIDGE: case INTEL_BIGCORE_IVYBRIDGE:
case INTEL_BIGCORE_HASWELL: case INTEL_BIGCORE_HASWELL:
case INTEL_BIGCORE_BROADWELL: case INTEL_BIGCORE_BROADWELL:
cpu_features->cachesize_non_temporal_divisor = 8;
goto default_tuning;
/* Newer Bigcore microarch (larger non-temporal store
threshold). */
case INTEL_BIGCORE_SKYLAKE: case INTEL_BIGCORE_SKYLAKE:
case INTEL_BIGCORE_KABYLAKE: case INTEL_BIGCORE_KABYLAKE:
case INTEL_BIGCORE_COMETLAKE: case INTEL_BIGCORE_COMETLAKE:
@@ -753,13 +765,14 @@ init_cpu_features (struct cpu_features *cpu_features)
case INTEL_BIGCORE_SAPPHIRERAPIDS: case INTEL_BIGCORE_SAPPHIRERAPIDS:
case INTEL_BIGCORE_EMERALDRAPIDS: case INTEL_BIGCORE_EMERALDRAPIDS:
case INTEL_BIGCORE_GRANITERAPIDS: case INTEL_BIGCORE_GRANITERAPIDS:
*/ cpu_features->cachesize_non_temporal_divisor = 2;
goto default_tuning;
/* /* Default tuned Mixed (bigcore + atom SOC). */
Default tuned Mixed (bigcore + atom SOC).
case INTEL_MIXED_LAKEFIELD: case INTEL_MIXED_LAKEFIELD:
case INTEL_MIXED_ALDERLAKE: case INTEL_MIXED_ALDERLAKE:
*/ cpu_features->cachesize_non_temporal_divisor = 2;
goto default_tuning;
} }
/* Disable TSX on some processors to avoid TSX on kernels that /* Disable TSX on some processors to avoid TSX on kernels that

View File

@@ -738,19 +738,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
cpu_features->level3_cache_linesize = level3_cache_linesize; cpu_features->level3_cache_linesize = level3_cache_linesize;
cpu_features->level4_cache_size = level4_cache_size; cpu_features->level4_cache_size = level4_cache_size;
/* The default setting for the non_temporal threshold is 1/4 of size unsigned long int cachesize_non_temporal_divisor
of the chip's cache. For most Intel and AMD processors with an = cpu_features->cachesize_non_temporal_divisor;
initial release date between 2017 and 2023, a thread's typical if (cachesize_non_temporal_divisor <= 0)
share of the cache is from 18-64MB. Using the 1/4 L3 is meant to cachesize_non_temporal_divisor = 4;
estimate the point where non-temporal stores begin out-competing
REP MOVSB. As well the point where the fact that non-temporal /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
stores are forced back to main memory would already occurred to the of the chip's cache (depending on `cachesize_non_temporal_divisor` which
majority of the lines in the copy. Note, concerns about the is microarch specific. The defeault is 1/4). For most Intel and AMD
entire L3 cache being evicted by the copy are mostly alleviated processors with an initial release date between 2017 and 2023, a thread's
by the fact that modern HW detects streaming patterns and typical share of the cache is from 18-64MB. Using a reasonable size
provides proper LRU hints so that the maximum thrashing fraction of L3 is meant to estimate the point where non-temporal stores
capped at 1/associativity. */ begin out-competing REP MOVSB. As well the point where the fact that
unsigned long int non_temporal_threshold = shared / 4; non-temporal stores are forced back to main memory would already occurred
to the majority of the lines in the copy. Note, concerns about the entire
L3 cache being evicted by the copy are mostly alleviated by the fact that
modern HW detects streaming patterns and provides proper LRU hints so that
the maximum thrashing capped at 1/associativity. */
unsigned long int non_temporal_threshold
= shared / cachesize_non_temporal_divisor;
/* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
a higher risk of actually thrashing the cache as they don't have a HW LRU a higher risk of actually thrashing the cache as they don't have a HW LRU
hint. As well, their performance in highly parallel situations is hint. As well, their performance in highly parallel situations is

View File

@@ -113,8 +113,11 @@ _dl_diagnostics_cpu (void)
cpu_features->level3_cache_linesize); cpu_features->level3_cache_linesize);
print_cpu_features_value ("level4_cache_size", print_cpu_features_value ("level4_cache_size",
cpu_features->level4_cache_size); cpu_features->level4_cache_size);
_Static_assert (offsetof (struct cpu_features, level4_cache_size) print_cpu_features_value ("cachesize_non_temporal_divisor",
+ sizeof (cpu_features->level4_cache_size) cpu_features->cachesize_non_temporal_divisor);
_Static_assert (
offsetof (struct cpu_features, cachesize_non_temporal_divisor)
+ sizeof (cpu_features->cachesize_non_temporal_divisor)
== sizeof (*cpu_features), == sizeof (*cpu_features),
"last cpu_features field has been printed"); "last cpu_features field has been printed");
} }

View File

@@ -945,6 +945,9 @@ struct cpu_features
unsigned long int level3_cache_linesize; unsigned long int level3_cache_linesize;
/* /_SC_LEVEL4_CACHE_SIZE. */ /* /_SC_LEVEL4_CACHE_SIZE. */
unsigned long int level4_cache_size; unsigned long int level4_cache_size;
/* When no user non_temporal_threshold is specified. We default to
cachesize / cachesize_non_temporal_divisor. */
unsigned long int cachesize_non_temporal_divisor;
}; };
/* Get a pointer to the CPU features structure. */ /* Get a pointer to the CPU features structure. */