mirror of
https://sourceware.org/git/glibc.git
synced 2025-08-05 19:35:52 +03:00
x86: Make the divisor in setting non_temporal_threshold
cpu specific
Different systems prefer a different divisors. From benchmarks[1] so far the following divisors have been found: ICX : 2 SKX : 2 BWD : 8 For Intel, we are generalizing that BWD and older prefers 8 as a divisor, and SKL and newer prefers 2. This number can be further tuned as benchmarks are run. [1]: https://github.com/goldsteinn/memcpy-nt-benchmarks Reviewed-by: DJ Delorie <dj@redhat.com>
This commit is contained in:
@@ -636,6 +636,7 @@ init_cpu_features (struct cpu_features *cpu_features)
|
|||||||
unsigned int stepping = 0;
|
unsigned int stepping = 0;
|
||||||
enum cpu_features_kind kind;
|
enum cpu_features_kind kind;
|
||||||
|
|
||||||
|
cpu_features->cachesize_non_temporal_divisor = 4;
|
||||||
#if !HAS_CPUID
|
#if !HAS_CPUID
|
||||||
if (__get_cpuid_max (0, 0) == 0)
|
if (__get_cpuid_max (0, 0) == 0)
|
||||||
{
|
{
|
||||||
@@ -716,13 +717,13 @@ init_cpu_features (struct cpu_features *cpu_features)
|
|||||||
|
|
||||||
/* Bigcore/Default Tuning. */
|
/* Bigcore/Default Tuning. */
|
||||||
default:
|
default:
|
||||||
|
default_tuning:
|
||||||
/* Unknown family 0x06 processors. Assuming this is one
|
/* Unknown family 0x06 processors. Assuming this is one
|
||||||
of Core i3/i5/i7 processors if AVX is available. */
|
of Core i3/i5/i7 processors if AVX is available. */
|
||||||
if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
|
if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
|
||||||
break;
|
break;
|
||||||
/* Fall through. */
|
|
||||||
case INTEL_BIGCORE_NEHALEM:
|
enable_modern_features:
|
||||||
case INTEL_BIGCORE_WESTMERE:
|
|
||||||
/* Rep string instructions, unaligned load, unaligned copy,
|
/* Rep string instructions, unaligned load, unaligned copy,
|
||||||
and pminub are fast on Intel Core i3, i5 and i7. */
|
and pminub are fast on Intel Core i3, i5 and i7. */
|
||||||
cpu_features->preferred[index_arch_Fast_Rep_String]
|
cpu_features->preferred[index_arch_Fast_Rep_String]
|
||||||
@@ -732,12 +733,23 @@ init_cpu_features (struct cpu_features *cpu_features)
|
|||||||
| bit_arch_Prefer_PMINUB_for_stringop);
|
| bit_arch_Prefer_PMINUB_for_stringop);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/*
|
case INTEL_BIGCORE_NEHALEM:
|
||||||
Default tuned Bigcore microarch.
|
case INTEL_BIGCORE_WESTMERE:
|
||||||
|
/* Older CPUs prefer non-temporal stores at lower threshold. */
|
||||||
|
cpu_features->cachesize_non_temporal_divisor = 8;
|
||||||
|
goto enable_modern_features;
|
||||||
|
|
||||||
|
/* Older Bigcore microarch (smaller non-temporal store
|
||||||
|
threshold). */
|
||||||
case INTEL_BIGCORE_SANDYBRIDGE:
|
case INTEL_BIGCORE_SANDYBRIDGE:
|
||||||
case INTEL_BIGCORE_IVYBRIDGE:
|
case INTEL_BIGCORE_IVYBRIDGE:
|
||||||
case INTEL_BIGCORE_HASWELL:
|
case INTEL_BIGCORE_HASWELL:
|
||||||
case INTEL_BIGCORE_BROADWELL:
|
case INTEL_BIGCORE_BROADWELL:
|
||||||
|
cpu_features->cachesize_non_temporal_divisor = 8;
|
||||||
|
goto default_tuning;
|
||||||
|
|
||||||
|
/* Newer Bigcore microarch (larger non-temporal store
|
||||||
|
threshold). */
|
||||||
case INTEL_BIGCORE_SKYLAKE:
|
case INTEL_BIGCORE_SKYLAKE:
|
||||||
case INTEL_BIGCORE_KABYLAKE:
|
case INTEL_BIGCORE_KABYLAKE:
|
||||||
case INTEL_BIGCORE_COMETLAKE:
|
case INTEL_BIGCORE_COMETLAKE:
|
||||||
@@ -753,13 +765,14 @@ init_cpu_features (struct cpu_features *cpu_features)
|
|||||||
case INTEL_BIGCORE_SAPPHIRERAPIDS:
|
case INTEL_BIGCORE_SAPPHIRERAPIDS:
|
||||||
case INTEL_BIGCORE_EMERALDRAPIDS:
|
case INTEL_BIGCORE_EMERALDRAPIDS:
|
||||||
case INTEL_BIGCORE_GRANITERAPIDS:
|
case INTEL_BIGCORE_GRANITERAPIDS:
|
||||||
*/
|
cpu_features->cachesize_non_temporal_divisor = 2;
|
||||||
|
goto default_tuning;
|
||||||
|
|
||||||
/*
|
/* Default tuned Mixed (bigcore + atom SOC). */
|
||||||
Default tuned Mixed (bigcore + atom SOC).
|
|
||||||
case INTEL_MIXED_LAKEFIELD:
|
case INTEL_MIXED_LAKEFIELD:
|
||||||
case INTEL_MIXED_ALDERLAKE:
|
case INTEL_MIXED_ALDERLAKE:
|
||||||
*/
|
cpu_features->cachesize_non_temporal_divisor = 2;
|
||||||
|
goto default_tuning;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Disable TSX on some processors to avoid TSX on kernels that
|
/* Disable TSX on some processors to avoid TSX on kernels that
|
||||||
|
@@ -738,19 +738,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
|||||||
cpu_features->level3_cache_linesize = level3_cache_linesize;
|
cpu_features->level3_cache_linesize = level3_cache_linesize;
|
||||||
cpu_features->level4_cache_size = level4_cache_size;
|
cpu_features->level4_cache_size = level4_cache_size;
|
||||||
|
|
||||||
/* The default setting for the non_temporal threshold is 1/4 of size
|
unsigned long int cachesize_non_temporal_divisor
|
||||||
of the chip's cache. For most Intel and AMD processors with an
|
= cpu_features->cachesize_non_temporal_divisor;
|
||||||
initial release date between 2017 and 2023, a thread's typical
|
if (cachesize_non_temporal_divisor <= 0)
|
||||||
share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
|
cachesize_non_temporal_divisor = 4;
|
||||||
estimate the point where non-temporal stores begin out-competing
|
|
||||||
REP MOVSB. As well the point where the fact that non-temporal
|
/* The default setting for the non_temporal threshold is [1/8, 1/2] of size
|
||||||
stores are forced back to main memory would already occurred to the
|
of the chip's cache (depending on `cachesize_non_temporal_divisor` which
|
||||||
majority of the lines in the copy. Note, concerns about the
|
is microarch specific. The defeault is 1/4). For most Intel and AMD
|
||||||
entire L3 cache being evicted by the copy are mostly alleviated
|
processors with an initial release date between 2017 and 2023, a thread's
|
||||||
by the fact that modern HW detects streaming patterns and
|
typical share of the cache is from 18-64MB. Using a reasonable size
|
||||||
provides proper LRU hints so that the maximum thrashing
|
fraction of L3 is meant to estimate the point where non-temporal stores
|
||||||
capped at 1/associativity. */
|
begin out-competing REP MOVSB. As well the point where the fact that
|
||||||
unsigned long int non_temporal_threshold = shared / 4;
|
non-temporal stores are forced back to main memory would already occurred
|
||||||
|
to the majority of the lines in the copy. Note, concerns about the entire
|
||||||
|
L3 cache being evicted by the copy are mostly alleviated by the fact that
|
||||||
|
modern HW detects streaming patterns and provides proper LRU hints so that
|
||||||
|
the maximum thrashing capped at 1/associativity. */
|
||||||
|
unsigned long int non_temporal_threshold
|
||||||
|
= shared / cachesize_non_temporal_divisor;
|
||||||
/* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
|
/* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
|
||||||
a higher risk of actually thrashing the cache as they don't have a HW LRU
|
a higher risk of actually thrashing the cache as they don't have a HW LRU
|
||||||
hint. As well, their performance in highly parallel situations is
|
hint. As well, their performance in highly parallel situations is
|
||||||
|
@@ -113,8 +113,11 @@ _dl_diagnostics_cpu (void)
|
|||||||
cpu_features->level3_cache_linesize);
|
cpu_features->level3_cache_linesize);
|
||||||
print_cpu_features_value ("level4_cache_size",
|
print_cpu_features_value ("level4_cache_size",
|
||||||
cpu_features->level4_cache_size);
|
cpu_features->level4_cache_size);
|
||||||
_Static_assert (offsetof (struct cpu_features, level4_cache_size)
|
print_cpu_features_value ("cachesize_non_temporal_divisor",
|
||||||
+ sizeof (cpu_features->level4_cache_size)
|
cpu_features->cachesize_non_temporal_divisor);
|
||||||
== sizeof (*cpu_features),
|
_Static_assert (
|
||||||
"last cpu_features field has been printed");
|
offsetof (struct cpu_features, cachesize_non_temporal_divisor)
|
||||||
|
+ sizeof (cpu_features->cachesize_non_temporal_divisor)
|
||||||
|
== sizeof (*cpu_features),
|
||||||
|
"last cpu_features field has been printed");
|
||||||
}
|
}
|
||||||
|
@@ -945,6 +945,9 @@ struct cpu_features
|
|||||||
unsigned long int level3_cache_linesize;
|
unsigned long int level3_cache_linesize;
|
||||||
/* /_SC_LEVEL4_CACHE_SIZE. */
|
/* /_SC_LEVEL4_CACHE_SIZE. */
|
||||||
unsigned long int level4_cache_size;
|
unsigned long int level4_cache_size;
|
||||||
|
/* When no user non_temporal_threshold is specified. We default to
|
||||||
|
cachesize / cachesize_non_temporal_divisor. */
|
||||||
|
unsigned long int cachesize_non_temporal_divisor;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Get a pointer to the CPU features structure. */
|
/* Get a pointer to the CPU features structure. */
|
||||||
|
Reference in New Issue
Block a user