1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-08-01 10:06:57 +03:00

x86: Make the divisor in setting non_temporal_threshold cpu specific

Different systems prefer a different divisors.

From benchmarks[1] so far the following divisors have been found:
    ICX     : 2
    SKX     : 2
    BWD     : 8

For Intel, we are generalizing that BWD and older prefers 8 as a
divisor, and SKL and newer prefers 2. This number can be further tuned
as benchmarks are run.

[1]: https://github.com/goldsteinn/memcpy-nt-benchmarks
Reviewed-by: DJ Delorie <dj@redhat.com>
This commit is contained in:
Noah Goldstein
2023-06-07 13:18:03 -05:00
parent f193ea20ed
commit 180897c161
4 changed files with 51 additions and 26 deletions

View File

@ -636,6 +636,7 @@ init_cpu_features (struct cpu_features *cpu_features)
unsigned int stepping = 0;
enum cpu_features_kind kind;
cpu_features->cachesize_non_temporal_divisor = 4;
#if !HAS_CPUID
if (__get_cpuid_max (0, 0) == 0)
{
@ -716,13 +717,13 @@ init_cpu_features (struct cpu_features *cpu_features)
/* Bigcore/Default Tuning. */
default:
default_tuning:
/* Unknown family 0x06 processors. Assuming this is one
of Core i3/i5/i7 processors if AVX is available. */
if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
break;
/* Fall through. */
case INTEL_BIGCORE_NEHALEM:
case INTEL_BIGCORE_WESTMERE:
enable_modern_features:
/* Rep string instructions, unaligned load, unaligned copy,
and pminub are fast on Intel Core i3, i5 and i7. */
cpu_features->preferred[index_arch_Fast_Rep_String]
@ -732,12 +733,23 @@ init_cpu_features (struct cpu_features *cpu_features)
| bit_arch_Prefer_PMINUB_for_stringop);
break;
/*
Default tuned Bigcore microarch.
case INTEL_BIGCORE_NEHALEM:
case INTEL_BIGCORE_WESTMERE:
/* Older CPUs prefer non-temporal stores at lower threshold. */
cpu_features->cachesize_non_temporal_divisor = 8;
goto enable_modern_features;
/* Older Bigcore microarch (smaller non-temporal store
threshold). */
case INTEL_BIGCORE_SANDYBRIDGE:
case INTEL_BIGCORE_IVYBRIDGE:
case INTEL_BIGCORE_HASWELL:
case INTEL_BIGCORE_BROADWELL:
cpu_features->cachesize_non_temporal_divisor = 8;
goto default_tuning;
/* Newer Bigcore microarch (larger non-temporal store
threshold). */
case INTEL_BIGCORE_SKYLAKE:
case INTEL_BIGCORE_KABYLAKE:
case INTEL_BIGCORE_COMETLAKE:
@ -753,13 +765,14 @@ init_cpu_features (struct cpu_features *cpu_features)
case INTEL_BIGCORE_SAPPHIRERAPIDS:
case INTEL_BIGCORE_EMERALDRAPIDS:
case INTEL_BIGCORE_GRANITERAPIDS:
*/
cpu_features->cachesize_non_temporal_divisor = 2;
goto default_tuning;
/*
Default tuned Mixed (bigcore + atom SOC).
/* Default tuned Mixed (bigcore + atom SOC). */
case INTEL_MIXED_LAKEFIELD:
case INTEL_MIXED_ALDERLAKE:
*/
cpu_features->cachesize_non_temporal_divisor = 2;
goto default_tuning;
}
/* Disable TSX on some processors to avoid TSX on kernels that