Skip to content

Commit 0f8afd2

Browse files
Add an undocumented switch to allow controlling the preferred vector width emitted implicitly by the JIT (#86457)
* Add an undocumented switch to allow controlling the preferred vector width emitted implicitly by the JIT * Resolving issues and responding to PR feedback * Simplifying the xarch cpu info check
1 parent f73356c commit 0f8afd2

9 files changed

Lines changed: 153 additions & 37 deletions

File tree

docs/design/coreclr/botr/vectors-and-intrinsics.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,4 +170,4 @@ While the above api exists, it is not expected that general purpose code within
170170
|`compOpportunisticallyDependsOn(isa)`| Use when making an opportunistic decision to use or not use an instruction set. Use when the instruction set usage is a "nice to have optimization opportunity", but do not use when a false result may change the semantics of the program. Should never be used in an assert. | Return whether or not an instruction set is supported. Calls notifyInstructionSetUsage if the instruction set is supported.
171171
|`compIsaSupportedDebugOnly(isa)` | Use to assert whether or not an instruction set is supported | Return whether or not an instruction set is supported. Does not report anything. Only available in debug builds.
172172
|`getSIMDVectorRegisterByteLength()` | Use to get the size of a `Vector<T>` value. | Determine the size of the `Vector<T>` type. If on the architecture the size may vary depending on whatever rules. Use `compExactlyDependsOn` to perform the queries so that the size is consistent between compile time and runtime.
173-
|`maxSIMDStructBytes()`| Get the maximum number of bytes that might be used in a SIMD type during this compilation. | Query the set of instruction sets supported, and determine the largest simd type supported. Use `compOpportunisticallyDependsOn` to perform the queries so that the maximum size needed is the only one recorded.
173+
|`getMaxVectorByteLength()`| Get the maximum number of bytes that might be used in a SIMD type during this compilation. | Query the set of instruction sets supported, and determine the largest simd type supported. Use `compOpportunisticallyDependsOn` to perform the queries so that the maximum size needed is the only one recorded.

src/coreclr/inc/corjitflags.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,9 @@ class CORJIT_FLAGS
8787

8888
#if defined(TARGET_ARM)
8989
CORJIT_FLAG_SOFTFP_ABI = 43, // On ARM should enable armel calling convention
90-
#else // !defined(TARGET_ARM)
90+
#elif defined(TARGET_X86) || defined(TARGET_AMD64)
91+
CORJIT_FLAG_VECTOR512_THROTTLING = 43, // On Xarch, 512-bit vector usage may incur CPU frequency throttling
92+
#else
9193
CORJIT_FLAG_UNUSED16 = 43,
9294
#endif // !defined(TARGET_ARM)
9395

src/coreclr/inc/jiteeversionguid.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,11 @@ typedef const GUID *LPCGUID;
4343
#define GUID_DEFINED
4444
#endif // !GUID_DEFINED
4545

46-
constexpr GUID JITEEVersionIdentifier = { /* f63c2964-bae9-448f-baaf-9c9f2d4292f2 */
47-
0xf63c2964,
48-
0xbae9,
49-
0x448f,
50-
{0xba, 0xaf, 0x9c, 0x9f, 0x2d, 0x42, 0x92, 0xf2}
46+
constexpr GUID JITEEVersionIdentifier = { /* c540b287-0d17-4fc0-bac8-abd055acccb8 */
47+
0xc540b287,
48+
0x0d17,
49+
0x4fc0,
50+
{0xba, 0xc8, 0xab, 0xd0, 0x55, 0xac, 0xcc, 0xb8}
5151
};
5252

5353
//////////////////////////////////////////////////////////////////////////////////////////////////////////

src/coreclr/jit/compiler.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2267,6 +2267,10 @@ void Compiler::compSetProcessor()
22672267
// don't actually exist. The JIT is in charge of adding those and ensuring
22682268
// the total sum of flags is still valid.
22692269
#if defined(TARGET_XARCH)
2270+
// Get the preferred vector bitwidth, rounding down to the nearest multiple of 128-bits
2271+
uint32_t preferredVectorBitWidth = (JitConfig.PreferredVectorBitWidth() / 128) * 128;
2272+
uint32_t preferredVectorByteLength = preferredVectorBitWidth / 8;
2273+
22702274
if (instructionSetFlags.HasInstructionSet(InstructionSet_SSE))
22712275
{
22722276
instructionSetFlags.AddInstructionSet(InstructionSet_Vector128);
@@ -2294,6 +2298,17 @@ void Compiler::compSetProcessor()
22942298
assert(instructionSetFlags.HasInstructionSet(InstructionSet_AVX512F_VL));
22952299

22962300
instructionSetFlags.AddInstructionSet(InstructionSet_Vector512);
2301+
2302+
if ((preferredVectorByteLength == 0) && jitFlags.IsSet(JitFlags::JIT_FLAG_VECTOR512_THROTTLING))
2303+
{
2304+
// Some architectures can experience frequency throttling when
2305+
// executing 512-bit width instructions. To account for this we set the
2306+
// default preferred vector width to 256-bits in some scenarios. Power
2307+
// users can override this with `DOTNET_PreferredVectorBitWith=512` to
2308+
// allow using such instructions where hardware support is available.
2309+
2310+
preferredVectorByteLength = 256;
2311+
}
22972312
}
22982313
else
22992314
{
@@ -2321,6 +2336,8 @@ void Compiler::compSetProcessor()
23212336
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512VBMI_VL_X64);
23222337
#endif // TARGET_AMD64
23232338
}
2339+
2340+
opts.preferredVectorByteLength = preferredVectorByteLength;
23242341
#elif defined(TARGET_ARM64)
23252342
if (instructionSetFlags.HasInstructionSet(InstructionSet_AdvSimd))
23262343
{

src/coreclr/jit/compiler.h

Lines changed: 51 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8664,14 +8664,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
86648664

86658665
// The minimum and maximum possible number of bytes in a SIMD vector.
86668666

8667-
// maxSIMDStructBytes
8667+
// getMaxVectorByteLength
86688668
// The minimum SIMD size supported by System.Numeric.Vectors or System.Runtime.Intrinsic
86698669
// Arm.AdvSimd: 16-byte Vector<T> and Vector128<T>
86708670
// X86.SSE: 16-byte Vector<T> and Vector128<T>
86718671
// X86.AVX: 16-byte Vector<T> and Vector256<T>
86728672
// X86.AVX2: 32-byte Vector<T> and Vector256<T>
86738673
// X86.AVX512F: 32-byte Vector<T> and Vector512<T>
8674-
unsigned int maxSIMDStructBytes() const
8674+
uint32_t getMaxVectorByteLength() const
86758675
{
86768676
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
86778677
if (compOpportunisticallyDependsOn(InstructionSet_AVX))
@@ -8692,11 +8692,28 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
86928692
#elif defined(TARGET_ARM64)
86938693
return FP_REGSIZE_BYTES;
86948694
#else
8695-
assert(!"maxSIMDStructBytes() unimplemented on target arch");
8695+
assert(!"getMaxVectorByteLength() unimplemented on target arch");
86968696
unreached();
86978697
#endif
86988698
}
86998699

8700+
//------------------------------------------------------------------------
8701+
// getPreferredVectorByteLength: Gets the preferred length, in bytes, to use for vectorization
8702+
//
8703+
uint32_t getPreferredVectorByteLength() const
8704+
{
8705+
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
8706+
uint32_t preferredVectorByteLength = opts.preferredVectorByteLength;
8707+
8708+
if (preferredVectorByteLength != 0)
8709+
{
8710+
return min(getMaxVectorByteLength(), preferredVectorByteLength);
8711+
}
8712+
#endif // FEATURE_HW_INTRINSICS && TARGET_XARCH
8713+
8714+
return getMaxVectorByteLength();
8715+
}
8716+
87008717
//------------------------------------------------------------------------
87018718
// roundUpSIMDSize: rounds the given size up to the nearest SIMD size
87028719
// available on the target. Examples on XARCH:
@@ -8712,22 +8729,25 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
87128729
// It's only supposed to be used for scenarios where we can
87138730
// perform an overlapped load/store.
87148731
//
8715-
unsigned int roundUpSIMDSize(unsigned size)
8732+
uint32_t roundUpSIMDSize(unsigned size)
87168733
{
87178734
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
8718-
unsigned maxSimdSize = maxSIMDStructBytes();
8719-
assert(maxSimdSize <= ZMM_REGSIZE_BYTES);
8720-
if (size <= XMM_REGSIZE_BYTES && maxSimdSize > XMM_REGSIZE_BYTES)
8735+
uint32_t maxSize = getPreferredVectorByteLength();
8736+
assert(maxSize <= ZMM_REGSIZE_BYTES);
8737+
8738+
if ((size <= XMM_REGSIZE_BYTES) && (maxSize > XMM_REGSIZE_BYTES))
87218739
{
87228740
return XMM_REGSIZE_BYTES;
87238741
}
8724-
if (size <= YMM_REGSIZE_BYTES && maxSimdSize > YMM_REGSIZE_BYTES)
8742+
8743+
if ((size <= YMM_REGSIZE_BYTES) && (maxSize > YMM_REGSIZE_BYTES))
87258744
{
87268745
return YMM_REGSIZE_BYTES;
87278746
}
8728-
return maxSimdSize;
8747+
8748+
return maxSize;
87298749
#elif defined(TARGET_ARM64)
8730-
assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES);
8750+
assert(getMaxVectorByteLength() == FP_REGSIZE_BYTES);
87318751
return FP_REGSIZE_BYTES;
87328752
#else
87338753
assert(!"roundUpSIMDSize() unimplemented on target arch");
@@ -8747,33 +8767,36 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
87478767
// Arguments:
87488768
// size - size of the data to process with SIMD
87498769
//
8750-
unsigned int roundDownSIMDSize(unsigned size)
8770+
uint32_t roundDownSIMDSize(unsigned size)
87518771
{
87528772
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
8753-
unsigned maxSimdSize = maxSIMDStructBytes();
8754-
assert(maxSimdSize <= ZMM_REGSIZE_BYTES);
8755-
if (size >= maxSimdSize)
8773+
uint32_t maxSize = getPreferredVectorByteLength();
8774+
assert(maxSize <= ZMM_REGSIZE_BYTES);
8775+
8776+
if (size >= maxSize)
87568777
{
87578778
// Size is bigger than max SIMD size the current target supports
8758-
return maxSimdSize;
8779+
return maxSize;
87598780
}
8760-
if (size >= YMM_REGSIZE_BYTES && maxSimdSize >= YMM_REGSIZE_BYTES)
8781+
8782+
if ((size >= YMM_REGSIZE_BYTES) && (maxSize >= YMM_REGSIZE_BYTES))
87618783
{
87628784
// Size is >= YMM but not enough for ZMM -> YMM
87638785
return YMM_REGSIZE_BYTES;
87648786
}
8787+
87658788
// Return 0 if size is even less than XMM, otherwise - XMM
8766-
return size >= XMM_REGSIZE_BYTES ? XMM_REGSIZE_BYTES : 0;
8789+
return (size >= XMM_REGSIZE_BYTES) ? XMM_REGSIZE_BYTES : 0;
87678790
#elif defined(TARGET_ARM64)
8768-
assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES);
8769-
return size >= FP_REGSIZE_BYTES ? FP_REGSIZE_BYTES : 0;
8791+
assert(getMaxVectorByteLength() == FP_REGSIZE_BYTES);
8792+
return (size >= FP_REGSIZE_BYTES) ? FP_REGSIZE_BYTES : 0;
87708793
#else
87718794
assert(!"roundDownSIMDSize() unimplemented on target arch");
87728795
unreached();
87738796
#endif
87748797
}
87758798

8776-
unsigned int minSIMDStructBytes()
8799+
uint32_t getMinVectorByteLength()
87778800
{
87788801
return emitTypeSize(TYP_SIMD8);
87798802
}
@@ -8856,8 +8879,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
88568879
#if defined(FEATURE_SIMD)
88578880
if (canUseSimd)
88588881
{
8859-
maxRegSize = maxSIMDStructBytes();
8882+
maxRegSize = getPreferredVectorByteLength();
8883+
88608884
#if defined(TARGET_XARCH)
8885+
assert(maxRegSize <= ZMM_REGSIZE_BYTES);
88618886
threshold = maxRegSize;
88628887
#elif defined(TARGET_ARM64)
88638888
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
@@ -8915,7 +8940,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
89158940
bool structSizeMightRepresentSIMDType(size_t structSize)
89168941
{
89178942
#ifdef FEATURE_SIMD
8918-
return (structSize >= minSIMDStructBytes()) && (structSize <= maxSIMDStructBytes());
8943+
return (structSize >= getMinVectorByteLength()) && (structSize <= getMaxVectorByteLength());
89198944
#else
89208945
return false;
89218946
#endif // FEATURE_SIMD
@@ -9241,6 +9266,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
92419266

92429267
codeOptimize compCodeOpt; // what type of code optimizations
92439268

9269+
#if defined(TARGET_XARCH)
9270+
uint32_t preferredVectorByteLength;
9271+
#endif // TARGET_XARCH
9272+
92449273
// optimize maximally and/or favor speed over size?
92459274

92469275
#define DEFAULT_MIN_OPTS_CODE_SIZE 60000

src/coreclr/jit/jitconfigvalues.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,10 +297,18 @@ CONFIG_INTEGER(JitStressEvexEncoding, W("JitStressEvexEncoding"), 0) // Enable E
297297

298298
// clang-format off
299299

300+
CONFIG_INTEGER(PreferredVectorBitWidth, W("PreferredVectorBitWidth"), 0) // The preferred width, in bits, to use for any implicit vectorization emitted. A value less than 128 is treated as the system default.
301+
300302
//
301303
// Hardware Intrinsic ISAs; keep in sync with clrconfigvalues.h
302304
//
303-
CONFIG_INTEGER(EnableHWIntrinsic, W("EnableHWIntrinsic"), 1) // Allows Base+ hardware intrinsics to be disabled
305+
#if defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
306+
//TODO: should implement LoongArch64's features.
307+
//TODO-RISCV64-CQ: should implement RISCV64's features.
308+
CONFIG_INTEGER(EnableHWIntrinsic, W("EnableHWIntrinsic"), 0) // Allows Base+ hardware intrinsics to be disabled
309+
#else
310+
CONFIG_INTEGER(EnableHWIntrinsic, W("EnableHWIntrinsic"), 1) // Allows Base+ hardware intrinsics to be disabled
311+
#endif // defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
304312

305313
#if defined(TARGET_AMD64) || defined(TARGET_X86)
306314
CONFIG_INTEGER(EnableAES, W("EnableAES"), 1) // Allows AES+ hardware intrinsics to be disabled

src/coreclr/jit/jitee.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,11 @@ class JitFlags
7777

7878
#if defined(TARGET_ARM)
7979
JIT_FLAG_SOFTFP_ABI = 43, // On ARM should enable armel calling convention
80-
#else // !defined(TARGET_ARM)
80+
#elif defined(TARGET_XARCH)
81+
JIT_FLAG_VECTOR512_THROTTLING = 43, // On Xarch, 512-bit vector usage may incur CPU frequency throttling
82+
#else
8183
JIT_FLAG_UNUSED16 = 43,
82-
#endif // !defined(TARGET_ARM)
84+
#endif
8385

8486
JIT_FLAG_UNUSED17 = 44,
8587
JIT_FLAG_UNUSED18 = 45,

src/coreclr/jit/lclvars.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1756,8 +1756,8 @@ bool Compiler::StructPromotionHelper::CanPromoteStructType(CORINFO_CLASS_HANDLE
17561756
structPromotionInfo = lvaStructPromotionInfo(typeHnd);
17571757

17581758
#if defined(FEATURE_SIMD)
1759-
// maxSIMDStructBytes() represents the size of the largest primitive type that we can struct promote.
1760-
const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * compiler->maxSIMDStructBytes();
1759+
// getMaxVectorByteLength() represents the size of the largest primitive type that we can struct promote.
1760+
const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * compiler->getMaxVectorByteLength();
17611761
#else // !FEATURE_SIMD
17621762
// sizeof(double) represents the size of the largest primitive type that we can struct promote.
17631763
const unsigned maxSize = MAX_NumOfFieldsInPromotableStruct * sizeof(double);

src/coreclr/vm/codeman.cpp

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1423,6 +1423,22 @@ void EEJitManager::SetCpuInfo()
14231423
// LZCNT - ECX bit 5
14241424
// synchronously updating VM and JIT.
14251425

1426+
union XarchCpuInfo
1427+
{
1428+
struct {
1429+
uint32_t SteppingId : 4;
1430+
uint32_t Model : 4;
1431+
uint32_t FamilyId : 4;
1432+
uint32_t ProcessorType : 2;
1433+
uint32_t Reserved1 : 2; // Unused bits in the CPUID result
1434+
uint32_t ExtendedModelId : 4;
1435+
uint32_t ExtendedFamilyId : 8;
1436+
uint32_t Reserved : 4; // Unused bits in the CPUID result
1437+
};
1438+
1439+
uint32_t Value;
1440+
} xarchCpuInfo;
1441+
14261442
int cpuidInfo[4];
14271443

14281444
const int CPUID_EAX = 0;
@@ -1431,13 +1447,19 @@ void EEJitManager::SetCpuInfo()
14311447
const int CPUID_EDX = 3;
14321448

14331449
__cpuid(cpuidInfo, 0x00000000);
1450+
14341451
uint32_t maxCpuId = static_cast<uint32_t>(cpuidInfo[CPUID_EAX]);
14351452
_ASSERTE(maxCpuId >= 1);
14361453

1437-
__cpuid(cpuidInfo, 0x00000001);
1454+
bool isGenuineIntel = (cpuidInfo[CPUID_EBX] == 0x756E6547) && // Genu
1455+
(cpuidInfo[CPUID_EDX] == 0x49656E69) && // ineI
1456+
(cpuidInfo[CPUID_ECX] == 0x6C65746E); // ntel
14381457

1458+
__cpuid(cpuidInfo, 0x00000001);
14391459
_ASSERTE((cpuidInfo[CPUID_EDX] & (1 << 15)) != 0); // CMOV
14401460

1461+
xarchCpuInfo.Value = cpuidInfo[CPUID_EAX];
1462+
14411463
#if defined(TARGET_X86) && !defined(TARGET_WINDOWS)
14421464
// Linux may still support no SSE/SSE2 for 32-bit
14431465
if ((cpuidInfo[CPUID_EDX] & (1 << 25)) != 0)
@@ -1695,7 +1717,7 @@ void EEJitManager::SetCpuInfo()
16951717
// Now that we've queried the actual hardware support, we need to adjust what is actually supported based
16961718
// on some externally available config switches that exist so users can test code for downlevel hardware.
16971719

1698-
#if defined(TARGET_AMD64) || defined(TARGET_X86)
1720+
#if defined(TARGET_X86) || defined(TARGET_AMD64)
16991721
if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableHWIntrinsic))
17001722
{
17011723
CPUCompileFlags.Clear(InstructionSet_X86Base);
@@ -1818,7 +1840,8 @@ void EEJitManager::SetCpuInfo()
18181840

18191841
// We need to additionally check that EXTERNAL_EnableSSE3_4 is set, as that
18201842
// is a prexisting config flag that controls the SSE3+ ISAs
1821-
if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3) || !CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3_4))
1843+
if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3) ||
1844+
!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE3_4))
18221845
{
18231846
CPUCompileFlags.Clear(InstructionSet_SSE3);
18241847
}
@@ -1911,6 +1934,41 @@ void EEJitManager::SetCpuInfo()
19111934
CPUCompileFlags.Set64BitInstructionSetVariants();
19121935
CPUCompileFlags.EnsureValidInstructionSetSupport();
19131936

1937+
#if defined(TARGET_X86) || defined(TARGET_AMD64)
1938+
if (isGenuineIntel)
1939+
{
1940+
// Some architectures can experience frequency throttling when executing
1941+
// executing 512-bit width instructions. To account for this we set the
1942+
// default preferred vector width to 256-bits in some scenarios. Power
1943+
// users can override this with `DOTNET_PreferredVectorBitWith=512` to
1944+
// allow using such instructions where hardware support is available.
1945+
1946+
if (xarchCpuInfo.FamilyId == 0x06)
1947+
{
1948+
if (xarchCpuInfo.ExtendedModelId == 0x05)
1949+
{
1950+
if (xarchCpuInfo.Model == 0x05)
1951+
{
1952+
// * Skylake (Server)
1953+
// * Cascade Lake
1954+
// * Cooper Lake
1955+
1956+
CPUCompileFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_VECTOR512_THROTTLING);
1957+
}
1958+
}
1959+
else if (xarchCpuInfo.ExtendedModelId == 0x06)
1960+
{
1961+
if (xarchCpuInfo.Model == 0x06)
1962+
{
1963+
// * Cannon Lake
1964+
1965+
CPUCompileFlags.Set(CORJIT_FLAGS::CORJIT_FLAG_VECTOR512_THROTTLING);
1966+
}
1967+
}
1968+
}
1969+
}
1970+
#endif // TARGET_X86 || TARGET_AMD64
1971+
19141972
m_CPUCompileFlags = CPUCompileFlags;
19151973
}
19161974

0 commit comments

Comments
 (0)