Skip to content

Add AVX512 BMM API#124804

Draft
alexcovington wants to merge 8 commits intodotnet:mainfrom
alexcovington:avx512bmm
Draft

Add AVX512 BMM API#124804
alexcovington wants to merge 8 commits intodotnet:mainfrom
alexcovington:avx512bmm

Conversation

@alexcovington
Copy link
Contributor

This PR implements the AVX512 BMM API.

namespace System.Runtime.Intrinsics.X86
{
    public abstract class Avx512Bmm : Avx512F
    {
        public static new bool IsSupported { get; }

        public static Vector128<byte> ReverseBits(Vector128<byte> values);
        public static Vector256<byte> ReverseBits(Vector256<byte> values);
        public static Vector512<byte> ReverseBits(Vector512<byte> values);

        public static Vector256<ushort> BitMultiplyMatrix16x16WithOrReduction(Vector256<ushort> left, Vector256<ushort> right, Vector256<ushort> addend);
        public static Vector512<ushort> BitMultiplyMatrix16x16WithOrReduction(Vector512<ushort> left, Vector512<ushort> right, Vector512<ushort> addend);
        public static Vector256<ushort> BitMultiplyMatrix16x16WithXorReduction(Vector256<ushort> left, Vector256<ushort> right, Vector256<ushort> addend);
        public static Vector512<ushort> BitMultiplyMatrix16x16WithXorReduction(Vector512<ushort> left, Vector512<ushort> right, Vector512<ushort> addend);
    }
}

Disasm Samples

BitMultiplyMatrix16x16WithOrReduction
private static Vector256<ushort> BitMultiplyMatrix16x16WithOrReduction_Vector256(Vector256<ushort> x, Vector256<ushort> y, Vector256<ushort> z)
{
    return Avx512Bmm.BitMultiplyMatrix16x16WithOrReduction(x, y, z);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithOrReduction_Vector256(System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort]):System.Runtime.Intrinsics.Vector256`1[ushort] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymm1, ymmword ptr [r8]
       vbmacor16x16x16 ymm0, ymm1, ymmword ptr [r9]
       vmovups  ymmword ptr [rcx], ymm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0016
       vzeroupper
       ret

; Total bytes of code 26
private static Vector512<ushort> BitMultiplyMatrix16x16WithOrReduction_Vector512(Vector512<ushort> x, Vector512<ushort> y, Vector512<ushort> z)
{
    return Avx512Bmm.BitMultiplyMatrix16x16WithOrReduction(x, y, z);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithOrReduction_Vector512(System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort]):System.Runtime.Intrinsics.Vector512`1[ushort] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       vmovups  zmm1, zmmword ptr [r8]
       vbmacor16x16x16 zmm0, zmm1, zmmword ptr [r9]
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x001B
       vzeroupper
       ret

; Total bytes of code 31
BitMultiplyMatrix16x16WithXorReduction
private static Vector256<ushort> BitMultiplyMatrix16x16WithXorReduction_Vector256(Vector256<ushort> x, Vector256<ushort> y, Vector256<ushort> z)
{
    return Avx512Bmm.BitMultiplyMatrix16x16WithXorReduction(x, y, z);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithXorReduction_Vector256(System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort]):System.Runtime.Intrinsics.Vector256`1[ushort] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymm1, ymmword ptr [r8]
       vbmacxor16x16x16 ymm0, ymm1, ymmword ptr [r9]
       vmovups  ymmword ptr [rcx], ymm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0016
       vzeroupper
       ret

; Total bytes of code 26
private static Vector512<ushort> BitMultiplyMatrix16x16WithXorReduction_Vector512(Vector512<ushort> x, Vector512<ushort> y, Vector512<ushort> z)
{
    return Avx512Bmm.BitMultiplyMatrix16x16WithXorReduction(x, y, z);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithXorReduction_Vector512(System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort]):System.Runtime.Intrinsics.Vector512`1[ushort] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       vmovups  zmm1, zmmword ptr [r8]
       vbmacxor16x16x16 zmm0, zmm1, zmmword ptr [r9]
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x001B
       vzeroupper
       ret

; Total bytes of code 31
ReverseBits
private static Vector128<byte> ReverseBits_Vector128(Vector128<byte> values)
{
    return Avx512Bmm.ReverseBits(values);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Vector128(System.Runtime.Intrinsics.Vector128`1[byte]):System.Runtime.Intrinsics.Vector128`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vbitrev  xmm0, xmmword ptr [rdx]
       vmovups  xmmword ptr [rcx], xmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x000D
       ret

; Total bytes of code 14
private static Vector256<byte> ReverseBits_Vector256(Vector256<byte> values)
{
    return Avx512Bmm.ReverseBits(values);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Vector256(System.Runtime.Intrinsics.Vector256`1[byte]):System.Runtime.Intrinsics.Vector256`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vbitrev  ymm0, ymmword ptr [rdx]
       vmovups  ymmword ptr [rcx], ymm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x000D
       vzeroupper
       ret

; Total bytes of code 17
private static Vector512<byte> ReverseBits_Vector512(Vector512<byte> values)
{
    return Avx512Bmm.ReverseBits(values);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Vector512(System.Runtime.Intrinsics.Vector512`1[byte]):System.Runtime.Intrinsics.Vector512`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vbitrev  zmm0, zmmword ptr [rdx]
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x000F
       vzeroupper
       ret

; Total bytes of code 19
ReverseBits (Merge Masking)
private static Vector128<byte> ReverseBits_Mask_Vector128(Vector128<byte> values, Vector128<byte> mask)
{
    return Avx512BW.BlendVariable(values, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Mask_Vector128(System.Runtime.Intrinsics.Vector128`1[byte],System.Runtime.Intrinsics.Vector128`1[byte]):System.Runtime.Intrinsics.Vector128`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  xmm0, xmmword ptr [rdx]
       vmovups  xmm1, xmmword ptr [r8]
       vpmovb2m k1, xmm1
       vbitrev  xmm0 {k1}, xmm0
       vmovups  xmmword ptr [rcx], xmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x001C
       ret

; Total bytes of code 29
private static Vector256<byte> ReverseBits_Mask_Vector256(Vector256<byte> values, Vector256<byte> mask)
{
    return Avx512BW.BlendVariable(values, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Mask_Vector256(System.Runtime.Intrinsics.Vector256`1[byte],System.Runtime.Intrinsics.Vector256`1[byte]):System.Runtime.Intrinsics.Vector256`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  ymm0, ymmword ptr [rdx]
       vmovups  ymm1, ymmword ptr [r8]
       vpmovb2m k1, ymm1
       vbitrev  ymm0 {k1}, ymm0
       vmovups  ymmword ptr [rcx], ymm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x001C
       vzeroupper
       ret

; Total bytes of code 32
private static Vector512<byte> ReverseBits_Mask_Vector512(Vector512<byte> values, Vector512<byte> mask)
{
    return Avx512BW.BlendVariable(values, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Mask_Vector512(System.Runtime.Intrinsics.Vector512`1[byte],System.Runtime.Intrinsics.Vector512`1[byte]):System.Runtime.Intrinsics.Vector512`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [rdx]
       vmovups  zmm1, zmmword ptr [r8]
       vpmovb2m k1, zmm1
       vbitrev  zmm0 {k1}, zmm0
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0021
       vzeroupper
       ret

; Total bytes of code 37
ReverseBits (Zero Masking)
private static Vector128<byte> ReverseBits_Maskz_Vector128(Vector128<byte> values, Vector128<byte> mask)
{
    return Avx512BW.BlendVariable(Vector128<byte>.Zero, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Maskz_Vector128(System.Runtime.Intrinsics.Vector128`1[byte],System.Runtime.Intrinsics.Vector128`1[byte]):System.Runtime.Intrinsics.Vector128`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  xmm0, xmmword ptr [r8]
       vpmovb2m k1, xmm0
       vbitrev  xmm0 {k1}{z}, xmmword ptr [rdx]
       vmovups  xmmword ptr [rcx], xmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0018
       ret

; Total bytes of code 25
private static Vector256<byte> ReverseBits_Maskz_Vector256(Vector256<byte> values, Vector256<byte> mask)
{
    return Avx512BW.BlendVariable(Vector256<byte>.Zero, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Maskz_Vector256(System.Runtime.Intrinsics.Vector256`1[byte],System.Runtime.Intrinsics.Vector256`1[byte]):System.Runtime.Intrinsics.Vector256`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  ymm0, ymmword ptr [r8]
       vpmovb2m k1, ymm0
       vbitrev  ymm0 {k1}{z}, ymmword ptr [rdx]
       vmovups  ymmword ptr [rcx], ymm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x0018
       vzeroupper
       ret

; Total bytes of code 28
private static Vector512<byte> ReverseBits_Maskz_Vector512(Vector512<byte> values, Vector512<byte> mask)
{
    return Avx512BW.BlendVariable(Vector512<byte>.Zero, Avx512Bmm.ReverseBits(values), mask);
}
; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Maskz_Vector512(System.Runtime.Intrinsics.Vector512`1[byte],System.Runtime.Intrinsics.Vector512`1[byte]):System.Runtime.Intrinsics.Vector512`1[byte] (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data

G_M000_IG01:                ;; offset=0x0000

G_M000_IG02:                ;; offset=0x0000
       vmovups  zmm0, zmmword ptr [r8]
       vpmovb2m k1, zmm0
       vbitrev  zmm0 {k1}{z}, zmmword ptr [rdx]
       vmovups  zmmword ptr [rcx], zmm0
       mov      rax, rcx

G_M000_IG03:                ;; offset=0x001B
       vzeroupper
       ret

; Total bytes of code 31

@github-actions github-actions bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Feb 24, 2026
@dotnet-policy-service dotnet-policy-service bot added the community-contribution Indicates that the PR has been added by a community member label Feb 24, 2026
@dotnet-policy-service
Copy link
Contributor

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

@alexcovington alexcovington marked this pull request as draft February 24, 2026 18:12
@@ -8,36 +8,35 @@ namespace Microsoft.Win32.SafeHandles
{
public abstract partial class CriticalHandleMinusOneIsInvalid : System.Runtime.InteropServices.CriticalHandle
{
protected CriticalHandleMinusOneIsInvalid() : base (default(System.IntPtr)) { }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There shouldn't be any changes to System.Runtime.cs with this PR

Comment on lines 100 to 101
InstructionSet_AVX512BMM=46,
InstructionSet_AVX512BMM_X64=47,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this may have been added manually.

You rather want to modify src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt and then run src/coreclr/tools/Common/JitInterface/ThunkGenerator/gen.bat

Comment on lines 914 to 915
case NI_AVX512BMM_BitMultiplyMatrix16x16WithOrReduction:
case NI_AVX512BMM_BitMultiplyMatrix16x16WithXorReduction:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These could be put in the same grouping as AvxVnni just above, right?

Looks like they're identical and should also have the same assertions.

INST3(vp2intersectd, "vp2intersectd", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0x68), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Compute Intersection Between DWORDS to a Pair of Mask Registers
INST3(vp2intersectq, "vp2intersectq", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0x68), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Compute Intersection Between QWORDS to a Pair of Mask Registers

#define FIRST_AVX512BMM_INSTRUCTION INS_vbmacor16x16x16
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably put these below the AvxVnni group. Just so its not splitting the Avx10v2 and Avx10v2.x64 group

/// <para>__m512i _mm512_bmacxor16x16x16 (__m512i left, __m512i right, __m512i addend)</para>
/// <para> VBMACXOR16x16x16 zmm1, zmm2, zmm3/m256</para>
/// </summary>
public static Vector512<ushort> BitMultiplyMatrix16x16WithXorReduction(Vector512<ushort> left, Vector512<ushort> right, Vector512<ushort> addend) => BitMultiplyMatrix16x16WithXorReduction(left, right, addend);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After double checking the instruction, it looks like the parameter order is VBMACXOR16x16x16 ymm1, ymm2, ymm3/m256 where ymm1: addend, ymm2: left, ymm3: right

and so the order here should be: addend, left, right instead

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-- we want to ensure parameter order matches instruction order, so we don't have to move things around

Comment on lines 1096 to 1097
HARDWARE_INTRINSIC(AVX512BMM, BitMultiplyMatrix16x16WithOrReduction, -1, -1, {INS_invalid, INS_invalid, INS_vbmacor16x16x16, INS_vbmacor16x16x16, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AVX512BMM, BitMultiplyMatrix16x16WithXorReduction, -1, -1, {INS_invalid, INS_invalid, INS_vbmacxor16x16x16, INS_vbmacxor16x16x16, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
Copy link
Member

@tannergooding tannergooding Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These two are "RMW" (read-modify write) and so should be marked as IsRmwIntrinsic and have some specialized handling like Fma and AvxVnni intrinsics have (in lsra and lower). This will ensure better codegen since the first operand is both a source and destination as far as the register allocator is concerned

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd also expect some minimal handling in lower to handle the fact that left and right are commutative and so either can be the "from memory" operand.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI community-contribution Indicates that the PR has been added by a community member

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants