Skip to content

Commit 423f410

Browse files
authored
[msan] Micro-optimize NEON matrix-multiply instrumentation (#188815)
Replace Or(SExt(),SExt()) with the equivalent SExt(Or()).
1 parent 5b8c175 commit 423f410

File tree

3 files changed

+12
-17
lines changed

3 files changed

+12
-17
lines changed

llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5673,13 +5673,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
56735673
ExpectedRTy->getElementCount(),
56745674
ConstantInt::get(ExpectedRTy->getElementType(), 0x8));
56755675

5676-
ShadowAB = IRB.CreateSExt(IRB.CreateICmpNE(ShadowAB, FullyInit),
5677-
ShadowAB->getType());
5676+
ShadowAB = IRB.CreateICmpNE(ShadowAB, FullyInit);
56785677

5679-
ShadowR = IRB.CreateSExt(
5680-
IRB.CreateICmpNE(ShadowR, getCleanShadow(ExpectedRTy)), ExpectedRTy);
5678+
ShadowR = IRB.CreateICmpNE(ShadowR, getCleanShadow(ExpectedRTy));
5679+
ShadowR = IRB.CreateOr(ShadowAB, ShadowR);
56815680

5682-
setShadow(&I, IRB.CreateOr(ShadowAB, ShadowR));
5681+
setShadow(&I, IRB.CreateSExt(ShadowR, ExpectedRTy));
56835682
setOriginForNaryOp(I);
56845683
}
56855684

llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-bf16-dotprod-intrinsics.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,9 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo
7878
; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i1> [[TMP7]] to <16 x i8>
7979
; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP6]], <16 x i8> [[TMP8]])
8080
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP9]], splat (i32 8)
81-
; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
8281
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
83-
; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i32>
84-
; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP11]], [[TMP13]]
82+
; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]]
83+
; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i1> [[TMP13]] to <4 x i32>
8584
; CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
8685
; CHECK-NEXT: store <4 x i32> [[TMP14]], ptr @__msan_retval_tls, align 8
8786
; CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]]

llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-matmul.ll

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,9 @@ define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) sa
2424
; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i1> [[TMP5]] to <16 x i8>
2525
; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP4]], <16 x i8> [[TMP6]])
2626
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <4 x i32> [[TMP7]], splat (i32 8)
27-
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32>
2827
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
29-
; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
30-
; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP9]], [[TMP11]]
28+
; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP8]], [[TMP10]]
29+
; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i1> [[TMP11]] to <4 x i32>
3130
; CHECK-NEXT: [[VMMLA1_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
3231
; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
3332
; CHECK-NEXT: ret <4 x i32> [[VMMLA1_I]]
@@ -51,10 +50,9 @@ define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) sa
5150
; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i1> [[TMP5]] to <16 x i8>
5251
; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP4]], <16 x i8> [[TMP6]])
5352
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <4 x i32> [[TMP7]], splat (i32 8)
54-
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32>
5553
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
56-
; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
57-
; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP9]], [[TMP11]]
54+
; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP8]], [[TMP10]]
55+
; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i1> [[TMP11]] to <4 x i32>
5856
; CHECK-NEXT: [[VMMLA1_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
5957
; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
6058
; CHECK-NEXT: ret <4 x i32> [[VMMLA1_I]]
@@ -78,10 +76,9 @@ define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) s
7876
; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i1> [[TMP5]] to <16 x i8>
7977
; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP4]], <16 x i8> [[TMP6]])
8078
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <4 x i32> [[TMP7]], splat (i32 8)
81-
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32>
8279
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
83-
; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
84-
; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i32> [[TMP9]], [[TMP11]]
80+
; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP8]], [[TMP10]]
81+
; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i1> [[TMP11]] to <4 x i32>
8582
; CHECK-NEXT: [[VUSMMLA1_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
8683
; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
8784
; CHECK-NEXT: ret <4 x i32> [[VUSMMLA1_I]]

0 commit comments

Comments
 (0)