[msan] Micro-optimize NEON matrix-multiply instrumentation (#188815)

thurstond · web-flow · commit 423f410e7e1b · 2026-03-26T12:13:52.000-07:00
Replace Or(SExt(),SExt()) with the equivalent SExt(Or()).
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5673,13 +5673,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         ExpectedRTy->getElementCount(),
         ConstantInt::get(ExpectedRTy->getElementType(), 0x8));
 
-    ShadowAB = IRB.CreateSExt(IRB.CreateICmpNE(ShadowAB, FullyInit),
-                              ShadowAB->getType());
+    ShadowAB = IRB.CreateICmpNE(ShadowAB, FullyInit);
 
-    ShadowR = IRB.CreateSExt(
-        IRB.CreateICmpNE(ShadowR, getCleanShadow(ExpectedRTy)), ExpectedRTy);
+    ShadowR = IRB.CreateICmpNE(ShadowR, getCleanShadow(ExpectedRTy));
+    ShadowR = IRB.CreateOr(ShadowAB, ShadowR);
 
-    setShadow(&I, IRB.CreateOr(ShadowAB, ShadowR));
+    setShadow(&I, IRB.CreateSExt(ShadowR, ExpectedRTy));
     setOriginForNaryOp(I);
   }
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-bf16-dotprod-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-bf16-dotprod-intrinsics.ll
@@ -78,10 +78,9 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i1> [[TMP7]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP6]], <16 x i8> [[TMP8]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP9]], splat (i32 8)
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i1> [[TMP13]] to <4 x i32>
 ; CHECK-NEXT:    [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[VBFMMLAQ_V3_I]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-matmul.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-matmul.ll
@@ -24,10 +24,9 @@ define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) sa
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i1> [[TMP5]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP4]], <16 x i8> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <4 x i32> [[TMP7]], splat (i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <4 x i1> [[TMP11]] to <4 x i32>
 ; CHECK-NEXT:    [[VMMLA1_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[VMMLA1_I]]
@@ -51,10 +50,9 @@ define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) sa
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i1> [[TMP5]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP4]], <16 x i8> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <4 x i32> [[TMP7]], splat (i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <4 x i1> [[TMP11]] to <4 x i32>
 ; CHECK-NEXT:    [[VMMLA1_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[VMMLA1_I]]
@@ -78,10 +76,9 @@ define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) s
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i1> [[TMP5]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP4]], <16 x i8> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <4 x i32> [[TMP7]], splat (i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <4 x i1> [[TMP11]] to <4 x i32>
 ; CHECK-NEXT:    [[VUSMMLA1_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[VUSMMLA1_I]]