@@ -498,12 +498,12 @@ inline void write8(char* buffer, uint64_t value) noexcept {
498498auto write_significand17 (char * buffer, uint64_t value) noexcept -> char* {
499499#if ZMIJ_USE_NEON
500500 // An optimized version for NEON by Dougall Johnson.
501+ constexpr int32_t neg10k = -10000 + 0x10000 ;
501502 struct to_string_constants {
502503 uint64_t mul_const = 0xabcc77118461cefd ;
503504 uint64_t hundred_million = 100000000 ;
504- int32_t multipliers32[4 ] = {div10k_sig, -10000 + 0x10000 , div100_sig << 12 ,
505- -100 + 0x10000 };
506- int16_t multipliers16[8 ] = {0xce0 , -10 + 0x100 };
505+ int32_t multipliers32[4 ] = {div10k_sig, neg10k, div100_sig << 12 , neg100};
506+ int16_t multipliers16[8 ] = {0xce0 , neg10};
507507 };
508508
509509 static const to_string_constants constants;
@@ -530,31 +530,33 @@ auto write_significand17(char* buffer, uint64_t value) noexcept -> char* {
530530 char * start = buffer;
531531 buffer = write_if_nonzero (buffer, a);
532532
533- uint64x1_t ffgghhii_bbccddee64 = {(uint64_t (ffgghhii) << 32 ) | bbccddee};
534- int32x2_t ffgghhii_bbccddee = vreinterpret_s32_u64 (ffgghhii_bbccddee64 );
533+ uint64x1_t ffgghhii_bbccddee_64 = {(uint64_t (ffgghhii) << 32 ) | bbccddee};
534+ int32x2_t bbccddee_ffgghhii = vreinterpret_s32_u64 (ffgghhii_bbccddee_64 );
535535
536- int32x2_t quo10k = vreinterpret_s32_u32 (
536+ int32x2_t bbcc_ffgg = vreinterpret_s32_u32 (
537537 vshr_n_u32 (vreinterpret_u32_s32 (
538- vqdmulh_n_s32 (ffgghhii_bbccddee , c->multipliers32 [0 ])),
538+ vqdmulh_n_s32 (bbccddee_ffgghhii , c->multipliers32 [0 ])),
539539 9 ));
540- int32x2_t rem10k = vmla_n_s32 (ffgghhii_bbccddee, quo10k, c->multipliers32 [1 ]);
540+ int32x2_t ddee_bbcc_hhii_ffgg_32 =
541+ vmla_n_s32 (bbccddee_ffgghhii, bbcc_ffgg, c->multipliers32 [1 ]);
541542
542- int32x4_t extended =
543- vreinterpretq_s32_u32 ( vshll_n_u16 (vreinterpret_u16_s32 (rem10k ), 0 ));
543+ int32x4_t ddee_bbcc_hhii_ffgg = vreinterpretq_s32_u32 (
544+ vshll_n_u16 (vreinterpret_u16_s32 (ddee_bbcc_hhii_ffgg_32 ), 0 ));
544545
545546 // Compiler barrier, or clang breaks the subsequent MLA into UADDW + MUL.
546- ZMIJ_ASM ((" " : " +w" (extended)));
547-
548- int32x4_t high_100 = vqdmulhq_n_s32 (extended, c->multipliers32 [2 ]);
549- int16x8_t hundreds = vreinterpretq_s16_s32 (
550- vmlaq_n_s32 (extended, high_100, c->multipliers32 [3 ]));
551- int16x8_t high_10 = vqdmulhq_n_s16 (hundreds, c->multipliers16 [0 ]);
547+ ZMIJ_ASM ((" " : " +w" (ddee_bbcc_hhii_ffgg)));
548+
549+ int32x4_t dd_bb_hh_ff =
550+ vqdmulhq_n_s32 (ddee_bbcc_hhii_ffgg, c->multipliers32 [2 ]);
551+ int16x8_t ee_dd_cc_bb_ii_hh_gg_ff = vreinterpretq_s16_s32 (
552+ vmlaq_n_s32 (ddee_bbcc_hhii_ffgg, dd_bb_hh_ff, c->multipliers32 [3 ]));
553+ int16x8_t high_10s =
554+ vqdmulhq_n_s16 (ee_dd_cc_bb_ii_hh_gg_ff, c->multipliers16 [0 ]);
552555 uint8x16_t digits = vrev64q_u8 (vreinterpretq_u8_s16 (
553- vmlaq_n_s16 (hundreds, high_10, c->multipliers16 [1 ])));
554- uint16x8_t ascii = vaddq_u16 (vreinterpretq_u16_u8 (digits),
555- vreinterpretq_u16_s8 (vdupq_n_s8 (' 0' )));
556-
557- memcpy (buffer, &ascii, 16 );
556+ vmlaq_n_s16 (ee_dd_cc_bb_ii_hh_gg_ff, high_10s, c->multipliers16 [1 ])));
557+ uint16x8_t str = vaddq_u16 (vreinterpretq_u16_u8 (digits),
558+ vreinterpretq_u16_s8 (vdupq_n_s8 (' 0' )));
559+ memcpy (buffer, &str, sizeof (str));
558560
559561 uint16x8_t is_zero = vreinterpretq_u16_u8 (vceqq_u8 (digits, vdupq_n_u8 (0 )));
560562 uint64_t zeroes =
0 commit comments