@@ -23,7 +23,6 @@ struct dec_fp {
2323#include < limits> // std::numeric_limits
2424#include < type_traits> // std::conditional_t
2525
26-
2726#ifndef ZMIJ_USE_SIMD
2827# define ZMIJ_USE_SIMD 1
2928#endif
@@ -40,7 +39,8 @@ struct dec_fp {
4039// Use the provided definition
4140#elif defined(__SSE2__)
4241# define ZMIJ_USE_SSE 1
43- #elif defined(_MSC_VER) && (defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86FP == 2))
42+ #elif defined(_MSC_VER) && \
43+ (defined (_M_AMD64) || (defined (_M_IX86_FP) && _M_IX86FP == 2 ))
4444# define ZMIJ_USE_SSE 1
4545#else
4646# define ZMIJ_USE_SSE 0
@@ -50,7 +50,8 @@ struct dec_fp {
5050// Use the provided definition
5151#elif defined(__SSE4_1__)
5252# define ZMIJ_USE_SSE4_1 1
53- #elif defined(_MSC_VER) && defined(__AVX__) // There's no way to check for /arch:SSE4.2 specifically
53+ #elif defined(_MSC_VER) && \
54+ defined (__AVX__) // There's no way to check for /arch:SSE4.2 specifically
5455# define ZMIJ_USE_SSE4_1 1
5556#else
5657# define ZMIJ_USE_SSE4_1 0
@@ -61,7 +62,8 @@ struct dec_fp {
6162#endif
6263
6364#if ZMIJ_USE_SSE4_1 && !ZMIJ_USE_SSE
64- # error "User asked for SSE4.1 but SSE is not available or explicitly not requested."
65+ # error \
66+ " User asked for SSE4.1 but SSE is not available or explicitly not requested."
6567#endif
6668
6769#if ZMIJ_USE_SSE
@@ -451,19 +453,26 @@ inline auto digits2(size_t value) noexcept -> const char* {
451453 return &data[value * 2 ];
452454}
453455
456+ constexpr int div10k_exp = 40 ;
457+ constexpr uint32_t div10k_sig = uint32_t ((1ull << div10k_exp) / 10000 + 1 );
458+ constexpr int div100_exp = 19 ;
459+ constexpr uint32_t div100_sig = (1 << div100_exp) / 100 + 1 ;
460+
461+ constexpr uint64_t zeros = 0x0101010101010101u * ' 0' ;
462+
454463auto to_bcd8 (uint64_t abcdefgh) noexcept -> uint64_t {
455464 // An optimization from Xiang JunBo.
456- // Three steps BCD. Base 10000 -> base 100 -> base 10.
465+ // Three steps BCD. Base 10000 -> base 100 -> base 10.
457466 // div and mod are evaluated simultaneously as, e.g.
458467 // (abcdefgh / 10000) << 32 + (abcdefgh % 10000)
459- // == abcdefgh + (2^ 32 - 10000) * (abcdefgh / 10000)))
468+ // == abcdefgh + (2** 32 - 10000) * (abcdefgh / 10000)))
460469 // where the division on the RHS is implemented by the usual multiply + shift
461470 // trick and the fractional bits are masked away.
462- uint64_t abcd_efgh =
463- abcdefgh + ( 0x100000000 - 10000 ) * ((abcdefgh * 0x68db8bb ) >> 40 );
471+ uint64_t abcd_efgh = abcdefgh + ( 0x100000000 - 10000 ) *
472+ ((abcdefgh * div10k_sig ) >> div10k_exp );
464473 uint64_t ab_cd_ef_gh =
465- abcd_efgh +
466- ( 0x10000 - 100 ) * (((abcd_efgh * 0x147b ) >> 19 ) & 0x7f0000007f );
474+ abcd_efgh + ( 0x10000 - 100 ) *
475+ (((abcd_efgh * div100_sig ) >> div100_exp ) & 0x7f0000007f );
467476 uint64_t a_b_c_d_e_f_g_h =
468477 ab_cd_ef_gh +
469478 (0x100 - 10 ) * (((ab_cd_ef_gh * 0x67 ) >> 10 ) & 0xf000f000f000f );
@@ -479,8 +488,6 @@ inline void write8(char* buffer, uint64_t value) noexcept {
479488 memcpy (buffer, &value, 8 );
480489}
481490
482- constexpr uint64_t zeros = 0x0101010101010101u * ' 0' ;
483-
484491// Writes a significand consisting of up to 17 decimal digits (16-17 for
485492// normals) and removes trailing zeros.
486493auto write_significand17 (char * buffer, uint64_t value) noexcept -> char* {
@@ -489,7 +496,7 @@ auto write_significand17(char* buffer, uint64_t value) noexcept -> char* {
489496 struct to_string_constants {
490497 uint64_t mul_const = 0xabcc77118461cefd ;
491498 uint64_t hundred_million = 100000000 ;
492- int32_t multipliers32[4 ] = {0x68db8bb , -10000 + 0x10000 , 0x147b000 ,
499+ int32_t multipliers32[4 ] = {div10k_sig , -10000 + 0x10000 , div100_sig << 12 ,
493500 -100 + 0x10000 };
494501 int16_t multipliers16[8 ] = {0xce0 , -10 + 0x100 };
495502 };
@@ -551,7 +558,7 @@ auto write_significand17(char* buffer, uint64_t value) noexcept -> char* {
551558
552559 buffer += 16 - ((zeroes != 0 ? clz (zeroes) : 64 ) >> 2 );
553560 return buffer - int (buffer - start == 1 );
554- # elif ZMIJ_USE_SSE
561+ #elif ZMIJ_USE_SSE
555562 uint32_t abbccddee = uint32_t (value / 100'000'000 );
556563 uint32_t ffgghhii = uint32_t (value % 100'000'000 );
557564 uint32_t a = abbccddee / 100'000'000 ;
@@ -560,37 +567,49 @@ auto write_significand17(char* buffer, uint64_t value) noexcept -> char* {
560567 buffer = write_if_nonzero (buffer, a);
561568
562569 alignas (64 ) static const struct {
563- __m128i div10000 = _mm_set1_epi64x(( 1ull << 40 ) / 10000 + 1 );
564- __m128i divmod10000 = _mm_set1_epi64x((1ull << 32 ) - 10000 );
565- __m128i div100 = _mm_set1_epi32(( 1 << 19 ) / 100 + 1 );
570+ __m128i div10k = _mm_set1_epi64x(div10k_sig );
571+ __m128i divmod10k = _mm_set1_epi64x((1ull << 32 ) - 10000 );
572+ __m128i div100 = _mm_set1_epi32(div100_sig );
566573 __m128i divmod100 = _mm_set1_epi32((1 << 16 ) - 100 );
567574 __m128i div10 = _mm_set1_epi16((1 << 16 ) / 10 + 1 );
568575# if ZMIJ_USE_SSE4_1
569576 __m128i divmod10 = _mm_set1_epi16((1 << 8 ) - 10 );
570- __m128i bswap = _mm_set_epi8(0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 );
571- # else // !ZMIJ_USE_SSE4_1
577+ __m128i bswap =
578+ _mm_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 );
579+ # else // !ZMIJ_USE_SSE4_1
572580 __m128i one_hundred = _mm_set1_epi32(100 );
573581 __m128i moddiv10 = _mm_set1_epi16(10 * (1 << 8 ) - 1 );
574- # endif // ZMIJ_USE_SSE4_1
582+ # endif // ZMIJ_USE_SSE4_1
575583 __m128i ascii0 = _mm_set1_epi64x(zeros);
576584 } c;
577585
578586// The BCD sequences are based on ones provided by Xiang JunBo.
579587# if ZMIJ_USE_SSE4_1
580588 __m128i x = _mm_set_epi64x (bbccddee, ffgghhii);
581- __m128i y = _mm_add_epi64 (x, _mm_mul_epu32 (c.divmod10000 , _mm_srli_epi64 (_mm_mul_epu32 (x, c.div10000 ), 40 )));
582- __m128i z = _mm_add_epi64 (y, _mm_mullo_epi32 (c.divmod100 , _mm_srli_epi32 (_mm_mulhi_epu16 (y, c.div100 ), 3 ))); // _mm_mullo_epi32 is SSE 4.1
583- __m128i big_endian_bcd = _mm_add_epi16 (z, _mm_mullo_epi16 (c.divmod10 , _mm_mulhi_epu16 (z, c.div10 )));
584- __m128i bcd = _mm_shuffle_epi8 (big_endian_bcd, c.bswap ); // SSSE3
585- # else // !ZMIJ_USE_SSE4_1
589+ __m128i y = _mm_add_epi64 (
590+ x, _mm_mul_epu32 (c.divmod10k ,
591+ _mm_srli_epi64 (_mm_mul_epu32 (x, c.div10k ), div10k_exp)));
592+ __m128i z = _mm_add_epi64 (
593+ y, _mm_mullo_epi32 (c.divmod100 ,
594+ _mm_srli_epi32 (_mm_mulhi_epu16 (y, c.div100 ),
595+ 3 ))); // _mm_mullo_epi32 is SSE 4.1
596+ __m128i big_endian_bcd = _mm_add_epi16 (
597+ z, _mm_mullo_epi16 (c.divmod10 , _mm_mulhi_epu16 (z, c.div10 )));
598+ __m128i bcd = _mm_shuffle_epi8 (big_endian_bcd, c.bswap ); // SSSE3
599+ # else // !ZMIJ_USE_SSE4_1
586600 __m128i x = _mm_set_epi64x (bbccddee, ffgghhii);
587- __m128i y = _mm_add_epi64 (x, _mm_mul_epu32 (c.divmod10000 , _mm_srli_epi64 (_mm_mul_epu32 (x, c.div10000 ), 40 )));
601+ __m128i y = _mm_add_epi64 (
602+ x, _mm_mul_epu32 (c.divmod10k ,
603+ _mm_srli_epi64 (_mm_mul_epu32 (x, c.div10k ), div10k_exp)));
588604 __m128i y_div_100 = _mm_srli_epi16 (_mm_mulhi_epu16 (y, c.div100 ), 3 );
589- __m128i y_mod_100 = _mm_sub_epi16 (y, _mm_mullo_epi16 (y_div_100, c.one_hundred ));
605+ __m128i y_mod_100 =
606+ _mm_sub_epi16 (y, _mm_mullo_epi16 (y_div_100, c.one_hundred ));
590607 __m128i z = _mm_or_si128 (_mm_slli_epi32 (y_mod_100, 16 ), y_div_100);
591- __m128i bcd_shuffled = _mm_sub_epi16 (_mm_slli_epi16 (z, 8 ), _mm_mullo_epi16 (c.moddiv10 , _mm_mulhi_epu16 (z, c.div10 )));
608+ __m128i bcd_shuffled =
609+ _mm_sub_epi16 (_mm_slli_epi16 (z, 8 ),
610+ _mm_mullo_epi16 (c.moddiv10 , _mm_mulhi_epu16 (z, c.div10 )));
592611 __m128i bcd = _mm_shuffle_epi32 (bcd_shuffled, _MM_SHUFFLE (0 , 1 , 2 , 3 ));
593- # endif // ZMIJ_USE_SSE4_1
612+ # endif // ZMIJ_USE_SSE4_1
594613
595614 auto digits = _mm_or_si128 (bcd, c.ascii0 );
596615
@@ -605,7 +624,7 @@ auto write_significand17(char* buffer, uint64_t value) noexcept -> char* {
605624
606625 _mm_storeu_si128 ((__m128i*)buffer, digits);
607626 return buffer + len - int (len + (a != 0 ) == 1 );
608- #else // !ZMIJ_USE_NEON && !ZMIJ_USE_SSE
627+ #else // !ZMIJ_USE_NEON && !ZMIJ_USE_SSE
609628 char * start = buffer;
610629 // Each digit is denoted by a letter so value is abbccddeeffgghhii.
611630 uint32_t abbccddee = uint32_t (value / 100'000'000 );
@@ -862,8 +881,8 @@ auto write(Float value, char* buffer) noexcept -> char* {
862881 return buffer + 2 ;
863882 }
864883 // 19 is faster or equal to 12 even for 3 digits.
865- constexpr int div_exp = 19 , div_sig = ( 1 << div_exp) / 100 + 1 ;
866- uint32_t digit = (uint32_t (dec_exp) * div_sig ) >> div_exp ; // value / 100
884+ uint32_t digit =
885+ (uint32_t (dec_exp) * div100_sig ) >> div100_exp ; // value / 100
867886 uint32_t digit_with_nuls = ' 0' + digit;
868887 if (is_big_endian ()) digit_with_nuls <<= 24 ;
869888 memcpy (buffer, &digit_with_nuls, 4 );
0 commit comments