Skip to content

Commit ff8d4de

Browse files
committed
Reuse constaints and format
1 parent edb9de6 commit ff8d4de

File tree

1 file changed

+51
-32
lines changed

1 file changed

+51
-32
lines changed

zmij.cc

Lines changed: 51 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ struct dec_fp {
2323
#include <limits> // std::numeric_limits
2424
#include <type_traits> // std::conditional_t
2525

26-
2726
#ifndef ZMIJ_USE_SIMD
2827
# define ZMIJ_USE_SIMD 1
2928
#endif
@@ -40,7 +39,8 @@ struct dec_fp {
4039
// Use the provided definition
4140
#elif defined(__SSE2__)
4241
# define ZMIJ_USE_SSE 1
43-
#elif defined(_MSC_VER) && (defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86FP == 2))
42+
#elif defined(_MSC_VER) && \
43+
(defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86FP == 2))
4444
# define ZMIJ_USE_SSE 1
4545
#else
4646
# define ZMIJ_USE_SSE 0
@@ -50,7 +50,8 @@ struct dec_fp {
5050
// Use the provided definition
5151
#elif defined(__SSE4_1__)
5252
# define ZMIJ_USE_SSE4_1 1
53-
#elif defined(_MSC_VER) && defined(__AVX__) // There's no way to check for /arch:SSE4.2 specifically
53+
#elif defined(_MSC_VER) && \
54+
defined(__AVX__) // There's no way to check for /arch:SSE4.2 specifically
5455
# define ZMIJ_USE_SSE4_1 1
5556
#else
5657
# define ZMIJ_USE_SSE4_1 0
@@ -61,7 +62,8 @@ struct dec_fp {
6162
#endif
6263

6364
#if ZMIJ_USE_SSE4_1 && !ZMIJ_USE_SSE
64-
# error "User asked for SSE4.1 but SSE is not available or explicitly not requested."
65+
# error \
66+
"User asked for SSE4.1 but SSE is not available or explicitly not requested."
6567
#endif
6668

6769
#if ZMIJ_USE_SSE
@@ -451,19 +453,26 @@ inline auto digits2(size_t value) noexcept -> const char* {
451453
return &data[value * 2];
452454
}
453455

456+
constexpr int div10k_exp = 40;
457+
constexpr uint32_t div10k_sig = uint32_t((1ull << div10k_exp) / 10000 + 1);
458+
constexpr int div100_exp = 19;
459+
constexpr uint32_t div100_sig = (1 << div100_exp) / 100 + 1;
460+
461+
constexpr uint64_t zeros = 0x0101010101010101u * '0';
462+
454463
auto to_bcd8(uint64_t abcdefgh) noexcept -> uint64_t {
455464
// An optimization from Xiang JunBo.
456-
// Three steps BCD. Base 10000 -> base 100 -> base 10.
465+
// Three steps BCD. Base 10000 -> base 100 -> base 10.
457466
// div and mod are evaluated simultaneously as, e.g.
458467
// (abcdefgh / 10000) << 32 + (abcdefgh % 10000)
459-
// == abcdefgh + (2^32 - 10000) * (abcdefgh / 10000)))
468+
// == abcdefgh + (2**32 - 10000) * (abcdefgh / 10000)))
460469
// where the division on the RHS is implemented by the usual multiply + shift
461470
// trick and the fractional bits are masked away.
462-
uint64_t abcd_efgh =
463-
abcdefgh + (0x100000000 - 10000) * ((abcdefgh * 0x68db8bb) >> 40);
471+
uint64_t abcd_efgh = abcdefgh + (0x100000000 - 10000) *
472+
((abcdefgh * div10k_sig) >> div10k_exp);
464473
uint64_t ab_cd_ef_gh =
465-
abcd_efgh +
466-
(0x10000 - 100) * (((abcd_efgh * 0x147b) >> 19) & 0x7f0000007f);
474+
abcd_efgh + (0x10000 - 100) *
475+
(((abcd_efgh * div100_sig) >> div100_exp) & 0x7f0000007f);
467476
uint64_t a_b_c_d_e_f_g_h =
468477
ab_cd_ef_gh +
469478
(0x100 - 10) * (((ab_cd_ef_gh * 0x67) >> 10) & 0xf000f000f000f);
@@ -479,8 +488,6 @@ inline void write8(char* buffer, uint64_t value) noexcept {
479488
memcpy(buffer, &value, 8);
480489
}
481490

482-
constexpr uint64_t zeros = 0x0101010101010101u * '0';
483-
484491
// Writes a significand consisting of up to 17 decimal digits (16-17 for
485492
// normals) and removes trailing zeros.
486493
auto write_significand17(char* buffer, uint64_t value) noexcept -> char* {
@@ -489,7 +496,7 @@ auto write_significand17(char* buffer, uint64_t value) noexcept -> char* {
489496
struct to_string_constants {
490497
uint64_t mul_const = 0xabcc77118461cefd;
491498
uint64_t hundred_million = 100000000;
492-
int32_t multipliers32[4] = {0x68db8bb, -10000 + 0x10000, 0x147b000,
499+
int32_t multipliers32[4] = {div10k_sig, -10000 + 0x10000, div100_sig << 12,
493500
-100 + 0x10000};
494501
int16_t multipliers16[8] = {0xce0, -10 + 0x100};
495502
};
@@ -551,7 +558,7 @@ auto write_significand17(char* buffer, uint64_t value) noexcept -> char* {
551558

552559
buffer += 16 - ((zeroes != 0 ? clz(zeroes) : 64) >> 2);
553560
return buffer - int(buffer - start == 1);
554-
# elif ZMIJ_USE_SSE
561+
#elif ZMIJ_USE_SSE
555562
uint32_t abbccddee = uint32_t(value / 100'000'000);
556563
uint32_t ffgghhii = uint32_t(value % 100'000'000);
557564
uint32_t a = abbccddee / 100'000'000;
@@ -560,37 +567,49 @@ auto write_significand17(char* buffer, uint64_t value) noexcept -> char* {
560567
buffer = write_if_nonzero(buffer, a);
561568

562569
alignas(64) static const struct {
563-
__m128i div10000 = _mm_set1_epi64x((1ull << 40) / 10000 + 1);
564-
__m128i divmod10000 = _mm_set1_epi64x((1ull << 32) - 10000);
565-
__m128i div100 = _mm_set1_epi32((1 << 19) / 100 + 1);
570+
__m128i div10k = _mm_set1_epi64x(div10k_sig);
571+
__m128i divmod10k = _mm_set1_epi64x((1ull << 32) - 10000);
572+
__m128i div100 = _mm_set1_epi32(div100_sig);
566573
__m128i divmod100 = _mm_set1_epi32((1 << 16) - 100);
567574
__m128i div10 = _mm_set1_epi16((1 << 16) / 10 + 1);
568575
# if ZMIJ_USE_SSE4_1
569576
__m128i divmod10 = _mm_set1_epi16((1 << 8) - 10);
570-
__m128i bswap = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
571-
# else // !ZMIJ_USE_SSE4_1
577+
__m128i bswap =
578+
_mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
579+
# else // !ZMIJ_USE_SSE4_1
572580
__m128i one_hundred = _mm_set1_epi32(100);
573581
__m128i moddiv10 = _mm_set1_epi16(10 * (1 << 8) - 1);
574-
# endif // ZMIJ_USE_SSE4_1
582+
# endif // ZMIJ_USE_SSE4_1
575583
__m128i ascii0 = _mm_set1_epi64x(zeros);
576584
} c;
577585

578586
// The BCD sequences are based on ones provided by Xiang JunBo.
579587
# if ZMIJ_USE_SSE4_1
580588
__m128i x = _mm_set_epi64x(bbccddee, ffgghhii);
581-
__m128i y = _mm_add_epi64(x, _mm_mul_epu32(c.divmod10000, _mm_srli_epi64(_mm_mul_epu32(x, c.div10000), 40)));
582-
__m128i z = _mm_add_epi64(y, _mm_mullo_epi32(c.divmod100, _mm_srli_epi32(_mm_mulhi_epu16(y, c.div100), 3))); // _mm_mullo_epi32 is SSE 4.1
583-
__m128i big_endian_bcd = _mm_add_epi16(z, _mm_mullo_epi16(c.divmod10, _mm_mulhi_epu16(z, c.div10)));
584-
__m128i bcd = _mm_shuffle_epi8(big_endian_bcd, c.bswap); // SSSE3
585-
# else // !ZMIJ_USE_SSE4_1
589+
__m128i y = _mm_add_epi64(
590+
x, _mm_mul_epu32(c.divmod10k,
591+
_mm_srli_epi64(_mm_mul_epu32(x, c.div10k), div10k_exp)));
592+
__m128i z = _mm_add_epi64(
593+
y, _mm_mullo_epi32(c.divmod100,
594+
_mm_srli_epi32(_mm_mulhi_epu16(y, c.div100),
595+
3))); // _mm_mullo_epi32 is SSE 4.1
596+
__m128i big_endian_bcd = _mm_add_epi16(
597+
z, _mm_mullo_epi16(c.divmod10, _mm_mulhi_epu16(z, c.div10)));
598+
__m128i bcd = _mm_shuffle_epi8(big_endian_bcd, c.bswap); // SSSE3
599+
# else // !ZMIJ_USE_SSE4_1
586600
__m128i x = _mm_set_epi64x(bbccddee, ffgghhii);
587-
__m128i y = _mm_add_epi64(x, _mm_mul_epu32(c.divmod10000, _mm_srli_epi64(_mm_mul_epu32(x, c.div10000), 40)));
601+
__m128i y = _mm_add_epi64(
602+
x, _mm_mul_epu32(c.divmod10k,
603+
_mm_srli_epi64(_mm_mul_epu32(x, c.div10k), div10k_exp)));
588604
__m128i y_div_100 = _mm_srli_epi16(_mm_mulhi_epu16(y, c.div100), 3);
589-
__m128i y_mod_100 = _mm_sub_epi16(y, _mm_mullo_epi16(y_div_100, c.one_hundred));
605+
__m128i y_mod_100 =
606+
_mm_sub_epi16(y, _mm_mullo_epi16(y_div_100, c.one_hundred));
590607
__m128i z = _mm_or_si128(_mm_slli_epi32(y_mod_100, 16), y_div_100);
591-
__m128i bcd_shuffled = _mm_sub_epi16(_mm_slli_epi16(z, 8), _mm_mullo_epi16(c.moddiv10, _mm_mulhi_epu16(z, c.div10)));
608+
__m128i bcd_shuffled =
609+
_mm_sub_epi16(_mm_slli_epi16(z, 8),
610+
_mm_mullo_epi16(c.moddiv10, _mm_mulhi_epu16(z, c.div10)));
592611
__m128i bcd = _mm_shuffle_epi32(bcd_shuffled, _MM_SHUFFLE(0, 1, 2, 3));
593-
# endif // ZMIJ_USE_SSE4_1
612+
# endif // ZMIJ_USE_SSE4_1
594613

595614
auto digits = _mm_or_si128(bcd, c.ascii0);
596615

@@ -605,7 +624,7 @@ auto write_significand17(char* buffer, uint64_t value) noexcept -> char* {
605624

606625
_mm_storeu_si128((__m128i*)buffer, digits);
607626
return buffer + len - int(len + (a != 0) == 1);
608-
#else // !ZMIJ_USE_NEON && !ZMIJ_USE_SSE
627+
#else // !ZMIJ_USE_NEON && !ZMIJ_USE_SSE
609628
char* start = buffer;
610629
// Each digit is denoted by a letter so value is abbccddeeffgghhii.
611630
uint32_t abbccddee = uint32_t(value / 100'000'000);
@@ -862,8 +881,8 @@ auto write(Float value, char* buffer) noexcept -> char* {
862881
return buffer + 2;
863882
}
864883
// 19 is faster or equal to 12 even for 3 digits.
865-
constexpr int div_exp = 19, div_sig = (1 << div_exp) / 100 + 1;
866-
uint32_t digit = (uint32_t(dec_exp) * div_sig) >> div_exp; // value / 100
884+
uint32_t digit =
885+
(uint32_t(dec_exp) * div100_sig) >> div100_exp; // value / 100
867886
uint32_t digit_with_nuls = '0' + digit;
868887
if (is_big_endian()) digit_with_nuls <<= 24;
869888
memcpy(buffer, &digit_with_nuls, 4);

0 commit comments

Comments
 (0)