Skip to content

Commit 6c1ed96

Browse files
committed
fwrite scientific/decimal format to exactly match write.csv, #1664
1 parent 17df2f4 commit 6c1ed96

File tree

3 files changed

+114
-43
lines changed

3 files changed

+114
-43
lines changed

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Thanks to Otto Seiskari for the initial pull request [#580](https://github.com/Rdatatable/data.table/issues/580) that provided C code, R wrapper, manual page and extensive tests.
1616
* From there Matt parallelized and specialized C functions for writing integer/numeric values. See [this blog post](http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/) for implementation details and benchmarks.
1717
* Caught in development before release to CRAN: thanks to Francesco Grossetti for [#1725](https://github.com/Rdatatable/data.table/issues/1725) (NA handling) and Torsten Betz for [#1847](https://github.com/Rdatatable/data.table/issues/1847) (rounding of 9.999999999999998).
18+
* `fwrite` status is being tracked here: [#1664](https://github.com/Rdatatable/data.table/issues/1664)
1819

1920
2. `fread()`:
2021
* gains `quote` argument. `quote = ""` disables quoting altogether which reads each field *as is*, [#1367](https://github.com/Rdatatable/data.table/issues/1367). Thanks @manimal.

inst/tests/tests.Rraw

Lines changed: 62 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9365,19 +9365,68 @@ test(1728.11, DT[order(x,na.last=TRUE)], DT[c(2,1)])
93659365
test(1728.12, DT[order(x,na.last=FALSE)], DT)
93669366
test(1728.13, DT[order(x,na.last=NA)], DT[2]) # was randomly wrong
93679367

9368-
# fwrite wrong and crash on 9.9999999999999982236431605, #1847
9369-
test(1729.1, fwrite(data.table(V1=c(1), V2=c(9.9999999999999982236431605997495353221893310546875))),
9370-
output="V1,V21,1e+1")
9371-
test(1729.2, fwrite(data.table(V2=c(9.9999999999999982236431605997495353221893310546875), V1=c(1))),
9372-
output="V2,V11e+1,1")
9373-
DT = data.table(data.table(c(9999999999.99, 0.00000000000000099, 0.0000000000000000000009, 0.9, 9.0, 9.1, 99.9,
9374-
0.000000000000000000000999999999999999999999999,
9375-
99999999999999999999999999999.999999)))
9376-
ans1 = "V19.99999999999e+99.9e-169e-220.999.19.99e+11e-211e+29"
9377-
ans2 = "V19999999999.999.9e-169e-220.999.199.91e-211e+29"
9378-
# both ans1 and ans2 are correct. TODO: make the same
9379-
test(1729.3, fwrite(DT), output=ans1)
9380-
test(1729.4, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans2)
9368+
if (.Machine$sizeof.longdouble == 16) {
9369+
# so as not to run on solaris-sparc 32bit which doesn't have long double
9370+
# fwrite wrong and crash on 9.9999999999999982236431605, #1847
9371+
test(1729.1, fwrite(data.table(V1=c(1), V2=c(9.9999999999999982236431605997495353221893310546875))),
9372+
output="V1,V21,10")
9373+
test(1729.2, fwrite(data.table(V2=c(9.9999999999999982236431605997495353221893310546875), V1=c(1))),
9374+
output="V2,V110,1")
9375+
DT = data.table(V1=c(9999999999.99, 0.00000000000000099, 0.0000000000000000000009, 0.9, 9.0, 9.1, 99.9,
9376+
0.000000000000000000000999999999999999999999999,
9377+
99999999999999999999999999999.999999))
9378+
ans = "V19999999999.999.9e-169e-220.999.199.91e-211e+29"
9379+
test(1729.3, fwrite(DT), output=ans)
9380+
test(1729.4, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans)
9381+
# same decimal/scientific rule (shortest format) as write.csv
9382+
DT = data.table(V1=c(-00000.00006, -123456789.123456789,
9383+
seq.int(-1000,1000,17),
9384+
seq(-1000,1000,pi*87),
9385+
-1.2345678912345 * 10^(c((-30):30)),
9386+
+1.2345678912345 * 10^(c((-30):30)),
9387+
-1.2345 * 10^((-20):20),
9388+
+1.2345 * 10^((-20):20),
9389+
-1.7 * 10^((-20):20),
9390+
+1.7 * 10^((-20):20),
9391+
-7 * 10^((-20):20),
9392+
+7 * 10^((-20):20),
9393+
0, NA, NaN, Inf, -Inf,
9394+
5.123456789e-290, -5.123456789e-290, 5.123456789e+307, -5.123456789e+307))
9395+
test(1729.5, nrow(DT), 505)
9396+
x = capture.output(fwrite(DT,na="NA"))[-1] # -1 to remove the column name V1
9397+
y = capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))[-1]
9398+
# One mismatch that seems to be accuracy in write.csv
9399+
# tmp = cbind(row=1:length(x), `fwrite`=x, `write.csv`=y)
9400+
# tmp[x!=y,]
9401+
# row fwrite write.csv
9402+
# 177 "-1234567891234500000" "-1234567891234499840"
9403+
# 238 "1234567891234500000" "1234567891234499840"
9404+
# looking in surrounding rows for the first one shows the switch point :
9405+
# tmp[175:179,]
9406+
# row fwrite write.csv
9407+
# 175 "-12345678912345000" "-12345678912345000"
9408+
# 176 "-123456789123450000" "-123456789123450000"
9409+
# 177 "-1234567891234500000" "-1234567891234499840" # e+18 last before switch to scientific
9410+
# 178 "-1.2345678912345e+19" "-1.2345678912345e+19"
9411+
# 179 "-1.2345678912345e+20" "-1.2345678912345e+20"
9412+
test(1729.6, x[c(177,238)], c("-1234567891234500000","1234567891234500000"))
9413+
x = x[-c(177,238)]
9414+
y = y[-c(177,238)]
9415+
test(1729.7, length(x), 503)
9416+
test(1729.8, x, y) # ensure the remaining 338 character outputs match exactly
9417+
9418+
DT = data.table(c(5.123456789e-325, 5.123456789e-320, 5.123456789e-315,
9419+
5.123456789e+300, 5.123456789e+305, 5.123456789e+310,
9420+
1e-305,1e+305, 1.2e-305,1.2e+305, 1.23e-305,1.23e+305))
9421+
ans = c("V1","0","5.12346074737373e-320","5.1234567899079e-315","5.123456789e+300","5.123456789e+305",
9422+
"Inf","1e-305","1e+305","1.2e-305","1.2e+305","1.23e-305","1.23e+305")
9423+
# explicitly check against ans rather than just comparing fwrite to write.csv so that :
9424+
# i) we can easily see intended results right here in future without needing to run
9425+
# ii) we don't get a false pass if fwrite and write.csv agree but are both wrong because of
9426+
# a problem with the test mechanism itself or something else strange or unexpected
9427+
test(1729.9, capture.output(fwrite(DT)), ans)
9428+
test(1729.11, capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)), ans)
9429+
}
93819430

93829431

93839432
##########################

src/fwrite.c

Lines changed: 51 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -77,58 +77,79 @@ static inline void writeNumeric(double x, char **thisCh)
7777
// iv) shorter, easier to read and reason with. In one self contained place.
7878
char *ch = *thisCh;
7979
if (!R_FINITE(x)) {
80-
if (ISNA(x)) {
80+
if (ISNAN(x)) {
8181
if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; } // by default na_len==0 and the memcpy call will be skipped
82-
} else if (ISNAN(x)) {
83-
*ch++ = 'N'; *ch++ = 'a'; *ch++ = 'N';
8482
} else if (x>0) {
8583
*ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f';
8684
} else {
8785
*ch++ = '-'; *ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f';
8886
}
8987
} else if (x == 0.0) {
9088
*ch++ = '0'; // and we're done. so much easier rather than passing back special cases
91-
} else if (x==(int)x && x>=INT_MIN && x<=INT_MAX) {
92-
// it's not really a real; users often end up with integers stored as type double
93-
// use writeInteger instead for speed
94-
// careful not to pass NA_INTEGER (<INT_MIN) to writeInteger otherwise it'd get written NA
95-
writeInteger((int)x, thisCh);
96-
return;
9789
} else {
98-
if (x < 0.0) { *ch++ = '-'; x = -x; } // and we're done on sign. no need to pass back sign, already written to output
90+
if (x < 0.0) { *ch++ = '-'; x = -x; } // and we're done on sign, already written. no need to pass back sign
9991
int exp = (int)floor(log10(x));
100-
unsigned long long l = (unsigned long long)(x * pow(10, NUM_SF-exp));
101-
// TODO?: use lookup table like base R ....... ^^^
102-
// here in fwrite it might make a difference wheras in base R other very
103-
// significant write.table inefficiency dominates
104-
// l now contains NUM_SF+1 digits. The last one is used to round.
105-
if (l%10 >= 5) l+=10;
92+
unsigned long long l = (unsigned long long)((long double)x * powl(10, NUM_SF-exp));
93+
// TODO?: use lookup table like base R? .................... ^^^^
94+
// here in fwrite it might make a difference whereas in base R other very
95+
// significant write.table inefficiency dominates.
96+
// long double needed for 1729.9 to ensure 1e-310 doesn't write as 0.
97+
// l now contains NUM_SF+1 digits.
98+
// ULL for compound accuracy. If double, the repeated base 10 ops below could compound errors
99+
if (l%10 >= 5) l+=10; // use the last digit to round
106100
l /= 10;
107101
if (l == 0) {
102+
if (*(ch-1)=='-') ch--; //
108103
*ch++ = '0';
109104
} else {
110-
// Count trailing zeros and therefore s.f. present
105+
// Count trailing zeros and therefore s.f. present in l
111106
int trailZero = 0;
112107
while (l%10 == 0) { l /= 10; trailZero++; }
113108
int sf = NUM_SF - trailZero;
114109
if (sf==0) {sf=1; exp++;} // e.g. l was 9999999[5-9] rounded to 10000000 which added 1 digit
115-
// TODO: Improve deciding what's shortest to write here.
116-
if (exp<0 && exp>-5) { sf-=exp; exp=0; }
117-
ch += sf;
118-
for (int i=sf; i>1; i--) {
119-
*ch-- = '0' + l%10; // l is long for compound accuracy. If kept in double, repeated *=10. or /=10. could compound errors
120-
l /= 10;
110+
111+
// l is now an unsigned long that doesn't start or end with 0
112+
// sf is the number of digits now in l
113+
// exp is e<exp> were l to be written with the decimal sep after the first digit
114+
int dr = sf-exp-1; // how many characters to print to the right of the decimal place
115+
int width=0; // field width were it written decimal format. Used to decide whether to or not.
116+
int dl0=0; // how many 0's to add to the left of the decimal place before starting l
117+
if (dr<=0) { dl0=-dr; dr=0; width=sf+dl0; } // 1, 10, 100, 99000
118+
else {
119+
if (sf>dr) width=sf+1; // 1.234 and 123.4
120+
else { dl0=1; width=dr+1+dl0; } // 0.1234, 0.0001234
121121
}
122-
if (sf == 1) ch--; else *ch-- = DECIMAL_SEP;
123-
*ch = '0' + l;
124-
ch += sf + (sf>1);
125-
if (exp != 0) {
122+
// So: 3.1416 => l=31416, sf=5, exp=0 dr=4; dl0=0; width=6
123+
// 30460 => l=3046, sf=4, exp=4 dr=0; dl0=1; width=5
124+
// 0.0072 => l=72, sf=2, exp=-3 dr=4; dl0=1; width=6
125+
if (width <= sf + (sf>1) + 2 + (abs(exp)>99?3:2)) {
126+
// ^^^^ to not include 1 char for dec in -7e-04 where sf==1
127+
// ^ 2 for 'e+'/'e-'
128+
// decimal format ...
129+
ch += width-1;
130+
if (dr) {
131+
while (dr && sf) { *ch--='0'+l%10; l/=10; dr--; sf--; }
132+
while (dr) { *ch--='0'; dr--; }
133+
*ch-- = DECIMAL_SEP;
134+
}
135+
while (dl0) { *ch--='0'; dl0--; }
136+
while (sf) { *ch--='0'+l%10; l/=10; sf--; }
137+
// ch is now 1 before the first char of the field so position it afterward again, and done
138+
ch += width+1;
139+
} else {
140+
// scientific ...
141+
ch += sf; // sf-1 + 1 for dec
142+
for (int i=sf; i>1; i--) {
143+
*ch-- = '0' + l%10;
144+
l /= 10;
145+
}
146+
if (sf == 1) ch--; else *ch-- = DECIMAL_SEP;
147+
*ch = '0' + l;
148+
ch += sf + (sf>1);
126149
*ch++ = 'e'; // lower case e to match base::write.csv
127150
if (exp < 0) { *ch++ = '-'; exp=-exp; }
128151
else { *ch++ = '+'; } // to match base::write.csv
129-
if (exp < 10) {
130-
*ch++ = '0' + exp;
131-
} else if (exp < 100) {
152+
if (exp < 100) {
132153
*ch++ = '0' + (exp / 10);
133154
*ch++ = '0' + (exp % 10);
134155
} else {

0 commit comments

Comments
 (0)