fwrite scientific/decimal format to exactly match write.csv, #1664

mattdowle · mattdowle · commit 6c1ed96d45df · 2016-10-28T19:48:28.000-07:00
diff --git a/NEWS.md b/NEWS.md
@@ -15,6 +15,7 @@
     * Thanks to Otto Seiskari for the initial pull request [#580](https://github.com/Rdatatable/data.table/issues/580) that provided C code, R wrapper, manual page and extensive tests.
     * From there Matt parallelized and specialized C functions for writing integer/numeric values. See [this blog post](http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/) for implementation details and benchmarks.
     * Caught in development before release to CRAN: thanks to Francesco Grossetti for [#1725](https://github.com/Rdatatable/data.table/issues/1725) (NA handling) and Torsten Betz for [#1847](https://github.com/Rdatatable/data.table/issues/1847) (rounding of 9.999999999999998).
+    * `fwrite` status is being tracked here: [#1664](https://github.com/Rdatatable/data.table/issues/1664)
 
   2. `fread()`:
     * gains `quote` argument. `quote = ""` disables quoting altogether which reads each field *as is*, [#1367](https://github.com/Rdatatable/data.table/issues/1367). Thanks @manimal.
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -9365,19 +9365,68 @@ test(1728.11, DT[order(x,na.last=TRUE)], DT[c(2,1)])
 test(1728.12, DT[order(x,na.last=FALSE)], DT)
 test(1728.13, DT[order(x,na.last=NA)], DT[2])  # was randomly wrong
 
-# fwrite wrong and crash on 9.9999999999999982236431605, #1847
-test(1729.1, fwrite(data.table(V1=c(1), V2=c(9.9999999999999982236431605997495353221893310546875))),
-             output="V1,V21,1e+1")
-test(1729.2, fwrite(data.table(V2=c(9.9999999999999982236431605997495353221893310546875), V1=c(1))),
-             output="V2,V11e+1,1")
-DT = data.table(data.table(c(9999999999.99, 0.00000000000000099, 0.0000000000000000000009, 0.9, 9.0, 9.1, 99.9,
-                                 0.000000000000000000000999999999999999999999999,
-                                 99999999999999999999999999999.999999)))
-ans1 = "V19.99999999999e+99.9e-169e-220.999.19.99e+11e-211e+29"
-ans2 = "V19999999999.999.9e-169e-220.999.199.91e-211e+29"
-# both ans1 and ans2 are correct.  TODO: make the same
-test(1729.3, fwrite(DT), output=ans1)
-test(1729.4, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans2)
+if (.Machine$sizeof.longdouble == 16) {
+  # so as not to run on solaris-sparc 32bit which doesn't have long double
+  # fwrite wrong and crash on 9.9999999999999982236431605, #1847
+  test(1729.1, fwrite(data.table(V1=c(1), V2=c(9.9999999999999982236431605997495353221893310546875))),
+               output="V1,V21,10")
+  test(1729.2, fwrite(data.table(V2=c(9.9999999999999982236431605997495353221893310546875), V1=c(1))),
+               output="V2,V110,1")
+  DT = data.table(V1=c(9999999999.99, 0.00000000000000099, 0.0000000000000000000009, 0.9, 9.0, 9.1, 99.9,
+                       0.000000000000000000000999999999999999999999999,
+                       99999999999999999999999999999.999999))
+  ans = "V19999999999.999.9e-169e-220.999.199.91e-211e+29"
+  test(1729.3, fwrite(DT), output=ans)
+  test(1729.4, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans)
+  # same decimal/scientific rule (shortest format) as write.csv
+  DT = data.table(V1=c(-00000.00006, -123456789.123456789,
+                       seq.int(-1000,1000,17),
+                       seq(-1000,1000,pi*87),
+                       -1.2345678912345 * 10^(c((-30):30)),
+                       +1.2345678912345 * 10^(c((-30):30)),
+                       -1.2345 * 10^((-20):20),
+                       +1.2345 * 10^((-20):20),
+                       -1.7 * 10^((-20):20),
+                       +1.7 * 10^((-20):20),
+                       -7 * 10^((-20):20),
+                       +7 * 10^((-20):20),
+                       0, NA, NaN, Inf, -Inf,
+                       5.123456789e-290, -5.123456789e-290, 5.123456789e+307, -5.123456789e+307))
+  test(1729.5, nrow(DT), 505)
+  x = capture.output(fwrite(DT,na="NA"))[-1]   # -1 to remove the column name V1
+  y = capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))[-1]
+  # One mismatch that seems to be accuracy in write.csv
+  # tmp = cbind(row=1:length(x), `fwrite`=x, `write.csv`=y)
+  # tmp[x!=y,]
+  # row  fwrite                  write.csv       
+  # 177  "-1234567891234500000"  "-1234567891234499840"
+  # 238  "1234567891234500000"   "1234567891234499840"
+  # looking in surrounding rows for the first one shows the switch point :
+  # tmp[175:179,]
+  # row  fwrite                  write.csv       
+  # 175  "-12345678912345000"    "-12345678912345000"  
+  # 176  "-123456789123450000"   "-123456789123450000" 
+  # 177  "-1234567891234500000"  "-1234567891234499840"   # e+18 last before switch to scientific
+  # 178  "-1.2345678912345e+19"  "-1.2345678912345e+19"
+  # 179  "-1.2345678912345e+20"  "-1.2345678912345e+20"
+  test(1729.6, x[c(177,238)], c("-1234567891234500000","1234567891234500000"))
+  x = x[-c(177,238)]
+  y = y[-c(177,238)]
+  test(1729.7, length(x), 503)
+  test(1729.8, x, y)  # ensure the remaining 338 character outputs match exactly
+
+  DT = data.table(c(5.123456789e-325, 5.123456789e-320, 5.123456789e-315,
+                    5.123456789e+300, 5.123456789e+305, 5.123456789e+310,
+                    1e-305,1e+305, 1.2e-305,1.2e+305, 1.23e-305,1.23e+305))
+  ans = c("V1","0","5.12346074737373e-320","5.1234567899079e-315","5.123456789e+300","5.123456789e+305",
+          "Inf","1e-305","1e+305","1.2e-305","1.2e+305","1.23e-305","1.23e+305")
+  # explicitly check against ans rather than just comparing fwrite to write.csv so that :
+  # i) we can easily see intended results right here in future without needing to run
+  # ii) we don't get a false pass if fwrite and write.csv agree but are both wrong because of
+  #     a problem with the test mechanism itself or something else strange or unexpected
+  test(1729.9, capture.output(fwrite(DT)), ans)
+  test(1729.11, capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)), ans)
+}
 
 
 ##########################
diff --git a/src/fwrite.c b/src/fwrite.c
@@ -77,58 +77,79 @@ static inline void writeNumeric(double x, char **thisCh)
   //  iv) shorter, easier to read and reason with. In one self contained place.
   char *ch = *thisCh;
   if (!R_FINITE(x)) {
-    if (ISNA(x)) {
+    if (ISNAN(x)) {
       if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }  // by default na_len==0 and the memcpy call will be skipped
-    } else if (ISNAN(x)) {
-      *ch++ = 'N'; *ch++ = 'a'; *ch++ = 'N';
     } else if (x>0) {
       *ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f';
     } else {
       *ch++ = '-'; *ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f';
     }
   } else if (x == 0.0) {
     *ch++ = '0';   // and we're done.  so much easier rather than passing back special cases
-  } else if (x==(int)x && x>=INT_MIN && x<=INT_MAX) {
-    // it's not really a real; users often end up with integers stored as type double
-    // use writeInteger instead for speed
-    // careful not to pass NA_INTEGER (<INT_MIN) to writeInteger otherwise it'd get written NA
-    writeInteger((int)x, thisCh);
-    return;
   } else {
-    if (x < 0.0) { *ch++ = '-'; x = -x; }  // and we're done on sign.  no need to pass back sign, already written to output
+    if (x < 0.0) { *ch++ = '-'; x = -x; }  // and we're done on sign, already written. no need to pass back sign
     int exp = (int)floor(log10(x));
-    unsigned long long l = (unsigned long long)(x * pow(10, NUM_SF-exp));
-    // TODO?: use lookup table like base R .......  ^^^       
-    //        here in fwrite it might make a difference wheras in base R other very
-    //        significant write.table inefficiency dominates
-    // l now contains NUM_SF+1 digits. The last one is used to round.  
-    if (l%10 >= 5) l+=10;
+    unsigned long long l = (unsigned long long)((long double)x * powl(10, NUM_SF-exp));
+    // TODO?: use lookup table like base R? .................... ^^^^
+    //        here in fwrite it might make a difference whereas in base R other very
+    //        significant write.table inefficiency dominates.
+    // long double needed for 1729.9 to ensure 1e-310 doesn't write as 0.
+    // l now contains NUM_SF+1 digits.
+    // ULL for compound accuracy. If double, the repeated base 10 ops below could compound errors
+    if (l%10 >= 5) l+=10; // use the last digit to round
     l /= 10;
     if (l == 0) {
+      if (*(ch-1)=='-') ch--; //
       *ch++ = '0';
     } else {
-      // Count trailing zeros and therefore s.f. present
+      // Count trailing zeros and therefore s.f. present in l
       int trailZero = 0;
       while (l%10 == 0) { l /= 10; trailZero++; }
       int sf = NUM_SF - trailZero;
       if (sf==0) {sf=1; exp++;}  // e.g. l was 9999999[5-9] rounded to 10000000 which added 1 digit
-      // TODO: Improve deciding what's shortest to write here.
-      if (exp<0 && exp>-5) { sf-=exp; exp=0; }
-      ch += sf;
-      for (int i=sf; i>1; i--) {
-        *ch-- = '0' + l%10;   // l is long for compound accuracy. If kept in double, repeated *=10. or /=10. could compound errors 
-        l /= 10;
+      
+      // l is now an unsigned long that doesn't start or end with 0
+      // sf is the number of digits now in l
+      // exp is e<exp> were l to be written with the decimal sep after the first digit
+      int dr = sf-exp-1; // how many characters to print to the right of the decimal place
+      int width=0;       // field width were it written decimal format. Used to decide whether to or not.
+      int dl0=0;         // how many 0's to add to the left of the decimal place before starting l
+      if (dr<=0) { dl0=-dr; dr=0; width=sf+dl0; }  // 1, 10, 100, 99000
+      else {
+        if (sf>dr) width=sf+1;                     // 1.234 and 123.4
+        else { dl0=1; width=dr+1+dl0; }            // 0.1234, 0.0001234
       }
-      if (sf == 1) ch--; else *ch-- = DECIMAL_SEP;
-      *ch = '0' + l;
-      ch += sf + (sf>1);
-      if (exp != 0) {
+      // So:  3.1416 => l=31416, sf=5, exp=0     dr=4; dl0=0; width=6
+      //      30460  => l=3046, sf=4, exp=4      dr=0; dl0=1; width=5
+      //      0.0072 => l=72, sf=2, exp=-3       dr=4; dl0=1; width=6
+      if (width <= sf + (sf>1) + 2 + (abs(exp)>99?3:2)) {
+         //              ^^^^ to not include 1 char for dec in -7e-04 where sf==1
+         //                      ^ 2 for 'e+'/'e-'
+         // decimal format ...
+         ch += width-1;
+         if (dr) {
+           while (dr && sf) { *ch--='0'+l%10; l/=10; dr--; sf--; }
+           while (dr) { *ch--='0'; dr--; }
+           *ch-- = DECIMAL_SEP;
+         }
+         while (dl0) { *ch--='0'; dl0--; }
+         while (sf) { *ch--='0'+l%10; l/=10; sf--; }
+         // ch is now 1 before the first char of the field so position it afterward again, and done
+         ch += width+1;
+      } else {
+        // scientific ...
+        ch += sf;  // sf-1 + 1 for dec
+        for (int i=sf; i>1; i--) {
+          *ch-- = '0' + l%10;   
+          l /= 10;
+        }
+        if (sf == 1) ch--; else *ch-- = DECIMAL_SEP;
+        *ch = '0' + l;
+        ch += sf + (sf>1);
         *ch++ = 'e';  // lower case e to match base::write.csv
         if (exp < 0) { *ch++ = '-'; exp=-exp; }
         else { *ch++ = '+'; }  // to match base::write.csv
-        if (exp < 10) {
-          *ch++ = '0' + exp;
-        } else if (exp < 100) {
+        if (exp < 100) {
           *ch++ = '0' + (exp / 10);
           *ch++ = '0' + (exp % 10);
         } else {