Skip to content

Commit 16fdb2d

Browse files
committed
fix(csv-parse): record_delimiter and non default encoding (fix #365)
1 parent 8ed0e18 commit 16fdb2d

15 files changed

Lines changed: 392 additions & 191 deletions

File tree

26 Bytes
Binary file not shown.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
a,b,c
2+
1,2,3

demo/issues-esm/lib/365.js

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
2+
import fs from 'fs/promises'
3+
import { parse } from 'csv-parse/sync'
4+
import { dirname, join } from 'path'
5+
import { fileURLToPath } from 'url'
6+
7+
const __filename = fileURLToPath(import.meta.url);
8+
const __dirname = dirname(__filename);
9+
10+
(async () => {
11+
const data = await fs.readFile(`${__dirname}/365-utf16le-bom-windows.csv`)
12+
const records = parse(data, {bom: true})
13+
console.log('utf16le', records)
14+
})();
15+
16+
(async () => {
17+
const data = await fs.readFile(`${__dirname}/365-utf8-bom-windows.csv`)
18+
const records = parse(data, {bom: true})
19+
console.log('utf8', records)
20+
})();

packages/csv-parse/dist/cjs/index.cjs

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ const init_state = function(options){
155155
record: [],
156156
recordHasError: false,
157157
record_length: 0,
158-
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
158+
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
159159
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
160160
wasQuoting: false,
161161
wasRowDelimiter: false,
@@ -620,16 +620,22 @@ const transform = function(original_options = {}) {
620620
state: init_state(options),
621621
__needMoreData: function(i, bufLen, end){
622622
if(end) return false;
623-
const {quote} = this.options;
623+
const {encoding, escape, quote} = this.options;
624624
const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
625625
const numOfCharLeft = bufLen - i - 1;
626626
const requiredLength = Math.max(
627627
needMoreDataSize,
628628
// Skip if the remaining buffer smaller than record delimiter
629-
recordDelimiterMaxLength,
630-
// Skip if the remaining buffer can be record delimiter following the closing quote
631-
// 1 is for quote.length
632-
quoting ? (quote.length + recordDelimiterMaxLength) : 0,
629+
// If "record_delimiter" is yet to be discovered:
630+
// 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
631+
// 2. We set the length to windows line ending in the current encoding
632+
// Note, that encoding is known from user or bom discovery at that point
633+
// recordDelimiterMaxLength,
634+
recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
635+
// Skip if remaining buffer can be an escaped quote
636+
quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
637+
// Skip if remaining buffer can be record delimiter following the closing quote
638+
quoting ? (quote.length) : 0,
633639
);
634640
return numOfCharLeft < requiredLength;
635641
},
@@ -1224,22 +1230,26 @@ const transform = function(original_options = {}) {
12241230
return true;
12251231
},
12261232
__autoDiscoverRecordDelimiter: function(buf, pos){
1227-
const {encoding} = this.options;
1228-
const chr = buf[pos];
1229-
if(chr === cr){
1230-
if(buf[pos+1] === nl){
1231-
this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
1232-
this.state.recordDelimiterMaxLength = 2;
1233-
return 2;
1234-
}else {
1235-
this.options.record_delimiter.push(Buffer.from('\r', encoding));
1236-
this.state.recordDelimiterMaxLength = 1;
1237-
return 1;
1233+
const { encoding } = this.options;
1234+
// Note, we don't need to cache this information in state,
1235+
// It is only called on the first line until we find out a suitable
1236+
// record delimiter.
1237+
const rds = [
1238+
// Important, the windows line ending must be before mac os 9
1239+
Buffer.from('\r\n', encoding),
1240+
Buffer.from('\n', encoding),
1241+
Buffer.from('\r', encoding),
1242+
];
1243+
loop: for(let i = 0; i < rds.length; i++){
1244+
const l = rds[i].length;
1245+
for(let j = 0; j < l; j++){
1246+
if(rds[i][j] !== buf[pos + j]){
1247+
continue loop;
1248+
}
12381249
}
1239-
}else if(chr === nl){
1240-
this.options.record_delimiter.push(Buffer.from('\n', encoding));
1241-
this.state.recordDelimiterMaxLength = 1;
1242-
return 1;
1250+
this.options.record_delimiter.push(rds[i]);
1251+
this.state.recordDelimiterMaxLength = rds[i].length;
1252+
return rds[i].length;
12431253
}
12441254
return 0;
12451255
},

packages/csv-parse/dist/cjs/sync.cjs

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ const init_state = function(options){
153153
record: [],
154154
recordHasError: false,
155155
record_length: 0,
156-
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
156+
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
157157
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
158158
wasQuoting: false,
159159
wasRowDelimiter: false,
@@ -618,16 +618,22 @@ const transform = function(original_options = {}) {
618618
state: init_state(options),
619619
__needMoreData: function(i, bufLen, end){
620620
if(end) return false;
621-
const {quote} = this.options;
621+
const {encoding, escape, quote} = this.options;
622622
const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
623623
const numOfCharLeft = bufLen - i - 1;
624624
const requiredLength = Math.max(
625625
needMoreDataSize,
626626
// Skip if the remaining buffer smaller than record delimiter
627-
recordDelimiterMaxLength,
628-
// Skip if the remaining buffer can be record delimiter following the closing quote
629-
// 1 is for quote.length
630-
quoting ? (quote.length + recordDelimiterMaxLength) : 0,
627+
// If "record_delimiter" is yet to be discovered:
628+
// 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
629+
// 2. We set the length to windows line ending in the current encoding
630+
// Note, that encoding is known from user or bom discovery at that point
631+
// recordDelimiterMaxLength,
632+
recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
633+
// Skip if remaining buffer can be an escaped quote
634+
quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
635+
// Skip if remaining buffer can be record delimiter following the closing quote
636+
quoting ? (quote.length) : 0,
631637
);
632638
return numOfCharLeft < requiredLength;
633639
},
@@ -1222,22 +1228,26 @@ const transform = function(original_options = {}) {
12221228
return true;
12231229
},
12241230
__autoDiscoverRecordDelimiter: function(buf, pos){
1225-
const {encoding} = this.options;
1226-
const chr = buf[pos];
1227-
if(chr === cr){
1228-
if(buf[pos+1] === nl){
1229-
this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
1230-
this.state.recordDelimiterMaxLength = 2;
1231-
return 2;
1232-
}else {
1233-
this.options.record_delimiter.push(Buffer.from('\r', encoding));
1234-
this.state.recordDelimiterMaxLength = 1;
1235-
return 1;
1231+
const { encoding } = this.options;
1232+
// Note, we don't need to cache this information in state,
1233+
// It is only called on the first line until we find out a suitable
1234+
// record delimiter.
1235+
const rds = [
1236+
// Important, the windows line ending must be before mac os 9
1237+
Buffer.from('\r\n', encoding),
1238+
Buffer.from('\n', encoding),
1239+
Buffer.from('\r', encoding),
1240+
];
1241+
loop: for(let i = 0; i < rds.length; i++){
1242+
const l = rds[i].length;
1243+
for(let j = 0; j < l; j++){
1244+
if(rds[i][j] !== buf[pos + j]){
1245+
continue loop;
1246+
}
12361247
}
1237-
}else if(chr === nl){
1238-
this.options.record_delimiter.push(Buffer.from('\n', encoding));
1239-
this.state.recordDelimiterMaxLength = 1;
1240-
return 1;
1248+
this.options.record_delimiter.push(rds[i]);
1249+
this.state.recordDelimiterMaxLength = rds[i].length;
1250+
return rds[i].length;
12411251
}
12421252
return 0;
12431253
},

packages/csv-parse/dist/esm/index.js

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5211,7 +5211,7 @@ const init_state = function(options){
52115211
record: [],
52125212
recordHasError: false,
52135213
record_length: 0,
5214-
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
5214+
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
52155215
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
52165216
wasQuoting: false,
52175217
wasRowDelimiter: false,
@@ -5676,16 +5676,22 @@ const transform = function(original_options = {}) {
56765676
state: init_state(options),
56775677
__needMoreData: function(i, bufLen, end){
56785678
if(end) return false;
5679-
const {quote} = this.options;
5679+
const {encoding, escape, quote} = this.options;
56805680
const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
56815681
const numOfCharLeft = bufLen - i - 1;
56825682
const requiredLength = Math.max(
56835683
needMoreDataSize,
56845684
// Skip if the remaining buffer smaller than record delimiter
5685-
recordDelimiterMaxLength,
5686-
// Skip if the remaining buffer can be record delimiter following the closing quote
5687-
// 1 is for quote.length
5688-
quoting ? (quote.length + recordDelimiterMaxLength) : 0,
5685+
// If "record_delimiter" is yet to be discovered:
5686+
// 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
5687+
// 2. We set the length to windows line ending in the current encoding
5688+
// Note, that encoding is known from user or bom discovery at that point
5689+
// recordDelimiterMaxLength,
5690+
recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
5691+
// Skip if remaining buffer can be an escaped quote
5692+
quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
5693+
// Skip if remaining buffer can be record delimiter following the closing quote
5694+
quoting ? (quote.length) : 0,
56895695
);
56905696
return numOfCharLeft < requiredLength;
56915697
},
@@ -6280,22 +6286,26 @@ const transform = function(original_options = {}) {
62806286
return true;
62816287
},
62826288
__autoDiscoverRecordDelimiter: function(buf, pos){
6283-
const {encoding} = this.options;
6284-
const chr = buf[pos];
6285-
if(chr === cr){
6286-
if(buf[pos+1] === nl){
6287-
this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
6288-
this.state.recordDelimiterMaxLength = 2;
6289-
return 2;
6290-
}else {
6291-
this.options.record_delimiter.push(Buffer.from('\r', encoding));
6292-
this.state.recordDelimiterMaxLength = 1;
6293-
return 1;
6289+
const { encoding } = this.options;
6290+
// Note, we don't need to cache this information in state,
6291+
// It is only called on the first line until we find out a suitable
6292+
// record delimiter.
6293+
const rds = [
6294+
// Important, the windows line ending must be before mac os 9
6295+
Buffer.from('\r\n', encoding),
6296+
Buffer.from('\n', encoding),
6297+
Buffer.from('\r', encoding),
6298+
];
6299+
loop: for(let i = 0; i < rds.length; i++){
6300+
const l = rds[i].length;
6301+
for(let j = 0; j < l; j++){
6302+
if(rds[i][j] !== buf[pos + j]){
6303+
continue loop;
6304+
}
62946305
}
6295-
}else if(chr === nl){
6296-
this.options.record_delimiter.push(Buffer.from('\n', encoding));
6297-
this.state.recordDelimiterMaxLength = 1;
6298-
return 1;
6306+
this.options.record_delimiter.push(rds[i]);
6307+
this.state.recordDelimiterMaxLength = rds[i].length;
6308+
return rds[i].length;
62996309
}
63006310
return 0;
63016311
},

packages/csv-parse/dist/esm/sync.js

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2121,7 +2121,7 @@ const init_state = function(options){
21212121
record: [],
21222122
recordHasError: false,
21232123
record_length: 0,
2124-
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
2124+
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
21252125
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
21262126
wasQuoting: false,
21272127
wasRowDelimiter: false,
@@ -2586,16 +2586,22 @@ const transform = function(original_options = {}) {
25862586
state: init_state(options),
25872587
__needMoreData: function(i, bufLen, end){
25882588
if(end) return false;
2589-
const {quote} = this.options;
2589+
const {encoding, escape, quote} = this.options;
25902590
const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
25912591
const numOfCharLeft = bufLen - i - 1;
25922592
const requiredLength = Math.max(
25932593
needMoreDataSize,
25942594
// Skip if the remaining buffer smaller than record delimiter
2595-
recordDelimiterMaxLength,
2596-
// Skip if the remaining buffer can be record delimiter following the closing quote
2597-
// 1 is for quote.length
2598-
quoting ? (quote.length + recordDelimiterMaxLength) : 0,
2595+
// If "record_delimiter" is yet to be discovered:
2596+
// 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
2597+
// 2. We set the length to windows line ending in the current encoding
2598+
// Note, that encoding is known from user or bom discovery at that point
2599+
// recordDelimiterMaxLength,
2600+
recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
2601+
// Skip if remaining buffer can be an escaped quote
2602+
quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
2603+
// Skip if remaining buffer can be record delimiter following the closing quote
2604+
quoting ? (quote.length) : 0,
25992605
);
26002606
return numOfCharLeft < requiredLength;
26012607
},
@@ -3190,22 +3196,26 @@ const transform = function(original_options = {}) {
31903196
return true;
31913197
},
31923198
__autoDiscoverRecordDelimiter: function(buf, pos){
3193-
const {encoding} = this.options;
3194-
const chr = buf[pos];
3195-
if(chr === cr){
3196-
if(buf[pos+1] === nl){
3197-
this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
3198-
this.state.recordDelimiterMaxLength = 2;
3199-
return 2;
3200-
}else {
3201-
this.options.record_delimiter.push(Buffer.from('\r', encoding));
3202-
this.state.recordDelimiterMaxLength = 1;
3203-
return 1;
3199+
const { encoding } = this.options;
3200+
// Note, we don't need to cache this information in state,
3201+
// It is only called on the first line until we find out a suitable
3202+
// record delimiter.
3203+
const rds = [
3204+
// Important, the windows line ending must be before mac os 9
3205+
Buffer.from('\r\n', encoding),
3206+
Buffer.from('\n', encoding),
3207+
Buffer.from('\r', encoding),
3208+
];
3209+
loop: for(let i = 0; i < rds.length; i++){
3210+
const l = rds[i].length;
3211+
for(let j = 0; j < l; j++){
3212+
if(rds[i][j] !== buf[pos + j]){
3213+
continue loop;
3214+
}
32043215
}
3205-
}else if(chr === nl){
3206-
this.options.record_delimiter.push(Buffer.from('\n', encoding));
3207-
this.state.recordDelimiterMaxLength = 1;
3208-
return 1;
3216+
this.options.record_delimiter.push(rds[i]);
3217+
this.state.recordDelimiterMaxLength = rds[i].length;
3218+
return rds[i].length;
32093219
}
32103220
return 0;
32113221
},

0 commit comments

Comments
 (0)