RFC 4180 Solution Using Regular Expression
Regular expressions to the rescue! These few lines of code properly handle quoted fields with embedded commas, quotes, and newlines based on the RFC 4180 standard.
function parseCsv(data, fieldSep, newLine) {
fieldSep = fieldSep || ',';
newLine = newLine || '\n';
const nSep = '\x1D'; const nSepRe = new RegExp(nSep, 'g');
const qSep = '\x1E'; const qSepRe = new RegExp(qSep, 'g');
const cSep = '\x1F'; const cSepRe = new RegExp(cSep, 'g');
const fieldRe = new RegExp('(^|[' + fieldSep + '\\n])"([^"]*(?:""[^"]*)*)"(?=($|[' + fieldSep + '\\n]))', 'g');
return data
.replace(/\r/g, '')
.replace(/\n+$/, '')
.replace(fieldRe, (match, p1, p2) => {
return p1 + p2.replace(/\n/g, nSep).replace(/""/g, qSep).replace(/,/g, cSep)
})
.split(/\n/)
.map(line => {
return line
.split(fieldSep)
.map(cell => cell.replace(nSepRe, newLine).replace(qSepRe, '"').replace(cSepRe, ','))
});
}
const csv = 'A1,B1,C1\n"A ""2""","B, 2","C\n2"';
const separator = ','; // field separator, default: ','
const newline = ' <br /> '; // newline representation in case a field contains newlines, default: '\n'
let grid = parseCsv(csv, separator, newline);
// expected: [ [ 'A1', 'B1', 'C1' ], [ 'A "2"', 'B, 2', 'C <br /> 2' ] ]
Notes:
- The field separator can be configured, such as
\t
for TSV (tab-separated values)
- Embedded newlines can be converted to something else, such as
<br/>
for HTML use
- This
parseCsv
function avoids negative/positive lookbehinds, e.g. works also on Safari browsers.
Unless stated elsewhere, you don't need a finite state machine. The regular expression handles RFC 4180 properly thanks to a functional programming approach using temporary replacements/restores, capture groups, and positive lookahead.
Clone/download code at https://github.com/peterthoeny/parse-csv-js
Learn more about regex: https://twiki.org/cgi-bin/view/Codev/TWikiPresentation2018x10x14Regex
Old Answer Using Lookbehinds
(This does not work on Safari browsers)
function parseCsv(data, fieldSep, newLine) {
fieldSep = fieldSep || ',';
newLine = newLine || '\n';
var nSep = '\x1D';
var qSep = '\x1E';
var cSep = '\x1F';
var nSepRe = new RegExp(nSep, 'g');
var qSepRe = new RegExp(qSep, 'g');
var cSepRe = new RegExp(cSep, 'g');
var fieldRe = new RegExp('(?<=(^|[' + fieldSep + '\\n]))"(|[\\s\\S]+?(?<![^"]"))"(?=($|[' + fieldSep + '\\n]))', 'g');
var grid = [];
data.replace(/\r/g, '').replace(/\n+$/, '').replace(fieldRe, function(match, p1, p2) {
return p2.replace(/\n/g, nSep).replace(/""/g, qSep).replace(/,/g, cSep);
}).split(/\n/).forEach(function(line) {
var row = line.split(fieldSep).map(function(cell) {
return cell.replace(nSepRe, newLine).replace(qSepRe, '"').replace(cSepRe, ',');
});
grid.push(row);
});
return grid;
}
const csv = 'A1,B1,C1\n"A ""2""","B, 2","C\n2"';
const separator = ','; // field separator, default: ','
const newline = ' <br /> '; // newline representation in case a field contains newlines, default: '\n'
var grid = parseCsv(csv, separator, newline);
// expected: [ [ 'A1', 'B1', 'C1' ], [ 'A "2"', 'B, 2', 'C <br /> 2' ] ]