1

I found this function to parse a csv string into a multi-dim array in javascript, here, however I need to update it to also allow single quotes (') to be used as the enclosure in addition to double quotes (") which it already does. i.e. I need it to be able to parse:

CSVToArray('"a, comma","2","3"');
// => [['a, comma', '2', '3']]

As well as this

CSVToArray("'a, comma','2','3'");
// => [['a, comma', '2', '3']]

The function:

// This will parse a delimited string into an array of
// arrays. The default delimiter is the comma, but this
// can be overriden in the second argument.
function CSVToArray( strData, strDelimiter ){
    // Check to see if the delimiter is defined. If not,
    // then default to comma.
    strDelimiter = (strDelimiter || ",");

    // Create a regular expression to parse the CSV values.
    var objPattern = new RegExp(
            (
                    // Delimiters.
                    "(\\" + strDelimiter + "|\\r?\\n|\\r|^)" +

                    // Quoted fields.
                    "(?:\"([^\"]*(?:\"\"[^\"]*)*)\"|" +

                    // Standard fields.
                    "([^\"\\" + strDelimiter + "\\r\\n]*))"
            ),
            "gi"
            );


    // Create an array to hold our data. Give the array
    // a default empty first row.
    var arrData = [[]];

    // Create an array to hold our individual pattern
    // matching groups.
    var arrMatches = null;


    // Keep looping over the regular expression matches
    // until we can no longer find a match.
    while (arrMatches = objPattern.exec( strData )){

            // Get the delimiter that was found.
            var strMatchedDelimiter = arrMatches[ 1 ];

            // Check to see if the given delimiter has a length
            // (is not the start of string) and if it matches
            // field delimiter. If id does not, then we know
            // that this delimiter is a row delimiter.
            if (
                    strMatchedDelimiter.length &&
                    (strMatchedDelimiter != strDelimiter)
                    ){

                    // Since we have reached a new row of data,
                    // add an empty row to our data array.
                    arrData.push( [] );

            }


            // Now that we have our delimiter out of the way,
            // let's check to see which kind of value we
            // captured (quoted or unquoted).
            if (arrMatches[ 2 ]){

                    // We found a quoted value. When we capture
                    // this value, unescape any double quotes.
                    var strMatchedValue = arrMatches[ 2 ].replace(
                            new RegExp( "\"\"", "g" ),
                            "\""
                            );

            } else {

                    // We found a non-quoted value.
                    var strMatchedValue = arrMatches[ 3 ];

            }


            // Now that we have our value string, let's add
            // it to the data array.
            arrData[ arrData.length - 1 ].push( strMatchedValue );
    }

    // Return the parsed data.
    return( arrData );
}
Community
  • 1
  • 1
Bill Dami
  • 3,205
  • 5
  • 51
  • 70

1 Answers1

3

Your base pattern contains some flaws, and not easily extensible: Delimiters are always prefixed by a \\, even when they shouldn't.

I've rewritten the code, to make it mor ereliable.Multiple quotation types are supported, and delimiters are properly escaped.

Fiddle: http://jsfiddle.net/qz53J/

function CSVToArray( strData, strDelimiter ){    
    // Properly escape the delimiter, if existent.
    // If no delimiter is given, use a comma
    strDelimiter = (strDelimiter || ",").replace(/([[^$.|?*+(){}])/g, '\\$1');

    //What are the quotation characters? "'
    var quotes = "\"'";

    // Create a regular expression to parse the CSV values.
    // match[1] = Contains the delimiter if the RegExp is not at the begin
    // match[2] = quote, if any
    // match[3] = string inside quotes, if match[2] exists
    // match[4] = non-quoted strings
    var objPattern = new RegExp(
                // Delimiter or marker of new row
        "(?:(" + strDelimiter + ")|[\\n\\r]|^)" +
                // Quoted fields
        "(?:([" + quotes + "])((?:[^" + quotes + "]+|(?!\\2).|\\2\\2)*)\\2" + 
                // Standard fields
        "|([^" + quotes + strDelimiter + "\\n\\r]*))"
    , "gi");

    // Create a matrix (2d array) to hold data, which will be returned.
    var arrData = [];

    // Execute the RegExp until no match is found
    var arrMatches;
    while ( arrMatches = objPattern.exec( strData ) ){
            // If the first group of the RegExp does is empty, no delimiter is
            // matched. This only occurs at the beginning of a new row
            if ( !arrMatches[ 1 ] ){
                    // Add an empty row to our data array.
                    arrData.push( [] );    
            }

            var quote = arrMatches[ 2 ]
            if ( quote ){
                    // We found a quoted value. When we capture
                    // this value, unescape any double quotes.
                    var strMatchedValue = arrMatches[ 3 ].replace(
                        new RegExp( quote + quote, "g" ),
                        quote
                    );
            } else {
                    // We found a non-quoted value.
                    var strMatchedValue = arrMatches[ 4 ];
            }
            // Add the found value to the array
            arrData[ arrData.length - 1 ].push( strMatchedValue );
    }
    // Return the parsed data.
    return arrData;
}
Rob W
  • 341,306
  • 83
  • 791
  • 678
  • ah, so then would this also require an additional if clause to be added for strings with single quote enclosures? Here: if (arrMatches[ 2 ]){ var strMatchedValue = arrMatches[ 2 ].replace( new RegExp( "\"\"", "g" ), "\"" ); } else { // We found a non-quoted value. var strMatchedValue = arrMatches[ 3 ]; } – Bill Dami Oct 24 '11 at 16:39
  • @BillDami I'm improving your code + RegExp. Wait about 10-15 minutes. – Rob W Oct 24 '11 at 16:44
  • Tested your function in this fiddle, http://jsfiddle.net/eZmq4/1/, and got the following error: "Uncaught SyntaxError: Invalid regular expression: /(,|[\n\r]|^)(?:(["'])((?:[^"']+|(?!\2).|\2\2)*))\2|([^"',\n\r]*))/: Unmatched ')'", which looks like it was an extra ")" in the Standard fields section of the objPattern. After that was removed, got an error "Uncaught TypeError: Cannot read property 'length' of undefined" on this line: if(strMatchedDelimiter.length && strDelimiter != strMatchedDelimiter ){ – Bill Dami Oct 24 '11 at 19:24
  • @BillDami I've fixed it by fully rewriting the code. The logic of the function has been improved, and unnecessary comments have been removed. See **Fiddle: http://jsfiddle.net/qz53J/** – Rob W Oct 24 '11 at 19:41
  • +1 Nice work. I wish I would've seen this sooner to save some time learning how to do the regex with variables part. The .replace on strDelimiter is escaping regex special chars, correct? If so, I need to include that in my code too. I'm trying to battle-harden my jquery-csv plugin library to be configurable to all forms of CSV files. You can find the code at http://code.google.com/p/jquery-csv/. – Evan Plaice Apr 24 '12 at 04:16
  • @EvanPlaice The escape is meant to deal with special RegExp characters. Instead of grouping, it can be written even shorter using `.replace(/[[^$.|?*+(){}]/g, '\\$&')` – Rob W Apr 24 '12 at 13:08