1

How to read a CSV file in Snowflake, file has commas between double quotes and some rows has 1/2 blank columns ?

This solution is working if there is not blank values on the row. Can you please help enhance it to accept blank values also.

create or replace function SPLIT_QUOTED_STRING(STR string)
returns array
language javascript
as
$$
var arr = STR.match(/(".*?"|[^",\s]+)(?=\s*,|\s*$)/g);
for (var i = 0; i < arr.length; i++) {
    arr[i] = arr[i].replace(/['"]+/g, '')
}
return arr;
$$;
SPLIT_QUOTED_STRING('o,,,"sadasdasd",123123123,"asdasdasd.www.org,123123,link.com",0')[1]::string
--this must give a blank value! 

Thanks for the help!

Felipe Hoffa
  • 54,922
  • 16
  • 151
  • 325
priyanka
  • 25
  • 6

2 Answers2

0

Apparently you want code to split CSV values in a JavaScript UDF.

Instead of regular expressions, this code does the job better:

create or replace function SPLIT_QUOTED_STRING(STR string)
returns array
language javascript
as
$$

function parseCSV(str) {
    var arr = [];
    var quote = false;  // 'true' means we're inside a quoted field

    // Iterate over each character, keep track of current row and column (of the returned array)
    for (var row = 0, col = 0, c = 0; c < str.length; c++) {
        var cc = str[c], nc = str[c+1];        // Current character, next character
        arr[row] = arr[row] || [];             // Create a new row if necessary
        arr[row][col] = arr[row][col] || '';   // Create a new column (start with empty string) if necessary

        // If the current character is a quotation mark, and we're inside a
        // quoted field, and the next character is also a quotation mark,
        // add a quotation mark to the current column and skip the next character
        if (cc == '"' && quote && nc == '"') { arr[row][col] += cc; ++c; continue; }

        // If it's just one quotation mark, begin/end quoted field
        if (cc == '"') { quote = !quote; continue; }

        // If it's a comma and we're not in a quoted field, move on to the next column
        if (cc == ',' && !quote) { ++col; continue; }

        // If it's a newline (CRLF) and we're not in a quoted field, skip the next character
        // and move on to the next row and move to column 0 of that new row
        if (cc == '\r' && nc == '\n' && !quote) { ++row; col = 0; ++c; continue; }

        // If it's a newline (LF or CR) and we're not in a quoted field,
        // move on to the next row and move to column 0 of that new row
        if (cc == '\n' && !quote) { ++row; col = 0; continue; }
        if (cc == '\r' && !quote) { ++row; col = 0; continue; }

        // Otherwise, append the current character to the current column
        arr[row][col] += cc;
    }
    return arr[0];
}

return parseCSV(STR);
$$;
select SPLIT_QUOTED_STRING('o,,,"sadasdasd",123123123,"asdasdasd.www.org,123123,link.com",0')

JS taken from this answer: https://stackoverflow.com/a/14991797/132438

Felipe Hoffa
  • 54,922
  • 16
  • 151
  • 325
0

If you want to read a CSV file which separated with commas (which is the default delimiter for CSV files), and some columns are enclosed with double quotes, you can create a file format object and use the COPY command:

create or replace file format csvformat type=csv FIELD_OPTIONALLY_ENCLOSED_BY = '"';

select $1, $2, $3, $4, $5, $6, $7 
from @mystage (file_format => csvformat);

+----+----+----+-----------+-----------+-----------------------------------+----+
| $1 | $2 | $3 |    $4     |    $5     |                $6                 | $7 |
+----+----+----+-----------+-----------+-----------------------------------+----+
| o  |    |    | sadasdasd | 123123123 | asdasdasd.www.org,123123,link.com |  0 |
+----+----+----+-----------+-----------+-----------------------------------+----+

FIELD_OPTIONALLY_ENCLOSED_BY https://docs.snowflake.com/en/sql-reference/sql/create-file-format.html#type-csv

Gokhan Atil
  • 9,278
  • 2
  • 11
  • 24