The csv module won't handle the scenario of " and ' being quotes at the same time. Absent a module that provides that kind of dialect, one has to get into the parsing business. To avoid reliance on a third party module, we can use the re
module to do the lexical analysis, using the re.MatchObject.lastindex gimmick to associate a token type with the matched pattern.
The following code when run as a script passes all the tests shown, with Python 2.7 and 2.2.
import re
# lexical token symbols
DQUOTED, SQUOTED, UNQUOTED, COMMA, NEWLINE = xrange(5)
_pattern_tuples = (
(r'"[^"]*"', DQUOTED),
(r"'[^']*'", SQUOTED),
(r",", COMMA),
(r"$", NEWLINE), # matches end of string OR \n just before end of string
(r"[^,\n]+", UNQUOTED), # order in the above list is important
)
_matcher = re.compile(
'(' + ')|('.join([i[0] for i in _pattern_tuples]) + ')',
).match
_toktype = [None] + [i[1] for i in _pattern_tuples]
# need dummy at start because re.MatchObject.lastindex counts from 1
def csv_split(text):
"""Split a csv string into a list of fields.
Fields may be quoted with " or ' or be unquoted.
An unquoted string can contain both a " and a ', provided neither is at
the start of the string.
A trailing \n will be ignored if present.
"""
fields = []
pos = 0
want_field = True
while 1:
m = _matcher(text, pos)
if not m:
raise ValueError("Problem at offset %d in %r" % (pos, text))
ttype = _toktype[m.lastindex]
if want_field:
if ttype in (DQUOTED, SQUOTED):
fields.append(m.group(0)[1:-1])
want_field = False
elif ttype == UNQUOTED:
fields.append(m.group(0))
want_field = False
elif ttype == COMMA:
fields.append("")
else:
assert ttype == NEWLINE
fields.append("")
break
else:
if ttype == COMMA:
want_field = True
elif ttype == NEWLINE:
break
else:
print "*** Error dump ***", ttype, repr(m.group(0)), fields
raise ValueError("Missing comma at offset %d in %r" % (pos, text))
pos = m.end(0)
return fields
if __name__ == "__main__":
tests = (
("""hey,hello,,"hello,world",'hey,world'\n""", ['hey', 'hello', '', 'hello,world', 'hey,world']),
("""\n""", ['']),
("""""", ['']),
("""a,b\n""", ['a', 'b']),
("""a,b""", ['a', 'b']),
(""",,,\n""", ['', '', '', '']),
("""a,contains both " and ',c""", ['a', 'contains both " and \'', 'c']),
("""a,'"starts with "...',c""", ['a', '"starts with "...', 'c']),
)
for text, expected in tests:
result = csv_split(text)
print
print repr(text)
print repr(result)
print repr(expected)
print result == expected