I am using a python script from here to convert a gzipped large sql dump into CSV. It works perfectly until it reaches a specific line after the text 47896122,1,'Brigadier_General_Albert_Pike',3
. After this line it encounter some kind of Byte 0xf8
which can not be decoded by python.
Here is the code:
#!/usr/bin/env python
import fileinput
import csv
import sys
# This prevents prematurely closed pipes from raising
# an exception in Python
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)
# allow large content in the dump
csv.field_size_limit(sys.maxsize)
def is_insert(line):
"""
Returns true if the line begins a SQL insert statement.
"""
return line.startswith('INSERT INTO') or False
def get_values(line):
"""
Returns the portion of an INSERT statement containing values
"""
return line.partition('` VALUES ')[2]
def values_sanity_check(values):
"""
Ensures that values from the INSERT statement meet basic checks.
"""
assert values
assert values[0] == '('
# Assertions have not been raised
return True
def parse_values(values, outfile):
"""
Given a file handle and the raw values from a MySQL INSERT
statement, write the equivalent CSV to the file
"""
latest_row = []
reader = csv.reader([values], delimiter=',',
doublequote=False,
escapechar='\\',
quotechar="'",
strict=True
)
writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)
for reader_row in reader:
for column in reader_row:
# If our current string is empty...
if len(column) == 0 or column == 'NULL':
latest_row.append(chr(0))
continue
# If our string starts with an open paren
if column[0] == "(":
# Assume that this column does not begin
# a new row.
new_row = False
# If we've been filling out a row
if len(latest_row) > 0:
# Check if the previous entry ended in
# a close paren. If so, the row we've
# been filling out has been COMPLETED
# as:
# 1) the previous entry ended in a )
# 2) the current entry starts with a (
if latest_row[-1][-1] == ")":
# Remove the close paren.
latest_row[-1] = latest_row[-1][:-1]
new_row = True
# If we've found a new row, write it out
# and begin our new one
if new_row:
writer.writerow(latest_row)
latest_row = []
# If we're beginning a new row, eliminate the
# opening parentheses.
if len(latest_row) == 0:
column = column[1:]
# Add our column to the row we're working on.
latest_row.append(column)
# At the end of an INSERT statement, we'll
# have the semicolon.
# Make sure to remove the semicolon and
# the close paren.
if latest_row[-1][-2:] == ");":
latest_row[-1] = latest_row[-1][:-2]
writer.writerow(latest_row)
def main():
"""
Parse arguments and start the program
"""
# Iterate over all lines in all files
# listed in sys.argv[1:]
# or stdin if no args given.
try:
for line in fileinput.input():
# Look for an INSERT statement and parse it.
if is_insert(line):
values = get_values(line)
if values_sanity_check(values):
parse_values(values, sys.stdout)
except KeyboardInterrupt:
sys.exit(0)
if __name__ == "__main__":
main()
Here is the link to the input file
Here is how to run the code until error:
python3 mysqldump_to_csv.py input.sql > output.csv
Now I have three questions:
How can I view the exact byte to understand what is causing all this problem? I want to understand what is byte
0xf8
and where it is residing in the input file.Is there any way I can convert this erroneous byte to something that is readable? I am asking this because when I open the raw input file in sublime text, sublime text is not complaining in showing the characters in the file and sublime text is set to UTF-8 encoding.
If it is not at all possible to decode those bytes without ignoring, how do I ignore them in python
fileinput
module in the code I provided? I understand it is possible to ignore errors inutf-8
encoding using something likewith open(path, encoding="utf8", errors='ignore') as f:
based on answers in this and this stackoverflow questions, but I really need to usefileinput
module because it helps me to grab lines from standard input, as I am using gzcat to provide input from a huge compressed sql file to this python program.