If you think 40MB is huge, you haven't seen huge ;) Either way, you don't need to read the whole file in memory, nor you need to split the whole line - it's sufficient just to skip the first n
lines while reading and then get the line content up until the first semi-column, something like:
def remove_lines(input_file, output_file):
with open(input_file, "r") as f_in, open(output_file, "a") as f_out:
for i, line in enumerate(f_in): # read the input line by line and enumerate it
if i > 9: # we're not interested in the first 10 lines
sc_index = line.find(";") # find the position of the first ; in line
if sc_index != -1: # found the first semi-column, get the content up to it
f_out.write(line[:sc_index] + "\n") # write it to the output file
else:
f_out.write(line) # write the whole line as we couldn't find a ;
UPDATE: For the folks who think that str.split("delim", 1)
is faster than finding the actual position and manually slicing, here's a simple test:
import timeit
def func_split(data):
return data.split(";", 1)[0]
def func_find(data):
index = data.find(";")
if index != -1:
return data[:index]
return data
test1 = "A quick; brown; fox; with; semi; columns."
test2 = "A quick brown fox without semi columns."
assert func_split(test1) == func_find(test1)
assert func_split(test2) == func_find(test2)
if __name__ == "__main__":
print("func_split:", timeit.timeit("func_split(test1); func_split(test2)",
"from __main__ import func_split, test1, test2",
number=1000000))
print("func_find: ", timeit.timeit("func_find(test1); func_find(test2)",
"from __main__ import func_find, test1, test2",
number=1000000))
And the results
CPython 2.7.11 x64 (1000000 loops):
('func_split:', 6.877725868989936)
('func_find: ', 6.228281754820999)
CPython 3.5.1 x64 (100000 loops):
func_split: 0.8343849130147841
func_find: 0.8080772353660183
YMMV, of course, but in general the latter will always be faster on CPython, and the speed difference will increase with each character added to the string as the str.find()
doesn't need to pick up the whole string 'til the end nor needs to create a list to store it.