I use an ETL tool which has python2.6 as a built in scripting language, so when I had a requirement to split a large file into chunks for downstream processing. It seemed an obvious choice. I initially wrote and tested the script on my macbook ( osx 10.8 ) using the python 2.6 install.
When I moved this to windows I was amazed as it ran 10x slower ... even enterprise scale server (32core 64GB fibrechannel SAN etc).
When trying to narrow down where the differences lie, mac osx shows little difference when commenting out the writes, whereas windows increases > 5x
Is there some fundamental file IO difference between osx and windows?
Any help gratefully accepted :)
import os
import sys
import re
from time import time
t = time()
"""
# Split a pre sorted text file into multiple outputs based on the leftmost element
# delimited by spaces.
# The second element can be used for an additional sort and will stripped from the
# output when 'isLeadingSort=1'
#
# parameter:
# path: char path for the input file
# outPath: char path for the output files
# isLeadingSort int use the 2nd of 3rd element as output data
# isdbg int enable debug prints
"""
# Just use the cmd at the moment for test
path= sys.argv[1]
outPath = sys.argv[2]
isLeadingSort = int(sys.argv[3])
isdbg = int(sys.argv[4])
#outPath = os.getcwd()
#isLeadingSort = 0
#isdbg = 0
# define all the functions up front
def printStr(str):
""" print when the debug option is set """
if isdbg:
print (str)
def testPath(path):
"""raise an exception if we cant find the path or file"""
if not os.path.exists(path):
raise Exception ('File not found: ' + path )
return false
#
# This is where we start
#
# check that the paths exist or raise an exception
testPath(path)
testPath(outPath)
printStr ('paths ok')
#init
arline = []
fnameOut = chr(1) # init the output filename
line=object()
fOut=object()
# open the input file for reading and process though in a loop
with open(path,'r') as f:
for line in f:
printStr( 'for line in f: ' )
if isLeadingSort:
wrds=2
else:
wrds=1
arLine = re.split('[ \n]+',line,wrds)
newFname = arLine[0]
outLine = arLine[len(arLine)-1]
if newFname == fnameOut:
printStr ('writing to open file: ' + fnameOut)
else:
fnameOut = newFname
printStr ('opennextfile: ' + fnameOut + '- closing: ' + str(fOut) )
try:
fOut.close()
except:
pass
if fnameOut in ('' , '\n'):
raise Exception ('Filename is not the first element of the data: ' )
fOut = open(os.path.join(outPath,fnameOut),'w') # open new
#write
fOut.write(outLine)
try:
fOut.close()
except:
pass
print ( 'timediff : ' + str(time() - t))