I am fairly new to python programming. I am making an Apache log file parser. The parser takes either Statuses or IPs and calculates count, percentage of total and bytes transfered for the one that was chosen to be analysed.
I created two similar functions, one of which takes status, another ips, and calculates those values.
I feel like I could put this into one function, but I am unfamiliar on how to pass the values needed, and to get the returns needed correctly.
Code:
if __name__ == '__main__':
filename = sys.argv[1]
try:
with open(filename) as logfile:
ip = []
bytes = []
status = []
for line in logfile:
split = line.split()
ip.append(split[0])
bytes.append(split[9])
status.append(split[8])
except OSError:
print(filename, 'not existing')
exit()
#user inputs what data he wants to analyse
sort = input('Do you want to sort results by [ip] or [status]? [ANSWER]: ')
#get ips
if sort == 'ip':
ip_list = []
for match in (ip):
if match not in ip_list:
ip_list.append(match)
#get statuses
if sort == 'status':
status_list = []
for match in (status):
if match not in status_list:
status_list.append(match)
#user inputs what numerical output he desires
desired_output = input(
'Choose Desired Output: count - percentage - bytes: ')
# ---------------------SIMILAR FUNCTIONS NUMBER 1 ---------------------------------
if sort == 'ip':
ip_count = []
for match in ip_list:
count = 0
for ip_match in ip:
if match == ip_match:
count += 1
ip_count.append(count)
if desired_output == 'count':
ip_count, ip_list = zip(
*sorted(zip(ip_count, ip_list), reverse=True))
for i in range(len(ip_list)):
print('IP: ' + str(ip_list[i]) + ' count: ' + str(ip_count[i]))
if desired_output == 'percentage':
ip_count, ip_list = zip(
*sorted(zip(ip_count, ip_list), reverse=True))
for i in range(len(ip_list)):
print('IP: ' + str(ip_list[i]) + ' percentage: ' +
str(round(ip_count[i]/len(ip)*100, 2)) + '%')
if desired_output == 'bytes':
cnt_bytes = []
for v in range(len(ip_list)):
tmp = 0
for k in range(len(ip)):
if ip_list[v] == ip[k]:
if bytes[k] == '-':
bytes[k] = 0
tmp += int(bytes[k])
cnt_bytes.append(tmp)
ip_list, cnt_bytes = zip(
*sorted(zip(ip_list, cnt_bytes), reverse=True))
for line in range(len(ip_list)):
print('IP: ' + str(ip_list[line]) + ' bytes: ' + str(cnt_bytes[line]))
# ---------------------SIMILAR FUNCTIONS NUMBER 2 ---------------------------------
if sort == 'status':
status_count = []
for match in status_list:
count = 0
for status_match in status:
if match == status_match:
count += 1
status_count.append(count)
if desired_output == 'count':
status_count, status_list = zip(
*sorted(zip(status_count, status_list), reverse=True))
for i in range(len(status_list)):
print('Status: ' +
str(status_list[i]) + ' count: ' + str(status_count[i]))
if desired_output == 'percentage':
status_count, status_list = zip(
*sorted(zip(status_count, status_list), reverse=True))
for i in range(len(status_list)):
print('Status: ' + str(status_list[i]) + ' percentage: ' +
str(round(status_count[i]/len(status)*100, 2)) + '%')
if desired_output == 'bytes':
cnt_bytes = []
for v in range(len(status_list)):
tmp = 0
for k in range(len(status)):
if status_list[v] == status[k]:
if bytes[k] == '-':
bytes[k] = 0
tmp += int(bytes[k])
cnt_bytes.append(tmp)
cnt_bytes, status_list = zip(
*sorted(zip(cnt_bytes, status_list), reverse=True))
for line in range(len(status_list)):
print('Status: ' + str(status_list[line]) + ' bytes: ' + str(cnt_bytes[line]))
Log example:
68.180.224.225 - - [17/May/2015:14:05:48 +0000] "GET /scripts/?C=M;O=A HTTP/1.1" 200 21894 "-" "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
68.180.224.225 - - [17/May/2015:14:05:07 +0000] "GET /scripts/?C=M;O=D HTTP/1.1" 200 21894 "-" "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
217.212.224.183 - - [17/May/2015:14:05:30 +0000] "GET /robots.txt HTTP/1.0" 200 - "-" "psbot/0.1 (+http://www.picsearch.com/bot.html)"
217.212.224.181 - - [17/May/2015:14:05:43 +0000] "GET /blog/projects/xmlpresenter/main.html HTTP/1.0" 200 11628 "-" "psbot/0.1 (+http://www.picsearch.com/bot.html)"