0

I am fairly new to python programming. I am making an Apache log file parser. The parser takes either Statuses or IPs and calculates count, percentage of total and bytes transfered for the one that was chosen to be analysed.

I created two similar functions, one of which takes status, another ips, and calculates those values.

I feel like I could put this into one function, but I am unfamiliar on how to pass the values needed, and to get the returns needed correctly.

Code:

if __name__ == '__main__':
    filename = sys.argv[1]
    try:
        with open(filename) as logfile:
            ip = []
            bytes = []
            status = []
            for line in logfile:
                split = line.split()
                ip.append(split[0])
                bytes.append(split[9])
                status.append(split[8])
    except OSError:
        print(filename, 'not existing')
        exit()

    #user inputs what data he wants to analyse
    sort = input('Do you want to sort results by [ip] or [status]? [ANSWER]: ')

    #get ips
    if sort == 'ip':
        ip_list = []
        for match in (ip):
            if match not in ip_list:
                ip_list.append(match)
    #get statuses
    if sort == 'status':
        status_list = []
        for match in (status):
            if match not in status_list:
                status_list.append(match)

    #user inputs what numerical output he desires
    desired_output = input(
        'Choose Desired Output: count - percentage - bytes: ')
# ---------------------SIMILAR FUNCTIONS NUMBER 1 ---------------------------------
    if sort == 'ip':
        ip_count = []
        for match in ip_list:
            count = 0
            for ip_match in ip:
                if match == ip_match:
                    count += 1
            ip_count.append(count)

        if desired_output == 'count':
            ip_count, ip_list = zip(
                *sorted(zip(ip_count, ip_list), reverse=True))
            for i in range(len(ip_list)):
                print('IP: ' + str(ip_list[i]) + ' count: ' + str(ip_count[i]))

        if desired_output == 'percentage':
            ip_count, ip_list = zip(
                *sorted(zip(ip_count, ip_list), reverse=True))
            for i in range(len(ip_list)):
                print('IP: ' + str(ip_list[i]) + ' percentage: ' +
                      str(round(ip_count[i]/len(ip)*100, 2)) + '%')
                      
        if desired_output == 'bytes':
            cnt_bytes = []
            for v in range(len(ip_list)):
                tmp = 0
                for k in range(len(ip)):
                    if ip_list[v] == ip[k]:
                        if bytes[k] == '-':
                            bytes[k] = 0
                        tmp += int(bytes[k])
                cnt_bytes.append(tmp)
            ip_list, cnt_bytes = zip(
                    *sorted(zip(ip_list, cnt_bytes), reverse=True))
            for line in range(len(ip_list)):
                print('IP: ' + str(ip_list[line]) + '   bytes: ' + str(cnt_bytes[line]))
# ---------------------SIMILAR FUNCTIONS NUMBER 2 ---------------------------------
    if sort == 'status':
        status_count = []
        for match in status_list:
            count = 0
            for status_match in status:
                if match == status_match:
                    count += 1
            status_count.append(count)

        if desired_output == 'count':
            status_count, status_list = zip(
                *sorted(zip(status_count, status_list), reverse=True))
            for i in range(len(status_list)):
                print('Status: ' +
                      str(status_list[i]) + ' count: ' + str(status_count[i]))
        if desired_output == 'percentage':
            status_count, status_list = zip(
                *sorted(zip(status_count, status_list), reverse=True))
            for i in range(len(status_list)):
                print('Status: ' + str(status_list[i]) + ' percentage: ' +
                      str(round(status_count[i]/len(status)*100, 2)) + '%')
        if desired_output == 'bytes':
            cnt_bytes = []
            for v in range(len(status_list)):
                tmp = 0
                for k in range(len(status)):
                    if status_list[v] == status[k]:
                        if bytes[k] == '-':
                            bytes[k] = 0
                        tmp += int(bytes[k])
                cnt_bytes.append(tmp)
            cnt_bytes, status_list = zip(
                    *sorted(zip(cnt_bytes, status_list), reverse=True))
            for line in range(len(status_list)):
                print('Status: ' + str(status_list[line]) + '   bytes: ' + str(cnt_bytes[line]))

Log example:

68.180.224.225 - - [17/May/2015:14:05:48 +0000] "GET /scripts/?C=M;O=A HTTP/1.1" 200 21894 "-" "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
68.180.224.225 - - [17/May/2015:14:05:07 +0000] "GET /scripts/?C=M;O=D HTTP/1.1" 200 21894 "-" "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
217.212.224.183 - - [17/May/2015:14:05:30 +0000] "GET /robots.txt HTTP/1.0" 200 - "-" "psbot/0.1 (+http://www.picsearch.com/bot.html)"
217.212.224.181 - - [17/May/2015:14:05:43 +0000] "GET /blog/projects/xmlpresenter/main.html HTTP/1.0" 200 11628 "-" "psbot/0.1 (+http://www.picsearch.com/bot.html)"
Upsice
  • 23
  • 6

1 Answers1

0

This is a solution that uses the powerful python pandas library. The pandas learning curve can be steep but it is indispensable for dealing with this kind of problem.

The code below includes explanatory comments and references to relevant online references.

import re
import pandas as pd

# About Pandas: https://pandas.pydata.org/
# Avout User asking for userinput: https://stackoverflow.com/questions/23294658

data = """68.180.224.225 - - [17/May/2015:14:05:48 +0000] "GET /scripts/?C=M;O=A HTTP/1.1" 200 21894 "-" "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
68.180.224.225 - - [17/May/2015:14:05:07 +0000] "GET /scripts/?C=M;O=D HTTP/1.1" 200 21894 "-" "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
217.212.224.183 - - [17/May/2015:14:05:30 +0000] "GET /robots.txt HTTP/1.0" 200 - "-" "psbot/0.1 (+http://www.picsearch.com/bot.html)"
217.212.224.181 - - [17/May/2015:14:05:43 +0000] "GET /blog/projects/xmlpresenter/main.html HTTP/1.0" 200 11628 "-" "psbot/0.1 (+http://www.picsearch.com/bot.html)" """

log = data.split('\n')  # Reconstruct the log file lines as a list

#  Using regular expressiong rather split() here.
#  Match:             -IP--                   -ST-- -CNT-
pattern = re.compile('(\S+) - - \[.*?\] ".*?" (\d+) (\S+) .*')
lst = []
for msg in log:
    m = re.match(pattern, msg)
    if m:
        bytecount = 0 if m[3] == '-' else m[3]  # For when there's no byte count ('-')
        lst.append([m[1], int(m[2]), int(bytecount)])

#  Create a pandas dataframe from the lst created above
df = pd.DataFrame(lst, columns=['ip', 'status', 'bytes'])
print(df)

# Add some columns to the df
df['total_bytes'] = df['bytes'].sum()  #  the sum of the bytes in the log
df['count'] = 1  # the count for each line is, of course, 1!
df["percent"] = df["bytes"] / df["total_bytes"] * 100  # The % computed fron other columns
print(df)

#  Ask the user to make a choice and print the ip/status stats as a table
options = ['ip', 'status']
while True:
    choice = input(f'Please choose one of {options} or quit: ').strip().lower()
    if choice == 'quit':
        break
    if choice in options:
        result = df.groupby(choice)[['count', 'bytes', 'percent']].sum()
        print(result)
    else:
        print(f'{choice} not valid')

And here is a session output including the output of the print statements which I hope will help clarify how the code works:

                ip  status  bytes
0   68.180.224.225     200  21894
1   68.180.224.225     200  21894
2  217.212.224.183     200      0
3  217.212.224.181     200  11628
                ip  status  bytes  total_bytes  count    percent
0   68.180.224.225     200  21894        55416      1  39.508445
1   68.180.224.225     200  21894        55416      1  39.508445
2  217.212.224.183     200      0        55416      1   0.000000
3  217.212.224.181     200  11628        55416      1  20.983110
Please choose one of ['ip', 'status']: ip
                 count  bytes   percent
ip
217.212.224.181      1  11628  20.98311
217.212.224.183      1      0   0.00000
68.180.224.225       2  43788  79.01689
Please choose one of ['ip', 'status']:    STATUS
        count  bytes  percent
status
200         4  55416    100.0
Please choose one of ['ip', 'status']: quit
C. Pappy
  • 739
  • 4
  • 13