more efficient way split quoted string by space with python?

Question

Recently I'm dealing with the access log of nginx analyzing work with python.

I've found the way to split the quoted string by space using shlex according to this

But it's really slow, analyzing 2000 lines of logs costs more than 1.2 seconds. My nginx server generates more than 2500 lines per sec.

So I've tried with re or more native (and rude) way with indices the string.

The codes are running in a virtual machine and both cost about more than 0.5 seconds for 2000 lines of logs

Do I have any other choice to make it more efficient?

Thanks in advance

Here's my code

import re
import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080  [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'
def convert(line):
    line = re.split('\"', line)
    line_pre = re.split('\s+', line[0])

    r =re.compile(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$")
    http_method =r.findall(line[1])
    #http_method =re.findall(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$", line[1])
    if len(http_method):
        http_method = http_method[0]
    else:
        http_method = ''
    r = re.compile(r"^\s+(\d{1,3})\s+(\d+)")
    code_byte = r.findall(line[2])
    #code_byte = re.findall(r"^\s+(\d{1,3})\s+(\d+)", line[2])
    status = int(code_byte[0][0])
    bytes_sent = int(code_byte[0][1])
    r = re.compile(r":\d+$")
    upstream_addr = r.sub("", line_pre[4])
    request_time = int(float(line_pre[0])*1000)
    if line_pre[1] == '-':
        upstream_response_time = -1
    else:
        upstream_response_time = int(float(line_pre[1])*1000)
    remote_addr = line_pre[2]
    host = line_pre[7].replace(' ','')
    logdatetime = line_pre[5].replace('[','')
    dt = datetime.datetime.strptime(logdatetime, "%d/%b/%Y:%H:%M:%S")
    year = int(str(dt)[0:4])
    monthday = int(str(dt)[4:10].replace("-",""))
    hour = int(str(dt)[11:13])
    logtime = int(str(dt)[14:16])
    sec = time.mktime(dt.timetuple())
    r = re.compile(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$")
    request_uri = r.findall(line[1])
    #request_uri = re.findall(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$", line[1])
    http_referer = line[3]
    user_agent = line[5]
    gzip_ratio = line[7]
    http_x_forwarded_for = line[9]
    r = re.compile(r"^([0-9\.]+)\s+(.*)")
    serad_guid = r.findall(line[11])
    server_addr = serad_guid[0][0]
    guid = serad_guid[0][1]
    doc = {
                    "hour":hour,
                    "year":year,
                    "date":monthday,
                    "time":logtime,
                    "sec":sec,
                    "request_time":request_time,
                    "upstream_response_time":upstream_response_time,
                    "remote_addr":remote_addr,
                    "upstream_addr":upstream_addr,
                    "host":host,
                    "method":http_method,
                    "request_uri":request_uri,
                    #"request_protocal":"",
                    "status":status,
                    "bytes_sent":bytes_sent,
                    "http_referer":http_referer,
                    "user_agent":user_agent,
                    "gzip_ratio":gzip_ratio,
                    "http_x_forwarded_for":http_x_forwarded_for,
                    "server_addr":server_addr,
                    "guid":guid

    }
    return doc
t2 = time.time()
count =0
for i in range(12000):
    convert(line)
    count += 1
    if count % 2000 == 0:
    t1 = t2
        t2 = time.time()
        print str(t2-t1)

and

indices way

import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080  [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'

def pair(l):
    for i in range(0, len(l), 2):
        yield (l[i], l[i+1])

def convert(line):
    line = line.replace("  ", "")
    quotes_positions = allindices(line, "\"")
    if len(quotes_positions) <= 0 or len(quotes_positions)% 2 != 0:
        return None

    space_positions = allindices(line, " ")

    target_positions = []

    for s in space_positions:
        true_target = True
        for qs, qe in pair(quotes_positions):
            if s > qs and s < qe:
                true_target = False
                break
        if true_target:
            target_positions.append(s)

    ret = []
    for i in range(0, len(target_positions)):
        if i + 1 == len(target_positions):
            ret.append(line[target_positions[i] + 1:])
        else:
            ret.append(line[target_positions[i] + 1:target_positions[i + 1]])
    return ret


# def allindices(string, sub, listindex=[], offset=0):
def allindices(string, sub):
    listindex = list()
    i = string.find(sub)
    while i >= 0:
        listindex.append(i)
        i = string.find(sub, i + 1)
    return listindex

t2 = time.time()
count =0
for i in range(12000):
    convert(line)
    count += 1
    if count % 2000 == 0:
    t1 = t2
        t2 = time.time()
        print str(t2-t1)

That is an awful lot of code. I'm not going to sit here and figure out all of what it's supposed to do. Could you describe the exact parsing, and/or show sample input and corresponding output? — Karl Knechtel, Apr 10 '12 at 11:53
Maybe you're going about this a bit wrong? consider making a change to the [`log_format`](http://wiki.nginx.org/HttpLogModule) option on your server to suit your parser; perhaps make it look like json, or add an irregular separator (say, `|`) instead of a blank. — SingleNegationElimination, Apr 10 '12 at 12:28

score 3 · Answer 1 · answered Apr 10 '12 at 12:33

This looks a bit like CSV; I wonder if the csv module can be abused into working with this?

>>> for row in csv.reader([line], delimiter=' '):
...     print repr(row)
... 
['0.278', '0.264', '113.116.52.174', '-', '10.10.3.41:20080', '', '[08/Apr/2012:23:59:08', '+0800]', 'shenzhen.anjuke.com', 'GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0', '200', '10914', 'http://shenzhen.anjuke.com/prop/view/104178677', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)', '-', '-', '-', '114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E']

SAPikachu · Accepted Answer · 2012-04-10T12:42:27.280

Just wrote a regex based on the sample line, I don't actually know the meaning of some fields so I used placeholder names for them, you can rename them to more meaningful ones. On my machine this snippet is 4~5 times faster than your first one.

log_line_re = re.compile(
r"""
(?P<float1>[0-9.]+)
\s
(?P<float2>[0-9.]+)
\s
(?P<ip1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
\s
(?P<field1>.+?)
\s
(?P<ip_port_1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5})
\s+
\[(?P<request_date>.+?)\]
\s
(?P<host>.+?)
\s
"
(?P<http_method>[A-Z]+)
\s
(?P<request_path>.+?)
\s
HTTP/(?P<http_version>[0-9.]+)
"
\s
(?P<status_code>\d{3})
\s
(?P<number>\d+)
\s
"
(?P<referer>.+?)
"
\s
"(?P<user_agent>.+?)"
\s
"(?P<field2>.+?)"
\s
"(?P<field3>.+?)"
\s
(?P<field4>.+?)
"
(?P<ip2>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
\s
(?P<request_guid>.+?)
"
""", re.VERBOSE)


def convert(line):
    return log_line_re.match(line).groupdict()

yep, how can i forget about using single re :D – Apr 10 '12 at 13:56 — , Apr 10 '12 at 13:56

more efficient way split quoted string by space with python?

2 Answers2