Recently I'm dealing with the access log of nginx analyzing work with python.
I've found the way to split the quoted string by space using shlex
according to this
But it's really slow, analyzing 2000 lines of logs costs more than 1.2 seconds. My nginx server generates more than 2500 lines per sec.
So I've tried with re
or more native (and rude) way with indices the string.
The codes are running in a virtual machine and both cost about more than 0.5 seconds for 2000 lines of logs
Do I have any other choice to make it more efficient?
Thanks in advance
Here's my code
import re
import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080 [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'
def convert(line):
line = re.split('\"', line)
line_pre = re.split('\s+', line[0])
r =re.compile(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$")
http_method =r.findall(line[1])
#http_method =re.findall(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$", line[1])
if len(http_method):
http_method = http_method[0]
else:
http_method = ''
r = re.compile(r"^\s+(\d{1,3})\s+(\d+)")
code_byte = r.findall(line[2])
#code_byte = re.findall(r"^\s+(\d{1,3})\s+(\d+)", line[2])
status = int(code_byte[0][0])
bytes_sent = int(code_byte[0][1])
r = re.compile(r":\d+$")
upstream_addr = r.sub("", line_pre[4])
request_time = int(float(line_pre[0])*1000)
if line_pre[1] == '-':
upstream_response_time = -1
else:
upstream_response_time = int(float(line_pre[1])*1000)
remote_addr = line_pre[2]
host = line_pre[7].replace(' ','')
logdatetime = line_pre[5].replace('[','')
dt = datetime.datetime.strptime(logdatetime, "%d/%b/%Y:%H:%M:%S")
year = int(str(dt)[0:4])
monthday = int(str(dt)[4:10].replace("-",""))
hour = int(str(dt)[11:13])
logtime = int(str(dt)[14:16])
sec = time.mktime(dt.timetuple())
r = re.compile(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$")
request_uri = r.findall(line[1])
#request_uri = re.findall(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$", line[1])
http_referer = line[3]
user_agent = line[5]
gzip_ratio = line[7]
http_x_forwarded_for = line[9]
r = re.compile(r"^([0-9\.]+)\s+(.*)")
serad_guid = r.findall(line[11])
server_addr = serad_guid[0][0]
guid = serad_guid[0][1]
doc = {
"hour":hour,
"year":year,
"date":monthday,
"time":logtime,
"sec":sec,
"request_time":request_time,
"upstream_response_time":upstream_response_time,
"remote_addr":remote_addr,
"upstream_addr":upstream_addr,
"host":host,
"method":http_method,
"request_uri":request_uri,
#"request_protocal":"",
"status":status,
"bytes_sent":bytes_sent,
"http_referer":http_referer,
"user_agent":user_agent,
"gzip_ratio":gzip_ratio,
"http_x_forwarded_for":http_x_forwarded_for,
"server_addr":server_addr,
"guid":guid
}
return doc
t2 = time.time()
count =0
for i in range(12000):
convert(line)
count += 1
if count % 2000 == 0:
t1 = t2
t2 = time.time()
print str(t2-t1)
and
indices way
import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080 [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'
def pair(l):
for i in range(0, len(l), 2):
yield (l[i], l[i+1])
def convert(line):
line = line.replace(" ", "")
quotes_positions = allindices(line, "\"")
if len(quotes_positions) <= 0 or len(quotes_positions)% 2 != 0:
return None
space_positions = allindices(line, " ")
target_positions = []
for s in space_positions:
true_target = True
for qs, qe in pair(quotes_positions):
if s > qs and s < qe:
true_target = False
break
if true_target:
target_positions.append(s)
ret = []
for i in range(0, len(target_positions)):
if i + 1 == len(target_positions):
ret.append(line[target_positions[i] + 1:])
else:
ret.append(line[target_positions[i] + 1:target_positions[i + 1]])
return ret
# def allindices(string, sub, listindex=[], offset=0):
def allindices(string, sub):
listindex = list()
i = string.find(sub)
while i >= 0:
listindex.append(i)
i = string.find(sub, i + 1)
return listindex
t2 = time.time()
count =0
for i in range(12000):
convert(line)
count += 1
if count % 2000 == 0:
t1 = t2
t2 = time.time()
print str(t2-t1)