from dataclasses import dataclass
from datetime import datetime
from itertools import groupby
import re
from sys import argv
import user_agents
ACCESS_RE = re.compile(' '.join((
r'(?P
\S+)',
r'\S+',
r'\S+',
r'(?P\[\S+ \S+\])',
r'"GET (?P\S+) [^"]+"',
r'200 [0-9]+',
r'"(?P[^"]+)"',
r'"(?P[^"]+)"'
)))
DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
@dataclass
class Access:
address: str
useragent: str
referer: str
time: datetime
resource: str
@classmethod
def from_log(cls, info):
return cls(
info['address'], user_agents.parse(info['useragent']),
info['referer'], datetime.strptime(info['date'], DATE_FMT),
info['resource']
)
def interesting(resource):
return (
resource.endswith('.html')
or resource == '/'
or resource.startswith('/?')
)
def parse(logs_path):
with open(logs_path) as logs_file:
logs = logs_file.read().splitlines()
matches = (ACCESS_RE.match(l) for l in logs)
return tuple(
Access.from_log(m) for m in matches
if m is not None and interesting(m['resource'])
)
def key(access):
return f'{access.address} / {access.useragent}'
def group(accesses):
groups = {}
for k, g in groupby(sorted(accesses, key=key), key=key):
groups[k] = tuple(
(a.time, a.resource, a.referer)
for a in g
)
return groups
def span(visits):
return (visits[0][0],
visits[-1][0]-visits[0][0])
def dump(logs_path):
for k, accesses in group(parse(logs_path)).items():
print(k)
t, duration = span(accesses)
print(t, duration)
for (_, rsrc, ref) in accesses:
print(f'{rsrc} {ref}')
print()
if __name__ == '__main__':
dump(argv[1])