from dataclasses import dataclass from datetime import datetime from itertools import groupby import re from sys import argv import user_agents ACCESS_RE = re.compile(' '.join(( r'(?P
\S+)', r'\S+', r'\S+', r'(?P\[\S+ \S+\])', r'"GET (?P\S+) [^"]+"', r'200 [0-9]+', r'"(?P[^"]+)"', r'"(?P[^"]+)"' ))) DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' @dataclass class Access: address: str useragent: str referer: str time: datetime resource: str @classmethod def from_log(cls, info): return cls( info['address'], user_agents.parse(info['useragent']), info['referer'], datetime.strptime(info['date'], DATE_FMT), info['resource'] ) def interesting(resource): return ( resource.endswith('.html') or resource == '/' or resource.startswith('/?') ) def parse(logs_path): with open(logs_path) as logs_file: logs = logs_file.read().splitlines() matches = (ACCESS_RE.match(l) for l in logs) return tuple( Access.from_log(m) for m in matches if m is not None and interesting(m['resource']) ) def key(access): return f'{access.address} / {access.useragent}' def group(accesses): groups = {} for k, g in groupby(sorted(accesses, key=key), key=key): groups[k] = tuple( (a.time, a.resource, a.referer) for a in g ) return groups def span(visits): return (visits[0][0], visits[-1][0]-visits[0][0]) def dump(logs_path): for k, accesses in group(parse(logs_path)).items(): print(k) t, duration = span(accesses) print(t, duration) for (_, rsrc, ref) in accesses: print(f'{rsrc} {ref}') print() if __name__ == '__main__': dump(argv[1])