#!/usr/bin/env python3 from collections import Counter, defaultdict from dataclasses import dataclass from datetime import datetime, timedelta import re from os import remove from subprocess import run from sys import argv import user_agents ACCESS_RE = re.compile(' '.join(( r'(?P
\S+)', r'\S+', r'\S+', r'(?P\[\S+ \S+\])', r'"GET (?P[^ ?]+)(\?\S+)? [^"]+"', r'200 [0-9]+', r'"(?P[^"]+)"', r'"(?P[^"]+)"' ))) DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' VISIT_MAX_DURATION = timedelta(hours=1) @dataclass class Access: address: str useragent: str referer: str time: datetime resource: str @classmethod def from_log(cls, info): resource = info['resource'] if resource == '/': resource = '/index.html' referer = re.sub( r'https://(?:www\.)?quatuorbellefeuille\.(?:fr|com)(/[^?]*)(?:\?.+)?', r'\1', info['referer'] ) if referer == '/': referer = '/index.html' return cls( info['address'], user_agents.parse(info['useragent']), referer, datetime.strptime(info['date'], DATE_FMT), resource ) def interesting(resource): return ( resource.endswith('.html') or resource == '/' ) def parse(logs_path): with open(logs_path) as logs_file: logs = logs_file.read().splitlines() matches = (ACCESS_RE.match(l) for l in logs) return tuple( Access.from_log(m) for m in matches if (m is not None and interesting(m['resource']) and 'klg.uber.space' not in m['referer']) ) def key(access): return f'{access.address} / {access.useragent}' def visits(accesses): # Map (IP, user agent) to list of visits. A visit is a list of # accesses. When processing an access, if the previous time for # this (IP, user agent) is less than VISIT_MAX_DURATION seconds # ago, we aggregate it, otherwise, we start a new visit. visits = defaultdict(list) for access in accesses: visitor = key(access) if visitor in visits: last_access = visits[visitor][-1][-1].time if access.time - last_access < VISIT_MAX_DURATION: visits[visitor][-1].append(access) continue visits[visitor].append([access]) return visits def order(grouped_visits): # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. visits = {} for i, i_visits in grouped_visits.items(): for v in i_visits: visits[(i, v[0].time)] = v return visits def visit_graph(accesses): edges = (f' "{a.referer}" -> "{a.resource}";' for a in accesses) return '\n'.join((f'digraph visit {{', *edges, '}')) def graph(visits): date = visits[0][0].time.strftime('%F') tempfiles = { f'{date}-{i}.pdf': visit for i, visit in enumerate(visits) } for tempfile, visit in tempfiles.items(): vgraph = visit_graph(visit) with open(tempfile, 'wb') as vfile: vfile.write( run(('dot', '-Tpdf'), text=False, check=True, capture_output=True, input=vgraph.encode()) .stdout ) run(('qpdf', '--empty', '--pages', *tempfiles, '--', f'{date}.pdf'), check=True) for f in tempfiles: remove(f) def analyze(logs_path): accesses = parse(logs_path) visits_by_visitor = visits(accesses) visits_by_time = order(visits_by_visitor) print('Visiteurs :', len(visits_by_visitor)) print('Visites :', len(visits_by_time)) pagehits = Counter(a.resource for a in accesses) for page, hits in pagehits.most_common(): print(hits, page) graph(tuple(visits_by_time.values())) if __name__ == '__main__': analyze(argv[1])