diff options
Diffstat (limited to 'admin/stats/old.py')
| -rwxr-xr-x | admin/stats/old.py | 153 |
1 files changed, 153 insertions, 0 deletions
diff --git a/admin/stats/old.py b/admin/stats/old.py new file mode 100755 index 0000000..e3d46cc --- /dev/null +++ b/admin/stats/old.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 + +from collections import Counter, defaultdict +from dataclasses import dataclass +from datetime import datetime, timedelta +import re +from os import remove +from subprocess import run +from sys import argv + +import user_agents + + +ACCESS_RE = re.compile(' '.join(( + r'(?P<address>\S+)', + r'\S+', + r'\S+', + r'(?P<date>\[\S+ \S+\])', + r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"', + r'200 [0-9]+', + r'"(?P<referer>[^"]+)"', + r'"(?P<useragent>[^"]+)"' +))) + +DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' + +VISIT_MAX_DURATION = timedelta(hours=1) + + +@dataclass +class Access: + address: str + useragent: str + referer: str + time: datetime + resource: str + + @classmethod + def from_log(cls, info): + resource = info['resource'] + if resource == '/': + resource = '/index.html' + + referer = re.sub( + r'https://(?:www\.)?quatuorbellefeuille\.(?:fr|com)(/[^?]*)(?:\?.+)?', + r'\1', + info['referer'] + ) + if referer == '/': + referer = '/index.html' + + return cls( + info['address'], user_agents.parse(info['useragent']), + referer, datetime.strptime(info['date'], DATE_FMT), + resource + ) + +def interesting(resource): + return ( + resource.endswith('.html') + or resource == '/' + ) + +def parse(logs_path): + with open(logs_path) as logs_file: + logs = logs_file.read().splitlines() + + matches = (ACCESS_RE.match(l) for l in logs) + return tuple( + Access.from_log(m) for m in matches + if (m is not None + and interesting(m['resource']) + and 'klg.uber.space' not in m['referer']) + ) + +def key(access): + return f'{access.address} / {access.useragent}' + +def visits(accesses): + # Map (IP, user agent) to list of visits. A visit is a list of + # accesses. When processing an access, if the previous time for + # this (IP, user agent) is less than VISIT_MAX_DURATION seconds + # ago, we aggregate it, otherwise, we start a new visit. + visits = defaultdict(list) + + for access in accesses: + visitor = key(access) + + if visitor in visits: + last_access = visits[visitor][-1][-1].time + + if access.time - last_access < VISIT_MAX_DURATION: + visits[visitor][-1].append(access) + continue + + visits[visitor].append([access]) + + return visits + +def order(grouped_visits): + # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. + visits = {} + + for i, i_visits in grouped_visits.items(): + for v in i_visits: + visits[(i, v[0].time)] = v + + return visits + +def visit_graph(accesses): + edges = (f' "{a.referer}" -> "{a.resource}";' + for a in accesses) + return '\n'.join((f'digraph visit {{', *edges, '}')) + +def graph(visits): + date = visits[0][0].time.strftime('%F') + + tempfiles = { + f'{date}-{i}.pdf': visit for i, visit in enumerate(visits) + } + + for tempfile, visit in tempfiles.items(): + vgraph = visit_graph(visit) + + with open(tempfile, 'wb') as vfile: + vfile.write( + run(('dot', '-Tpdf'), text=False, check=True, + capture_output=True, input=vgraph.encode()) + .stdout + ) + + run(('qpdf', '--empty', '--pages', *tempfiles, '--', f'{date}.pdf'), + check=True) + + for f in tempfiles: + remove(f) + +def analyze(logs_path): + accesses = parse(logs_path) + visits_by_visitor = visits(accesses) + visits_by_time = order(visits_by_visitor) + + print('Visiteurs :', len(visits_by_visitor)) + print('Visites :', len(visits_by_time)) + + pagehits = Counter(a.resource for a in accesses) + for page, hits in pagehits.most_common(): + print(hits, page) + + graph(tuple(visits_by_time.values())) + +if __name__ == '__main__': + analyze(argv[1]) |
