diff options
| author | Kévin Le Gouguec <kevin.legouguec@gmail.com> | 2021-03-20 18:06:41 +0100 |
|---|---|---|
| committer | Kévin Le Gouguec <kevin.legouguec@gmail.com> | 2021-03-20 18:06:41 +0100 |
| commit | 0ae67d029c4bc191f14a2c34ccad2a4c670c23ee (patch) | |
| tree | c623448511893db590ac7d16d537560540e05697 /admin/stats/util.py | |
| parent | b8e8a12b1f76e2649ff43b4cf10bcdeb04696c1c (diff) | |
| download | quatuorbellefeuille.com-0ae67d029c4bc191f14a2c34ccad2a4c670c23ee.tar.xz | |
Start analyzing visits
Diffstat (limited to 'admin/stats/util.py')
| -rw-r--r-- | admin/stats/util.py | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/admin/stats/util.py b/admin/stats/util.py new file mode 100644 index 0000000..ced69d9 --- /dev/null +++ b/admin/stats/util.py @@ -0,0 +1,87 @@ +from dataclasses import dataclass +from datetime import datetime +from itertools import groupby +import re +from sys import argv + +import user_agents + + +ACCESS_RE = re.compile(' '.join(( + r'(?P<address>\S+)', + r'\S+', + r'\S+', + r'(?P<date>\[\S+ \S+\])', + r'"GET (?P<resource>\S+) [^"]+"', + r'200 [0-9]+', + r'"(?P<referer>[^"]+)"', + r'"(?P<useragent>[^"]+)"' +))) + +DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' + + +@dataclass +class Access: + address: str + useragent: str + referer: str + time: datetime + resource: str + + @classmethod + def from_log(cls, info): + return cls( + info['address'], user_agents.parse(info['useragent']), + info['referer'], datetime.strptime(info['date'], DATE_FMT), + info['resource'] + ) + +def interesting(resource): + return ( + resource.endswith('.html') + or resource == '/' + or resource.startswith('/?') + ) + +def parse(logs_path): + with open(logs_path) as logs_file: + logs = logs_file.read().splitlines() + + matches = (ACCESS_RE.match(l) for l in logs) + return tuple( + Access.from_log(m) for m in matches + if m is not None and interesting(m['resource']) + ) + +def key(access): + return f'{access.address} / {access.useragent}' + +def group(accesses): + groups = {} + for k, g in groupby(sorted(accesses, key=key), key=key): + groups[k] = tuple( + (a.time, a.resource, a.referer) + for a in g + ) + return groups + +def span(visits): + return (visits[0][0], + visits[-1][0]-visits[0][0]) + +def dump(logs_path): + for k, accesses in group(parse(logs_path)).items(): + print(k) + + t, duration = span(accesses) + print(t, duration) + + for (_, rsrc, ref) in accesses: + print(f'{rsrc} {ref}') + + print() + + +if __name__ == '__main__': + dump(argv[1]) |
