summaryrefslogtreecommitdiff
path: root/admin/stats/util.py
blob: ced69d95cab36f795076aad41890735ce836899b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from dataclasses import dataclass
from datetime import datetime
from itertools import groupby
import re
from sys import argv

import user_agents


ACCESS_RE = re.compile(' '.join((
    r'(?P<address>\S+)',
    r'\S+',
    r'\S+',
    r'(?P<date>\[\S+ \S+\])',
    r'"GET (?P<resource>\S+) [^"]+"',
    r'200 [0-9]+',
    r'"(?P<referer>[^"]+)"',
    r'"(?P<useragent>[^"]+)"'
)))

DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'


@dataclass
class Access:
    address: str
    useragent: str
    referer: str
    time: datetime
    resource: str

    @classmethod
    def from_log(cls, info):
        return cls(
            info['address'], user_agents.parse(info['useragent']),
            info['referer'], datetime.strptime(info['date'], DATE_FMT),
            info['resource']
        )

def interesting(resource):
    return (
        resource.endswith('.html')
        or resource == '/'
        or resource.startswith('/?')
    )

def parse(logs_path):
    with open(logs_path) as logs_file:
        logs = logs_file.read().splitlines()

    matches = (ACCESS_RE.match(l) for l in logs)
    return tuple(
        Access.from_log(m) for m in matches
        if m is not None and interesting(m['resource'])
    )

def key(access):
    return f'{access.address} / {access.useragent}'

def group(accesses):
    groups = {}
    for k, g in groupby(sorted(accesses, key=key), key=key):
        groups[k] = tuple(
            (a.time, a.resource, a.referer)
            for a in g
        )
    return groups

def span(visits):
    return (visits[0][0],
            visits[-1][0]-visits[0][0])

def dump(logs_path):
    for k, accesses in group(parse(logs_path)).items():
        print(k)

        t, duration = span(accesses)
        print(t, duration)

        for (_, rsrc, ref) in accesses:
            print(f'{rsrc} {ref}')

        print()


if __name__ == '__main__':
    dump(argv[1])