summaryrefslogtreecommitdiff
path: root/admin/stats.py
blob: e79784ce2420cc1139eeca1005d7a4ab23f61a3b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from collections import Counter, defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta
import re
from sys import argv

import user_agents


ACCESS_RE = re.compile(' '.join((
    r'(?P<address>\S+)',
    r'\S+',
    r'\S+',
    r'(?P<date>\[\S+ \S+\])',
    r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"',
    r'200 [0-9]+',
    r'"(?P<referer>[^"]+)"',
    r'"(?P<useragent>[^"]+)"'
)))

DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'

VISIT_MAX_DURATION = timedelta(hours=1)


@dataclass
class Access:
    address: str
    useragent: str
    referer: str
    time: datetime
    resource: str

    @classmethod
    def from_log(cls, info):
        return cls(
            info['address'], user_agents.parse(info['useragent']),
            info['referer'], datetime.strptime(info['date'], DATE_FMT),
            info['resource']
        )

def interesting(resource):
    return (
        resource.endswith('.html')
        or resource == '/'
    )

def parse(logs_path):
    with open(logs_path) as logs_file:
        logs = logs_file.read().splitlines()

    matches = (ACCESS_RE.match(l) for l in logs)
    return tuple(
        Access.from_log(m) for m in matches
        if m is not None and interesting(m['resource'])
    )

def key(access):
    return f'{access.address} / {access.useragent}'

def visits(accesses):
    # Map (IP, user agent) to list of visits.  A visit is a list of
    # accesses.  When processing an access, if the previous time for
    # this (IP, user agent) is less than VISIT_MAX_DURATION seconds
    # ago, we aggregate it, otherwise, we start a new visit.
    visits = defaultdict(list)

    for access in accesses:
        visitor = key(access)

        if visitor in visits:
            last_access = visits[visitor][-1][-1].time

            if access.time - last_access < VISIT_MAX_DURATION:
                visits[visitor][-1].append(access)
                continue

        visits[visitor].append([access])

    return visits

def order(grouped_visits):
    # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }.
    visits = {}

    for i, i_visits in grouped_visits.items():
        for v in i_visits:
            visits[(i, v[0].time)] = v

    return visits

def analyze(logs_path):
    accesses = parse(logs_path)
    visits_by_visitor = visits(accesses)
    visits_by_time = order(visits_by_visitor)

    print('Visiteurs :', len(visits_by_visitor))
    print('Visites :', len(visits_by_time))

    pagehits = Counter(a.resource for a in accesses)
    for page, hits in pagehits.most_common():
        print(hits, page)


if __name__ == '__main__':
    analyze(argv[1])