diff options
Diffstat (limited to 'admin/stats')
| -rw-r--r-- | admin/stats/util.py | 106 |
1 files changed, 0 insertions, 106 deletions
diff --git a/admin/stats/util.py b/admin/stats/util.py deleted file mode 100644 index e79784c..0000000 --- a/admin/stats/util.py +++ /dev/null @@ -1,106 +0,0 @@ -from collections import Counter, defaultdict -from dataclasses import dataclass -from datetime import datetime, timedelta -import re -from sys import argv - -import user_agents - - -ACCESS_RE = re.compile(' '.join(( - r'(?P<address>\S+)', - r'\S+', - r'\S+', - r'(?P<date>\[\S+ \S+\])', - r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"', - r'200 [0-9]+', - r'"(?P<referer>[^"]+)"', - r'"(?P<useragent>[^"]+)"' -))) - -DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' - -VISIT_MAX_DURATION = timedelta(hours=1) - - -@dataclass -class Access: - address: str - useragent: str - referer: str - time: datetime - resource: str - - @classmethod - def from_log(cls, info): - return cls( - info['address'], user_agents.parse(info['useragent']), - info['referer'], datetime.strptime(info['date'], DATE_FMT), - info['resource'] - ) - -def interesting(resource): - return ( - resource.endswith('.html') - or resource == '/' - ) - -def parse(logs_path): - with open(logs_path) as logs_file: - logs = logs_file.read().splitlines() - - matches = (ACCESS_RE.match(l) for l in logs) - return tuple( - Access.from_log(m) for m in matches - if m is not None and interesting(m['resource']) - ) - -def key(access): - return f'{access.address} / {access.useragent}' - -def visits(accesses): - # Map (IP, user agent) to list of visits. A visit is a list of - # accesses. When processing an access, if the previous time for - # this (IP, user agent) is less than VISIT_MAX_DURATION seconds - # ago, we aggregate it, otherwise, we start a new visit. - visits = defaultdict(list) - - for access in accesses: - visitor = key(access) - - if visitor in visits: - last_access = visits[visitor][-1][-1].time - - if access.time - last_access < VISIT_MAX_DURATION: - visits[visitor][-1].append(access) - continue - - visits[visitor].append([access]) - - return visits - -def order(grouped_visits): - # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. - visits = {} - - for i, i_visits in grouped_visits.items(): - for v in i_visits: - visits[(i, v[0].time)] = v - - return visits - -def analyze(logs_path): - accesses = parse(logs_path) - visits_by_visitor = visits(accesses) - visits_by_time = order(visits_by_visitor) - - print('Visiteurs :', len(visits_by_visitor)) - print('Visites :', len(visits_by_time)) - - pagehits = Counter(a.resource for a in accesses) - for page, hits in pagehits.most_common(): - print(hits, page) - - -if __name__ == '__main__': - analyze(argv[1]) |
