From 194f41f2f3826e1f37ecaa5a39ae41708808ad6f Mon Sep 17 00:00:00 2001 From: Kévin Le Gouguec Date: Sat, 20 Mar 2021 19:33:24 +0100 Subject: Move stats script --- admin/stats.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++ admin/stats/util.py | 106 ---------------------------------------------------- 2 files changed, 106 insertions(+), 106 deletions(-) create mode 100644 admin/stats.py delete mode 100644 admin/stats/util.py (limited to 'admin') diff --git a/admin/stats.py b/admin/stats.py new file mode 100644 index 0000000..e79784c --- /dev/null +++ b/admin/stats.py @@ -0,0 +1,106 @@ +from collections import Counter, defaultdict +from dataclasses import dataclass +from datetime import datetime, timedelta +import re +from sys import argv + +import user_agents + + +ACCESS_RE = re.compile(' '.join(( + r'(?P
\S+)', + r'\S+', + r'\S+', + r'(?P\[\S+ \S+\])', + r'"GET (?P[^ ?]+)(\?\S+)? [^"]+"', + r'200 [0-9]+', + r'"(?P[^"]+)"', + r'"(?P[^"]+)"' +))) + +DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' + +VISIT_MAX_DURATION = timedelta(hours=1) + + +@dataclass +class Access: + address: str + useragent: str + referer: str + time: datetime + resource: str + + @classmethod + def from_log(cls, info): + return cls( + info['address'], user_agents.parse(info['useragent']), + info['referer'], datetime.strptime(info['date'], DATE_FMT), + info['resource'] + ) + +def interesting(resource): + return ( + resource.endswith('.html') + or resource == '/' + ) + +def parse(logs_path): + with open(logs_path) as logs_file: + logs = logs_file.read().splitlines() + + matches = (ACCESS_RE.match(l) for l in logs) + return tuple( + Access.from_log(m) for m in matches + if m is not None and interesting(m['resource']) + ) + +def key(access): + return f'{access.address} / {access.useragent}' + +def visits(accesses): + # Map (IP, user agent) to list of visits. A visit is a list of + # accesses. When processing an access, if the previous time for + # this (IP, user agent) is less than VISIT_MAX_DURATION seconds + # ago, we aggregate it, otherwise, we start a new visit. + visits = defaultdict(list) + + for access in accesses: + visitor = key(access) + + if visitor in visits: + last_access = visits[visitor][-1][-1].time + + if access.time - last_access < VISIT_MAX_DURATION: + visits[visitor][-1].append(access) + continue + + visits[visitor].append([access]) + + return visits + +def order(grouped_visits): + # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. + visits = {} + + for i, i_visits in grouped_visits.items(): + for v in i_visits: + visits[(i, v[0].time)] = v + + return visits + +def analyze(logs_path): + accesses = parse(logs_path) + visits_by_visitor = visits(accesses) + visits_by_time = order(visits_by_visitor) + + print('Visiteurs :', len(visits_by_visitor)) + print('Visites :', len(visits_by_time)) + + pagehits = Counter(a.resource for a in accesses) + for page, hits in pagehits.most_common(): + print(hits, page) + + +if __name__ == '__main__': + analyze(argv[1]) diff --git a/admin/stats/util.py b/admin/stats/util.py deleted file mode 100644 index e79784c..0000000 --- a/admin/stats/util.py +++ /dev/null @@ -1,106 +0,0 @@ -from collections import Counter, defaultdict -from dataclasses import dataclass -from datetime import datetime, timedelta -import re -from sys import argv - -import user_agents - - -ACCESS_RE = re.compile(' '.join(( - r'(?P
\S+)', - r'\S+', - r'\S+', - r'(?P\[\S+ \S+\])', - r'"GET (?P[^ ?]+)(\?\S+)? [^"]+"', - r'200 [0-9]+', - r'"(?P[^"]+)"', - r'"(?P[^"]+)"' -))) - -DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' - -VISIT_MAX_DURATION = timedelta(hours=1) - - -@dataclass -class Access: - address: str - useragent: str - referer: str - time: datetime - resource: str - - @classmethod - def from_log(cls, info): - return cls( - info['address'], user_agents.parse(info['useragent']), - info['referer'], datetime.strptime(info['date'], DATE_FMT), - info['resource'] - ) - -def interesting(resource): - return ( - resource.endswith('.html') - or resource == '/' - ) - -def parse(logs_path): - with open(logs_path) as logs_file: - logs = logs_file.read().splitlines() - - matches = (ACCESS_RE.match(l) for l in logs) - return tuple( - Access.from_log(m) for m in matches - if m is not None and interesting(m['resource']) - ) - -def key(access): - return f'{access.address} / {access.useragent}' - -def visits(accesses): - # Map (IP, user agent) to list of visits. A visit is a list of - # accesses. When processing an access, if the previous time for - # this (IP, user agent) is less than VISIT_MAX_DURATION seconds - # ago, we aggregate it, otherwise, we start a new visit. - visits = defaultdict(list) - - for access in accesses: - visitor = key(access) - - if visitor in visits: - last_access = visits[visitor][-1][-1].time - - if access.time - last_access < VISIT_MAX_DURATION: - visits[visitor][-1].append(access) - continue - - visits[visitor].append([access]) - - return visits - -def order(grouped_visits): - # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. - visits = {} - - for i, i_visits in grouped_visits.items(): - for v in i_visits: - visits[(i, v[0].time)] = v - - return visits - -def analyze(logs_path): - accesses = parse(logs_path) - visits_by_visitor = visits(accesses) - visits_by_time = order(visits_by_visitor) - - print('Visiteurs :', len(visits_by_visitor)) - print('Visites :', len(visits_by_time)) - - pagehits = Counter(a.resource for a in accesses) - for page, hits in pagehits.most_common(): - print(hits, page) - - -if __name__ == '__main__': - analyze(argv[1]) -- cgit v1.2.3