summaryrefslogtreecommitdiff
path: root/admin/stats
diff options
context:
space:
mode:
Diffstat (limited to 'admin/stats')
-rwxr-xr-xadmin/stats/dump.py191
-rwxr-xr-xadmin/stats/old.py153
-rwxr-xr-xadmin/stats/renamelogs.py35
3 files changed, 379 insertions, 0 deletions
diff --git a/admin/stats/dump.py b/admin/stats/dump.py
new file mode 100755
index 0000000..7a54acc
--- /dev/null
+++ b/admin/stats/dump.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+
+from collections import Counter, defaultdict
+import csv
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from pathlib import Path
+import re
+from sys import argv
+from urllib.parse import urlparse
+from typing import Dict, List, Tuple
+from warnings import warn
+
+import user_agents
+
+
+ACCESS_RE = re.compile(' '.join((
+ r'(?P<address>\S+)',
+ r'\S+',
+ r'\S+',
+ r'(?P<date>\[\S+ \S+\])',
+ r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"',
+ r'200 [0-9]+',
+ r'"(?P<referer>[^"]+)(\?\S+)?"',
+ r'"(?P<useragent>[^"]+)"'
+)))
+
+DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
+
+VISIT_MAX_DURATION = timedelta(hours=1)
+
+DOMAINS = {
+ 'quatuorbellefeuille.com',
+ 'quatuorbellefeuille.fr',
+ 'klg.uber.space'
+}
+
+
+def normalize_path(p):
+ if p == '/':
+ return '/index.html'
+ return p
+
+
+@dataclass
+class Access:
+ address: str
+ useragent: str
+ referrer: str
+ time: datetime
+ resource: str
+
+ @classmethod
+ def from_log(cls, info):
+ resource = normalize_path(info['resource'])
+
+ referrer = urlparse(info['referer'])
+ if referrer.netloc in DOMAINS:
+ ref = normalize_path(referrer.path)
+ else:
+ ref = referrer.netloc
+
+ return cls(
+ info['address'], info['useragent'], ref,
+ datetime.strptime(info['date'], DATE_FMT), resource
+ )
+
+
+def interesting(resource):
+ return resource.endswith('.html') or resource == '/'
+
+
+def parse(logs_paths):
+ logs = []
+ for lp in logs_paths:
+ with open(lp) as logs_file:
+ logs += logs_file.read().splitlines()
+
+ matches = (ACCESS_RE.match(l) for l in logs)
+ return tuple(
+ Access.from_log(m) for m in matches
+ if (m is not None and interesting(m['resource']))
+ )
+
+
+Visit = List[Access]
+
+
+@dataclass
+class Visitor:
+ address: str
+ useragent: str
+ referrers: List[str]
+ visits: List[Visit]
+
+
+def useragent_kind(ua_string):
+ ua = user_agents.parse(ua_string)
+ if ua.is_pc:
+ return 'pc'
+ if ua.is_mobile:
+ return 'mobile'
+ if ua.is_tablet:
+ return 'tablet'
+ if ua.is_bot:
+ return 'bot'
+ warn(f'Unknown user agent kind: {ua_string}')
+ return 'n/a'
+
+
+def sort_visits(accesses):
+ visitors: Dict[Tuple(str, str), Visitor] = {}
+
+ for a in accesses:
+ key = (a.address, a.useragent)
+
+ visitor = visitors.get(key)
+ if visitor is None:
+ visitor = Visitor(
+ a.address,
+ useragent_kind(a.useragent),
+ a.referrer,
+ [[a]]
+ )
+ visitors[key] = visitor
+ continue
+
+ last_visit = visitor.visits[-1]
+ last_access = last_visit[-1].time
+ if a.time - last_access < VISIT_MAX_DURATION:
+ last_visit.append(a)
+ continue
+
+ visitor.visits.append([a])
+
+ return visitors
+
+
+def find_days(visits):
+ return {
+ v[0].time.replace(hour=0, minute=0, second=0)
+ for v in visits
+ }
+
+
+def daily_visitors(visitors, output_path):
+ days: Dict[datetime, Counter] = defaultdict(Counter)
+ columns = ('mobile', 'tablet', 'pc', 'bot', 'n/a')
+
+ print('Visitors:')
+
+ for v in visitors.values():
+ for day in find_days(v.visits):
+ days[day][v.useragent] += 1
+
+ with open(output_path, 'w') as f:
+ out = csv.writer(f)
+ out.writerow(('day', 'total', *columns))
+ print('day', 'total', *columns, sep='\t')
+
+ for day in sorted(days):
+ counter = days[day]
+ counters = tuple(counter[c] for c in columns)
+ values = (day.strftime('%F'), sum(counters), *counters)
+
+ out.writerow(values)
+ print(*values, sep='\t')
+
+
+def daily_stats(visitors, output_dir):
+ output_dir = Path(output_dir)
+ daily_visitors(visitors, output_dir.joinpath('dailyvisitors.csv'))
+ # daily_visits(visitors, output_dir.joinpath('dailyvisits.csv'))
+ # daily_pages_per_visit(visitors, output_dir.joinpath('dailypagespervisit.csv'))
+ # daily_page_hits(visitors, output_dir.joinpath('dailypagehits.csv'))
+ # daily_referrers(visitors, output_dir.joinpath('dailyreferrers.csv'))
+
+
+def global_stats(visitors, output_dir):
+ pass
+
+
+def main(logs_paths, output_dir):
+ accesses = parse(logs_paths)
+ visitors = sort_visits(accesses)
+ daily_stats(visitors, output_dir)
+ global_stats(visitors, output_dir)
+
+
+if __name__ == '__main__':
+ main(argv[1:-1], argv[-1])
diff --git a/admin/stats/old.py b/admin/stats/old.py
new file mode 100755
index 0000000..e3d46cc
--- /dev/null
+++ b/admin/stats/old.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+import re
+from os import remove
+from subprocess import run
+from sys import argv
+
+import user_agents
+
+
+ACCESS_RE = re.compile(' '.join((
+ r'(?P<address>\S+)',
+ r'\S+',
+ r'\S+',
+ r'(?P<date>\[\S+ \S+\])',
+ r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"',
+ r'200 [0-9]+',
+ r'"(?P<referer>[^"]+)"',
+ r'"(?P<useragent>[^"]+)"'
+)))
+
+DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
+
+VISIT_MAX_DURATION = timedelta(hours=1)
+
+
+@dataclass
+class Access:
+ address: str
+ useragent: str
+ referer: str
+ time: datetime
+ resource: str
+
+ @classmethod
+ def from_log(cls, info):
+ resource = info['resource']
+ if resource == '/':
+ resource = '/index.html'
+
+ referer = re.sub(
+ r'https://(?:www\.)?quatuorbellefeuille\.(?:fr|com)(/[^?]*)(?:\?.+)?',
+ r'\1',
+ info['referer']
+ )
+ if referer == '/':
+ referer = '/index.html'
+
+ return cls(
+ info['address'], user_agents.parse(info['useragent']),
+ referer, datetime.strptime(info['date'], DATE_FMT),
+ resource
+ )
+
+def interesting(resource):
+ return (
+ resource.endswith('.html')
+ or resource == '/'
+ )
+
+def parse(logs_path):
+ with open(logs_path) as logs_file:
+ logs = logs_file.read().splitlines()
+
+ matches = (ACCESS_RE.match(l) for l in logs)
+ return tuple(
+ Access.from_log(m) for m in matches
+ if (m is not None
+ and interesting(m['resource'])
+ and 'klg.uber.space' not in m['referer'])
+ )
+
+def key(access):
+ return f'{access.address} / {access.useragent}'
+
+def visits(accesses):
+ # Map (IP, user agent) to list of visits. A visit is a list of
+ # accesses. When processing an access, if the previous time for
+ # this (IP, user agent) is less than VISIT_MAX_DURATION seconds
+ # ago, we aggregate it, otherwise, we start a new visit.
+ visits = defaultdict(list)
+
+ for access in accesses:
+ visitor = key(access)
+
+ if visitor in visits:
+ last_access = visits[visitor][-1][-1].time
+
+ if access.time - last_access < VISIT_MAX_DURATION:
+ visits[visitor][-1].append(access)
+ continue
+
+ visits[visitor].append([access])
+
+ return visits
+
+def order(grouped_visits):
+ # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }.
+ visits = {}
+
+ for i, i_visits in grouped_visits.items():
+ for v in i_visits:
+ visits[(i, v[0].time)] = v
+
+ return visits
+
+def visit_graph(accesses):
+ edges = (f' "{a.referer}" -> "{a.resource}";'
+ for a in accesses)
+ return '\n'.join((f'digraph visit {{', *edges, '}'))
+
+def graph(visits):
+ date = visits[0][0].time.strftime('%F')
+
+ tempfiles = {
+ f'{date}-{i}.pdf': visit for i, visit in enumerate(visits)
+ }
+
+ for tempfile, visit in tempfiles.items():
+ vgraph = visit_graph(visit)
+
+ with open(tempfile, 'wb') as vfile:
+ vfile.write(
+ run(('dot', '-Tpdf'), text=False, check=True,
+ capture_output=True, input=vgraph.encode())
+ .stdout
+ )
+
+ run(('qpdf', '--empty', '--pages', *tempfiles, '--', f'{date}.pdf'),
+ check=True)
+
+ for f in tempfiles:
+ remove(f)
+
+def analyze(logs_path):
+ accesses = parse(logs_path)
+ visits_by_visitor = visits(accesses)
+ visits_by_time = order(visits_by_visitor)
+
+ print('Visiteurs :', len(visits_by_visitor))
+ print('Visites :', len(visits_by_time))
+
+ pagehits = Counter(a.resource for a in accesses)
+ for page, hits in pagehits.most_common():
+ print(hits, page)
+
+ graph(tuple(visits_by_time.values()))
+
+if __name__ == '__main__':
+ analyze(argv[1])
diff --git a/admin/stats/renamelogs.py b/admin/stats/renamelogs.py
new file mode 100755
index 0000000..34fbccf
--- /dev/null
+++ b/admin/stats/renamelogs.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+from datetime import datetime
+from os import rename
+from pathlib import Path
+import re
+from sys import argv
+
+
+ACCESS_RE = re.compile(' '.join((
+ r'\S+',
+ r'\S+',
+ r'\S+',
+ r'\[(?P<date>[^:]+):\S+ \S+\]',
+ r'"GET [^ ?]+(\?\S+)? [^"]+"',
+ r'200 [0-9]+',
+ r'"[^"]+(\?\S+)?"',
+ r'"[^"]+"'
+)))
+
+DATE_FMT = '%d/%b/%Y'
+
+
+def main(paths):
+ for p in paths:
+ with open(p) as f:
+ date = ACCESS_RE.search(f.read()).group('date')
+
+ date = datetime.strptime(date, DATE_FMT)
+ new_path = Path(p).with_name(date.strftime('%F'))
+ rename(p, new_path)
+
+
+if __name__ == '__main__':
+ main(argv[1:])