From ecae0f066e83b596bc3a590baa1968fa1666ad19 Mon Sep 17 00:00:00 2001 From: Kévin Le Gouguec Date: Sat, 27 Mar 2021 21:06:11 +0100 Subject: Add new script to process multiple log files --- admin/stats.py | 153 ------------------------------------- admin/stats/dump.py | 191 ++++++++++++++++++++++++++++++++++++++++++++++ admin/stats/old.py | 153 +++++++++++++++++++++++++++++++++++++ admin/stats/renamelogs.py | 35 +++++++++ 4 files changed, 379 insertions(+), 153 deletions(-) delete mode 100755 admin/stats.py create mode 100755 admin/stats/dump.py create mode 100755 admin/stats/old.py create mode 100755 admin/stats/renamelogs.py diff --git a/admin/stats.py b/admin/stats.py deleted file mode 100755 index e3d46cc..0000000 --- a/admin/stats.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python3 - -from collections import Counter, defaultdict -from dataclasses import dataclass -from datetime import datetime, timedelta -import re -from os import remove -from subprocess import run -from sys import argv - -import user_agents - - -ACCESS_RE = re.compile(' '.join(( - r'(?P
\S+)', - r'\S+', - r'\S+', - r'(?P\[\S+ \S+\])', - r'"GET (?P[^ ?]+)(\?\S+)? [^"]+"', - r'200 [0-9]+', - r'"(?P[^"]+)"', - r'"(?P[^"]+)"' -))) - -DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' - -VISIT_MAX_DURATION = timedelta(hours=1) - - -@dataclass -class Access: - address: str - useragent: str - referer: str - time: datetime - resource: str - - @classmethod - def from_log(cls, info): - resource = info['resource'] - if resource == '/': - resource = '/index.html' - - referer = re.sub( - r'https://(?:www\.)?quatuorbellefeuille\.(?:fr|com)(/[^?]*)(?:\?.+)?', - r'\1', - info['referer'] - ) - if referer == '/': - referer = '/index.html' - - return cls( - info['address'], user_agents.parse(info['useragent']), - referer, datetime.strptime(info['date'], DATE_FMT), - resource - ) - -def interesting(resource): - return ( - resource.endswith('.html') - or resource == '/' - ) - -def parse(logs_path): - with open(logs_path) as logs_file: - logs = logs_file.read().splitlines() - - matches = (ACCESS_RE.match(l) for l in logs) - return tuple( - Access.from_log(m) for m in matches - if (m is not None - and interesting(m['resource']) - and 'klg.uber.space' not in m['referer']) - ) - -def key(access): - return f'{access.address} / {access.useragent}' - -def visits(accesses): - # Map (IP, user agent) to list of visits. A visit is a list of - # accesses. When processing an access, if the previous time for - # this (IP, user agent) is less than VISIT_MAX_DURATION seconds - # ago, we aggregate it, otherwise, we start a new visit. - visits = defaultdict(list) - - for access in accesses: - visitor = key(access) - - if visitor in visits: - last_access = visits[visitor][-1][-1].time - - if access.time - last_access < VISIT_MAX_DURATION: - visits[visitor][-1].append(access) - continue - - visits[visitor].append([access]) - - return visits - -def order(grouped_visits): - # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. - visits = {} - - for i, i_visits in grouped_visits.items(): - for v in i_visits: - visits[(i, v[0].time)] = v - - return visits - -def visit_graph(accesses): - edges = (f' "{a.referer}" -> "{a.resource}";' - for a in accesses) - return '\n'.join((f'digraph visit {{', *edges, '}')) - -def graph(visits): - date = visits[0][0].time.strftime('%F') - - tempfiles = { - f'{date}-{i}.pdf': visit for i, visit in enumerate(visits) - } - - for tempfile, visit in tempfiles.items(): - vgraph = visit_graph(visit) - - with open(tempfile, 'wb') as vfile: - vfile.write( - run(('dot', '-Tpdf'), text=False, check=True, - capture_output=True, input=vgraph.encode()) - .stdout - ) - - run(('qpdf', '--empty', '--pages', *tempfiles, '--', f'{date}.pdf'), - check=True) - - for f in tempfiles: - remove(f) - -def analyze(logs_path): - accesses = parse(logs_path) - visits_by_visitor = visits(accesses) - visits_by_time = order(visits_by_visitor) - - print('Visiteurs :', len(visits_by_visitor)) - print('Visites :', len(visits_by_time)) - - pagehits = Counter(a.resource for a in accesses) - for page, hits in pagehits.most_common(): - print(hits, page) - - graph(tuple(visits_by_time.values())) - -if __name__ == '__main__': - analyze(argv[1]) diff --git a/admin/stats/dump.py b/admin/stats/dump.py new file mode 100755 index 0000000..7a54acc --- /dev/null +++ b/admin/stats/dump.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 + +from collections import Counter, defaultdict +import csv +from dataclasses import dataclass +from datetime import datetime, timedelta +from pathlib import Path +import re +from sys import argv +from urllib.parse import urlparse +from typing import Dict, List, Tuple +from warnings import warn + +import user_agents + + +ACCESS_RE = re.compile(' '.join(( + r'(?P
\S+)', + r'\S+', + r'\S+', + r'(?P\[\S+ \S+\])', + r'"GET (?P[^ ?]+)(\?\S+)? [^"]+"', + r'200 [0-9]+', + r'"(?P[^"]+)(\?\S+)?"', + r'"(?P[^"]+)"' +))) + +DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' + +VISIT_MAX_DURATION = timedelta(hours=1) + +DOMAINS = { + 'quatuorbellefeuille.com', + 'quatuorbellefeuille.fr', + 'klg.uber.space' +} + + +def normalize_path(p): + if p == '/': + return '/index.html' + return p + + +@dataclass +class Access: + address: str + useragent: str + referrer: str + time: datetime + resource: str + + @classmethod + def from_log(cls, info): + resource = normalize_path(info['resource']) + + referrer = urlparse(info['referer']) + if referrer.netloc in DOMAINS: + ref = normalize_path(referrer.path) + else: + ref = referrer.netloc + + return cls( + info['address'], info['useragent'], ref, + datetime.strptime(info['date'], DATE_FMT), resource + ) + + +def interesting(resource): + return resource.endswith('.html') or resource == '/' + + +def parse(logs_paths): + logs = [] + for lp in logs_paths: + with open(lp) as logs_file: + logs += logs_file.read().splitlines() + + matches = (ACCESS_RE.match(l) for l in logs) + return tuple( + Access.from_log(m) for m in matches + if (m is not None and interesting(m['resource'])) + ) + + +Visit = List[Access] + + +@dataclass +class Visitor: + address: str + useragent: str + referrers: List[str] + visits: List[Visit] + + +def useragent_kind(ua_string): + ua = user_agents.parse(ua_string) + if ua.is_pc: + return 'pc' + if ua.is_mobile: + return 'mobile' + if ua.is_tablet: + return 'tablet' + if ua.is_bot: + return 'bot' + warn(f'Unknown user agent kind: {ua_string}') + return 'n/a' + + +def sort_visits(accesses): + visitors: Dict[Tuple(str, str), Visitor] = {} + + for a in accesses: + key = (a.address, a.useragent) + + visitor = visitors.get(key) + if visitor is None: + visitor = Visitor( + a.address, + useragent_kind(a.useragent), + a.referrer, + [[a]] + ) + visitors[key] = visitor + continue + + last_visit = visitor.visits[-1] + last_access = last_visit[-1].time + if a.time - last_access < VISIT_MAX_DURATION: + last_visit.append(a) + continue + + visitor.visits.append([a]) + + return visitors + + +def find_days(visits): + return { + v[0].time.replace(hour=0, minute=0, second=0) + for v in visits + } + + +def daily_visitors(visitors, output_path): + days: Dict[datetime, Counter] = defaultdict(Counter) + columns = ('mobile', 'tablet', 'pc', 'bot', 'n/a') + + print('Visitors:') + + for v in visitors.values(): + for day in find_days(v.visits): + days[day][v.useragent] += 1 + + with open(output_path, 'w') as f: + out = csv.writer(f) + out.writerow(('day', 'total', *columns)) + print('day', 'total', *columns, sep='\t') + + for day in sorted(days): + counter = days[day] + counters = tuple(counter[c] for c in columns) + values = (day.strftime('%F'), sum(counters), *counters) + + out.writerow(values) + print(*values, sep='\t') + + +def daily_stats(visitors, output_dir): + output_dir = Path(output_dir) + daily_visitors(visitors, output_dir.joinpath('dailyvisitors.csv')) + # daily_visits(visitors, output_dir.joinpath('dailyvisits.csv')) + # daily_pages_per_visit(visitors, output_dir.joinpath('dailypagespervisit.csv')) + # daily_page_hits(visitors, output_dir.joinpath('dailypagehits.csv')) + # daily_referrers(visitors, output_dir.joinpath('dailyreferrers.csv')) + + +def global_stats(visitors, output_dir): + pass + + +def main(logs_paths, output_dir): + accesses = parse(logs_paths) + visitors = sort_visits(accesses) + daily_stats(visitors, output_dir) + global_stats(visitors, output_dir) + + +if __name__ == '__main__': + main(argv[1:-1], argv[-1]) diff --git a/admin/stats/old.py b/admin/stats/old.py new file mode 100755 index 0000000..e3d46cc --- /dev/null +++ b/admin/stats/old.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 + +from collections import Counter, defaultdict +from dataclasses import dataclass +from datetime import datetime, timedelta +import re +from os import remove +from subprocess import run +from sys import argv + +import user_agents + + +ACCESS_RE = re.compile(' '.join(( + r'(?P
\S+)', + r'\S+', + r'\S+', + r'(?P\[\S+ \S+\])', + r'"GET (?P[^ ?]+)(\?\S+)? [^"]+"', + r'200 [0-9]+', + r'"(?P[^"]+)"', + r'"(?P[^"]+)"' +))) + +DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' + +VISIT_MAX_DURATION = timedelta(hours=1) + + +@dataclass +class Access: + address: str + useragent: str + referer: str + time: datetime + resource: str + + @classmethod + def from_log(cls, info): + resource = info['resource'] + if resource == '/': + resource = '/index.html' + + referer = re.sub( + r'https://(?:www\.)?quatuorbellefeuille\.(?:fr|com)(/[^?]*)(?:\?.+)?', + r'\1', + info['referer'] + ) + if referer == '/': + referer = '/index.html' + + return cls( + info['address'], user_agents.parse(info['useragent']), + referer, datetime.strptime(info['date'], DATE_FMT), + resource + ) + +def interesting(resource): + return ( + resource.endswith('.html') + or resource == '/' + ) + +def parse(logs_path): + with open(logs_path) as logs_file: + logs = logs_file.read().splitlines() + + matches = (ACCESS_RE.match(l) for l in logs) + return tuple( + Access.from_log(m) for m in matches + if (m is not None + and interesting(m['resource']) + and 'klg.uber.space' not in m['referer']) + ) + +def key(access): + return f'{access.address} / {access.useragent}' + +def visits(accesses): + # Map (IP, user agent) to list of visits. A visit is a list of + # accesses. When processing an access, if the previous time for + # this (IP, user agent) is less than VISIT_MAX_DURATION seconds + # ago, we aggregate it, otherwise, we start a new visit. + visits = defaultdict(list) + + for access in accesses: + visitor = key(access) + + if visitor in visits: + last_access = visits[visitor][-1][-1].time + + if access.time - last_access < VISIT_MAX_DURATION: + visits[visitor][-1].append(access) + continue + + visits[visitor].append([access]) + + return visits + +def order(grouped_visits): + # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. + visits = {} + + for i, i_visits in grouped_visits.items(): + for v in i_visits: + visits[(i, v[0].time)] = v + + return visits + +def visit_graph(accesses): + edges = (f' "{a.referer}" -> "{a.resource}";' + for a in accesses) + return '\n'.join((f'digraph visit {{', *edges, '}')) + +def graph(visits): + date = visits[0][0].time.strftime('%F') + + tempfiles = { + f'{date}-{i}.pdf': visit for i, visit in enumerate(visits) + } + + for tempfile, visit in tempfiles.items(): + vgraph = visit_graph(visit) + + with open(tempfile, 'wb') as vfile: + vfile.write( + run(('dot', '-Tpdf'), text=False, check=True, + capture_output=True, input=vgraph.encode()) + .stdout + ) + + run(('qpdf', '--empty', '--pages', *tempfiles, '--', f'{date}.pdf'), + check=True) + + for f in tempfiles: + remove(f) + +def analyze(logs_path): + accesses = parse(logs_path) + visits_by_visitor = visits(accesses) + visits_by_time = order(visits_by_visitor) + + print('Visiteurs :', len(visits_by_visitor)) + print('Visites :', len(visits_by_time)) + + pagehits = Counter(a.resource for a in accesses) + for page, hits in pagehits.most_common(): + print(hits, page) + + graph(tuple(visits_by_time.values())) + +if __name__ == '__main__': + analyze(argv[1]) diff --git a/admin/stats/renamelogs.py b/admin/stats/renamelogs.py new file mode 100755 index 0000000..34fbccf --- /dev/null +++ b/admin/stats/renamelogs.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +from datetime import datetime +from os import rename +from pathlib import Path +import re +from sys import argv + + +ACCESS_RE = re.compile(' '.join(( + r'\S+', + r'\S+', + r'\S+', + r'\[(?P[^:]+):\S+ \S+\]', + r'"GET [^ ?]+(\?\S+)? [^"]+"', + r'200 [0-9]+', + r'"[^"]+(\?\S+)?"', + r'"[^"]+"' +))) + +DATE_FMT = '%d/%b/%Y' + + +def main(paths): + for p in paths: + with open(p) as f: + date = ACCESS_RE.search(f.read()).group('date') + + date = datetime.strptime(date, DATE_FMT) + new_path = Path(p).with_name(date.strftime('%F')) + rename(p, new_path) + + +if __name__ == '__main__': + main(argv[1:]) -- cgit v1.2.3