old.py (3926B)
1 #!/usr/bin/env python3 2 3 from collections import Counter, defaultdict 4 from dataclasses import dataclass 5 from datetime import datetime, timedelta 6 import re 7 from os import remove 8 from subprocess import run 9 from sys import argv 10 11 import user_agents 12 13 14 ACCESS_RE = re.compile(' '.join(( 15 r'(?P<address>\S+)', 16 r'\S+', 17 r'\S+', 18 r'(?P<date>\[\S+ \S+\])', 19 r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"', 20 r'200 [0-9]+', 21 r'"(?P<referer>[^"]+)"', 22 r'"(?P<useragent>[^"]+)"' 23 ))) 24 25 DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' 26 27 VISIT_MAX_DURATION = timedelta(hours=1) 28 29 30 @dataclass 31 class Access: 32 address: str 33 useragent: str 34 referer: str 35 time: datetime 36 resource: str 37 38 @classmethod 39 def from_log(cls, info): 40 resource = info['resource'] 41 if resource == '/': 42 resource = '/index.html' 43 44 referer = re.sub( 45 r'https://(?:www\.)?quatuorbellefeuille\.(?:fr|com)(/[^?]*)(?:\?.+)?', 46 r'\1', 47 info['referer'] 48 ) 49 if referer == '/': 50 referer = '/index.html' 51 52 return cls( 53 info['address'], user_agents.parse(info['useragent']), 54 referer, datetime.strptime(info['date'], DATE_FMT), 55 resource 56 ) 57 58 def interesting(resource): 59 return ( 60 resource.endswith('.html') 61 or resource == '/' 62 ) 63 64 def parse(logs_path): 65 with open(logs_path) as logs_file: 66 logs = logs_file.read().splitlines() 67 68 matches = (ACCESS_RE.match(l) for l in logs) 69 return tuple( 70 Access.from_log(m) for m in matches 71 if (m is not None 72 and interesting(m['resource']) 73 and 'klg.uber.space' not in m['referer']) 74 ) 75 76 def key(access): 77 return f'{access.address} / {access.useragent}' 78 79 def visits(accesses): 80 # Map (IP, user agent) to list of visits. A visit is a list of 81 # accesses. When processing an access, if the previous time for 82 # this (IP, user agent) is less than VISIT_MAX_DURATION seconds 83 # ago, we aggregate it, otherwise, we start a new visit. 84 visits = defaultdict(list) 85 86 for access in accesses: 87 visitor = key(access) 88 89 if visitor in visits: 90 last_access = visits[visitor][-1][-1].time 91 92 if access.time - last_access < VISIT_MAX_DURATION: 93 visits[visitor][-1].append(access) 94 continue 95 96 visits[visitor].append([access]) 97 98 return visits 99 100 def order(grouped_visits): 101 # Flatten { (IP, UA) ↦ [visits] } to { (IP, UA, t0) ↦ accesses }. 102 visits = {} 103 104 for i, i_visits in grouped_visits.items(): 105 for v in i_visits: 106 visits[(i, v[0].time)] = v 107 108 return visits 109 110 def visit_graph(accesses): 111 edges = (f' "{a.referer}" -> "{a.resource}";' 112 for a in accesses) 113 return '\n'.join((f'digraph visit {{', *edges, '}')) 114 115 def graph(visits): 116 date = visits[0][0].time.strftime('%F') 117 118 tempfiles = { 119 f'{date}-{i}.pdf': visit for i, visit in enumerate(visits) 120 } 121 122 for tempfile, visit in tempfiles.items(): 123 vgraph = visit_graph(visit) 124 125 with open(tempfile, 'wb') as vfile: 126 vfile.write( 127 run(('dot', '-Tpdf'), text=False, check=True, 128 capture_output=True, input=vgraph.encode()) 129 .stdout 130 ) 131 132 run(('qpdf', '--empty', '--pages', *tempfiles, '--', f'{date}.pdf'), 133 check=True) 134 135 for f in tempfiles: 136 remove(f) 137 138 def analyze(logs_path): 139 accesses = parse(logs_path) 140 visits_by_visitor = visits(accesses) 141 visits_by_time = order(visits_by_visitor) 142 143 print('Visiteurs :', len(visits_by_visitor)) 144 print('Visites :', len(visits_by_time)) 145 146 pagehits = Counter(a.resource for a in accesses) 147 for page, hits in pagehits.most_common(): 148 print(hits, page) 149 150 graph(tuple(visits_by_time.values())) 151 152 if __name__ == '__main__': 153 analyze(argv[1])