dump.py (11000B)
1 #!/usr/bin/env python3 2 3 from collections import Counter, defaultdict 4 import csv 5 from dataclasses import dataclass 6 from datetime import datetime, timedelta 7 from enum import Enum 8 from logging import warning 9 from pathlib import Path 10 import re 11 from statistics import mean, median, stdev 12 from sys import argv 13 from urllib.parse import unquote, urlparse 14 from typing import Dict, List, Tuple 15 16 import user_agents 17 18 19 ACCESS_RE = re.compile(' '.join(( 20 r'(?P<address>\S+)', 21 r'\S+', 22 r'\S+', 23 r'(?P<date>\[\S+ \S+\])', 24 r'"GET (?P<resource>[^ ?]+)(\?\S+)? [^"]+"', 25 r'200 [0-9]+', 26 r'"(?P<referer>[^"]+)(\?\S+)?"', 27 r'"(?P<useragent>[^"]+)"' 28 ))) 29 30 DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]' 31 32 VISIT_MAX_DURATION = timedelta(hours=1) 33 34 DOMAINS = ( 35 'quatuorbellefeuille.com', 36 'quatuorbellefeuille.fr', 37 'klg.uber.space' 38 ) 39 40 41 def normalize_path(p): 42 if p in ('', '/'): 43 return '/index.html' 44 return unquote(p) 45 46 47 @dataclass 48 class Access: 49 address: str 50 useragent: str 51 referrer: str 52 time: datetime 53 resource: str 54 55 @classmethod 56 def from_log(cls, info): 57 resource = normalize_path(info['resource']) 58 59 referrer = urlparse(info['referer']) 60 if referrer.netloc.endswith(DOMAINS): 61 ref = normalize_path(referrer.path) 62 elif referrer.netloc: 63 ref = referrer.netloc 64 else: 65 ref = 'n/a' 66 67 return cls( 68 info['address'], info['useragent'], ref, 69 datetime.strptime(info['date'], DATE_FMT), resource 70 ) 71 72 73 def interesting(resource): 74 return resource.endswith('.html') or resource == '/' 75 76 77 def parse(logs_paths): 78 logs = [] 79 for lp in logs_paths: 80 with open(lp) as logs_file: 81 logs += logs_file.read().splitlines() 82 83 matches = (ACCESS_RE.match(l) for l in logs) 84 return tuple( 85 Access.from_log(m) for m in matches 86 if (m is not None and interesting(m['resource'])) 87 ) 88 89 90 Visit = List[Access] 91 92 93 class UserAgentKind(Enum): 94 PC = 'pc' 95 MOBILE = 'mobile' 96 TABLET = 'tablet' 97 BOT = 'bot' 98 NA = 'n/a' 99 100 @classmethod 101 def from_useragent(cls, ua_string): 102 ua = user_agents.parse(ua_string) 103 # is_bot is not mutually exclusive with other is_* predicates. 104 if ua.is_bot: 105 return cls.BOT 106 if ua.is_pc: 107 return cls.PC 108 if ua.is_mobile: 109 return cls.MOBILE 110 if ua.is_tablet: 111 return cls.TABLET 112 warning(f'Unknown user agent kind: {ua_string}') 113 return cls.NA 114 115 def is_human(self): 116 return self in { 117 UserAgentKind.PC, 118 UserAgentKind.MOBILE, 119 UserAgentKind.TABLET 120 } 121 122 123 @dataclass 124 class Visitor: 125 address: str 126 useragent: UserAgentKind 127 visits: List[Visit] 128 129 130 def sort_visits(accesses): 131 visitors: Dict[Tuple(str, str), Visitor] = {} 132 133 for a in accesses: 134 key = (a.address, a.useragent) 135 136 visitor = visitors.get(key) 137 if visitor is None: 138 visitor = Visitor( 139 a.address, 140 UserAgentKind.from_useragent(a.useragent), 141 [[a]] 142 ) 143 visitors[key] = visitor 144 continue 145 146 last_visit = visitor.visits[-1] 147 last_access = last_visit[-1].time 148 if a.time - last_access < VISIT_MAX_DURATION: 149 last_visit.append(a) 150 continue 151 152 visitor.visits.append([a]) 153 154 return visitors 155 156 157 def datetime_day(dt): 158 return dt.replace(hour=0, minute=0, second=0) 159 160 161 def find_days(visits): 162 return { 163 datetime_day(v[0].time) 164 for v in visits 165 } 166 167 168 def find_pages(visitors): 169 return sorted({ 170 access.resource 171 for v in visitors 172 for visit in v.visits 173 for access in visit 174 if v.useragent.is_human() 175 }) 176 177 178 def external_referrer(ref): 179 return ref != 'n/a' and not ref.startswith('/') 180 181 182 def simplify_referrer(ref): 183 parts = ref.split('.') 184 # Remove leading parts (www., l., m.…) and extension (.com, .fr…). 185 return parts[-2] 186 187 188 def find_referrers(visitors): 189 return sorted({ 190 simplify_referrer(access.referrer) 191 for v in visitors 192 for visit in v.visits 193 for access in visit 194 if external_referrer(access.referrer) 195 }) 196 197 198 def daily_visitors(visitors, output_dir): 199 days: Dict[datetime, Counter] = defaultdict(Counter) 200 columns = ('mobile', 'tablet', 'pc', 'bot', 'n/a') 201 202 print('Visitors:') 203 204 for v in visitors.values(): 205 for day in find_days(v.visits): 206 days[day][v.useragent.value] += 1 207 208 with open(Path(output_dir).joinpath('dailyvisitors.csv'), 'w') as f: 209 out = csv.writer(f) 210 out.writerow(('day', 'total', *columns)) 211 print('day', 'total', *columns, sep='\t') 212 213 for day in sorted(days): 214 counter = days[day] 215 counters = tuple(counter[c] for c in columns) 216 values = (day.strftime('%F'), sum(counters), *counters) 217 218 out.writerow(values) 219 print(*values, sep='\t') 220 221 return days 222 223 224 def daily_visits(visitors, output_dir): 225 days: Dict[datetime, Counter] = defaultdict(Counter) 226 columns = ('mobile', 'tablet', 'pc', 'bot', 'n/a') 227 228 print('Visits:') 229 230 for v in visitors.values(): 231 for visit in v.visits: 232 day = datetime_day(visit[0].time) 233 days[day][v.useragent.value] += 1 234 235 with open(Path(output_dir, 'dailyvisits.csv'), 'w') as f: 236 out = csv.writer(f) 237 out.writerow(('day', 'total', *columns)) 238 print('day', 'total', *columns, sep='\t') 239 240 for day in sorted(days): 241 counter = days[day] 242 counters = tuple(counter[c] for c in columns) 243 values = (day.strftime('%F'), sum(counters), *counters) 244 245 out.writerow(values) 246 print(*values, sep='\t') 247 248 return days 249 250 251 def daily_pages_per_visit(visitors, output_dir): 252 days: Dict[datetime, list] = defaultdict(list) 253 columns = ('min', 'max', 'med', 'avg', 'dev') 254 255 print('Pages/visit:') 256 257 for v in visitors.values(): 258 if not v.useragent.is_human(): 259 continue 260 261 for visit in v.visits: 262 day = datetime_day(visit[0].time) 263 days[day].append(len(visit)) 264 265 with open(Path(output_dir, 'dailypagespervisit.csv'), 'w') as f: 266 out = csv.writer(f) 267 out.writerow(('day', *columns)) 268 print('day', *columns, sep='\t') 269 270 for day in sorted(days): 271 view_counts = days[day] 272 values = ( 273 day.strftime('%F'), 274 min(view_counts), 275 max(view_counts), 276 median(view_counts), 277 mean(view_counts), 278 stdev(view_counts) 279 ) 280 281 out.writerow(values) 282 print(*values[:4], *(f'{v:.2f}' for v in values[4:]), sep='\t') 283 284 return days 285 286 287 def daily_page_hits(visitors, output_dir): 288 days: Dict[datetime, Counter] = defaultdict(Counter) 289 columns = find_pages(visitors.values()) 290 291 print('Page hits:') 292 293 for v in visitors.values(): 294 if not v.useragent.is_human(): 295 continue 296 297 for visit in v.visits: 298 day = datetime_day(visit[0].time) 299 for access in visit: 300 days[day][access.resource] += 1 301 302 with open(Path(output_dir, 'dailypagehits.csv'), 'w') as f: 303 out = csv.writer(f) 304 out.writerow(('day', *columns)) 305 306 for day in sorted(days): 307 page_hits = days[day] 308 values = (day.strftime('%F'), 309 *(page_hits[page] for page in columns)) 310 out.writerow(values) 311 312 print(day.strftime('%F')) 313 for page, hits in page_hits.most_common(5): 314 print(hits, page, sep='\t') 315 316 return days 317 318 319 def daily_referrers(visitors, output_dir): 320 days: Dict[datetime, Counter] = defaultdict(Counter) 321 columns = find_referrers(visitors.values()) 322 323 print('Referrers:') 324 325 for v in visitors.values(): 326 for visit in v.visits: 327 day = datetime_day(visit[0].time) 328 329 for access in visit: 330 if not external_referrer(access.referrer): 331 continue 332 days[day][simplify_referrer(access.referrer)] += 1 333 334 with open(Path(output_dir, 'dailyreferrers.csv'), 'w') as f: 335 out = csv.writer(f) 336 out.writerow(('day', *columns)) 337 print('day', *columns, sep='\t') 338 339 for day in sorted(days): 340 refcounts = days[day] 341 values = (day.strftime('%F'), *(refcounts[ref] for ref in columns)) 342 343 out.writerow(values) 344 print(*values, sep='\t') 345 346 return days 347 348 349 def dump_stats(visitors, output_dir): 350 output_dir = Path(output_dir) 351 daily_visitors(visitors, output_dir) 352 visits_pday = daily_visits(visitors, output_dir) 353 pagespervisit_pday = daily_pages_per_visit(visitors, output_dir) 354 pagehits_pday = daily_page_hits(visitors, output_dir) 355 referrers_pday = daily_referrers(visitors, output_dir) 356 357 ua_values = tuple(ua.value for ua in UserAgentKind) 358 359 nb_visitors = { 360 ua.value: sum(1 for v in visitors.values() if v.useragent == ua) 361 for ua in UserAgentKind 362 } 363 nb_visits = { 364 ua: sum(visits_pday[day][ua] for day in visits_pday) 365 for ua in ua_values 366 } 367 pages_per_visit = tuple( 368 nb for day in pagespervisit_pday for nb in pagespervisit_pday[day] 369 ) 370 hits_per_page = { 371 page: sum(pagehits_pday[day][page] for day in pagehits_pday) 372 for page in find_pages(visitors.values()) 373 } 374 referrers = { 375 ref: sum(referrers_pday[day][ref] for day in referrers_pday) 376 for ref in find_referrers(visitors.values()) 377 } 378 379 with open(Path(output_dir, 'global.csv'), 'w') as f: 380 out = csv.writer(f) 381 out.writerow(('#visitors',)) 382 out.writerows( 383 (ua, nb_visitors[ua]) for ua in ua_values 384 ) 385 out.writerow(('total', sum(nb_visitors.values()))) 386 387 out.writerow(('#visits',)) 388 out.writerows( 389 (ua, nb_visits[ua]) for ua in ua_values 390 ) 391 out.writerow(('total', sum(nb_visits.values()))) 392 393 out.writerow(('#pages/visit',)) 394 out.writerows(( 395 ('min', min(pages_per_visit)), 396 ('max', max(pages_per_visit)), 397 ('med', median(pages_per_visit)), 398 ('avg', mean(pages_per_visit)), 399 ('dev', stdev(pages_per_visit)) 400 )) 401 402 out.writerow(('#views/page',)) 403 lines = reversed(sorted(hits_per_page.items(), key=lambda kv: kv[1])) 404 out.writerows(lines) 405 406 out.writerow(('#referrers',)) 407 lines = reversed(sorted(referrers.items(), key=lambda kv: kv[1])) 408 out.writerows(lines) 409 410 411 def main(logs_paths, output_dir): 412 accesses = parse(logs_paths) 413 visitors = sort_visits(accesses) 414 dump_stats(visitors, output_dir) 415 416 417 if __name__ == '__main__': 418 main(argv[1:-1], argv[-1])