1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
from dataclasses import dataclass
from datetime import datetime
from itertools import groupby
import re
from sys import argv
import user_agents
ACCESS_RE = re.compile(' '.join((
r'(?P<address>\S+)',
r'\S+',
r'\S+',
r'(?P<date>\[\S+ \S+\])',
r'"GET (?P<resource>\S+) [^"]+"',
r'200 [0-9]+',
r'"(?P<referer>[^"]+)"',
r'"(?P<useragent>[^"]+)"'
)))
DATE_FMT = '[%d/%b/%Y:%H:%M:%S %z]'
@dataclass
class Access:
address: str
useragent: str
referer: str
time: datetime
resource: str
@classmethod
def from_log(cls, info):
return cls(
info['address'], user_agents.parse(info['useragent']),
info['referer'], datetime.strptime(info['date'], DATE_FMT),
info['resource']
)
def interesting(resource):
return (
resource.endswith('.html')
or resource == '/'
or resource.startswith('/?')
)
def parse(logs_path):
with open(logs_path) as logs_file:
logs = logs_file.read().splitlines()
matches = (ACCESS_RE.match(l) for l in logs)
return tuple(
Access.from_log(m) for m in matches
if m is not None and interesting(m['resource'])
)
def key(access):
return f'{access.address} / {access.useragent}'
def group(accesses):
groups = {}
for k, g in groupby(sorted(accesses, key=key), key=key):
groups[k] = tuple(
(a.time, a.resource, a.referer)
for a in g
)
return groups
def span(visits):
return (visits[0][0],
visits[-1][0]-visits[0][0])
def dump(logs_path):
for k, accesses in group(parse(logs_path)).items():
print(k)
t, duration = span(accesses)
print(t, duration)
for (_, rsrc, ref) in accesses:
print(f'{rsrc} {ref}')
print()
if __name__ == '__main__':
dump(argv[1])
|