| Hosted by CoCalc | Download
1
#!/usr/bin/env python
2
# demo about how to read the data files
3
4
import csv
5
import gzip
6
from datetime import datetime
7
from dateutil.parser import parse as dt_parse
8
9
# february data
10
fn = 'smc_file_access_2016-02.csv.gz'
11
file_handler = csv.reader(gzip.open(fn), delimiter=";")
12
13
# the following demo computes the distinct active users
14
# by weekday and distinct projects per day of month
15
16
proj_per_day = {}
17
account_per_weekday = {}
18
19
for line_no, row in enumerate(file_handler):
20
if line_no == 0:
21
# print "HEADER: %s" % row
22
continue
23
24
if line_no % 100000 == 0:
25
print "at line %d" % line_no
26
# break # uncomment, to break early while developing
27
28
# parse rows entries
29
timestamp = dt_parse(row[0])
30
account = int(row[1])
31
project = int(row[2])
32
filename = int(row[3])
33
extension = row[4]
34
35
# derived information
36
weekday = timestamp.weekday() # 0 to 6
37
hour = timestamp.hour
38
day = timestamp.day
39
40
proj_per_day.setdefault(day, set())
41
proj_per_day[day].add(project)
42
43
account_per_weekday.setdefault(weekday, set())
44
account_per_weekday[weekday].add(account)
45
46
print
47
for day, proj in sorted(proj_per_day.iteritems()):
48
print "Day %3d: %5d projects active" % (day, len(proj))
49
print
50
51
for weekday, accounts in sorted(account_per_weekday.iteritems()):
52
print "Weekday %2d: %5d accounts active" % (weekday, len(accounts))
53
print
54