-
Notifications
You must be signed in to change notification settings - Fork 31
Expand file tree
/
Copy pathdata2user.py
More file actions
47 lines (31 loc) · 1.14 KB
/
data2user.py
File metadata and controls
47 lines (31 loc) · 1.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""
Tabulate user-level statistics
We want to know how many questions of
each type each user has.
Marco Lui, October 2012
"""
import argparse
import csv
import operator
from collections import Counter, defaultdict
from data2vw import norm, Timer, status
if __name__ == "__main__":
parser = argparse.ArgumentParser("user stats of SO data")
parser.add_argument('input')
parser.add_argument('output')
args = parser.parse_args()
reader = csv.DictReader( open(args.input) )
user_posts = defaultdict(lambda: defaultdict(int))
with Timer() as t:
for i,row in enumerate(reader):
uid = row['OwnerUserId']
post_status = status[row['OpenStatus']]
user_posts[uid][post_status] += 1
if i and i % 10000 == 0:
print '{0} lines in {1}s ({2} l/s)'.format(i, t.elapsed, t.rate(i))
print '{0} lines in {1}s ({2} l/s)'.format(i+1, t.elapsed, t.rate(i+1))
#output stage
with open(args.output, 'w') as f:
writer = csv.writer(f)
writer.writerow(('OwnerUserId',) + tuple('Status{0}'.format(k) for k in range(6)))
writer.writerows((u,) + tuple(v[str(k)] for k in range(6)) for u,v in user_posts.iteritems())