Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions Maintenance.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
Changes made in the programs:

-- Some stuffs due to Python2 and Pyhton3 incompatibilities

Script:
-- Fix the year problem (checked out) already pulled in github
-- Fix the folder problem (add ldamodels)

-- create a new script prepareQuarter.sh (split xml files into quarter instead of months)

-- Folder users must be create under topics folder (createUserEvolutionChain method/function)

TextProcessor.py
-- Adapted it to Python 3 (cPickle, string methods, stem bug)
-- Fix file name problem (line 54) open("data/" + str(date) + "-titleS-users.txt", "r") -- it was missing S in the name
-- Adapted it to consider all user's document as one (time sliced)

---- createDictionariesFromFiles: added the creation of file date+"monthly-tokenized_dict-perUser.pdict"

---- createGlobalDictionaryFromMonthly: added a new paramater mergeDocs=False (if False documents are split by post, otherwise by user)
------------- false: opens date+"monthly-tokenized_dict.pdict"
------------- true: opens date+"monthly-tokenized_dict-perUser.pdict"
------------- The output is the same (global dictionary)

---- createMonthsCorpus: added a new paramater mergeDocs=False
------------- false: input: date+"monthly-tokenized_dict.pdict" output: date+'-tokenized.mm'
------------- true: input: date+"monthly-tokenized_dict-perUser.pdict" output: date+'-tokenizedUser.mm'

---- performTFIDF: new parameter added mergeDocs=False
------------- false: input: date+'-tokenized.mm' output: date+'tfidf.mm' and date+"-tfidf.model"
------------- true: input: date+'-tokenizedUser.mm' output: date+'tfidfUser.mm' and date+"-tfidfUser.model"

---- performLDA: new parameter added mergeDocs=False
------------- false: input: "models/" + date +"-tfidf.mm" output: the same file
------------- true: input: "models/" + date +"-tfidfUser.mm" output: the same file

---- calculateEta: new parameter added: mergeDocs=False
------------- false: input: date+"monthly-tokenized_dict.pdict" output: the same
------------- true: input: date+"monthly-tokenized_dict-perUser.pdict" output: the same

-- lookupLDATopics: new parameter added: mergeDocs=False
---- false: input: date+"monthly-tokenized_dict.pdict" output: the same
---- true: input: date+"monthly-tokenized_dict-perUser.pdict" output: the same

-- performLDA
----- Fix bug when call calculateEta: the parameter "vocabulary size" must set to len(dictionary.keys()) instead of vocabsize
------------ (since the vocabulary size produced by dictionary can be smalller than vocabsize)

UserComparator.py
-- summarizeTopicsPerUser received a new parameter to deal with document by post or by User
---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same
---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same
----------- the key for tokenized_dict is userid instead of docid

-- lookupTopics received a new parameter to deal with document by post or by User
---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same
---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same
----------- the key for tokenized_dict is userid instead of docid

TopicStats.py
-- docPerTopic received a new parameter to deal with document by post or by User
---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same
---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same

-- countWords received a new parameter to deal with document by post or by User
---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same
---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same


75 changes: 57 additions & 18 deletions SOParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,19 @@
import re, cgi, os, pickle, logging, time
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import pdb

def main():
minposts = 50

quarter=True

years = [2013, 2014]

extractUsers(minposts, years)
extractComments(years)
extractComments(years,quarter)

def extractComments(years):
def extractComments(years,isQuarter=False):
quarters=['1stQ','2ndQ','3rdQ','4thQ']
users = set()
usersFile = open('rawdata/userposts.txt', 'r')
for userline in usersFile:
Expand All @@ -21,35 +26,56 @@ def extractComments(years):
usersFile.close()

for year in years:
print "Parsing year: " + str(year)
months = range(1,13)
print ("Parsing year: " + str(year))

if not isQuarter:
months = range(1,13)
else:
months = range(1,5) ## 4 quarters in a year
####
for month in months:
start = time.time()
yearmonth = str(year) + "-" + str(month).zfill(2)
if not isQuarter:
strmonth=str(month).zfill(2)
else:
strmonth=quarters[month-1]
#####
yearmonth = str(year) + "-" + strmonth
print(yearmonth)
#######
## Dealing with qaurter instead of months vvvvv
#######
if month == 1:
lastmonth = str(year-1) + "-12"
if not isQuarter:
lastmonth = str(year-1) + "-12"
else:
lastmonth = str(year-1) + '-' + quarters[-1]
else:
lastmonth = str(year) + "-" + str(month-1).zfill(2)
if not isQuarter:
lastmonth = str(year) + "-" + str(month-1).zfill(2)
else:
lastmonth = str(year) + "-" + quarters[month-2]
###
lastmonthsquestiontitlesfile = "data/" + lastmonth + "-questiontitles.dict"
lastmonthsquestiontagsfile = "data/" + lastmonth + "-questiontags.dict"
if os.path.isfile(lastmonthsquestiontitlesfile):
logging.info('loading title dictionary: %s', lastmonthsquestiontitlesfile)
logging.info('loading tag dictionary: %s', lastmonthsquestiontagsfile)
questiontitles = {}
questiontags = {}
with open(lastmonthsquestiontitlesfile, 'r') as f:
with open(lastmonthsquestiontitlesfile, 'rb') as f: ## add b
questiontitles = pickle.load(f)
logging.info("Elements in questiontitles: %s", len(questiontitles))
with open(lastmonthsquestiontagsfile, 'r') as f:
with open(lastmonthsquestiontagsfile, 'rb') as f: ## add b
questiontags = pickle.load(f)
logging.info("Elements in questiontags: %s", len(questiontags))
else:
logging.info("creating new dictionaries")
questiontitles = {}
questiontags = {}

#######
## ^^^^^ End
#######
monthusers = set()
parsedpostsfile = open("data/"+ yearmonth + "-titles-tags-text.tsv","a")
rawpostsfile = open("rawdata/" + yearmonth + ".Posts.xml", 'r')
Expand All @@ -67,19 +93,31 @@ def extractComments(years):
creationDate = doc.get('CreationDate')
postTypeId = doc.get('PostTypeId')
score = doc.get('Score')
text = doc.get('Body').encode('utf8').replace('\r\n','').replace('\n','')
#text = doc.get('Body').encode('utf8').replace('\r\n','').replace('\n','')
text = doc.get('Body').replace('\r\n','').replace('\n','')
tagremove = re.compile(r'(<!--.*?-->|<[^>]*>)')
text = cgi.escape(tagremove.sub('', re.sub('<code>[^>]+</code>', '', text)))

parent = doc.get('ParentId')
if 'Title' in doc.keys():
title = doc.get('Title').encode('utf8')
#title = doc.get('Title').encode('utf8')
title = doc.get('Title')
if type(title) is bytes:
print('>>>>>>>> Byte')
title=title.decode('utf8')
else:
title = ''
if 'Tags' in doc.keys():
tags = doc.get('Tags').encode('utf8').replace("><", ",").replace("<","").replace(">","")
#tags = doc.get('Tags').encode('utf8').replace("><", ",").replace("<","").replace(">","")
tags = doc.get('Tags').replace("><", ",").replace("<","").replace(">","")
if type(tags) is bytes:
print('>>>>>>>> Byte')
tags=tags.decode('utf8')
else:
tags = ''
####
##pdb.set_trace()
####
if postTypeId == "1":
questiontags[rowID] = tags
questiontitles[rowID] = title
Expand All @@ -94,11 +132,12 @@ def extractComments(years):
parsedpostsfile.close()
rawpostsfile.close()

with open("data/"+ yearmonth + "-titles-users.txt", 'w') as f:
#pdb.set_trace()
with open("data/"+ yearmonth + "-titles-users.txt", 'w') as f:
f.write("\n".join(monthusers))
with open("data/" + yearmonth + "-questiontitles.dict", 'w') as f:
with open("data/" + yearmonth + "-questiontitles.dict", 'wb') as f: ## add b (binary mode)
pickle.dump(questiontitles, f, pickle.HIGHEST_PROTOCOL)
with open("data/" + yearmonth + "-questiontags.dict", 'w') as f:
with open("data/" + yearmonth + "-questiontags.dict", 'wb') as f: ## add b (binary mode)
pickle.dump(questiontags, f, pickle.HIGHEST_PROTOCOL)
end = time.time() - start
logging.info("Elapsed time (s): %s", end)
Expand All @@ -108,7 +147,7 @@ def extractComments(years):
def extractUsers(minPostCount, years):
users = {}
for year in years:
print "Parsing year: " +str(year)
print ("Parsing year: " +str(year))
posts = open("rawdata/"+str(year)+".Posts.xml", 'r')
for post in posts:
post = post.rstrip('\n')
Expand Down
Loading