diff --git a/Maintenance.txt b/Maintenance.txt
new file mode 100644
index 0000000..8f4e54d
--- /dev/null
+++ b/Maintenance.txt
@@ -0,0 +1,69 @@
+Changes made in the programs:
+
+-- Some stuffs due to Python2 and Pyhton3 incompatibilities
+
+Script:
+-- Fix the year problem (checked out) already pulled in github
+-- Fix the folder problem (add ldamodels)
+
+-- create a new script prepareQuarter.sh (split xml files into quarter instead of months)
+
+-- Folder users must be create under topics folder (createUserEvolutionChain method/function)
+
+TextProcessor.py
+-- Adapted it to Python 3 (cPickle, string methods, stem bug)
+-- Fix file name problem (line 54) open("data/" + str(date) + "-titleS-users.txt", "r") -- it was missing S in the name
+-- Adapted it to consider all user's document as one (time sliced)
+
+---- createDictionariesFromFiles: added the creation of file date+"monthly-tokenized_dict-perUser.pdict"
+
+---- createGlobalDictionaryFromMonthly: added a new paramater mergeDocs=False (if False documents are split by post, otherwise by user)
+------------- false: opens date+"monthly-tokenized_dict.pdict"
+------------- true: opens date+"monthly-tokenized_dict-perUser.pdict"
+------------- The output is the same (global dictionary)
+
+---- createMonthsCorpus: added a new paramater mergeDocs=False
+------------- false: input: date+"monthly-tokenized_dict.pdict" output: date+'-tokenized.mm'
+------------- true: input: date+"monthly-tokenized_dict-perUser.pdict" output: date+'-tokenizedUser.mm'
+
+---- performTFIDF: new parameter added mergeDocs=False
+------------- false: input: date+'-tokenized.mm' output: date+'tfidf.mm' and date+"-tfidf.model"
+------------- true: input: date+'-tokenizedUser.mm' output: date+'tfidfUser.mm' and date+"-tfidfUser.model"
+
+---- performLDA: new parameter added mergeDocs=False
+------------- false: input: "models/" + date +"-tfidf.mm" output: the same file
+------------- true: input: "models/" + date +"-tfidfUser.mm" output: the same file
+
+---- calculateEta: new parameter added: mergeDocs=False
+------------- false: input: date+"monthly-tokenized_dict.pdict" output: the same
+------------- true: input: date+"monthly-tokenized_dict-perUser.pdict" output: the same
+
+-- lookupLDATopics: new parameter added: mergeDocs=False
+---- false: input: date+"monthly-tokenized_dict.pdict" output: the same
+---- true: input: date+"monthly-tokenized_dict-perUser.pdict" output: the same
+
+-- performLDA
+----- Fix bug when call calculateEta: the parameter "vocabulary size" must set to len(dictionary.keys()) instead of vocabsize
+------------ (since the vocabulary size produced by dictionary can be smalller than vocabsize)
+
+UserComparator.py
+-- summarizeTopicsPerUser received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same
+---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same
+----------- the key for tokenized_dict is userid instead of docid
+
+-- lookupTopics received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same
+---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same
+----------- the key for tokenized_dict is userid instead of docid
+
+TopicStats.py
+-- docPerTopic received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same
+---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same
+
+-- countWords received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same
+---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same
+
+
diff --git a/SOParser.py b/SOParser.py
index ebafd9d..3e663e1 100644
--- a/SOParser.py
+++ b/SOParser.py
@@ -5,14 +5,19 @@
import re, cgi, os, pickle, logging, time
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+import pdb
+
def main():
minposts = 50
-
+ quarter=True
+
years = [2013, 2014]
+
extractUsers(minposts, years)
- extractComments(years)
+ extractComments(years,quarter)
-def extractComments(years):
+def extractComments(years,isQuarter=False):
+ quarters=['1stQ','2ndQ','3rdQ','4thQ']
users = set()
usersFile = open('rawdata/userposts.txt', 'r')
for userline in usersFile:
@@ -21,17 +26,36 @@ def extractComments(years):
usersFile.close()
for year in years:
- print "Parsing year: " + str(year)
- months = range(1,13)
+ print ("Parsing year: " + str(year))
+ if not isQuarter:
+ months = range(1,13)
+ else:
+ months = range(1,5) ## 4 quarters in a year
+ ####
for month in months:
start = time.time()
- yearmonth = str(year) + "-" + str(month).zfill(2)
+ if not isQuarter:
+ strmonth=str(month).zfill(2)
+ else:
+ strmonth=quarters[month-1]
+ #####
+ yearmonth = str(year) + "-" + strmonth
print(yearmonth)
+ #######
+ ## Dealing with qaurter instead of months vvvvv
+ #######
if month == 1:
- lastmonth = str(year-1) + "-12"
+ if not isQuarter:
+ lastmonth = str(year-1) + "-12"
+ else:
+ lastmonth = str(year-1) + '-' + quarters[-1]
else:
- lastmonth = str(year) + "-" + str(month-1).zfill(2)
+ if not isQuarter:
+ lastmonth = str(year) + "-" + str(month-1).zfill(2)
+ else:
+ lastmonth = str(year) + "-" + quarters[month-2]
+ ###
lastmonthsquestiontitlesfile = "data/" + lastmonth + "-questiontitles.dict"
lastmonthsquestiontagsfile = "data/" + lastmonth + "-questiontags.dict"
if os.path.isfile(lastmonthsquestiontitlesfile):
@@ -39,17 +63,19 @@ def extractComments(years):
logging.info('loading tag dictionary: %s', lastmonthsquestiontagsfile)
questiontitles = {}
questiontags = {}
- with open(lastmonthsquestiontitlesfile, 'r') as f:
+ with open(lastmonthsquestiontitlesfile, 'rb') as f: ## add b
questiontitles = pickle.load(f)
logging.info("Elements in questiontitles: %s", len(questiontitles))
- with open(lastmonthsquestiontagsfile, 'r') as f:
+ with open(lastmonthsquestiontagsfile, 'rb') as f: ## add b
questiontags = pickle.load(f)
logging.info("Elements in questiontags: %s", len(questiontags))
else:
logging.info("creating new dictionaries")
questiontitles = {}
questiontags = {}
-
+ #######
+ ## ^^^^^ End
+ #######
monthusers = set()
parsedpostsfile = open("data/"+ yearmonth + "-titles-tags-text.tsv","a")
rawpostsfile = open("rawdata/" + yearmonth + ".Posts.xml", 'r')
@@ -67,19 +93,31 @@ def extractComments(years):
creationDate = doc.get('CreationDate')
postTypeId = doc.get('PostTypeId')
score = doc.get('Score')
- text = doc.get('Body').encode('utf8').replace('\r\n','').replace('\n','')
+ #text = doc.get('Body').encode('utf8').replace('\r\n','').replace('\n','')
+ text = doc.get('Body').replace('\r\n','').replace('\n','')
tagremove = re.compile(r'(|<[^>]*>)')
text = cgi.escape(tagremove.sub('', re.sub('[^>]+', '', text)))
parent = doc.get('ParentId')
if 'Title' in doc.keys():
- title = doc.get('Title').encode('utf8')
+ #title = doc.get('Title').encode('utf8')
+ title = doc.get('Title')
+ if type(title) is bytes:
+ print('>>>>>>>> Byte')
+ title=title.decode('utf8')
else:
title = ''
if 'Tags' in doc.keys():
- tags = doc.get('Tags').encode('utf8').replace("><", ",").replace("<","").replace(">","")
+ #tags = doc.get('Tags').encode('utf8').replace("><", ",").replace("<","").replace(">","")
+ tags = doc.get('Tags').replace("><", ",").replace("<","").replace(">","")
+ if type(tags) is bytes:
+ print('>>>>>>>> Byte')
+ tags=tags.decode('utf8')
else:
tags = ''
+ ####
+ ##pdb.set_trace()
+ ####
if postTypeId == "1":
questiontags[rowID] = tags
questiontitles[rowID] = title
@@ -94,11 +132,12 @@ def extractComments(years):
parsedpostsfile.close()
rawpostsfile.close()
- with open("data/"+ yearmonth + "-titles-users.txt", 'w') as f:
+ #pdb.set_trace()
+ with open("data/"+ yearmonth + "-titles-users.txt", 'w') as f:
f.write("\n".join(monthusers))
- with open("data/" + yearmonth + "-questiontitles.dict", 'w') as f:
+ with open("data/" + yearmonth + "-questiontitles.dict", 'wb') as f: ## add b (binary mode)
pickle.dump(questiontitles, f, pickle.HIGHEST_PROTOCOL)
- with open("data/" + yearmonth + "-questiontags.dict", 'w') as f:
+ with open("data/" + yearmonth + "-questiontags.dict", 'wb') as f: ## add b (binary mode)
pickle.dump(questiontags, f, pickle.HIGHEST_PROTOCOL)
end = time.time() - start
logging.info("Elapsed time (s): %s", end)
@@ -108,7 +147,7 @@ def extractComments(years):
def extractUsers(minPostCount, years):
users = {}
for year in years:
- print "Parsing year: " +str(year)
+ print ("Parsing year: " +str(year))
posts = open("rawdata/"+str(year)+".Posts.xml", 'r')
for post in posts:
post = post.rstrip('\n')
diff --git a/TextProcessor.py b/TextProcessor.py
index 4f4e91a..cf5fc7e 100644
--- a/TextProcessor.py
+++ b/TextProcessor.py
@@ -1,36 +1,45 @@
from __future__ import print_function
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
-import logging, re, numpy, cPickle
+import logging, re, numpy
+import _pickle as cPickle ## Python 3 does not have cPickle
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
-
+import pdb
def main():
"""Main entry."""
global priorweight
- dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
- '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
+ dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ']
+ #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
+ # '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
- dates = ['2013-01', '2013-02', '2013-03']
+ #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12']
numtopics = 40
vocabsize = 2000
priorweight = 0.05
- workers = 3
- # filterUsers(dates)
+ workers = 1
+
+ merge=True
+ ########
+ filterUsers(dates)
createDictionariesFromFiles(dates)
- createGlobalDictionaryFromMonthly(dates, vocabsize)
- createMonthCorpuses(dates)
- #
- performTFIDF(dates)
- performLDA(dates, numtopics, vocabsize, workers)
- # lookupTopics(dates)
+ createGlobalDictionaryFromMonthly(dates, vocabsize,merge)
+ createMonthCorpuses(dates,merge)
+
+ performTFIDF(dates,merge)
+ #######
+ performLDA(dates, numtopics, vocabsize, workers,merge)
+ #######
+ #lookupTopics(dates)
+ #lookatdist(dates[1])
+ #######
def lookatdist(date):
@@ -45,7 +54,7 @@ def filterUsers(dates):
users = set()
for date in dates:
musers = set()
- for line in open("data/" + str(date) + "-title-users.txt", "r"):
+ for line in open("data/" + str(date) + "-titles-users.txt", "r"): #"-title-users.txt", "r"):
musers.add(line.strip("\n"))
if len(users) == 0:
users = musers
@@ -64,8 +73,12 @@ def readFile(date):
original_sentences[id] = text
return original_sentences
-def lookupLDATopics(date, docIDs, numTopics):
- tokenized_dictfile = "models/global-tokenized_dict.pdict"
+def lookupLDATopics(date, docIDs, numTopics, mergeDocs=False):
+ if not mergeDocs:
+ tokenized_dictfile="models/global-tokenized_dict.pdict"
+ else:
+ tokenized_dictfile="models/global-tokenized_dict-perUser.pdict"
+ #####
with open(tokenized_dictfile, 'rb') as f:
tokenized_dict = cPickle.load(f)
dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
@@ -77,9 +90,13 @@ def lookupLDATopics(date, docIDs, numTopics):
topics_by_value = sorted(topics, key=lambda tup: tup[1], reverse=True)
return topics_by_value[:numTopics]
-def calculateEta(dates, date, numtopics, vocabsize):
+def calculateEta(dates, date, numtopics, vocabsize,mergeDocs=False):
priordate = dates[dates.index(date) - 1]
- tokenized_dictfile = "models/"+priordate+"-monthly-tokenized_dict.pdict"
+ if not mergeDocs:
+ suffix="-monthly-tokenized_dict.pdict"
+ else:
+ suffix="-monthly-tokenized_dict-perUser.pdict"
+ tokenized_dictfile = "models/"+priordate+suffix
with open(tokenized_dictfile, 'rb') as f:
tokenized_dict = cPickle.load(f)
dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
@@ -132,22 +149,36 @@ def calculateEta2(dates, date, numtopics, vocabsize, minpriorvalue):
eta[topicid][index] = value
return eta
-def performTFIDF(dates):
+def performTFIDF(dates, mergeDocs=False):
for date in dates:
- corpus = corpora.MmCorpus("models/" + date + "-tokenized.mm")
+ if not mergeDocs:
+ suffix_tok="-tokenized.mm"
+ suffix_tfidf_model="-tfidf.model"
+ suffix_tfidf_corpus="-tfidf.mm"
+ else:
+ suffix_tok="-tokenizedUser.mm"
+ suffix_tfidf_model="-tfidfUser.model"
+ suffix_tfidf_corpus="-tfidfUser.mm"
+
+ corpus = corpora.MmCorpus("models/" + date + suffix_tok)
tfidf = models.TfidfModel(corpus)
- tfidf.save("models/"+date+"-tfidf.model")
+ tfidf.save("models/"+date+ suffix_tfidf_model)
tfidf_corpus = tfidf[corpus]
- corpora.MmCorpus.save_corpus("models/"+date+"-tfidf.mm", tfidf_corpus)
+ corpora.MmCorpus.save_corpus("models/"+date+ suffix_tfidf_corpus, tfidf_corpus)
-def performLDA(dates, numtopics, vocabsize, workers):
+def performLDA(dates, numtopics, vocabsize, workers,mergeDocs=False):
for date in dates:
+ if not mergeDocs:
+ suffix="-tfidf.mm"
+ else:
+ suffix="-tfidfUser.mm"
print("performing lda on " + str(date))
dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
- corpus = corpora.MmCorpus("models/" + date + "-tfidf.mm")
+ corpus = corpora.MmCorpus("models/" + date + suffix)
if date != dates[0] and priorweight != 0:
logging.info("Calculating eta based on prior month")
- eta = calculateEta(dates, date, numtopics, vocabsize)
+ eta = calculateEta(dates, date, numtopics, len(dictionary.keys()),mergeDocs) ## vocabsize -> len(dictionary.keys()) SAFER!
+ # pdb.set_trace()
lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=numtopics, workers=workers, eta=eta)
else:
logging.info("Eta weighting factor too low or no prior months")
@@ -159,18 +190,25 @@ def performLDA(dates, numtopics, vocabsize, workers):
def tokenizeandstemline(text):
stoplist = STOPWORDS
stemmer = PorterStemmer()
- tokenized_line = [stemmer.stem(word.lower()) for word in word_tokenize(text.decode('utf-8'), language='english') if word not in stoplist and len(word) > 3 and re.match('^[\w-]+$', word) is not None]
+ ### Python 3 does not have str.decode, and the method PorterStemmer.stem() has a bug
+ #tokenized_line = [stemmer.stem(word.lower()) for word in word_tokenize(text.decode('utf-8'), language='english') if word not in stoplist and len(word) > 3 and re.match('^[\w-]+$', word) is not None]
+ tokenized_line = [stemmer.stem(word.lower()) for word in word_tokenize(text, language='english') if word not in stoplist and len(word) > 3 and re.match('^[\w-]+$', word) is not None]
+ ###
return tokenized_line
def writecpicklefile(content, filename):
with open(filename, 'wb') as f:
- cPickle.dump(content, f, cPickle.HIGHEST_PROTOCOL)
+ cPickle.dump(content, f, -1) #cPickle.HIGHEST_PROTOCOL) ## Python 3 does not have the macro HIGHEST_PROTOCOL
-def createGlobalDictionaryFromMonthly(dates, vocabsize):
+def createGlobalDictionaryFromMonthly(dates, vocabsize, mergeDocs=False):
global_tokenized_dict = {}
for date in dates:
- monthly_tokenized_dictfile = "models/" + date + "-monthly-tokenized_dict.pdict"
+ if not mergeDocs:
+ suffix="-monthly-tokenized_dict.pdict"
+ else:
+ suffix="-monthly-tokenized_dict-perUser.pdict"
+ monthly_tokenized_dictfile = "models/" + date + suffix
with open(monthly_tokenized_dictfile, 'rb') as f:
logging.info("Opening file %s", monthly_tokenized_dictfile)
global_tokenized_dict = merge_two_dicts(cPickle.load(f), global_tokenized_dict)
@@ -192,6 +230,9 @@ def createDictionariesFromFiles(dates):
for date in dates:
print("parsing month: " + date)
monthly_tokenized_dict = {}
+ ####
+ monthly_tokenized_byUser = {}
+ ####
monthly_original_dict = {}
docids = {}
for line in open("data/" + date + "-titles-tags-text.tsv"):
@@ -199,25 +240,44 @@ def createDictionariesFromFiles(dates):
docids[id] = (userid, score)
text = title + " " + tags + " " + text
tokenized_line = tokenizeandstemline(text)
- monthly_tokenized_dict[id] = tokenized_line
+ monthly_tokenized_dict[id] = tokenized_line.copy()
monthly_original_dict[id] = text
+ #### merge all user's documents
+ if userid in monthly_tokenized_byUser:
+ monthly_tokenized_byUser[userid].extend(tokenized_line.copy())
+ else:
+ monthly_tokenized_byUser[userid]=tokenized_line.copy()
+ ####
+ ### pdb.set_trace() ## just in case :)
monthly_docids_dictfile = "models/"+date+"-docids.pdict"
writecpicklefile(docids, monthly_docids_dictfile)
monthly_tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
writecpicklefile(monthly_tokenized_dict, monthly_tokenized_dictfile)
+ ####
+ monthly_tokenized_dictfile_perUser = "models/"+date+"-monthly-tokenized_dict-perUser.pdict"
+ writecpicklefile(monthly_tokenized_byUser, monthly_tokenized_dictfile_perUser)
+ ####
monthly_original_dictfile = "models/"+date+"-monthly-original_dict.pdict"
writecpicklefile(monthly_original_dict, monthly_original_dictfile)
-def createMonthCorpuses(dates):
+def createMonthCorpuses(dates,mergeDocs=False):
for date in dates:
logging.info("Parsing date: %s", date)
print("parsing month: " + date)
- monthly_dict_file = "models/" + date + "-monthly-tokenized_dict.pdict"
+ if not mergeDocs:
+ suffix_source="-monthly-tokenized_dict.pdict"
+ suffix_target='-tokenized.mm'
+ else:
+ suffix_source="-monthly-tokenized_dict-perUser.pdict"
+ suffix_target='-tokenizedUser.mm'
+
+ monthly_dict_file = "models/" + date + suffix_source
with open(monthly_dict_file, 'rb') as f:
tokenized_dict = cPickle.load(f)
dictionary = corpora.Dictionary.load('models/global-dictionary.dict')
corpus = [dictionary.doc2bow(sentence) for sentence in tokenized_dict.values()]
- corpora.MmCorpus.serialize('models/' + date + '-tokenized.mm', corpus)
+ corpora.MmCorpus.serialize('models/' + date + suffix_target, corpus)
+
if __name__ == '__main__':
main()
diff --git a/TopicComparator.py b/TopicComparator.py
index 81a207f..b57c897 100644
--- a/TopicComparator.py
+++ b/TopicComparator.py
@@ -4,13 +4,16 @@
from numpy.linalg import norm
from numpy import array
-
+import pdb
def main():
global numtopics, vocabsize
- dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
- '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
- # dates = ['2013-02', '2013-03'] #, '2013-03', '2013-03']
+ dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ']
+ #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
+ # '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
+ #dates = ['2013-01','2013-02', '2013-03'] #, '2013-03', '2013-03']
+ #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12']
+
numtopics = 40
vocabsize = 2000
# compareMonths(dates)
@@ -21,7 +24,7 @@ def main():
def compareMonths(dates):
i = 1
for month in dates:
- print month
+ print (month)
nextmonth = dates[i]
TVDBasedSimilarity(month, nextmonth)
KLDBasedSimilarity(month, nextmonth)
@@ -110,7 +113,9 @@ def printTopicWords(dates):
lda = models.LdaModel.load("ldamodels/" + month + "-lda.model")
topicfile = open("topics/"+month+"-topicwords.txt", "w")
ldalist = lda.show_topics(num_topics=numtopics, num_words=10, log=False, formatted=False)
- wordlists = { topic[0]: [wordvals[0].encode('utf-8') for wordvals in topic[1]] for topic in ldalist}
+ #pdb.set_trace()
+ # wordlists = { topic[0]: [wordvals[0].encode('utf-8') for wordvals in topic[1]] for topic in ldalist} ## delete encode('utf-8')
+ wordlists = { topic[0]: [wordvals[0] for wordvals in topic[1]] for topic in ldalist}
for topic in wordlists.keys():
line = str(topic) + "\t" + " ".join(wordlists[topic]) + "\n"
topicfile.write(line)
diff --git a/TopicStats.py b/TopicStats.py
index 777bbb5..e0a05f8 100644
--- a/TopicStats.py
+++ b/TopicStats.py
@@ -1,22 +1,26 @@
from gensim import corpora, models
-import logging, numpy, cPickle
+import logging, numpy #, cPickle
+import _pickle as cPickle ## Python 3 does not have cPickle
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
def main():
global topicthreshold
- dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
- '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
+ dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ']
+ #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
+ # '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
# dates = ['2013-01', '2013-02']
+ #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12']
+
topics = 40
topicthreshold = 0.3
+ merge=True
+ countWords(dates, topics,merge)
+ docPerTopic(dates,merge)
- # countWords(dates, topics)
- docPerTopic(dates)
-
-def docPerTopic(dates):
+def docPerTopic(dates,mergeDocs):
dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
doctopics = {}
topicfile = open("stats/docpertopic.tsv", 'w')
@@ -25,7 +29,11 @@ def docPerTopic(dates):
date = str(date)
print(date)
- tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+ if not mergeDocs:
+ tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+ else:
+ tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict"
+ ####
with open(tokenized_dictfile, 'rb') as f:
tokenized_dict = cPickle.load(f)
@@ -35,8 +43,12 @@ def docPerTopic(dates):
for doc in documentfile:
[docid, userid, creationdate, score, title, tags, text] = doc.rstrip("\n").split("\t")
-
- sentence = tokenized_dict[docid]
+ ######
+ if not mergeDocs:
+ sentence = tokenized_dict[docid]
+ else:
+ sentence = tokenized_dict[userid]
+ #######
bow = dictionary.doc2bow(sentence)
documenttopics = lda[bow]
for (topicid, topicvalue) in documenttopics:
@@ -53,7 +65,7 @@ def docPerTopic(dates):
doctopics[topicid][date] = 0
doctopics[topicid][date]+=1
- print doctopics
+ print (doctopics)
for topicid in doctopics.keys():
line = str(topicid)
for date in doctopics[topicid].keys():
@@ -65,7 +77,7 @@ def docPerTopic(dates):
-def countWords(dates, numtopics):
+def countWords(dates, numtopics, mergeDocs):
wordfile = open("stats/wordcounts.tsv", "w")
words = {} #each word counted once per doc
totalwords = {} #each word counted n times per n mentions in doc
@@ -76,7 +88,11 @@ def countWords(dates, numtopics):
words[date] = 0
uniquewords[date] = set()
totalwords[date] = 0
- tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+ if not mergeDocs:
+ tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+ else:
+ tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict"
+ ###
with open(tokenized_dictfile, 'rb') as f:
tokenized_dict = cPickle.load(f)
dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
@@ -86,6 +102,7 @@ def countWords(dates, numtopics):
logging.info("Parsing date: %s", str(date))
[countedwordtopics.append(0) for i in range(numtopics)]
for docID in tokenized_dict.keys():
+ # check if it is necessary to choose between docID or UserID (when mergeDoc is True)
doc = tokenized_dict[docID]
bow = dictionary.doc2bow(doc)
wordcount = len(bow)
diff --git a/UserComparator.py b/UserComparator.py
index 73675f3..3cf34ba 100644
--- a/UserComparator.py
+++ b/UserComparator.py
@@ -1,30 +1,39 @@
from gensim import corpora, models
-import cPickle, numpy, logging, scipy
+import numpy, logging, scipy
from numpy.linalg import norm
from scipy.stats import entropy
+import _pickle as cPickle ## Python 3 does not have cPickle
+
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
+import pdb
+
def main():
global topicthreshold
- dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09',
- '2013-10', '2013-11', '2013-12',
- '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09',
- '2014-10', '2014-11', '2014-12']
- # dates = ['2013-01', '2013-02', '2013-03']
+ dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ']
+ #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09',
+ # '2013-10', '2013-11', '2013-12',
+ # '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09',
+ # '2014-10', '2014-11', '2014-12']
+ #dates = ['2013-01', '2013-02', '2013-03']
+ #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12']
+
numtopics = 40
vocabsize = 2000
topicthreshold = 0.3
+ merge=True
- # summarizeTopicsPerUser(dates)
- # compareMonths(dates)
- # lookupTopics(dates)
+ #summarizeTopicsPerUser(dates,merge)
+ #compareMonths(dates)
+ #lookupTopics(dates,merge)
+ #####
createUserEvolutionChain(dates)
def compareMonths(dates):
i = 1
for month in dates:
- print month
+ print (month)
nextmonth = dates[i]
# TVDBasedSimilarity(month, nextmonth)
KLDBasedSimilarity(month, nextmonth)
@@ -83,7 +92,7 @@ def JSD(P, Q):
-def summarizeTopicsPerUser(dates):
+def summarizeTopicsPerUser(dates,mergeDocs=False):
dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
usersfile = "data/allusers.txt"
users = set(open(usersfile).read().split())
@@ -96,18 +105,25 @@ def summarizeTopicsPerUser(dates):
date = str(date)
print(date)
- tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+ if not mergeDocs:
+ tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+ else:
+ tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict"
+ ###
with open(tokenized_dictfile, 'rb') as f:
tokenized_dict = cPickle.load(f)
documentfile = open("data/" + date + "-titles-tags-text.tsv")
lda = models.LdaMulticore.load("ldamodels/" + date + "-lda.model")
-
+ ###pdb.set_trace()
for doc in documentfile:
[docid, userid, creationdate, score, title, tags, text] = doc.rstrip("\n").split("\t")
document_users[docid] = userid
document_scores[docid] = score
- sentence = tokenized_dict[docid]
+ if not mergeDocs:
+ sentence = tokenized_dict[docid]
+ else:
+ sentence = tokenized_dict[userid]
bow = dictionary.doc2bow(sentence)
documenttopics = lda[bow]
for (topicid, topicvalue) in documenttopics:
@@ -148,7 +164,7 @@ def writecpicklefile(content, filename):
-def lookupTopics(dates):
+def lookupTopics(dates,mergeDocs=False):
dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
document_users = {}
document_scores = {}
@@ -156,8 +172,12 @@ def lookupTopics(dates):
for date in dates:
date = str(date)
print(date)
-
- tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+
+ if not mergeDocs:
+ tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+ else:
+ tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict"
+ ####
with open(tokenized_dictfile, 'rb') as f:
tokenized_dict = cPickle.load(f)
@@ -179,7 +199,10 @@ def lookupTopics(dates):
continue
document_users[docid] = userid
document_scores[docid] = score
- sentence = tokenized_dict[docid]
+ if not mergeDocs:
+ sentence = tokenized_dict[docid]
+ else:
+ sentence = tokenized_dict[userid]
bow = dictionary.doc2bow(sentence)
documenttopics = lda[bow]
for (topicid, topicvalue) in documenttopics:
@@ -215,6 +238,7 @@ def lookupTopics(dates):
# resultline = str(topicid) + "\t" + str(userid) + "\t" + str(meantopicvalue) + "\n"
topicfile.write(resultline)
topicfile.close()
+ print('***** End (lookupTopics) ****')
def createUserEvolutionChain(dates):
topicscores={}
@@ -234,6 +258,7 @@ def createUserEvolutionChain(dates):
users.add("7585")
users.add("12579")
+ print('***** Begin (createUserEvolution) ****')
for date in dates:
topicfile = open("topics/" + date + "-topics.txt", 'r')
allwords[date] = {}
diff --git a/UserStatistics.py b/UserStatistics.py
new file mode 100644
index 0000000..0ee96c3
--- /dev/null
+++ b/UserStatistics.py
@@ -0,0 +1,74 @@
+mergeDocs=False
+dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ']
+topicthreshold = 0.3
+dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
+document_users = {}
+document_scores = {}
+users = set()
+
+if not mergeDocs:
+ tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+else:
+ tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict"
+####
+with open(tokenized_dictfile, 'rb') as f:
+ tokenized_dict = cPickle.load(f)
+
+usertopics = {}
+userdoctopics = {}
+usertopicscores = {}
+documentfile = open("data/" + date + "-titles-tags-text.tsv")
+topicfile = open("topics/" + date + "-topics.txt", 'w')
+headerline = "UserID\ttopicID\tmeantopicvalue\tnumdocs\tmeantopicscore\ttopicword1\ttopicword2\ttopicword3\ttopicword4\ttopicword5\n"
+topicfile.write(headerline)
+lda = models.LdaMulticore.load("ldamodels/" + date + "-lda.model")
+
+for doc in documentfile:
+ [docid, userid, creationdate, score, title, tags, text] = doc.rstrip("\n").split("\t")
+ if date == dates[0]:
+ users.add(userid)
+ else:
+ if userid not in users:
+ continue
+ document_users[docid] = userid
+ document_scores[docid] = score
+ if not mergeDocs:
+ sentence = tokenized_dict[docid]
+ else:
+ sentence = tokenized_dict[userid]
+ bow = dictionary.doc2bow(sentence)
+ documenttopics = lda[bow]
+ for (topicid, topicvalue) in documenttopics:
+ if topicvalue >= topicthreshold:
+ try:
+ userdoctopics[userid]
+ except KeyError:
+ userdoctopics[userid] = {}
+ userdoctopics[userid][topicid] = []
+ usertopicscores[userid] = {}
+ usertopicscores[userid][topicid] = []
+ try:
+ userdoctopics[userid][topicid]
+ except KeyError:
+ userdoctopics[userid][topicid] = []
+ usertopicscores[userid][topicid] = []
+ userdoctopics[userid][topicid].append(topicvalue)
+ usertopicscores[userid][topicid].append(int(score))
+for userid in userdoctopics.keys():
+ usertopics[userid] = {}
+ for topicid in userdoctopics[userid].keys():
+ meantopicvalue = numpy.mean(userdoctopics[userid][topicid])
+ meantopicscore = numpy.mean(usertopicscores[userid][topicid])
+ numdocs = len(userdoctopics[userid][topicid])
+ if meantopicvalue < topicthreshold:
+ continue
+ usertopics[userid][topicid] = meantopicvalue
+ topicterms = lda.get_topic_terms(topicid, topn=5)
+ topicwords = ""
+ for term in topicterms:
+ topicwords += dictionary.get(term[0]).ljust(15) + "\t"
+ resultline = str(userid)+"\t"+str(topicid)+"\t"+ str(meantopicvalue) + "\t" + str(numdocs) + "\t" + str(meantopicscore) + "\t" + str(topicwords) + "\n"
+ # resultline = str(topicid) + "\t" + str(userid) + "\t" + str(meantopicvalue) + "\n"
+ topicfile.write(resultline)
+topicfile.close()
+print('***** End (lookupTopics) ****')
diff --git a/downloadAndPrepareData.sh b/downloadAndPrepareData.sh
index 6b4db83..dab73b2 100755
--- a/downloadAndPrepareData.sh
+++ b/downloadAndPrepareData.sh
@@ -7,6 +7,8 @@ mkdir data
mkdir models
mkdir topics
mkdir rawdata
+# the following folder was missing
+mkdir ldamodels
cd rawdata
if [ ! -f "stackoverflow.com-Posts.7z" ]; then
diff --git a/plotFunctionsDoc.py b/plotFunctionsDoc.py
new file mode 100644
index 0000000..01c82be
--- /dev/null
+++ b/plotFunctionsDoc.py
@@ -0,0 +1,220 @@
+### How to use it:
+### The dictionary userTop must be populated: docTop=buildDict(dates,'-doctopicdist.txt','topics/DocDistrib/')
+### in the example dates are the dates of the files, '-doctopicdist.txt' are the suffix and 'topics/DocDistrib/' the folder (optional)
+### Available functions:
+### buildDocTopic(tpId, threshold) builds a list SxD (S: # of slice time, D: # of documents)
+### with the probabilities associated to documents
+### plotDocrvsTop (threshold), plots the number of documents by topic in all time slices
+### oneVsoneDocs (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function)
+### returns a list of p-values of the samples (slice_i x slice_i+1, slice_i+1 x slice_i+2 ....)
+### and a list for labeling X axis
+###
+### oneVsallDocs (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function)
+### returns a list of p-values of the samples (slice_first x slice_i+1, slice_first x slice_i+2 ....)
+### and a list for labeling X axis
+### PlotStatsDocs(statisc function, start, label of the graph, call oneVsAll (True) or oneVsone (False), threshold, list of topics, arguments for function)
+### plots a line graph with p-values of the samples being compared
+### calls buildTop, oneVsall or oneVsone for every topic and plots the result
+### Examples
+### t=buildTopic(10,.7) -> t is a list with 24 slices of time and n document probabilities for topic 10
+### PlotStatsDocs(stats.ttest_ind,1,"Student's T test")
+### plots the p-values using t-test for all topics comparing the first slice with all others
+### PlotStatsDocs(stats.ttest_ind,1,"Student's T test",False)
+### plots the p-values using t-test for all topics comparing the two consective time slices from the fisrt one
+### PlotStatsDocs(stats.ttest_ind,1,"Student's T test",False,[3,4,5])
+### The same above but it plots only for topics 3,4, and 5.
+
+import numpy as np
+from scipy import stats
+import matplotlib.pyplot as pl
+
+def main():
+ global docTop
+ global colors
+ global vslices
+ docTop={}
+ vslices=['Jan13','Feb13','Mar13','Apr13','May13','Jun13','Jul13','Aug13','Sep13','Oct13','Nov13','Dec13',
+ 'Jan14','Feb14','Mar14','Apr14','May14','Jun14','Jul14','Aug14','Sep14','Oct14','Nov14','Dec14']
+
+ dates=['2013-01','2013-02','2013-03','2013-04','2013-05','2013-06','2013-07','2013-08','2013-09','2013-10','2013-11','2013-12',
+ '2014-01','2014-02','2014-03','2014-04','2014-05','2014-06','2014-07','2014-08','2014-09','2014-10','2014-11','2014-12']
+ #
+ colors=['yellow', 'orange', 'pink','black','blue','brown','coral','crimson','cyan','darkblue',
+ 'darkgreen','fuchsia','gold','green','grey','indigo','red','yellowgreen','navy','olive',
+ 'azure','orchid','beige','plum','purple','lavender','salmon','silver','violet','aqua','magenta']
+
+ docTop=buildDict(dates,'-doctopicdist.txt','topics/DocDistrib/')
+ plotDocvsTop(.5) ## plot the number of topics per topic per time slice
+ ## plot the p-value from bartlett test for all topics from the first
+ ## time slice
+ PlotStatsDocs(stats.bartlett,1,"Bartlett's test",False,.05,[3,5,7,9,11])
+
+### build a dictionary with all probabilities and documents
+### docTop size is 24 (24 slices of time)
+### doctop[n] is 20 (20 topics in slice n)
+### doctop[n][t] is the probabilities of documents to be associated to
+### topic t in slice n
+def buildDict(dates,filesuf,fileprex=''):
+ #### topics vs docs
+ docT={}
+ for i in range(len(dates)):
+ file=fileprex+dates[i]+filesuf
+ lines=[line.split() for line in open(file)]
+ matprob=[]
+ for j in range(len(lines)):
+ probs=[float(n) for n in lines[j]]
+ matprob.append(probs[1:])
+ matprob=np.array(matprob)
+ docT[i]=matprob
+ return docT
+### Build the doc probabilities by topic
+
+## plot the number of documents per topic in each slice of time
+## thr is the probability threshold
+def plotDocvsTop(thr=0.01):
+ plist=[]
+ for i in range(len(docTop[0])):
+ p=[]
+ for j in range(len(docTop)):
+ p.append(len([i for i in docTop[j][i] if i > thr]))
+ plist.append(pl.plot(p))
+ ltup=()
+ ctup=()
+ for i in range(len(plist)):
+ ntup=(plist[i][0],)
+ ltup=ltup+ntup
+ ntup=('T'+str(i),)
+ ctup=ctup+ntup
+ pl.legend(ltup,ctup)
+ pl.ylabel('# of documents')
+ pl.xticks(np.arange(0,24),np.arange(1,24))
+ pl.grid(True)
+ pl.title('Documents per topics\n Threshold: '+str(thr))
+ pl.show()
+
+
+### build list for the documents probabilities in a given topic (tId)
+### and a given threshol (thres)
+### the list NxP, where N is the slice number and P the documents
+### probabilities for tId in that slice (P is not fixed in the slices)
+def buildDocTopic(tId,thres=0.001):
+ tn=[] ## stores the probabilities
+ for i in range(24):
+ tn.append([i for i in docTop[i][tId] if i > thres])
+ return tn
+
+def oneVsallDocsPlot(fct,start,data,tp,label,**args):
+ Xlabels=[]
+ start=start-1
+ for i in range(start+1,len(vslices)):
+ Xlabels.append(vslices[start]+'x'+vslices[i]) ### labels
+
+ pval=[]
+ for i in range(1,len(Xlabels)+1):
+ if not args:
+ t,p=fct(data[start],data[start+i])
+ else:
+ key=list(args)[0]
+ val=args.get(key)
+ args={key:val}
+ t,p=fct(data[start],data[start+i],**args)
+ pval.append(p)
+ pl.plot(pval)
+ pl.xticks(np.arange(0,len(Xlabels)),Xlabels,rotation=90)
+ pl.title(label+"\nTopic: "+str(tp))
+ pl.show()
+
+
+
+### Returns the p-value for fct (stats.), comparing (1 is the first) to all following time slices
+### for the document probabilities in a topic defined by data
+### args is used as parameters for fct (if it is necessary)
+def oneVsallDocs(fct,start,data,**args):
+ Xlabels=[]
+ start=start-1
+ for i in range(start+1,len(vslices)):
+ Xlabels.append(vslices[start]+'x'+vslices[i]) ### labels
+
+ pval=[]
+ for i in range(1,len(Xlabels)+1):
+ if not args:
+ t,p=fct(data[start],data[start+i])
+ else:
+ key=list(args)[0]
+ val=args.get(key)
+ args={key:val}
+ t,p=fct(data[start],data[start+i],**args)
+ pval.append(p)
+ return pval,Xlabels
+
+### Returns the p-value for fct (stats.), from (1 is the first) to all following time slices
+### comparing two slices in a row: jan13xfev13, fev13xmar13, and so on
+### for the document probabilities in a topic defined by data
+### args is used as parameters for fct (if it is necessary)
+def oneVsoneDocs(fct,start,data,**args):
+ Xlabels=[]
+ start=start-1
+ for i in range(start,len(vslices)-1):
+ Xlabels.append(vslices[i]+'x'+vslices[i+1]) ### labels
+
+ pval=[]
+ for i in range(start,len(Xlabels)+start):
+ if not args:
+ t,p=fct(data[i],data[i+1])
+ else:
+ key=list(args)[0]
+ val=args.get(key)
+ args={key:val}
+ t,p=fct(data[i],data[i+1],**args)
+ pval.append(p)
+ return pval,Xlabels
+
+### Plots all topics in slices from (1 is the first), with p-value
+### calculate by fct. label is the name of the statistical function to
+### appear in the title, topics is a list of topics to be plot (none=all)
+### args is to be passed to fct
+def PlotStatsDocs(fct,start,label,oneVsall=True,th=0,topics=None,**args):
+ t=[]
+ eTop=[]
+ if topics is None:
+ for i in range(20):
+ eTop.append(i)
+ else:
+ eTop=topics
+ for i in eTop:
+ t.append(buildDocTopic(i,th))
+ #
+ plist=[]
+ for i in range(len(t)):
+ if not args:
+ if oneVsall:
+ pv,xl=oneVsallDocs(fct,start,t[i])
+ else:
+ pv,xl=oneVsoneDocs(fct,start,t[i])
+ else:
+ if oneVsall:
+ pv,xl=oneVsallDocs(fct,start,t[i],**args)
+ else:
+ pv,xl=oneVsoneDocs(fct,start,t[i],**args)
+ plist.append(pl.plot(pv,color=colors[i]))
+ #
+ ltup=()
+ ctup=()
+ j=0
+ for i in range(len(plist)):
+ ntup=(plist[i][0],)
+ ltup=ltup+ntup
+ eTop[j]
+ ntup=('T'+str(eTop[j]),)
+ j=j+1
+ ctup=ctup+ntup
+ #
+ pl.legend(ltup,ctup)
+ pl.ylabel('p-value')
+ pl.grid(True)
+ pl.xticks(np.arange(0,len(xl)),xl,rotation=90)
+ pl.title(label+'\nDocuments in Topics - Threshold: '+str(th))
+ pl.show()
+
+if __name__ == '__main__':
+ main()
diff --git a/plotFunctionsUser.py b/plotFunctionsUser.py
new file mode 100644
index 0000000..677ac7e
--- /dev/null
+++ b/plotFunctionsUser.py
@@ -0,0 +1,256 @@
+### How to use it:
+### The dictionary userTop must be populated: userTop=buildDict(dates,'-topicuserdist.txt','topics/')
+### in the example dates are the dates of the files, '-topicuserdist.txt' are the suffix and 'topics/' the folder (optional)
+### Available functions:
+### buildTopic(tpId, threshold) builds a list SxU (S: # of slice time, U: # of users)
+## with the probabilities associated to users
+### plotUservsTop (threshold), plots the number of user by topic in all time slices
+### oneVsone (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function)
+### returns a list of p-values of the samples (slice_i x slice_i+1, slice_i+1 x slice_i+2 ....)
+### and a list for labeling X axis
+###
+### oneVsall (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function)
+### returns a list of p-values of the samples (slice_first x slice_i+1, slice_first x slice_i+2 ....)
+### and a list for labeling X axis
+### PlotStats(statisc function, start, label of the graph, call oneVsAll (True) or oneVsone (False), threshold, list of topics, arguments for function)
+### plots a line graph with p-values of the samples being compared
+### calls buildTop, oneVsall or oneVsone for every topic and plots the result
+### Examples
+### t=buildTopic(10,.7) -> t is a list with 24 slices of time and n user probabilities for topic 10
+### PlotStats(stats.ttest_ind,1,"Student's T test")
+### plots the p-values using t-test for all topics comparing the first slice with all others
+### PlotStats(stats.ttest_ind,1,"Student's T test",False)
+### plots the p-values using t-test for all topics comparing the two consective time slices from the fisrt one
+### PlotStats(stats.ttest_ind,1,"Student's T test",False,[3,4,5])
+### The same above but it plots only for topics 3,4, and 5.
+import numpy as np
+from scipy import stats
+
+import matplotlib.pyplot as pl
+
+def main():
+ global userTop
+ global colors
+ global vslices
+ userTop={}
+ vslices=['Jan13','Feb13','Mar13','Apr13','May13','Jun13','Jul13','Aug13','Sep13','Oct13','Nov13','Dec13',
+ 'Jan14','Feb14','Mar14','Apr14','May14','Jun14','Jul14','Aug14','Sep14','Oct14','Nov14','Dec14']
+ #
+ dates=['2013-01','2013-02','2013-03','2013-04','2013-05','2013-06','2013-07','2013-08','2013-09','2013-10','2013-11','2013-12',
+ '2014-01','2014-02','2014-03','2014-04','2014-05','2014-06','2014-07','2014-08','2014-09','2014-10','2014-11','2014-12']
+ #
+ colors=['yellow', 'orange', 'pink','black','blue','brown','coral','crimson','cyan','darkblue',
+ 'darkgreen','fuchsia','gold','green','grey','indigo','red','yellowgreen','navy','olive',
+ 'azure','orchid','beige','plum','purple','lavender','salmon','silver','violet','aqua','magenta']
+
+ userTop=buildDict(dates,'-topicuserdist.txt','topics/')
+ plotUservsTop(.5) ## plot the number of topics per user as time goes by
+ ## plot the p-value from bartlett test for all topics from the first
+ ## time slice
+ PlotStats(stats.bartlett,1,"Bartlett's test",False)
+
+
+
+### Create a dict "userTop" with all the probabilities
+### of a user to be associated to topics in 24 slices of time
+### userTop[0] refers to the first time slice
+#### userTop[0][0] refers to users probabilities in the first topic in time slice 0
+####
+def buildDict(dates,filesuf,fileprex=''):
+ userT={}
+ for i in range(len(dates)):
+ file=fileprex+dates[i]+filesuf
+ lines=[line.split() for line in open(file)]
+ matprob=[]
+ for j in range(len(lines)):
+ probs=[float(n) for n in lines[j]]
+ matprob.append(probs)
+ matprob=np.array(matprob)
+ userT[i]=matprob
+ return userT
+###
+
+####################### USERS and TOPICS
+def plotUservsTop(trh=0.001):
+ uvst=[]
+ for i in range(20):
+ t=[]
+ for j in range(len(userTop)):
+ t.append(np.count_nonzero(userTop[j][i]>trh))
+ uvst.append(t)
+ plist=[]
+ for topId in range(20):
+ plist.append(pl.plot(uvst[topId],color=colors[topId]))
+ ltup=()
+ ctup=()
+ for i in range(20):
+ ntup=(plist[i][0],)
+ ltup=ltup+ntup
+ ntup=('T'+str(i),)
+ ctup=ctup+ntup
+ pl.legend(ltup,ctup)
+ pl.xticks(np.arange(0,len(userTop)),np.arange(1,len(userTop)+1))
+ pl.ylabel('# of Users')
+ pl.xlabel('Slices of time')
+ pl.grid(True)
+ pl.title('Threshold '+str(trh))
+ pl.show()
+
+
+### Extract the user probabilities of the given topic tId for all time slices (24)
+def buildTopic(tId,thres=0.001):
+ tn=[] ## stores the probabilities
+ for i in range(24):
+ tn.append(userTop[i][tId,userTop[i][tId]>=thres])
+ return tn
+
+### given a stastic function (fct), a initial time slice, a dataset, the label correspondig to fct, and a topic tp
+### plot the p-value from the start to point to all older ones.
+def oneVsallPlot(fct,start,data,label,tp,**args):
+ Xlabels=[]
+ start=start-1
+ for i in range(start+1,len(vslices)):
+ Xlabels.append(vslices[start]+'x'+vslices[i]) ### labels
+
+ pval=[]
+ for i in range(1,len(Xlabels)+1):
+ if not args:
+ t,p=fct(data[start],data[start+i])
+ else:
+ t,p=fct(data[start],data[start+i],alternative='two-sided')
+ pval.append(p)
+ pl.plot(pval)
+ pl.xticks(np.arange(0,len(Xlabels)),Xlabels,rotation=90)
+ pl.title(label+"\nTopic: "+str(tp))
+ pl.show()
+
+def oneVsall(fct,start,data,**args):
+ Xlabels=[]
+ start=start-1
+ for i in range(start+1,len(vslices)):
+ Xlabels.append(vslices[start]+'x'+vslices[i]) ### labels
+
+ pval=[]
+ for i in range(1,len(Xlabels)+1):
+ if not args:
+ t,p=fct(data[start],data[start+i])
+ else:
+ key=list(args)[0]
+ val=args.get(key)
+ args={key:val}
+ t,p=fct(data[start],data[start+i],**args)
+ pval.append(p)
+ return pval,Xlabels
+
+def oneVsone(fct,start,data,**args):
+ Xlabels=[]
+ start=start-1
+ for i in range(start,len(vslices)-1):
+ Xlabels.append(vslices[i]+'x'+vslices[i+1]) ### labels
+
+ pval=[]
+ for i in range(start,len(Xlabels)+start):
+ if not args:
+ t,p=fct(data[i],data[i+1])
+ else:
+ key=list(args)[0]
+ val=args.get(key)
+ args={key:val}
+ t,p=fct(data[i],data[i+1],**args)
+ pval.append(p)
+ return pval,Xlabels
+
+
+def PlotStats(fct,start,label,oneXall=True,th=0,topics=None,**args):
+ t=[]
+ eTop=[]
+ if topics is None:
+ for i in range(20):
+ eTop.append(i)
+ else:
+ eTop=topics
+ for i in eTop:
+ t.append(buildTopic(i,th))
+ #
+ plist=[]
+ for i in range(len(t)):
+ if not args:
+ if oneXall:
+ pv,xl=oneVsall(fct,start,t[i])
+ else:
+ pv,xl=oneVsone(fct,start,t[i])
+ else:
+ if oneVsall:
+ pv,xl=oneVsall(fct,start,t[i],**args)
+ else:
+ pv,xl=oneVsone(fct,start,t[i],**args)
+ plist.append(pl.plot(pv,color=colors[i]))
+ #
+ ltup=()
+ ctup=()
+ j=0
+ for i in range(len(plist)):
+ ntup=(plist[i][0],)
+ ltup=ltup+ntup
+ eTop[j]
+ ntup=('T'+str(eTop[j]),)
+ j=j+1
+ ctup=ctup+ntup
+ #
+ pl.legend(ltup,ctup)
+ pl.ylabel('p-value')
+ pl.grid(True)
+ pl.xticks(np.arange(0,len(xl)),xl,rotation=90)
+ pl.title(label+'\nUsers in Topics - Threshold: '+str(th))
+ pl.show()
+
+
+if __name__ == '__main__':
+ main()
+
+### Paired test must be done in samples having the same shape
+
+######## KRUSKAL
+## The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal.
+## It is a non-parametric version of ANOVA. The test works on 2 or more independent samples, which may have different sizes.
+## Note that rejecting the null hypothesis does not indicate which of the groups differs.
+## Post-hoc comparisons between groups are required to determine which groups are different.
+#ToneVsall(stats.kruskal,15,'Kruskal')
+#ToneVsall(stats.kruskal,15,'Kruskal',.5,[9,11,13,15]) ## probability >=.5 and topics 9,11,13,15
+
+
+##### STUDENT'S T TEST
+## Calculates the T-test for the means of two independent samples of scores.
+## This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values.
+## This test assumes that the populations have identical variances by default.
+#ToneVsall(stats.ttest_ind,15,"Student's t test")
+
+##### MANN-WHITNEY U
+## The Mann-Whitney U test is a nonparametric test that allows two groups or conditions or treatments to be
+## compared without making the assumption that values are normally distributed.
+## So, for example, one might compare the speed at which two different groups of people can run 100 metres,
+## where one group has trained for six weeks and the other has not.
+#ToneVsall(stats.mannwhitneyu,15,'Mann Whitney',alternative='two-sided')
+#ToneVsall(stats.mannwhitneyu,15,'Mann Whitney',.5,[9,11,13,15],alternative='two-sided')
+
+##### KOLMOGOROV-SMIRNOV
+## The Kolmogorov-Smirnov test (KS-test) tries to determine if two datasets differ significantly.
+## The KS-test has the advantage of making no assumption about the distribution of data. (Technically speaking it is non-parametric and distribution free.)
+## Note however, that this generality comes at some cost: other tests (for example Student's t-test) may be more sensitive if the
+## data meet the requirements of the test.
+#ToneVsall(stats.ks_2samp,15,'Kolmogorov-Smirnov')
+
+
+
+### ToneVsall(stats.wilcoxon,15,'Wilcoxon') Not for samples with different sizes
+### ToneVsall(stats.ttest_rel,15,"T test on Two Related") Not for samples with different sizes
+
+### BARTLETT
+## Bartlett’s test tests the null hypothesis that all input samples are from populations with equal variances
+#ToneVsall(stats.bartlett,15,"Bartlett's test")
+
+### LEVENE
+## Perform Levene test for equal variances.
+## The Levene test tests the null hypothesis that all input samples are from populations with equal variances.
+## Levene’s test is an alternative to Bartlett’s test bartlett in the case where there are significant deviations from normality.
+#ToneVsall(stats.levene,15,"Levene's test")
diff --git a/prepareQuarter.sh b/prepareQuarter.sh
new file mode 100755
index 0000000..b316e3a
--- /dev/null
+++ b/prepareQuarter.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+echo "This script will prepare the stackoverflow data with time slice equals a quarter"
+
+
+
+# Data must already be download, and folder rawdata must exist (as well 2013.Posts.xml and 2014.Posts.xml)
+cd rawdata
+
+echo "splitting files 2013 in quarters"
+M20131st=`awk '/CreationDate=\"2013-01/ {print NR; exit}' 2013.Posts.xml`
+M20132nd=`awk '/CreationDate=\"2013-04/ {print NR; exit}' 2013.Posts.xml`
+LAST=`expr $M20132nd - 1`
+awk 'NR=='$M20131st', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2013.Posts.xml > 2013-1stQ.Posts.xml
+echo "2013-1st quarter done"
+
+
+M20133rd=`awk '/CreationDate=\"2013-07/ {print NR; exit}' 2013.Posts.xml`
+LAST=`expr $M20133rd - 1`
+awk 'NR=='$M20132nd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2013.Posts.xml > 2013-2ndQ.Posts.xml
+echo "2013-2nd Quarter done"
+
+M20134th=`awk '/CreationDate=\"2013-10/ {print NR; exit}' 2013.Posts.xml`
+LAST=`expr $M20134th - 1`
+awk 'NR=='$M20133rd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2013.Posts.xml > 2013-3rdQ.Posts.xml
+echo "2013-3rd Quarter done"
+
+awk 'NR>='$M20134th 2013.Posts.xml > 2013-4thQ.Posts.xml
+echo "2013-4th Quarter done"
+
+####
+echo "splitting files 2014 in quarters"
+M20141st=`awk '/CreationDate=\"2014-01/ {print NR; exit}' 2014.Posts.xml`
+M20142nd=`awk '/CreationDate=\"2014-04/ {print NR; exit}' 2014.Posts.xml`
+LAST=`expr $M20142nd - 1`
+awk 'NR=='$M20141st', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2014.Posts.xml > 2014-1stQ.Posts.xml
+echo "2014-1st quarter done"
+
+
+M20143rd=`awk '/CreationDate=\"2014-07/ {print NR; exit}' 2014.Posts.xml`
+LAST=`expr $M20143rd - 1`
+awk 'NR=='$M20142nd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2014.Posts.xml > 2014-2ndQ.Posts.xml
+echo "2014-2nd Quarter done"
+
+M20144th=`awk '/CreationDate=\"2014-10/ {print NR; exit}' 2014.Posts.xml`
+LAST=`expr $M20144th - 1`
+awk 'NR=='$M20143rd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2014.Posts.xml > 2014-3rdQ.Posts.xml
+echo "2014-3rd Quarter done"
+
+awk 'NR>='$M20144th 2014.Posts.xml > 2014-4thQ.Posts.xml
+echo "2014-4th Quarter done"
+
+cd ..