diff --git a/Maintenance.txt b/Maintenance.txt
new file mode 100644
index 0000000..8f4e54d
--- /dev/null
+++ b/Maintenance.txt
@@ -0,0 +1,69 @@
+Changes made in the programs:
+
+-- Some stuffs due to Python2 and Pyhton3 incompatibilities
+
+Script:
+-- Fix the year problem (checked out) already pulled in github
+-- Fix the folder problem (add ldamodels)
+
+-- create a new script prepareQuarter.sh (split xml files into quarter instead of months)
+
+-- Folder users must be create under topics folder (createUserEvolutionChain method/function)
+
+TextProcessor.py
+-- Adapted it to Python 3 (cPickle, string methods, stem bug)
+-- Fix file name problem (line 54) open("data/" + str(date) + "-titleS-users.txt", "r") -- it was missing S in the name
+-- Adapted it to consider all user's document as one (time sliced)
+
+---- createDictionariesFromFiles: added the creation of file date+"monthly-tokenized_dict-perUser.pdict"
+
+---- createGlobalDictionaryFromMonthly: added a new paramater mergeDocs=False (if False documents are split by post, otherwise by user)
+------------- false: opens date+"monthly-tokenized_dict.pdict"
+------------- true:  opens date+"monthly-tokenized_dict-perUser.pdict"
+------------- The output is the same (global dictionary)
+
+---- createMonthsCorpus: added a new paramater mergeDocs=False 
+------------- false: input: date+"monthly-tokenized_dict.pdict"         output: date+'-tokenized.mm'
+------------- true:  input: date+"monthly-tokenized_dict-perUser.pdict" output: date+'-tokenizedUser.mm'
+
+---- performTFIDF: new parameter added mergeDocs=False
+------------- false: input: date+'-tokenized.mm'     output: date+'tfidf.mm' and date+"-tfidf.model"
+------------- true:  input: date+'-tokenizedUser.mm' output: date+'tfidfUser.mm' and date+"-tfidfUser.model"
+
+---- performLDA: new parameter added mergeDocs=False
+------------- false: input: "models/" + date +"-tfidf.mm"     output: the same file
+------------- true:  input: "models/" + date +"-tfidfUser.mm" output: the same file
+
+---- calculateEta: new parameter added: mergeDocs=False
+------------- false: input: date+"monthly-tokenized_dict.pdict"  output: the same
+------------- true:  input: date+"monthly-tokenized_dict-perUser.pdict" output: the same
+
+-- lookupLDATopics: new parameter added: mergeDocs=False
+---- false: input: date+"monthly-tokenized_dict.pdict"    output: the same
+---- true:  input: date+"monthly-tokenized_dict-perUser.pdict"  output: the same
+
+-- performLDA
+----- Fix bug when call calculateEta: the parameter "vocabulary size" must set to len(dictionary.keys()) instead of vocabsize
+------------ (since the vocabulary size produced by dictionary can be smalller than vocabsize)
+
+UserComparator.py
+-- summarizeTopicsPerUser received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict"           output: the same
+---- true:  input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser"  output: the same
+----------- the key for tokenized_dict is userid instead of docid
+
+-- lookupTopics received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict"           output: the same
+---- true:  input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser"  output: the same
+----------- the key for tokenized_dict is userid instead of docid
+
+TopicStats.py
+-- docPerTopic received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict"           output: the same
+---- true:  input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser"  output: the same
+
+-- countWords received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict"           output: the same
+---- true:  input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser"  output: the same
+
+
diff --git a/SOParser.py b/SOParser.py
index ebafd9d..3e663e1 100644
--- a/SOParser.py
+++ b/SOParser.py
@@ -5,14 +5,19 @@
 import re, cgi, os, pickle, logging, time
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
+import pdb
+
 def main():
     minposts = 50
-
+    quarter=True
+    
     years = [2013, 2014]
+    
     extractUsers(minposts, years)
-    extractComments(years)
+    extractComments(years,quarter)
 
-def extractComments(years):
+def extractComments(years,isQuarter=False):
+    quarters=['1stQ','2ndQ','3rdQ','4thQ']
     users = set()
     usersFile = open('rawdata/userposts.txt', 'r')
     for userline in usersFile:
@@ -21,17 +26,36 @@ def extractComments(years):
     usersFile.close()
 
     for year in years:
-        print "Parsing year: " + str(year)
-        months = range(1,13)
+        print ("Parsing year: " + str(year))
 
+        if not isQuarter:        
+            months = range(1,13)
+        else:
+            months = range(1,5) ## 4 quarters in a year
+        ####    
         for month in months:
             start = time.time()
-            yearmonth = str(year) + "-" + str(month).zfill(2)
+            if not isQuarter:
+                strmonth=str(month).zfill(2)
+            else:
+                strmonth=quarters[month-1] 
+            #####       
+            yearmonth = str(year) + "-" + strmonth
             print(yearmonth)
+            #######
+            ## Dealing with qaurter instead of months vvvvv
+            #######
             if month == 1:
-                lastmonth = str(year-1) + "-12"
+                if not isQuarter:
+                    lastmonth = str(year-1) + "-12"
+                else:
+                    lastmonth = str(year-1) + '-' + quarters[-1]
             else:
-                lastmonth = str(year) + "-" + str(month-1).zfill(2)
+                if not isQuarter:
+                    lastmonth = str(year) + "-" + str(month-1).zfill(2) 
+                else:
+                    lastmonth = str(year) + "-" + quarters[month-2]
+            ###        
             lastmonthsquestiontitlesfile = "data/" + lastmonth + "-questiontitles.dict"
             lastmonthsquestiontagsfile = "data/" + lastmonth + "-questiontags.dict"
             if os.path.isfile(lastmonthsquestiontitlesfile):
@@ -39,17 +63,19 @@ def extractComments(years):
                 logging.info('loading tag dictionary: %s', lastmonthsquestiontagsfile)
                 questiontitles = {}
                 questiontags = {}
-                with open(lastmonthsquestiontitlesfile, 'r') as f:
+                with open(lastmonthsquestiontitlesfile, 'rb') as f:  ## add b
                     questiontitles = pickle.load(f)
                     logging.info("Elements in questiontitles: %s", len(questiontitles))
-                with open(lastmonthsquestiontagsfile, 'r') as f:
+                with open(lastmonthsquestiontagsfile, 'rb') as f: ## add b
                     questiontags = pickle.load(f)
                     logging.info("Elements in questiontags: %s", len(questiontags))
             else:
                 logging.info("creating new dictionaries")
                 questiontitles = {}
                 questiontags = {}
-
+            #######
+            ## ^^^^^ End
+            #######
             monthusers = set()
             parsedpostsfile = open("data/"+ yearmonth + "-titles-tags-text.tsv","a")
             rawpostsfile = open("rawdata/" + yearmonth + ".Posts.xml", 'r')
@@ -67,19 +93,31 @@ def extractComments(years):
                 creationDate = doc.get('CreationDate')
                 postTypeId = doc.get('PostTypeId')
                 score = doc.get('Score')
-                text = doc.get('Body').encode('utf8').replace('\r\n','').replace('\n','')
+                #text = doc.get('Body').encode('utf8').replace('\r\n','').replace('\n','')
+                text = doc.get('Body').replace('\r\n','').replace('\n','')
                 tagremove = re.compile(r'(<!--.*?-->|<[^>]*>)')
                 text = cgi.escape(tagremove.sub('', re.sub('<code>[^>]+</code>', '', text)))
 
                 parent = doc.get('ParentId')
                 if 'Title' in doc.keys():
-                    title = doc.get('Title').encode('utf8')
+                    #title = doc.get('Title').encode('utf8')
+                    title = doc.get('Title')
+                    if type(title) is bytes:
+                        print('>>>>>>>> Byte')
+                        title=title.decode('utf8')
                 else:
                     title = ''
                 if 'Tags' in doc.keys():
-                    tags = doc.get('Tags').encode('utf8').replace("><", ",").replace("<","").replace(">","")
+                    #tags = doc.get('Tags').encode('utf8').replace("><", ",").replace("<","").replace(">","")
+                    tags = doc.get('Tags').replace("><", ",").replace("<","").replace(">","")
+                    if type(tags) is bytes:
+                        print('>>>>>>>> Byte')
+                        tags=tags.decode('utf8')
                 else:
                     tags = ''
+                ####
+                ##pdb.set_trace()
+                ####
                 if postTypeId == "1":
                     questiontags[rowID] = tags
                     questiontitles[rowID] = title
@@ -94,11 +132,12 @@ def extractComments(years):
             parsedpostsfile.close()
             rawpostsfile.close()
 
-            with open("data/"+ yearmonth + "-titles-users.txt", 'w') as f:
+            #pdb.set_trace()
+            with open("data/"+ yearmonth + "-titles-users.txt", 'w') as f:  
                 f.write("\n".join(monthusers))
-            with open("data/" + yearmonth + "-questiontitles.dict", 'w') as f:
+            with open("data/" + yearmonth + "-questiontitles.dict", 'wb') as f: ## add b (binary mode)
                 pickle.dump(questiontitles, f, pickle.HIGHEST_PROTOCOL)
-            with open("data/" + yearmonth + "-questiontags.dict", 'w') as f:
+            with open("data/" + yearmonth + "-questiontags.dict", 'wb') as f: ## add b (binary mode)
                 pickle.dump(questiontags, f, pickle.HIGHEST_PROTOCOL)
             end = time.time() - start
             logging.info("Elapsed time (s): %s", end)
@@ -108,7 +147,7 @@ def extractComments(years):
 def extractUsers(minPostCount, years):
     users = {}
     for year in years:
-        print "Parsing year: " +str(year)
+        print ("Parsing year: " +str(year))
         posts = open("rawdata/"+str(year)+".Posts.xml", 'r')
         for post in posts:
             post = post.rstrip('\n')
diff --git a/TextProcessor.py b/TextProcessor.py
index 4f4e91a..cf5fc7e 100644
--- a/TextProcessor.py
+++ b/TextProcessor.py
@@ -1,36 +1,45 @@
 from __future__ import print_function
 from gensim import corpora, models
 from gensim.parsing.preprocessing import STOPWORDS
-import logging, re, numpy, cPickle
+import logging, re, numpy
+import _pickle as cPickle ## Python 3 does not have cPickle
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
 from nltk.stem.porter import PorterStemmer
 from nltk.tokenize import word_tokenize
 
-
+import pdb
 
 
 def main():
     """Main entry."""
     global priorweight
-    dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
-             '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
+    dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ']
+    #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
+    #         '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
 
-    dates = ['2013-01', '2013-02', '2013-03']
+    #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12']
 
     numtopics = 40
     vocabsize = 2000
 
     priorweight = 0.05
-    workers = 3
-    # filterUsers(dates)
+    workers = 1
+    
+    merge=True
+    ########
+    filterUsers(dates)
     createDictionariesFromFiles(dates)
-    createGlobalDictionaryFromMonthly(dates, vocabsize)
-    createMonthCorpuses(dates)
-    #
-    performTFIDF(dates)
-    performLDA(dates, numtopics, vocabsize, workers)
-    # lookupTopics(dates)
+    createGlobalDictionaryFromMonthly(dates, vocabsize,merge)
+    createMonthCorpuses(dates,merge)
+    
+    performTFIDF(dates,merge)
+    #######
+    performLDA(dates, numtopics, vocabsize, workers,merge)
+    #######
+    #lookupTopics(dates)
 
+    #lookatdist(dates[1])
+    #######
 
 
 def lookatdist(date):
@@ -45,7 +54,7 @@ def filterUsers(dates):
     users = set()
     for date in dates:
         musers = set()
-        for line in open("data/" + str(date) + "-title-users.txt", "r"):
+        for line in open("data/" + str(date) + "-titles-users.txt", "r"):  #"-title-users.txt", "r"):
             musers.add(line.strip("\n"))
         if len(users) == 0:
             users = musers
@@ -64,8 +73,12 @@ def readFile(date):
         original_sentences[id] = text
     return original_sentences
 
-def lookupLDATopics(date, docIDs, numTopics):
-    tokenized_dictfile = "models/global-tokenized_dict.pdict"
+def lookupLDATopics(date, docIDs, numTopics, mergeDocs=False):
+    if not mergeDocs:
+        tokenized_dictfile="models/global-tokenized_dict.pdict"
+    else:
+        tokenized_dictfile="models/global-tokenized_dict-perUser.pdict"
+    #####    
     with open(tokenized_dictfile, 'rb') as f:
         tokenized_dict = cPickle.load(f)
     dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
@@ -77,9 +90,13 @@ def lookupLDATopics(date, docIDs, numTopics):
         topics_by_value = sorted(topics, key=lambda tup: tup[1], reverse=True)
         return topics_by_value[:numTopics]
 
-def calculateEta(dates, date, numtopics, vocabsize):
+def calculateEta(dates, date, numtopics, vocabsize,mergeDocs=False):
     priordate = dates[dates.index(date) - 1]
-    tokenized_dictfile = "models/"+priordate+"-monthly-tokenized_dict.pdict"
+    if not mergeDocs:
+	    suffix="-monthly-tokenized_dict.pdict"
+    else:
+        suffix="-monthly-tokenized_dict-perUser.pdict"
+    tokenized_dictfile = "models/"+priordate+suffix
     with open(tokenized_dictfile, 'rb') as f:
         tokenized_dict = cPickle.load(f)
     dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
@@ -132,22 +149,36 @@ def calculateEta2(dates, date, numtopics, vocabsize, minpriorvalue):
             eta[topicid][index] = value
     return eta
 
-def performTFIDF(dates):
+def performTFIDF(dates, mergeDocs=False):
     for date in dates:
-        corpus = corpora.MmCorpus("models/" + date + "-tokenized.mm")
+        if not mergeDocs:
+            suffix_tok="-tokenized.mm"
+            suffix_tfidf_model="-tfidf.model"
+            suffix_tfidf_corpus="-tfidf.mm"
+        else:
+            suffix_tok="-tokenizedUser.mm"
+            suffix_tfidf_model="-tfidfUser.model"
+            suffix_tfidf_corpus="-tfidfUser.mm"
+
+        corpus = corpora.MmCorpus("models/" + date + suffix_tok)
         tfidf = models.TfidfModel(corpus)
-        tfidf.save("models/"+date+"-tfidf.model")
+        tfidf.save("models/"+date+ suffix_tfidf_model)
         tfidf_corpus = tfidf[corpus]
-        corpora.MmCorpus.save_corpus("models/"+date+"-tfidf.mm", tfidf_corpus)
+        corpora.MmCorpus.save_corpus("models/"+date+ suffix_tfidf_corpus, tfidf_corpus)
 
-def performLDA(dates, numtopics, vocabsize, workers):
+def performLDA(dates, numtopics, vocabsize, workers,mergeDocs=False):
     for date in dates:
+        if not mergeDocs:
+            suffix="-tfidf.mm"
+        else:
+            suffix="-tfidfUser.mm"
         print("performing lda on " + str(date))
         dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
-        corpus = corpora.MmCorpus("models/" + date + "-tfidf.mm")
+        corpus = corpora.MmCorpus("models/" + date + suffix)
         if date != dates[0] and priorweight != 0:
             logging.info("Calculating eta based on prior month")
-            eta = calculateEta(dates, date, numtopics, vocabsize)
+            eta = calculateEta(dates, date, numtopics, len(dictionary.keys()),mergeDocs)  ## vocabsize -> len(dictionary.keys()) SAFER!
+            # pdb.set_trace()
             lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=numtopics, workers=workers, eta=eta)
         else:
             logging.info("Eta weighting factor too low or no prior months")
@@ -159,18 +190,25 @@ def performLDA(dates, numtopics, vocabsize, workers):
 def tokenizeandstemline(text):
     stoplist = STOPWORDS
     stemmer = PorterStemmer()
-    tokenized_line = [stemmer.stem(word.lower()) for word in word_tokenize(text.decode('utf-8'), language='english') if word not in stoplist and len(word) > 3 and re.match('^[\w-]+$', word) is not None]
+    ### Python 3 does not have str.decode, and the method PorterStemmer.stem() has a bug
+    #tokenized_line = [stemmer.stem(word.lower()) for word in word_tokenize(text.decode('utf-8'), language='english') if word not in stoplist and len(word) > 3 and re.match('^[\w-]+$', word) is not None]
+    tokenized_line = [stemmer.stem(word.lower()) for word in word_tokenize(text, language='english') if word not in stoplist and len(word) > 3 and re.match('^[\w-]+$', word) is not None]
+    ###
     return tokenized_line
 
 def writecpicklefile(content, filename):
     with open(filename, 'wb') as f:
-        cPickle.dump(content, f, cPickle.HIGHEST_PROTOCOL)
+        cPickle.dump(content, f, -1) #cPickle.HIGHEST_PROTOCOL) ## Python 3 does not have the macro HIGHEST_PROTOCOL
 
 
-def createGlobalDictionaryFromMonthly(dates, vocabsize):
+def createGlobalDictionaryFromMonthly(dates, vocabsize, mergeDocs=False):
     global_tokenized_dict = {}
     for date in dates:
-        monthly_tokenized_dictfile = "models/" + date + "-monthly-tokenized_dict.pdict"
+        if not mergeDocs:
+            suffix="-monthly-tokenized_dict.pdict"
+        else:
+            suffix="-monthly-tokenized_dict-perUser.pdict"
+        monthly_tokenized_dictfile = "models/" + date + suffix
         with open(monthly_tokenized_dictfile, 'rb') as f:
             logging.info("Opening file %s", monthly_tokenized_dictfile)
             global_tokenized_dict = merge_two_dicts(cPickle.load(f), global_tokenized_dict)
@@ -192,6 +230,9 @@ def createDictionariesFromFiles(dates):
     for date in dates:
         print("parsing month: " + date)
         monthly_tokenized_dict = {}
+        ####
+        monthly_tokenized_byUser = {}
+        ####
         monthly_original_dict = {}
         docids = {}
         for line in open("data/" + date + "-titles-tags-text.tsv"):
@@ -199,25 +240,44 @@ def createDictionariesFromFiles(dates):
             docids[id] = (userid, score)
             text = title + " " + tags + " " + text
             tokenized_line = tokenizeandstemline(text)
-            monthly_tokenized_dict[id] = tokenized_line
+            monthly_tokenized_dict[id] = tokenized_line.copy()
             monthly_original_dict[id] = text
+            #### merge all user's documents 
+            if userid in monthly_tokenized_byUser:
+                monthly_tokenized_byUser[userid].extend(tokenized_line.copy())
+            else:
+                monthly_tokenized_byUser[userid]=tokenized_line.copy()
+            ####
+        ### pdb.set_trace() ## just in case :)
         monthly_docids_dictfile = "models/"+date+"-docids.pdict"
         writecpicklefile(docids, monthly_docids_dictfile)
         monthly_tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
         writecpicklefile(monthly_tokenized_dict, monthly_tokenized_dictfile)
+        ####
+        monthly_tokenized_dictfile_perUser = "models/"+date+"-monthly-tokenized_dict-perUser.pdict"
+        writecpicklefile(monthly_tokenized_byUser, monthly_tokenized_dictfile_perUser)
+        ####
         monthly_original_dictfile = "models/"+date+"-monthly-original_dict.pdict"
         writecpicklefile(monthly_original_dict, monthly_original_dictfile)
 
-def createMonthCorpuses(dates):
+def createMonthCorpuses(dates,mergeDocs=False):
     for date in dates:
         logging.info("Parsing date: %s", date)
         print("parsing month: " + date)
-        monthly_dict_file = "models/" + date + "-monthly-tokenized_dict.pdict"
+        if not mergeDocs:
+            suffix_source="-monthly-tokenized_dict.pdict"
+            suffix_target='-tokenized.mm'
+        else:
+            suffix_source="-monthly-tokenized_dict-perUser.pdict"
+            suffix_target='-tokenizedUser.mm'
+		    
+        monthly_dict_file = "models/" + date + suffix_source
         with open(monthly_dict_file, 'rb') as f:
             tokenized_dict = cPickle.load(f)
         dictionary = corpora.Dictionary.load('models/global-dictionary.dict')
         corpus = [dictionary.doc2bow(sentence) for sentence in tokenized_dict.values()]
-        corpora.MmCorpus.serialize('models/' + date + '-tokenized.mm', corpus)
+        corpora.MmCorpus.serialize('models/' + date + suffix_target, corpus)
+
 
 if __name__ == '__main__':
     main()
diff --git a/TopicComparator.py b/TopicComparator.py
index 81a207f..b57c897 100644
--- a/TopicComparator.py
+++ b/TopicComparator.py
@@ -4,13 +4,16 @@
 from numpy.linalg import norm
 from numpy import array
 
-
+import pdb
 
 def main():
     global numtopics, vocabsize
-    dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
-             '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
-    # dates = ['2013-02', '2013-03'] #, '2013-03', '2013-03']
+    dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ']
+    #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
+    #         '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
+    #dates = ['2013-01','2013-02', '2013-03'] #, '2013-03', '2013-03']
+    #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12']
+
     numtopics = 40
     vocabsize = 2000
     # compareMonths(dates)
@@ -21,7 +24,7 @@ def main():
 def compareMonths(dates):
     i = 1
     for month in dates:
-        print month
+        print (month)
         nextmonth = dates[i]
         TVDBasedSimilarity(month, nextmonth)
         KLDBasedSimilarity(month, nextmonth)
@@ -110,7 +113,9 @@ def printTopicWords(dates):
         lda = models.LdaModel.load("ldamodels/" + month + "-lda.model")
         topicfile = open("topics/"+month+"-topicwords.txt", "w")
         ldalist = lda.show_topics(num_topics=numtopics, num_words=10, log=False, formatted=False)
-        wordlists = { topic[0]: [wordvals[0].encode('utf-8') for wordvals in topic[1]] for topic in ldalist}
+        #pdb.set_trace()
+        # wordlists = { topic[0]: [wordvals[0].encode('utf-8') for wordvals in topic[1]] for topic in ldalist} ## delete encode('utf-8')
+        wordlists = { topic[0]: [wordvals[0] for wordvals in topic[1]] for topic in ldalist}
         for topic in wordlists.keys():
             line = str(topic) + "\t" + " ".join(wordlists[topic]) + "\n"
             topicfile.write(line)
diff --git a/TopicStats.py b/TopicStats.py
index 777bbb5..e0a05f8 100644
--- a/TopicStats.py
+++ b/TopicStats.py
@@ -1,22 +1,26 @@
 from gensim import corpora, models
-import logging, numpy, cPickle
+import logging, numpy #, cPickle
+import _pickle as cPickle ## Python 3 does not have cPickle
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
 
 
 
 def main():
     global topicthreshold
-    dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
-             '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
+    dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ']
+    #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
+    #         '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
     # dates = ['2013-01', '2013-02']
+    #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12']
+    
     topics = 40
     topicthreshold = 0.3
+    merge=True
+    countWords(dates, topics,merge)
+    docPerTopic(dates,merge)
 
-    # countWords(dates, topics)
-    docPerTopic(dates)
 
-
-def docPerTopic(dates):
+def docPerTopic(dates,mergeDocs):
     dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
     doctopics = {}
     topicfile = open("stats/docpertopic.tsv", 'w')
@@ -25,7 +29,11 @@ def docPerTopic(dates):
         date = str(date)
         print(date)
 
-        tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+        if not mergeDocs:
+            tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+        else:
+            tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict" 
+        ####    
         with open(tokenized_dictfile, 'rb') as f:
             tokenized_dict = cPickle.load(f)
 
@@ -35,8 +43,12 @@ def docPerTopic(dates):
         for doc in documentfile:
             [docid, userid, creationdate, score, title, tags, text] = doc.rstrip("\n").split("\t")
 
-
-            sentence = tokenized_dict[docid]
+            ###### 
+            if not mergeDocs:
+                sentence = tokenized_dict[docid]
+            else:
+                sentence = tokenized_dict[userid]
+            #######
             bow = dictionary.doc2bow(sentence)
             documenttopics = lda[bow]
             for (topicid, topicvalue) in documenttopics:
@@ -53,7 +65,7 @@ def docPerTopic(dates):
                     doctopics[topicid][date] = 0
                 doctopics[topicid][date]+=1
 
-    print doctopics
+    print (doctopics)
     for topicid in doctopics.keys():
         line = str(topicid)
         for date in doctopics[topicid].keys():
@@ -65,7 +77,7 @@ def docPerTopic(dates):
 
 
 
-def countWords(dates, numtopics):
+def countWords(dates, numtopics, mergeDocs):
     wordfile = open("stats/wordcounts.tsv", "w")
     words = {} #each word counted once per doc
     totalwords = {} #each word counted n times per n mentions in doc
@@ -76,7 +88,11 @@ def countWords(dates, numtopics):
         words[date] = 0
         uniquewords[date]  = set()
         totalwords[date] = 0
-        tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+        if not mergeDocs:
+            tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+        else:
+            tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict"
+        ###    
         with open(tokenized_dictfile, 'rb') as f:
             tokenized_dict = cPickle.load(f)
         dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
@@ -86,6 +102,7 @@ def countWords(dates, numtopics):
         logging.info("Parsing date: %s", str(date))
         [countedwordtopics.append(0) for i in range(numtopics)]
         for docID in tokenized_dict.keys():
+            # check if it is necessary to choose between docID or UserID (when mergeDoc is True) 
             doc = tokenized_dict[docID]
             bow = dictionary.doc2bow(doc)
             wordcount = len(bow)
diff --git a/UserComparator.py b/UserComparator.py
index 73675f3..3cf34ba 100644
--- a/UserComparator.py
+++ b/UserComparator.py
@@ -1,30 +1,39 @@
 from gensim import corpora, models
-import cPickle, numpy, logging, scipy
+import numpy, logging, scipy
 from numpy.linalg import norm
 from scipy.stats import entropy
 
+import _pickle as cPickle ## Python 3 does not have cPickle
+
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
 
+import pdb
+
 def main():
     global topicthreshold
-    dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09',
-             '2013-10', '2013-11', '2013-12',
-             '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09',
-             '2014-10', '2014-11', '2014-12']
-    # dates = ['2013-01', '2013-02', '2013-03']
+    dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ']
+    #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09',
+    #         '2013-10', '2013-11', '2013-12',
+    #         '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09',
+    #         '2014-10', '2014-11', '2014-12']
+    #dates = ['2013-01', '2013-02', '2013-03']
+    #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12']
+
     numtopics = 40
     vocabsize = 2000
     topicthreshold = 0.3
+    merge=True
 
-    # summarizeTopicsPerUser(dates)
-    # compareMonths(dates)
-    # lookupTopics(dates)
+    #summarizeTopicsPerUser(dates,merge)
+    #compareMonths(dates)
+    #lookupTopics(dates,merge)
+    #####
     createUserEvolutionChain(dates)
 
 def compareMonths(dates):
     i = 1
     for month in dates:
-        print month
+        print (month)
         nextmonth = dates[i]
         # TVDBasedSimilarity(month, nextmonth)
         KLDBasedSimilarity(month, nextmonth)
@@ -83,7 +92,7 @@ def JSD(P, Q):
 
 
 
-def summarizeTopicsPerUser(dates):
+def summarizeTopicsPerUser(dates,mergeDocs=False):
     dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
     usersfile = "data/allusers.txt"
     users = set(open(usersfile).read().split())
@@ -96,18 +105,25 @@ def summarizeTopicsPerUser(dates):
         date = str(date)
         print(date)
 
-        tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+        if not mergeDocs:
+            tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+        else:
+            tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict"
+        ###
         with open(tokenized_dictfile, 'rb') as f:
             tokenized_dict = cPickle.load(f)
         documentfile = open("data/" + date + "-titles-tags-text.tsv")
 
         lda = models.LdaMulticore.load("ldamodels/" + date + "-lda.model")
-
+        ###pdb.set_trace()
         for doc in documentfile:
             [docid, userid, creationdate, score, title, tags, text] = doc.rstrip("\n").split("\t")
             document_users[docid] = userid
             document_scores[docid] = score
-            sentence = tokenized_dict[docid]
+            if not mergeDocs:
+                sentence = tokenized_dict[docid]
+            else:
+                sentence = tokenized_dict[userid]
             bow = dictionary.doc2bow(sentence)
             documenttopics = lda[bow]
             for (topicid, topicvalue) in documenttopics:
@@ -148,7 +164,7 @@ def writecpicklefile(content, filename):
 
 
 
-def lookupTopics(dates):
+def lookupTopics(dates,mergeDocs=False):
     dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
     document_users = {}
     document_scores = {}
@@ -156,8 +172,12 @@ def lookupTopics(dates):
     for date in dates:
         date = str(date)
         print(date)
-
-        tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+    
+        if not mergeDocs:
+            tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+        else:
+            tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict"
+        ####    
         with open(tokenized_dictfile, 'rb') as f:
             tokenized_dict = cPickle.load(f)
 
@@ -179,7 +199,10 @@ def lookupTopics(dates):
                     continue
             document_users[docid] = userid
             document_scores[docid] = score
-            sentence = tokenized_dict[docid]
+            if not mergeDocs:
+                sentence = tokenized_dict[docid]
+            else:
+                sentence = tokenized_dict[userid]
             bow = dictionary.doc2bow(sentence)
             documenttopics = lda[bow]
             for (topicid, topicvalue) in documenttopics:
@@ -215,6 +238,7 @@ def lookupTopics(dates):
                 # resultline = str(topicid) + "\t" + str(userid) + "\t" + str(meantopicvalue) + "\n"
                 topicfile.write(resultline)
         topicfile.close()
+    print('***** End (lookupTopics) ****')
 
 def createUserEvolutionChain(dates):
     topicscores={}
@@ -234,6 +258,7 @@ def createUserEvolutionChain(dates):
     users.add("7585")
     users.add("12579")
 
+    print('***** Begin (createUserEvolution) ****')
     for date in dates:
         topicfile = open("topics/" + date + "-topics.txt", 'r')
         allwords[date] = {}
diff --git a/UserStatistics.py b/UserStatistics.py
new file mode 100644
index 0000000..0ee96c3
--- /dev/null
+++ b/UserStatistics.py
@@ -0,0 +1,74 @@
+mergeDocs=False
+dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ']
+topicthreshold = 0.3
+dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
+document_users = {}
+document_scores = {}
+users = set()
+
+if not mergeDocs:
+	tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
+else:
+	tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict"
+####    
+with open(tokenized_dictfile, 'rb') as f:
+	tokenized_dict = cPickle.load(f)
+
+usertopics = {}
+userdoctopics = {}
+usertopicscores = {}
+documentfile = open("data/" + date + "-titles-tags-text.tsv")
+topicfile = open("topics/" + date + "-topics.txt", 'w')
+headerline = "UserID\ttopicID\tmeantopicvalue\tnumdocs\tmeantopicscore\ttopicword1\ttopicword2\ttopicword3\ttopicword4\ttopicword5\n"
+topicfile.write(headerline)
+lda = models.LdaMulticore.load("ldamodels/" + date + "-lda.model")
+
+for doc in documentfile:
+	[docid, userid, creationdate, score, title, tags, text] = doc.rstrip("\n").split("\t")
+	if date == dates[0]:
+		users.add(userid)
+	else:
+		if userid not in users:
+			continue
+	document_users[docid] = userid
+	document_scores[docid] = score
+	if not mergeDocs:
+		sentence = tokenized_dict[docid]
+	else:
+		sentence = tokenized_dict[userid]
+	bow = dictionary.doc2bow(sentence)
+	documenttopics = lda[bow]
+	for (topicid, topicvalue) in documenttopics:
+		if topicvalue >= topicthreshold:
+			try:
+				userdoctopics[userid]
+			except KeyError:
+				userdoctopics[userid] = {}
+				userdoctopics[userid][topicid] = []
+				usertopicscores[userid] = {}
+				usertopicscores[userid][topicid] = []
+			try:
+				userdoctopics[userid][topicid]
+			except KeyError:
+				userdoctopics[userid][topicid] = []
+				usertopicscores[userid][topicid] = []
+			userdoctopics[userid][topicid].append(topicvalue)
+			usertopicscores[userid][topicid].append(int(score))
+for userid in userdoctopics.keys():
+	usertopics[userid] = {}
+	for topicid in userdoctopics[userid].keys():
+		meantopicvalue = numpy.mean(userdoctopics[userid][topicid])
+		meantopicscore = numpy.mean(usertopicscores[userid][topicid])
+		numdocs = len(userdoctopics[userid][topicid])
+		if meantopicvalue < topicthreshold:
+			continue
+		usertopics[userid][topicid] = meantopicvalue
+		topicterms = lda.get_topic_terms(topicid, topn=5)
+		topicwords = ""
+		for term in topicterms:
+			topicwords += dictionary.get(term[0]).ljust(15) + "\t"
+		resultline = str(userid)+"\t"+str(topicid)+"\t"+ str(meantopicvalue) + "\t" + str(numdocs) + "\t" + str(meantopicscore) + "\t" + str(topicwords) + "\n"
+		# resultline = str(topicid) + "\t" + str(userid) + "\t" + str(meantopicvalue) + "\n"
+		topicfile.write(resultline)
+topicfile.close()
+print('***** End (lookupTopics) ****')
diff --git a/downloadAndPrepareData.sh b/downloadAndPrepareData.sh
index 6b4db83..dab73b2 100755
--- a/downloadAndPrepareData.sh
+++ b/downloadAndPrepareData.sh
@@ -7,6 +7,8 @@ mkdir data
 mkdir models
 mkdir topics
 mkdir rawdata
+# the following folder was missing
+mkdir ldamodels
 cd rawdata
 
 if [ ! -f "stackoverflow.com-Posts.7z" ]; then
diff --git a/plotFunctionsDoc.py b/plotFunctionsDoc.py
new file mode 100644
index 0000000..01c82be
--- /dev/null
+++ b/plotFunctionsDoc.py
@@ -0,0 +1,220 @@
+### How to use it:
+###  The dictionary userTop must be populated: docTop=buildDict(dates,'-doctopicdist.txt','topics/DocDistrib/')
+###      in the example dates are the dates of the files, '-doctopicdist.txt' are the suffix and 'topics/DocDistrib/' the folder (optional)
+### Available functions:
+###  buildDocTopic(tpId, threshold) builds a list SxD (S: # of slice time, D: # of documents)
+###                               with the probabilities associated to documents      
+###  plotDocrvsTop (threshold), plots the number of documents by topic in all time slices
+###  oneVsoneDocs (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function)
+###      returns a list of  p-values of the samples (slice_i x slice_i+1, slice_i+1 x slice_i+2 ....)
+###              and a list for labeling X axis
+###
+###  oneVsallDocs (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function)
+###      returns a list of  p-values of the samples (slice_first x slice_i+1, slice_first x slice_i+2 ....)
+###              and a list for labeling X axis
+###  PlotStatsDocs(statisc function, start, label of the graph, call oneVsAll (True) or oneVsone (False), threshold, list of topics, arguments for function)
+###      plots a line graph with p-values of the samples being compared
+###      calls buildTop, oneVsall or oneVsone for every topic and plots the result
+###  Examples
+###   t=buildTopic(10,.7) -> t is a list with 24 slices of time and n document probabilities for topic 10 
+###   PlotStatsDocs(stats.ttest_ind,1,"Student's T test")
+###      plots the p-values using t-test for all topics comparing the first slice with all others
+###   PlotStatsDocs(stats.ttest_ind,1,"Student's T test",False)
+###      plots the p-values using t-test for all topics comparing the two consective time slices from the fisrt one
+###   PlotStatsDocs(stats.ttest_ind,1,"Student's T test",False,[3,4,5])
+###      The same above but it plots only for topics 3,4, and 5.
+
+import numpy as np
+from scipy import stats
+import matplotlib.pyplot as pl
+
+def main():
+  global docTop
+  global colors
+  global vslices
+  docTop={}
+  vslices=['Jan13','Feb13','Mar13','Apr13','May13','Jun13','Jul13','Aug13','Sep13','Oct13','Nov13','Dec13',
+         'Jan14','Feb14','Mar14','Apr14','May14','Jun14','Jul14','Aug14','Sep14','Oct14','Nov14','Dec14']
+
+  dates=['2013-01','2013-02','2013-03','2013-04','2013-05','2013-06','2013-07','2013-08','2013-09','2013-10','2013-11','2013-12',
+       '2014-01','2014-02','2014-03','2014-04','2014-05','2014-06','2014-07','2014-08','2014-09','2014-10','2014-11','2014-12']
+  #
+  colors=['yellow',   'orange', 'pink','black','blue','brown','coral','crimson','cyan','darkblue',
+        'darkgreen','fuchsia','gold','green','grey','indigo','red','yellowgreen','navy','olive',
+        'azure','orchid','beige','plum','purple','lavender','salmon','silver','violet','aqua','magenta']
+
+  docTop=buildDict(dates,'-doctopicdist.txt','topics/DocDistrib/')
+  plotDocvsTop(.5) ## plot the number of topics per topic per time slice
+  ## plot the p-value from bartlett test for all topics from the first
+  ## time slice
+  PlotStatsDocs(stats.bartlett,1,"Bartlett's test",False,.05,[3,5,7,9,11])
+
+### build a dictionary with all probabilities and documents
+### docTop size is 24 (24 slices of time)
+### doctop[n] is 20 (20 topics in slice n)
+### doctop[n][t] is the probabilities of documents to be associated to
+###   topic t in slice n
+def buildDict(dates,filesuf,fileprex=''):
+  #### topics vs docs
+  docT={}
+  for i in range(len(dates)):
+     file=fileprex+dates[i]+filesuf
+     lines=[line.split() for line in open(file)]
+     matprob=[]
+     for j in range(len(lines)):
+         probs=[float(n) for n in lines[j]]
+         matprob.append(probs[1:])
+     matprob=np.array(matprob)
+     docT[i]=matprob
+  return docT   
+### Build the doc probabilities by topic
+
+## plot the number of documents per topic in each slice of time
+## thr is the probability threshold
+def plotDocvsTop(thr=0.01):
+  plist=[]
+  for i in range(len(docTop[0])):
+    p=[]
+    for j in range(len(docTop)):
+      p.append(len([i for i in docTop[j][i] if i > thr]))
+    plist.append(pl.plot(p))
+  ltup=()
+  ctup=()
+  for i in range(len(plist)):
+     ntup=(plist[i][0],)
+     ltup=ltup+ntup
+     ntup=('T'+str(i),)
+     ctup=ctup+ntup
+  pl.legend(ltup,ctup)
+  pl.ylabel('# of documents')
+  pl.xticks(np.arange(0,24),np.arange(1,24))
+  pl.grid(True)
+  pl.title('Documents per topics\n Threshold: '+str(thr))
+  pl.show()       
+
+
+### build list for the documents probabilities in a given topic (tId)
+### and a given threshol (thres)
+### the list NxP, where N is the slice number and P the documents
+### probabilities for tId in that slice (P is not fixed in the slices)
+def buildDocTopic(tId,thres=0.001):
+   tn=[]   ## stores the probabilities
+   for i in range(24):
+      tn.append([i for i in docTop[i][tId] if i > thres])
+   return tn 
+
+def oneVsallDocsPlot(fct,start,data,tp,label,**args):
+  Xlabels=[]
+  start=start-1
+  for i in range(start+1,len(vslices)):
+     Xlabels.append(vslices[start]+'x'+vslices[i])  ### labels
+     
+  pval=[]
+  for i in range(1,len(Xlabels)+1):
+     if not args:
+        t,p=fct(data[start],data[start+i])
+     else:
+        key=list(args)[0]
+        val=args.get(key)
+        args={key:val}  
+        t,p=fct(data[start],data[start+i],**args)
+     pval.append(p)
+  pl.plot(pval)
+  pl.xticks(np.arange(0,len(Xlabels)),Xlabels,rotation=90)
+  pl.title(label+"\nTopic: "+str(tp))
+  pl.show()
+     
+  
+
+### Returns the p-value for fct (stats.<function>), comparing <start> (1 is the first) to all following time slices
+### for the document probabilities in a topic defined by data
+### args is used as parameters for fct (if it is necessary)   
+def oneVsallDocs(fct,start,data,**args):
+  Xlabels=[]
+  start=start-1
+  for i in range(start+1,len(vslices)):
+     Xlabels.append(vslices[start]+'x'+vslices[i])  ### labels
+     
+  pval=[]
+  for i in range(1,len(Xlabels)+1):
+     if not args:
+        t,p=fct(data[start],data[start+i])
+     else:
+        key=list(args)[0]
+        val=args.get(key)
+        args={key:val}  
+        t,p=fct(data[start],data[start+i],**args)
+     pval.append(p)
+  return pval,Xlabels
+
+### Returns the p-value for fct (stats.<function>), from <start> (1 is the first) to all following time slices
+### comparing two slices in a row: jan13xfev13, fev13xmar13, and so on
+### for the document probabilities in a topic defined by data
+### args is used as parameters for fct (if it is necessary)   
+def oneVsoneDocs(fct,start,data,**args):
+  Xlabels=[]
+  start=start-1
+  for i in range(start,len(vslices)-1):
+     Xlabels.append(vslices[i]+'x'+vslices[i+1])  ### labels
+     
+  pval=[]
+  for i in range(start,len(Xlabels)+start):
+     if not args:
+        t,p=fct(data[i],data[i+1])
+     else:
+        key=list(args)[0]
+        val=args.get(key)
+        args={key:val}  
+        t,p=fct(data[i],data[i+1],**args)
+     pval.append(p)
+  return pval,Xlabels
+
+### Plots all topics in slices from <start> (1 is the first), with p-value
+### calculate by fct. label is the name of the statistical function to
+### appear in the title, topics is a list of topics to be plot (none=all)
+### args is to be passed to fct
+def PlotStatsDocs(fct,start,label,oneVsall=True,th=0,topics=None,**args):
+    t=[]
+    eTop=[]
+    if topics is None:
+       for i in range(20):
+          eTop.append(i)
+    else:
+       eTop=topics
+    for i in eTop:
+       t.append(buildDocTopic(i,th))
+    #
+    plist=[]
+    for i in range(len(t)):
+       if not args:
+          if oneVsall:
+             pv,xl=oneVsallDocs(fct,start,t[i])
+          else:
+             pv,xl=oneVsoneDocs(fct,start,t[i])  
+       else:
+          if oneVsall:
+             pv,xl=oneVsallDocs(fct,start,t[i],**args)
+          else:
+             pv,xl=oneVsoneDocs(fct,start,t[i],**args)
+       plist.append(pl.plot(pv,color=colors[i]))
+    #
+    ltup=()
+    ctup=()
+    j=0
+    for i in range(len(plist)):
+       ntup=(plist[i][0],)
+       ltup=ltup+ntup
+       eTop[j]
+       ntup=('T'+str(eTop[j]),)
+       j=j+1
+       ctup=ctup+ntup
+    #
+    pl.legend(ltup,ctup)
+    pl.ylabel('p-value')
+    pl.grid(True)
+    pl.xticks(np.arange(0,len(xl)),xl,rotation=90)
+    pl.title(label+'\nDocuments in Topics - Threshold: '+str(th))
+    pl.show()       
+
+if __name__ == '__main__':
+    main()
diff --git a/plotFunctionsUser.py b/plotFunctionsUser.py
new file mode 100644
index 0000000..677ac7e
--- /dev/null
+++ b/plotFunctionsUser.py
@@ -0,0 +1,256 @@
+### How to use it:
+###  The dictionary userTop must be populated: userTop=buildDict(dates,'-topicuserdist.txt','topics/')
+###      in the example dates are the dates of the files, '-topicuserdist.txt' are the suffix and 'topics/' the folder (optional)
+### Available functions:
+###  buildTopic(tpId, threshold) builds a list SxU (S: # of slice time, U: # of users)
+##                               with the probabilities associated to users      
+###  plotUservsTop (threshold), plots the number of user by topic in all time slices
+###  oneVsone (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function)
+###      returns a list of  p-values of the samples (slice_i x slice_i+1, slice_i+1 x slice_i+2 ....)
+###              and a list for labeling X axis
+###
+###  oneVsall (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function)
+###      returns a list of  p-values of the samples (slice_first x slice_i+1, slice_first x slice_i+2 ....)
+###              and a list for labeling X axis
+###  PlotStats(statisc function, start, label of the graph, call oneVsAll (True) or oneVsone (False), threshold, list of topics, arguments for function)
+###      plots a line graph with p-values of the samples being compared
+###      calls buildTop, oneVsall or oneVsone for every topic and plots the result
+###  Examples
+###   t=buildTopic(10,.7) -> t is a list with 24 slices of time and n user probabilities for topic 10 
+###   PlotStats(stats.ttest_ind,1,"Student's T test")
+###      plots the p-values using t-test for all topics comparing the first slice with all others
+###   PlotStats(stats.ttest_ind,1,"Student's T test",False)
+###      plots the p-values using t-test for all topics comparing the two consective time slices from the fisrt one
+###   PlotStats(stats.ttest_ind,1,"Student's T test",False,[3,4,5])
+###      The same above but it plots only for topics 3,4, and 5.
+import numpy as np
+from scipy import stats
+
+import matplotlib.pyplot as pl
+
+def main():
+  global userTop
+  global colors
+  global vslices
+  userTop={}
+  vslices=['Jan13','Feb13','Mar13','Apr13','May13','Jun13','Jul13','Aug13','Sep13','Oct13','Nov13','Dec13',
+         'Jan14','Feb14','Mar14','Apr14','May14','Jun14','Jul14','Aug14','Sep14','Oct14','Nov14','Dec14']
+  #
+  dates=['2013-01','2013-02','2013-03','2013-04','2013-05','2013-06','2013-07','2013-08','2013-09','2013-10','2013-11','2013-12',
+       '2014-01','2014-02','2014-03','2014-04','2014-05','2014-06','2014-07','2014-08','2014-09','2014-10','2014-11','2014-12']
+  #
+  colors=['yellow',   'orange', 'pink','black','blue','brown','coral','crimson','cyan','darkblue',
+        'darkgreen','fuchsia','gold','green','grey','indigo','red','yellowgreen','navy','olive',
+        'azure','orchid','beige','plum','purple','lavender','salmon','silver','violet','aqua','magenta']
+
+  userTop=buildDict(dates,'-topicuserdist.txt','topics/')
+  plotUservsTop(.5) ## plot the number of topics per user as time goes by
+  ## plot the p-value from bartlett test for all topics from the first
+  ## time slice
+  PlotStats(stats.bartlett,1,"Bartlett's test",False)
+
+       
+
+### Create a dict "userTop" with all the probabilities
+###      of a user to be associated to topics in 24 slices of time
+###   userTop[0] refers to the first time slice
+####     userTop[0][0] refers to users probabilities in the first topic in time slice 0
+####
+def buildDict(dates,filesuf,fileprex=''):
+  userT={}
+  for i in range(len(dates)):
+    file=fileprex+dates[i]+filesuf
+    lines=[line.split() for line in open(file)]
+    matprob=[]
+    for j in range(len(lines)):
+       probs=[float(n) for n in lines[j]]
+       matprob.append(probs)
+    matprob=np.array(matprob)
+    userT[i]=matprob
+  return userT
+###
+
+####################### USERS and TOPICS
+def plotUservsTop(trh=0.001):
+  uvst=[]
+  for i in range(20):
+    t=[]
+    for j in range(len(userTop)):
+        t.append(np.count_nonzero(userTop[j][i]>trh))
+    uvst.append(t)
+  plist=[]
+  for topId in range(20):
+    plist.append(pl.plot(uvst[topId],color=colors[topId]))
+  ltup=()
+  ctup=()
+  for i in range(20):
+    ntup=(plist[i][0],)
+    ltup=ltup+ntup
+    ntup=('T'+str(i),)
+    ctup=ctup+ntup
+  pl.legend(ltup,ctup)
+  pl.xticks(np.arange(0,len(userTop)),np.arange(1,len(userTop)+1))
+  pl.ylabel('# of Users')
+  pl.xlabel('Slices of time')
+  pl.grid(True)
+  pl.title('Threshold '+str(trh))
+  pl.show()    
+
+
+### Extract the user probabilities of the given topic tId for all time slices (24)
+def buildTopic(tId,thres=0.001):
+   tn=[]   ## stores the probabilities
+   for i in range(24):
+      tn.append(userTop[i][tId,userTop[i][tId]>=thres])
+   return tn 
+
+### given a stastic function (fct), a initial time slice, a dataset, the label correspondig to fct, and a topic tp
+### plot the p-value from the start to point to all older ones.
+def oneVsallPlot(fct,start,data,label,tp,**args):
+  Xlabels=[]
+  start=start-1
+  for i in range(start+1,len(vslices)):
+     Xlabels.append(vslices[start]+'x'+vslices[i])  ### labels
+     
+  pval=[]
+  for i in range(1,len(Xlabels)+1):
+     if not args:
+        t,p=fct(data[start],data[start+i])
+     else:
+        t,p=fct(data[start],data[start+i],alternative='two-sided')
+     pval.append(p)
+  pl.plot(pval)
+  pl.xticks(np.arange(0,len(Xlabels)),Xlabels,rotation=90)
+  pl.title(label+"\nTopic: "+str(tp))
+  pl.show()
+
+def oneVsall(fct,start,data,**args):
+  Xlabels=[]
+  start=start-1
+  for i in range(start+1,len(vslices)):
+     Xlabels.append(vslices[start]+'x'+vslices[i])  ### labels
+     
+  pval=[]
+  for i in range(1,len(Xlabels)+1):
+     if not args:
+        t,p=fct(data[start],data[start+i])
+     else:
+        key=list(args)[0]
+        val=args.get(key)
+        args={key:val}  
+        t,p=fct(data[start],data[start+i],**args)
+     pval.append(p)
+  return pval,Xlabels
+
+def oneVsone(fct,start,data,**args):
+  Xlabels=[]
+  start=start-1
+  for i in range(start,len(vslices)-1):
+     Xlabels.append(vslices[i]+'x'+vslices[i+1])  ### labels
+     
+  pval=[]
+  for i in range(start,len(Xlabels)+start):
+     if not args:
+        t,p=fct(data[i],data[i+1])
+     else:
+        key=list(args)[0]
+        val=args.get(key)
+        args={key:val}  
+        t,p=fct(data[i],data[i+1],**args)
+     pval.append(p)
+  return pval,Xlabels
+
+
+def PlotStats(fct,start,label,oneXall=True,th=0,topics=None,**args):
+    t=[]
+    eTop=[]
+    if topics is None:
+       for i in range(20):
+          eTop.append(i)
+    else:
+       eTop=topics
+    for i in eTop:
+       t.append(buildTopic(i,th))
+    #
+    plist=[]
+    for i in range(len(t)):
+       if not args:
+          if oneXall:
+             pv,xl=oneVsall(fct,start,t[i])
+          else:
+             pv,xl=oneVsone(fct,start,t[i])  
+       else:
+          if oneVsall:
+             pv,xl=oneVsall(fct,start,t[i],**args)
+          else:
+             pv,xl=oneVsone(fct,start,t[i],**args)
+       plist.append(pl.plot(pv,color=colors[i]))
+    #
+    ltup=()
+    ctup=()
+    j=0
+    for i in range(len(plist)):
+       ntup=(plist[i][0],)
+       ltup=ltup+ntup
+       eTop[j]
+       ntup=('T'+str(eTop[j]),)
+       j=j+1
+       ctup=ctup+ntup
+    #
+    pl.legend(ltup,ctup)
+    pl.ylabel('p-value')
+    pl.grid(True)
+    pl.xticks(np.arange(0,len(xl)),xl,rotation=90)
+    pl.title(label+'\nUsers in Topics - Threshold: '+str(th))
+    pl.show()       
+
+
+if __name__ == '__main__':
+    main()
+    
+### Paired test must be done in samples having the same shape
+    
+######## KRUSKAL
+## The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal. 
+## It is a non-parametric version of ANOVA. The test works on 2 or more independent samples, which may have different sizes. 
+## Note that rejecting the null hypothesis does not indicate which of the groups differs. 
+## Post-hoc comparisons between groups are required to determine which groups are different.
+#ToneVsall(stats.kruskal,15,'Kruskal')  
+#ToneVsall(stats.kruskal,15,'Kruskal',.5,[9,11,13,15]) ## probability >=.5 and topics 9,11,13,15  
+
+
+##### STUDENT'S T TEST
+## Calculates the T-test for the means of two independent samples of scores.
+## This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values. 
+## This test assumes that the populations have identical variances by default.
+#ToneVsall(stats.ttest_ind,15,"Student's t test")
+
+##### MANN-WHITNEY U
+## The Mann-Whitney U test is a nonparametric test that allows two groups or conditions or treatments to be 
+## compared without making the assumption that values are normally distributed. 
+## So, for example, one might compare the speed at which two different groups of people can run 100 metres, 
+## where one group has trained for six weeks and the other has not. 
+#ToneVsall(stats.mannwhitneyu,15,'Mann Whitney',alternative='two-sided')
+#ToneVsall(stats.mannwhitneyu,15,'Mann Whitney',.5,[9,11,13,15],alternative='two-sided')
+
+##### KOLMOGOROV-SMIRNOV
+## The Kolmogorov-Smirnov test (KS-test) tries to determine if two datasets differ significantly. 
+## The KS-test has the advantage of making no assumption about the distribution of data. (Technically speaking it is non-parametric and distribution free.) 
+## Note however, that this generality comes at some cost: other tests (for example Student's t-test) may be more sensitive if the 
+## data meet the requirements of the test. 
+#ToneVsall(stats.ks_2samp,15,'Kolmogorov-Smirnov')
+
+
+
+### ToneVsall(stats.wilcoxon,15,'Wilcoxon') Not for samples with different sizes
+### ToneVsall(stats.ttest_rel,15,"T test on Two Related") Not for samples with different sizes
+
+### BARTLETT
+## Bartlett’s test tests the null hypothesis that all input samples are from populations with equal variances
+#ToneVsall(stats.bartlett,15,"Bartlett's test")
+
+### LEVENE
+## Perform Levene test for equal variances.
+## The Levene test tests the null hypothesis that all input samples are from populations with equal variances. 
+## Levene’s test is an alternative to Bartlett’s test bartlett in the case where there are significant deviations from normality.
+#ToneVsall(stats.levene,15,"Levene's test")
diff --git a/prepareQuarter.sh b/prepareQuarter.sh
new file mode 100755
index 0000000..b316e3a
--- /dev/null
+++ b/prepareQuarter.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+echo "This script will prepare the stackoverflow data with time slice equals a quarter"
+
+
+
+# Data must already be download, and folder rawdata must exist (as well 2013.Posts.xml and 2014.Posts.xml)
+cd rawdata
+
+echo "splitting files 2013 in quarters"
+M20131st=`awk '/CreationDate=\"2013-01/ {print NR; exit}' 2013.Posts.xml`
+M20132nd=`awk '/CreationDate=\"2013-04/ {print NR; exit}' 2013.Posts.xml`
+LAST=`expr $M20132nd - 1`
+awk 'NR=='$M20131st', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2013.Posts.xml > 2013-1stQ.Posts.xml
+echo "2013-1st quarter done"
+
+
+M20133rd=`awk '/CreationDate=\"2013-07/ {print NR; exit}' 2013.Posts.xml`
+LAST=`expr $M20133rd - 1`
+awk 'NR=='$M20132nd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2013.Posts.xml > 2013-2ndQ.Posts.xml
+echo "2013-2nd Quarter done"
+
+M20134th=`awk '/CreationDate=\"2013-10/ {print NR; exit}' 2013.Posts.xml`
+LAST=`expr $M20134th - 1`
+awk 'NR=='$M20133rd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2013.Posts.xml > 2013-3rdQ.Posts.xml
+echo "2013-3rd Quarter done"
+
+awk 'NR>='$M20134th 2013.Posts.xml > 2013-4thQ.Posts.xml
+echo "2013-4th Quarter done"
+
+####
+echo "splitting files 2014 in quarters"
+M20141st=`awk '/CreationDate=\"2014-01/ {print NR; exit}' 2014.Posts.xml`
+M20142nd=`awk '/CreationDate=\"2014-04/ {print NR; exit}' 2014.Posts.xml`
+LAST=`expr $M20142nd - 1`
+awk 'NR=='$M20141st', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2014.Posts.xml > 2014-1stQ.Posts.xml
+echo "2014-1st quarter done"
+
+
+M20143rd=`awk '/CreationDate=\"2014-07/ {print NR; exit}' 2014.Posts.xml`
+LAST=`expr $M20143rd - 1`
+awk 'NR=='$M20142nd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2014.Posts.xml > 2014-2ndQ.Posts.xml
+echo "2014-2nd Quarter done"
+
+M20144th=`awk '/CreationDate=\"2014-10/ {print NR; exit}' 2014.Posts.xml`
+LAST=`expr $M20144th - 1`
+awk 'NR=='$M20143rd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2014.Posts.xml > 2014-3rdQ.Posts.xml
+echo "2014-3rd Quarter done"
+
+awk 'NR>='$M20144th 2014.Posts.xml > 2014-4thQ.Posts.xml
+echo "2014-4th Quarter done"
+
+cd ..