alansaid · duartedenio · Jul 13, 2017 · Jul 13, 2017 · Jul 13, 2017 · Jul 13, 2017
diff --git a/Maintenance.txt b/Maintenance.txt
@@ -0,0 +1,69 @@
+Changes made in the programs:
+
+-- Some stuffs due to Python2 and Pyhton3 incompatibilities
+
+Script:
+-- Fix the year problem (checked out) already pulled in github
+-- Fix the folder problem (add ldamodels)
+
+-- create a new script prepareQuarter.sh (split xml files into quarter instead of months)
+
+-- Folder users must be create under topics folder (createUserEvolutionChain method/function)
+
+TextProcessor.py
+-- Adapted it to Python 3 (cPickle, string methods, stem bug)
+-- Fix file name problem (line 54) open("data/" + str(date) + "-titleS-users.txt", "r") -- it was missing S in the name
+-- Adapted it to consider all user's document as one (time sliced)
+
+---- createDictionariesFromFiles: added the creation of file date+"monthly-tokenized_dict-perUser.pdict"
+
+---- createGlobalDictionaryFromMonthly: added a new paramater mergeDocs=False (if False documents are split by post, otherwise by user)
+------------- false: opens date+"monthly-tokenized_dict.pdict"
+------------- true:  opens date+"monthly-tokenized_dict-perUser.pdict"
+------------- The output is the same (global dictionary)
+
+---- createMonthsCorpus: added a new paramater mergeDocs=False 
+------------- false: input: date+"monthly-tokenized_dict.pdict"         output: date+'-tokenized.mm'
+------------- true:  input: date+"monthly-tokenized_dict-perUser.pdict" output: date+'-tokenizedUser.mm'
+
+---- performTFIDF: new parameter added mergeDocs=False
+------------- false: input: date+'-tokenized.mm'     output: date+'tfidf.mm' and date+"-tfidf.model"
+------------- true:  input: date+'-tokenizedUser.mm' output: date+'tfidfUser.mm' and date+"-tfidfUser.model"
+
+---- performLDA: new parameter added mergeDocs=False
+------------- false: input: "models/" + date +"-tfidf.mm"     output: the same file
+------------- true:  input: "models/" + date +"-tfidfUser.mm" output: the same file
+
+---- calculateEta: new parameter added: mergeDocs=False
+------------- false: input: date+"monthly-tokenized_dict.pdict"  output: the same
+------------- true:  input: date+"monthly-tokenized_dict-perUser.pdict" output: the same
+
+-- lookupLDATopics: new parameter added: mergeDocs=False
+---- false: input: date+"monthly-tokenized_dict.pdict"    output: the same
+---- true:  input: date+"monthly-tokenized_dict-perUser.pdict"  output: the same
+
+-- performLDA
+----- Fix bug when call calculateEta: the parameter "vocabulary size" must set to len(dictionary.keys()) instead of vocabsize
+------------ (since the vocabulary size produced by dictionary can be smalller than vocabsize)
+
+UserComparator.py
+-- summarizeTopicsPerUser received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict"           output: the same
+---- true:  input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser"  output: the same
+----------- the key for tokenized_dict is userid instead of docid
+
+-- lookupTopics received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict"           output: the same
+---- true:  input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser"  output: the same
+----------- the key for tokenized_dict is userid instead of docid
+
+TopicStats.py
+-- docPerTopic received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict"           output: the same
+---- true:  input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser"  output: the same
+
+-- countWords received a new parameter to deal with document by post or by User
+---- false: input "models/"+date+"-monthly-tokenized_dict.pdict"           output: the same
+---- true:  input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser"  output: the same
+
+
diff --git a/SOParser.py b/SOParser.py
@@ -5,14 +5,19 @@
 import re, cgi, os, pickle, logging, time
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
+import pdb
+
 def main():
     minposts = 50
-
+    quarter=True
+
     years = [2013, 2014]
+
     extractUsers(minposts, years)
-    extractComments(years)
+    extractComments(years,quarter)
 
-def extractComments(years):
+def extractComments(years,isQuarter=False):
+    quarters=['1stQ','2ndQ','3rdQ','4thQ']
     users = set()
     usersFile = open('rawdata/userposts.txt', 'r')
     for userline in usersFile:
@@ -21,35 +26,56 @@ def extractComments(years):
     usersFile.close()
 
     for year in years:
-        print "Parsing year: " + str(year)
-        months = range(1,13)
+        print ("Parsing year: " + str(year))
 
+        if not isQuarter:        
+            months = range(1,13)
+        else:
+            months = range(1,5) ## 4 quarters in a year
+        ####    
         for month in months:
             start = time.time()
-            yearmonth = str(year) + "-" + str(month).zfill(2)
+            if not isQuarter:
+                strmonth=str(month).zfill(2)
+            else:
+                strmonth=quarters[month-1] 
+            #####       
+            yearmonth = str(year) + "-" + strmonth
             print(yearmonth)
+            #######
+            ## Dealing with qaurter instead of months vvvvv
+            #######
             if month == 1:
-                lastmonth = str(year-1) + "-12"
+                if not isQuarter:
+                    lastmonth = str(year-1) + "-12"
+                else:
+                    lastmonth = str(year-1) + '-' + quarters[-1]
             else:
-                lastmonth = str(year) + "-" + str(month-1).zfill(2)
+                if not isQuarter:
+                    lastmonth = str(year) + "-" + str(month-1).zfill(2) 
+                else:
+                    lastmonth = str(year) + "-" + quarters[month-2]
+            ###        
             lastmonthsquestiontitlesfile = "data/" + lastmonth + "-questiontitles.dict"
             lastmonthsquestiontagsfile = "data/" + lastmonth + "-questiontags.dict"
             if os.path.isfile(lastmonthsquestiontitlesfile):
                 logging.info('loading title dictionary: %s', lastmonthsquestiontitlesfile)
                 logging.info('loading tag dictionary: %s', lastmonthsquestiontagsfile)
                 questiontitles = {}
                 questiontags = {}
-                with open(lastmonthsquestiontitlesfile, 'r') as f:
+                with open(lastmonthsquestiontitlesfile, 'rb') as f:  ## add b
                     questiontitles = pickle.load(f)
                     logging.info("Elements in questiontitles: %s", len(questiontitles))
-                with open(lastmonthsquestiontagsfile, 'r') as f:
+                with open(lastmonthsquestiontagsfile, 'rb') as f: ## add b
                     questiontags = pickle.load(f)
                     logging.info("Elements in questiontags: %s", len(questiontags))
             else:
                 logging.info("creating new dictionaries")
                 questiontitles = {}
                 questiontags = {}
-
+            #######
+            ## ^^^^^ End
+            #######
             monthusers = set()
             parsedpostsfile = open("data/"+ yearmonth + "-titles-tags-text.tsv","a")
             rawpostsfile = open("rawdata/" + yearmonth + ".Posts.xml", 'r')
@@ -67,19 +93,31 @@ def extractComments(years):
                 creationDate = doc.get('CreationDate')
                 postTypeId = doc.get('PostTypeId')
                 score = doc.get('Score')
-                text = doc.get('Body').encode('utf8').replace('\r\n','').replace('\n','')
+                #text = doc.get('Body').encode('utf8').replace('\r\n','').replace('\n','')
+                text = doc.get('Body').replace('\r\n','').replace('\n','')
                 tagremove = re.compile(r'(<!--.*?-->|<[^>]*>)')
                 text = cgi.escape(tagremove.sub('', re.sub('<code>[^>]+</code>', '', text)))
 
                 parent = doc.get('ParentId')
                 if 'Title' in doc.keys():
-                    title = doc.get('Title').encode('utf8')
+                    #title = doc.get('Title').encode('utf8')
+                    title = doc.get('Title')
+                    if type(title) is bytes:
+                        print('>>>>>>>> Byte')
+                        title=title.decode('utf8')
                 else:
                     title = ''
                 if 'Tags' in doc.keys():
-                    tags = doc.get('Tags').encode('utf8').replace("><", ",").replace("<","").replace(">","")
+                    #tags = doc.get('Tags').encode('utf8').replace("><", ",").replace("<","").replace(">","")
+                    tags = doc.get('Tags').replace("><", ",").replace("<","").replace(">","")
+                    if type(tags) is bytes:
+                        print('>>>>>>>> Byte')
+                        tags=tags.decode('utf8')
                 else:
                     tags = ''
+                ####
+                ##pdb.set_trace()
+                ####
                 if postTypeId == "1":
                     questiontags[rowID] = tags
                     questiontitles[rowID] = title
@@ -94,11 +132,12 @@ def extractComments(years):
             parsedpostsfile.close()
             rawpostsfile.close()
 
-            with open("data/"+ yearmonth + "-titles-users.txt", 'w') as f:
+            #pdb.set_trace()
+            with open("data/"+ yearmonth + "-titles-users.txt", 'w') as f:  
                 f.write("\n".join(monthusers))
-            with open("data/" + yearmonth + "-questiontitles.dict", 'w') as f:
+            with open("data/" + yearmonth + "-questiontitles.dict", 'wb') as f: ## add b (binary mode)
                 pickle.dump(questiontitles, f, pickle.HIGHEST_PROTOCOL)
-            with open("data/" + yearmonth + "-questiontags.dict", 'w') as f:
+            with open("data/" + yearmonth + "-questiontags.dict", 'wb') as f: ## add b (binary mode)
                 pickle.dump(questiontags, f, pickle.HIGHEST_PROTOCOL)
             end = time.time() - start
             logging.info("Elapsed time (s): %s", end)
@@ -108,7 +147,7 @@ def extractComments(years):
 def extractUsers(minPostCount, years):
     users = {}
     for year in years:
-        print "Parsing year: " +str(year)
+        print ("Parsing year: " +str(year))
         posts = open("rawdata/"+str(year)+".Posts.xml", 'r')
         for post in posts:
             post = post.rstrip('\n')