diff --git a/Maintenance.txt b/Maintenance.txt new file mode 100644 index 0000000..8f4e54d --- /dev/null +++ b/Maintenance.txt @@ -0,0 +1,69 @@ +Changes made in the programs: + +-- Some stuffs due to Python2 and Pyhton3 incompatibilities + +Script: +-- Fix the year problem (checked out) already pulled in github +-- Fix the folder problem (add ldamodels) + +-- create a new script prepareQuarter.sh (split xml files into quarter instead of months) + +-- Folder users must be create under topics folder (createUserEvolutionChain method/function) + +TextProcessor.py +-- Adapted it to Python 3 (cPickle, string methods, stem bug) +-- Fix file name problem (line 54) open("data/" + str(date) + "-titleS-users.txt", "r") -- it was missing S in the name +-- Adapted it to consider all user's document as one (time sliced) + +---- createDictionariesFromFiles: added the creation of file date+"monthly-tokenized_dict-perUser.pdict" + +---- createGlobalDictionaryFromMonthly: added a new paramater mergeDocs=False (if False documents are split by post, otherwise by user) +------------- false: opens date+"monthly-tokenized_dict.pdict" +------------- true: opens date+"monthly-tokenized_dict-perUser.pdict" +------------- The output is the same (global dictionary) + +---- createMonthsCorpus: added a new paramater mergeDocs=False +------------- false: input: date+"monthly-tokenized_dict.pdict" output: date+'-tokenized.mm' +------------- true: input: date+"monthly-tokenized_dict-perUser.pdict" output: date+'-tokenizedUser.mm' + +---- performTFIDF: new parameter added mergeDocs=False +------------- false: input: date+'-tokenized.mm' output: date+'tfidf.mm' and date+"-tfidf.model" +------------- true: input: date+'-tokenizedUser.mm' output: date+'tfidfUser.mm' and date+"-tfidfUser.model" + +---- performLDA: new parameter added mergeDocs=False +------------- false: input: "models/" + date +"-tfidf.mm" output: the same file +------------- true: input: "models/" + date +"-tfidfUser.mm" output: the same file + +---- calculateEta: new parameter added: mergeDocs=False +------------- false: input: date+"monthly-tokenized_dict.pdict" output: the same +------------- true: input: date+"monthly-tokenized_dict-perUser.pdict" output: the same + +-- lookupLDATopics: new parameter added: mergeDocs=False +---- false: input: date+"monthly-tokenized_dict.pdict" output: the same +---- true: input: date+"monthly-tokenized_dict-perUser.pdict" output: the same + +-- performLDA +----- Fix bug when call calculateEta: the parameter "vocabulary size" must set to len(dictionary.keys()) instead of vocabsize +------------ (since the vocabulary size produced by dictionary can be smalller than vocabsize) + +UserComparator.py +-- summarizeTopicsPerUser received a new parameter to deal with document by post or by User +---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same +---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same +----------- the key for tokenized_dict is userid instead of docid + +-- lookupTopics received a new parameter to deal with document by post or by User +---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same +---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same +----------- the key for tokenized_dict is userid instead of docid + +TopicStats.py +-- docPerTopic received a new parameter to deal with document by post or by User +---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same +---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same + +-- countWords received a new parameter to deal with document by post or by User +---- false: input "models/"+date+"-monthly-tokenized_dict.pdict" output: the same +---- true: input: "models/"+date+"-monthly-tokenized_dict.pdict-perUser" output: the same + + diff --git a/SOParser.py b/SOParser.py index ebafd9d..3e663e1 100644 --- a/SOParser.py +++ b/SOParser.py @@ -5,14 +5,19 @@ import re, cgi, os, pickle, logging, time logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) +import pdb + def main(): minposts = 50 - + quarter=True + years = [2013, 2014] + extractUsers(minposts, years) - extractComments(years) + extractComments(years,quarter) -def extractComments(years): +def extractComments(years,isQuarter=False): + quarters=['1stQ','2ndQ','3rdQ','4thQ'] users = set() usersFile = open('rawdata/userposts.txt', 'r') for userline in usersFile: @@ -21,17 +26,36 @@ def extractComments(years): usersFile.close() for year in years: - print "Parsing year: " + str(year) - months = range(1,13) + print ("Parsing year: " + str(year)) + if not isQuarter: + months = range(1,13) + else: + months = range(1,5) ## 4 quarters in a year + #### for month in months: start = time.time() - yearmonth = str(year) + "-" + str(month).zfill(2) + if not isQuarter: + strmonth=str(month).zfill(2) + else: + strmonth=quarters[month-1] + ##### + yearmonth = str(year) + "-" + strmonth print(yearmonth) + ####### + ## Dealing with qaurter instead of months vvvvv + ####### if month == 1: - lastmonth = str(year-1) + "-12" + if not isQuarter: + lastmonth = str(year-1) + "-12" + else: + lastmonth = str(year-1) + '-' + quarters[-1] else: - lastmonth = str(year) + "-" + str(month-1).zfill(2) + if not isQuarter: + lastmonth = str(year) + "-" + str(month-1).zfill(2) + else: + lastmonth = str(year) + "-" + quarters[month-2] + ### lastmonthsquestiontitlesfile = "data/" + lastmonth + "-questiontitles.dict" lastmonthsquestiontagsfile = "data/" + lastmonth + "-questiontags.dict" if os.path.isfile(lastmonthsquestiontitlesfile): @@ -39,17 +63,19 @@ def extractComments(years): logging.info('loading tag dictionary: %s', lastmonthsquestiontagsfile) questiontitles = {} questiontags = {} - with open(lastmonthsquestiontitlesfile, 'r') as f: + with open(lastmonthsquestiontitlesfile, 'rb') as f: ## add b questiontitles = pickle.load(f) logging.info("Elements in questiontitles: %s", len(questiontitles)) - with open(lastmonthsquestiontagsfile, 'r') as f: + with open(lastmonthsquestiontagsfile, 'rb') as f: ## add b questiontags = pickle.load(f) logging.info("Elements in questiontags: %s", len(questiontags)) else: logging.info("creating new dictionaries") questiontitles = {} questiontags = {} - + ####### + ## ^^^^^ End + ####### monthusers = set() parsedpostsfile = open("data/"+ yearmonth + "-titles-tags-text.tsv","a") rawpostsfile = open("rawdata/" + yearmonth + ".Posts.xml", 'r') @@ -67,19 +93,31 @@ def extractComments(years): creationDate = doc.get('CreationDate') postTypeId = doc.get('PostTypeId') score = doc.get('Score') - text = doc.get('Body').encode('utf8').replace('\r\n','').replace('\n','') + #text = doc.get('Body').encode('utf8').replace('\r\n','').replace('\n','') + text = doc.get('Body').replace('\r\n','').replace('\n','') tagremove = re.compile(r'(|<[^>]*>)') text = cgi.escape(tagremove.sub('', re.sub('[^>]+', '', text))) parent = doc.get('ParentId') if 'Title' in doc.keys(): - title = doc.get('Title').encode('utf8') + #title = doc.get('Title').encode('utf8') + title = doc.get('Title') + if type(title) is bytes: + print('>>>>>>>> Byte') + title=title.decode('utf8') else: title = '' if 'Tags' in doc.keys(): - tags = doc.get('Tags').encode('utf8').replace("><", ",").replace("<","").replace(">","") + #tags = doc.get('Tags').encode('utf8').replace("><", ",").replace("<","").replace(">","") + tags = doc.get('Tags').replace("><", ",").replace("<","").replace(">","") + if type(tags) is bytes: + print('>>>>>>>> Byte') + tags=tags.decode('utf8') else: tags = '' + #### + ##pdb.set_trace() + #### if postTypeId == "1": questiontags[rowID] = tags questiontitles[rowID] = title @@ -94,11 +132,12 @@ def extractComments(years): parsedpostsfile.close() rawpostsfile.close() - with open("data/"+ yearmonth + "-titles-users.txt", 'w') as f: + #pdb.set_trace() + with open("data/"+ yearmonth + "-titles-users.txt", 'w') as f: f.write("\n".join(monthusers)) - with open("data/" + yearmonth + "-questiontitles.dict", 'w') as f: + with open("data/" + yearmonth + "-questiontitles.dict", 'wb') as f: ## add b (binary mode) pickle.dump(questiontitles, f, pickle.HIGHEST_PROTOCOL) - with open("data/" + yearmonth + "-questiontags.dict", 'w') as f: + with open("data/" + yearmonth + "-questiontags.dict", 'wb') as f: ## add b (binary mode) pickle.dump(questiontags, f, pickle.HIGHEST_PROTOCOL) end = time.time() - start logging.info("Elapsed time (s): %s", end) @@ -108,7 +147,7 @@ def extractComments(years): def extractUsers(minPostCount, years): users = {} for year in years: - print "Parsing year: " +str(year) + print ("Parsing year: " +str(year)) posts = open("rawdata/"+str(year)+".Posts.xml", 'r') for post in posts: post = post.rstrip('\n') diff --git a/TextProcessor.py b/TextProcessor.py index 4f4e91a..cf5fc7e 100644 --- a/TextProcessor.py +++ b/TextProcessor.py @@ -1,36 +1,45 @@ from __future__ import print_function from gensim import corpora, models from gensim.parsing.preprocessing import STOPWORDS -import logging, re, numpy, cPickle +import logging, re, numpy +import _pickle as cPickle ## Python 3 does not have cPickle logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) from nltk.stem.porter import PorterStemmer from nltk.tokenize import word_tokenize - +import pdb def main(): """Main entry.""" global priorweight - dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12', - '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12'] + dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ'] + #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12', + # '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12'] - dates = ['2013-01', '2013-02', '2013-03'] + #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'] numtopics = 40 vocabsize = 2000 priorweight = 0.05 - workers = 3 - # filterUsers(dates) + workers = 1 + + merge=True + ######## + filterUsers(dates) createDictionariesFromFiles(dates) - createGlobalDictionaryFromMonthly(dates, vocabsize) - createMonthCorpuses(dates) - # - performTFIDF(dates) - performLDA(dates, numtopics, vocabsize, workers) - # lookupTopics(dates) + createGlobalDictionaryFromMonthly(dates, vocabsize,merge) + createMonthCorpuses(dates,merge) + + performTFIDF(dates,merge) + ####### + performLDA(dates, numtopics, vocabsize, workers,merge) + ####### + #lookupTopics(dates) + #lookatdist(dates[1]) + ####### def lookatdist(date): @@ -45,7 +54,7 @@ def filterUsers(dates): users = set() for date in dates: musers = set() - for line in open("data/" + str(date) + "-title-users.txt", "r"): + for line in open("data/" + str(date) + "-titles-users.txt", "r"): #"-title-users.txt", "r"): musers.add(line.strip("\n")) if len(users) == 0: users = musers @@ -64,8 +73,12 @@ def readFile(date): original_sentences[id] = text return original_sentences -def lookupLDATopics(date, docIDs, numTopics): - tokenized_dictfile = "models/global-tokenized_dict.pdict" +def lookupLDATopics(date, docIDs, numTopics, mergeDocs=False): + if not mergeDocs: + tokenized_dictfile="models/global-tokenized_dict.pdict" + else: + tokenized_dictfile="models/global-tokenized_dict-perUser.pdict" + ##### with open(tokenized_dictfile, 'rb') as f: tokenized_dict = cPickle.load(f) dictionary = corpora.Dictionary.load("models/global-dictionary.dict") @@ -77,9 +90,13 @@ def lookupLDATopics(date, docIDs, numTopics): topics_by_value = sorted(topics, key=lambda tup: tup[1], reverse=True) return topics_by_value[:numTopics] -def calculateEta(dates, date, numtopics, vocabsize): +def calculateEta(dates, date, numtopics, vocabsize,mergeDocs=False): priordate = dates[dates.index(date) - 1] - tokenized_dictfile = "models/"+priordate+"-monthly-tokenized_dict.pdict" + if not mergeDocs: + suffix="-monthly-tokenized_dict.pdict" + else: + suffix="-monthly-tokenized_dict-perUser.pdict" + tokenized_dictfile = "models/"+priordate+suffix with open(tokenized_dictfile, 'rb') as f: tokenized_dict = cPickle.load(f) dictionary = corpora.Dictionary.load("models/global-dictionary.dict") @@ -132,22 +149,36 @@ def calculateEta2(dates, date, numtopics, vocabsize, minpriorvalue): eta[topicid][index] = value return eta -def performTFIDF(dates): +def performTFIDF(dates, mergeDocs=False): for date in dates: - corpus = corpora.MmCorpus("models/" + date + "-tokenized.mm") + if not mergeDocs: + suffix_tok="-tokenized.mm" + suffix_tfidf_model="-tfidf.model" + suffix_tfidf_corpus="-tfidf.mm" + else: + suffix_tok="-tokenizedUser.mm" + suffix_tfidf_model="-tfidfUser.model" + suffix_tfidf_corpus="-tfidfUser.mm" + + corpus = corpora.MmCorpus("models/" + date + suffix_tok) tfidf = models.TfidfModel(corpus) - tfidf.save("models/"+date+"-tfidf.model") + tfidf.save("models/"+date+ suffix_tfidf_model) tfidf_corpus = tfidf[corpus] - corpora.MmCorpus.save_corpus("models/"+date+"-tfidf.mm", tfidf_corpus) + corpora.MmCorpus.save_corpus("models/"+date+ suffix_tfidf_corpus, tfidf_corpus) -def performLDA(dates, numtopics, vocabsize, workers): +def performLDA(dates, numtopics, vocabsize, workers,mergeDocs=False): for date in dates: + if not mergeDocs: + suffix="-tfidf.mm" + else: + suffix="-tfidfUser.mm" print("performing lda on " + str(date)) dictionary = corpora.Dictionary.load("models/global-dictionary.dict") - corpus = corpora.MmCorpus("models/" + date + "-tfidf.mm") + corpus = corpora.MmCorpus("models/" + date + suffix) if date != dates[0] and priorweight != 0: logging.info("Calculating eta based on prior month") - eta = calculateEta(dates, date, numtopics, vocabsize) + eta = calculateEta(dates, date, numtopics, len(dictionary.keys()),mergeDocs) ## vocabsize -> len(dictionary.keys()) SAFER! + # pdb.set_trace() lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=numtopics, workers=workers, eta=eta) else: logging.info("Eta weighting factor too low or no prior months") @@ -159,18 +190,25 @@ def performLDA(dates, numtopics, vocabsize, workers): def tokenizeandstemline(text): stoplist = STOPWORDS stemmer = PorterStemmer() - tokenized_line = [stemmer.stem(word.lower()) for word in word_tokenize(text.decode('utf-8'), language='english') if word not in stoplist and len(word) > 3 and re.match('^[\w-]+$', word) is not None] + ### Python 3 does not have str.decode, and the method PorterStemmer.stem() has a bug + #tokenized_line = [stemmer.stem(word.lower()) for word in word_tokenize(text.decode('utf-8'), language='english') if word not in stoplist and len(word) > 3 and re.match('^[\w-]+$', word) is not None] + tokenized_line = [stemmer.stem(word.lower()) for word in word_tokenize(text, language='english') if word not in stoplist and len(word) > 3 and re.match('^[\w-]+$', word) is not None] + ### return tokenized_line def writecpicklefile(content, filename): with open(filename, 'wb') as f: - cPickle.dump(content, f, cPickle.HIGHEST_PROTOCOL) + cPickle.dump(content, f, -1) #cPickle.HIGHEST_PROTOCOL) ## Python 3 does not have the macro HIGHEST_PROTOCOL -def createGlobalDictionaryFromMonthly(dates, vocabsize): +def createGlobalDictionaryFromMonthly(dates, vocabsize, mergeDocs=False): global_tokenized_dict = {} for date in dates: - monthly_tokenized_dictfile = "models/" + date + "-monthly-tokenized_dict.pdict" + if not mergeDocs: + suffix="-monthly-tokenized_dict.pdict" + else: + suffix="-monthly-tokenized_dict-perUser.pdict" + monthly_tokenized_dictfile = "models/" + date + suffix with open(monthly_tokenized_dictfile, 'rb') as f: logging.info("Opening file %s", monthly_tokenized_dictfile) global_tokenized_dict = merge_two_dicts(cPickle.load(f), global_tokenized_dict) @@ -192,6 +230,9 @@ def createDictionariesFromFiles(dates): for date in dates: print("parsing month: " + date) monthly_tokenized_dict = {} + #### + monthly_tokenized_byUser = {} + #### monthly_original_dict = {} docids = {} for line in open("data/" + date + "-titles-tags-text.tsv"): @@ -199,25 +240,44 @@ def createDictionariesFromFiles(dates): docids[id] = (userid, score) text = title + " " + tags + " " + text tokenized_line = tokenizeandstemline(text) - monthly_tokenized_dict[id] = tokenized_line + monthly_tokenized_dict[id] = tokenized_line.copy() monthly_original_dict[id] = text + #### merge all user's documents + if userid in monthly_tokenized_byUser: + monthly_tokenized_byUser[userid].extend(tokenized_line.copy()) + else: + monthly_tokenized_byUser[userid]=tokenized_line.copy() + #### + ### pdb.set_trace() ## just in case :) monthly_docids_dictfile = "models/"+date+"-docids.pdict" writecpicklefile(docids, monthly_docids_dictfile) monthly_tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict" writecpicklefile(monthly_tokenized_dict, monthly_tokenized_dictfile) + #### + monthly_tokenized_dictfile_perUser = "models/"+date+"-monthly-tokenized_dict-perUser.pdict" + writecpicklefile(monthly_tokenized_byUser, monthly_tokenized_dictfile_perUser) + #### monthly_original_dictfile = "models/"+date+"-monthly-original_dict.pdict" writecpicklefile(monthly_original_dict, monthly_original_dictfile) -def createMonthCorpuses(dates): +def createMonthCorpuses(dates,mergeDocs=False): for date in dates: logging.info("Parsing date: %s", date) print("parsing month: " + date) - monthly_dict_file = "models/" + date + "-monthly-tokenized_dict.pdict" + if not mergeDocs: + suffix_source="-monthly-tokenized_dict.pdict" + suffix_target='-tokenized.mm' + else: + suffix_source="-monthly-tokenized_dict-perUser.pdict" + suffix_target='-tokenizedUser.mm' + + monthly_dict_file = "models/" + date + suffix_source with open(monthly_dict_file, 'rb') as f: tokenized_dict = cPickle.load(f) dictionary = corpora.Dictionary.load('models/global-dictionary.dict') corpus = [dictionary.doc2bow(sentence) for sentence in tokenized_dict.values()] - corpora.MmCorpus.serialize('models/' + date + '-tokenized.mm', corpus) + corpora.MmCorpus.serialize('models/' + date + suffix_target, corpus) + if __name__ == '__main__': main() diff --git a/TopicComparator.py b/TopicComparator.py index 81a207f..b57c897 100644 --- a/TopicComparator.py +++ b/TopicComparator.py @@ -4,13 +4,16 @@ from numpy.linalg import norm from numpy import array - +import pdb def main(): global numtopics, vocabsize - dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12', - '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12'] - # dates = ['2013-02', '2013-03'] #, '2013-03', '2013-03'] + dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ'] + #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12', + # '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12'] + #dates = ['2013-01','2013-02', '2013-03'] #, '2013-03', '2013-03'] + #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'] + numtopics = 40 vocabsize = 2000 # compareMonths(dates) @@ -21,7 +24,7 @@ def main(): def compareMonths(dates): i = 1 for month in dates: - print month + print (month) nextmonth = dates[i] TVDBasedSimilarity(month, nextmonth) KLDBasedSimilarity(month, nextmonth) @@ -110,7 +113,9 @@ def printTopicWords(dates): lda = models.LdaModel.load("ldamodels/" + month + "-lda.model") topicfile = open("topics/"+month+"-topicwords.txt", "w") ldalist = lda.show_topics(num_topics=numtopics, num_words=10, log=False, formatted=False) - wordlists = { topic[0]: [wordvals[0].encode('utf-8') for wordvals in topic[1]] for topic in ldalist} + #pdb.set_trace() + # wordlists = { topic[0]: [wordvals[0].encode('utf-8') for wordvals in topic[1]] for topic in ldalist} ## delete encode('utf-8') + wordlists = { topic[0]: [wordvals[0] for wordvals in topic[1]] for topic in ldalist} for topic in wordlists.keys(): line = str(topic) + "\t" + " ".join(wordlists[topic]) + "\n" topicfile.write(line) diff --git a/TopicStats.py b/TopicStats.py index 777bbb5..e0a05f8 100644 --- a/TopicStats.py +++ b/TopicStats.py @@ -1,22 +1,26 @@ from gensim import corpora, models -import logging, numpy, cPickle +import logging, numpy #, cPickle +import _pickle as cPickle ## Python 3 does not have cPickle logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR) def main(): global topicthreshold - dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12', - '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12'] + dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ'] + #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12', + # '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12'] # dates = ['2013-01', '2013-02'] + #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'] + topics = 40 topicthreshold = 0.3 + merge=True + countWords(dates, topics,merge) + docPerTopic(dates,merge) - # countWords(dates, topics) - docPerTopic(dates) - -def docPerTopic(dates): +def docPerTopic(dates,mergeDocs): dictionary = corpora.Dictionary.load("models/global-dictionary.dict") doctopics = {} topicfile = open("stats/docpertopic.tsv", 'w') @@ -25,7 +29,11 @@ def docPerTopic(dates): date = str(date) print(date) - tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict" + if not mergeDocs: + tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict" + else: + tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict" + #### with open(tokenized_dictfile, 'rb') as f: tokenized_dict = cPickle.load(f) @@ -35,8 +43,12 @@ def docPerTopic(dates): for doc in documentfile: [docid, userid, creationdate, score, title, tags, text] = doc.rstrip("\n").split("\t") - - sentence = tokenized_dict[docid] + ###### + if not mergeDocs: + sentence = tokenized_dict[docid] + else: + sentence = tokenized_dict[userid] + ####### bow = dictionary.doc2bow(sentence) documenttopics = lda[bow] for (topicid, topicvalue) in documenttopics: @@ -53,7 +65,7 @@ def docPerTopic(dates): doctopics[topicid][date] = 0 doctopics[topicid][date]+=1 - print doctopics + print (doctopics) for topicid in doctopics.keys(): line = str(topicid) for date in doctopics[topicid].keys(): @@ -65,7 +77,7 @@ def docPerTopic(dates): -def countWords(dates, numtopics): +def countWords(dates, numtopics, mergeDocs): wordfile = open("stats/wordcounts.tsv", "w") words = {} #each word counted once per doc totalwords = {} #each word counted n times per n mentions in doc @@ -76,7 +88,11 @@ def countWords(dates, numtopics): words[date] = 0 uniquewords[date] = set() totalwords[date] = 0 - tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict" + if not mergeDocs: + tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict" + else: + tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict" + ### with open(tokenized_dictfile, 'rb') as f: tokenized_dict = cPickle.load(f) dictionary = corpora.Dictionary.load("models/global-dictionary.dict") @@ -86,6 +102,7 @@ def countWords(dates, numtopics): logging.info("Parsing date: %s", str(date)) [countedwordtopics.append(0) for i in range(numtopics)] for docID in tokenized_dict.keys(): + # check if it is necessary to choose between docID or UserID (when mergeDoc is True) doc = tokenized_dict[docID] bow = dictionary.doc2bow(doc) wordcount = len(bow) diff --git a/UserComparator.py b/UserComparator.py index 73675f3..3cf34ba 100644 --- a/UserComparator.py +++ b/UserComparator.py @@ -1,30 +1,39 @@ from gensim import corpora, models -import cPickle, numpy, logging, scipy +import numpy, logging, scipy from numpy.linalg import norm from scipy.stats import entropy +import _pickle as cPickle ## Python 3 does not have cPickle + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) +import pdb + def main(): global topicthreshold - dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', - '2013-10', '2013-11', '2013-12', - '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', - '2014-10', '2014-11', '2014-12'] - # dates = ['2013-01', '2013-02', '2013-03'] + dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ'] + #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', + # '2013-10', '2013-11', '2013-12', + # '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', + # '2014-10', '2014-11', '2014-12'] + #dates = ['2013-01', '2013-02', '2013-03'] + #dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'] + numtopics = 40 vocabsize = 2000 topicthreshold = 0.3 + merge=True - # summarizeTopicsPerUser(dates) - # compareMonths(dates) - # lookupTopics(dates) + #summarizeTopicsPerUser(dates,merge) + #compareMonths(dates) + #lookupTopics(dates,merge) + ##### createUserEvolutionChain(dates) def compareMonths(dates): i = 1 for month in dates: - print month + print (month) nextmonth = dates[i] # TVDBasedSimilarity(month, nextmonth) KLDBasedSimilarity(month, nextmonth) @@ -83,7 +92,7 @@ def JSD(P, Q): -def summarizeTopicsPerUser(dates): +def summarizeTopicsPerUser(dates,mergeDocs=False): dictionary = corpora.Dictionary.load("models/global-dictionary.dict") usersfile = "data/allusers.txt" users = set(open(usersfile).read().split()) @@ -96,18 +105,25 @@ def summarizeTopicsPerUser(dates): date = str(date) print(date) - tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict" + if not mergeDocs: + tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict" + else: + tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict" + ### with open(tokenized_dictfile, 'rb') as f: tokenized_dict = cPickle.load(f) documentfile = open("data/" + date + "-titles-tags-text.tsv") lda = models.LdaMulticore.load("ldamodels/" + date + "-lda.model") - + ###pdb.set_trace() for doc in documentfile: [docid, userid, creationdate, score, title, tags, text] = doc.rstrip("\n").split("\t") document_users[docid] = userid document_scores[docid] = score - sentence = tokenized_dict[docid] + if not mergeDocs: + sentence = tokenized_dict[docid] + else: + sentence = tokenized_dict[userid] bow = dictionary.doc2bow(sentence) documenttopics = lda[bow] for (topicid, topicvalue) in documenttopics: @@ -148,7 +164,7 @@ def writecpicklefile(content, filename): -def lookupTopics(dates): +def lookupTopics(dates,mergeDocs=False): dictionary = corpora.Dictionary.load("models/global-dictionary.dict") document_users = {} document_scores = {} @@ -156,8 +172,12 @@ def lookupTopics(dates): for date in dates: date = str(date) print(date) - - tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict" + + if not mergeDocs: + tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict" + else: + tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict" + #### with open(tokenized_dictfile, 'rb') as f: tokenized_dict = cPickle.load(f) @@ -179,7 +199,10 @@ def lookupTopics(dates): continue document_users[docid] = userid document_scores[docid] = score - sentence = tokenized_dict[docid] + if not mergeDocs: + sentence = tokenized_dict[docid] + else: + sentence = tokenized_dict[userid] bow = dictionary.doc2bow(sentence) documenttopics = lda[bow] for (topicid, topicvalue) in documenttopics: @@ -215,6 +238,7 @@ def lookupTopics(dates): # resultline = str(topicid) + "\t" + str(userid) + "\t" + str(meantopicvalue) + "\n" topicfile.write(resultline) topicfile.close() + print('***** End (lookupTopics) ****') def createUserEvolutionChain(dates): topicscores={} @@ -234,6 +258,7 @@ def createUserEvolutionChain(dates): users.add("7585") users.add("12579") + print('***** Begin (createUserEvolution) ****') for date in dates: topicfile = open("topics/" + date + "-topics.txt", 'r') allwords[date] = {} diff --git a/UserStatistics.py b/UserStatistics.py new file mode 100644 index 0000000..0ee96c3 --- /dev/null +++ b/UserStatistics.py @@ -0,0 +1,74 @@ +mergeDocs=False +dates = ['2013-1stQ','2013-2ndQ','2013-3rdQ','2013-4thQ','2014-1stQ','2014-2ndQ','2014-3rdQ','2014-4thQ'] +topicthreshold = 0.3 +dictionary = corpora.Dictionary.load("models/global-dictionary.dict") +document_users = {} +document_scores = {} +users = set() + +if not mergeDocs: + tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict" +else: + tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict-perUser.pdict" +#### +with open(tokenized_dictfile, 'rb') as f: + tokenized_dict = cPickle.load(f) + +usertopics = {} +userdoctopics = {} +usertopicscores = {} +documentfile = open("data/" + date + "-titles-tags-text.tsv") +topicfile = open("topics/" + date + "-topics.txt", 'w') +headerline = "UserID\ttopicID\tmeantopicvalue\tnumdocs\tmeantopicscore\ttopicword1\ttopicword2\ttopicword3\ttopicword4\ttopicword5\n" +topicfile.write(headerline) +lda = models.LdaMulticore.load("ldamodels/" + date + "-lda.model") + +for doc in documentfile: + [docid, userid, creationdate, score, title, tags, text] = doc.rstrip("\n").split("\t") + if date == dates[0]: + users.add(userid) + else: + if userid not in users: + continue + document_users[docid] = userid + document_scores[docid] = score + if not mergeDocs: + sentence = tokenized_dict[docid] + else: + sentence = tokenized_dict[userid] + bow = dictionary.doc2bow(sentence) + documenttopics = lda[bow] + for (topicid, topicvalue) in documenttopics: + if topicvalue >= topicthreshold: + try: + userdoctopics[userid] + except KeyError: + userdoctopics[userid] = {} + userdoctopics[userid][topicid] = [] + usertopicscores[userid] = {} + usertopicscores[userid][topicid] = [] + try: + userdoctopics[userid][topicid] + except KeyError: + userdoctopics[userid][topicid] = [] + usertopicscores[userid][topicid] = [] + userdoctopics[userid][topicid].append(topicvalue) + usertopicscores[userid][topicid].append(int(score)) +for userid in userdoctopics.keys(): + usertopics[userid] = {} + for topicid in userdoctopics[userid].keys(): + meantopicvalue = numpy.mean(userdoctopics[userid][topicid]) + meantopicscore = numpy.mean(usertopicscores[userid][topicid]) + numdocs = len(userdoctopics[userid][topicid]) + if meantopicvalue < topicthreshold: + continue + usertopics[userid][topicid] = meantopicvalue + topicterms = lda.get_topic_terms(topicid, topn=5) + topicwords = "" + for term in topicterms: + topicwords += dictionary.get(term[0]).ljust(15) + "\t" + resultline = str(userid)+"\t"+str(topicid)+"\t"+ str(meantopicvalue) + "\t" + str(numdocs) + "\t" + str(meantopicscore) + "\t" + str(topicwords) + "\n" + # resultline = str(topicid) + "\t" + str(userid) + "\t" + str(meantopicvalue) + "\n" + topicfile.write(resultline) +topicfile.close() +print('***** End (lookupTopics) ****') diff --git a/downloadAndPrepareData.sh b/downloadAndPrepareData.sh index 6b4db83..dab73b2 100755 --- a/downloadAndPrepareData.sh +++ b/downloadAndPrepareData.sh @@ -7,6 +7,8 @@ mkdir data mkdir models mkdir topics mkdir rawdata +# the following folder was missing +mkdir ldamodels cd rawdata if [ ! -f "stackoverflow.com-Posts.7z" ]; then diff --git a/plotFunctionsDoc.py b/plotFunctionsDoc.py new file mode 100644 index 0000000..01c82be --- /dev/null +++ b/plotFunctionsDoc.py @@ -0,0 +1,220 @@ +### How to use it: +### The dictionary userTop must be populated: docTop=buildDict(dates,'-doctopicdist.txt','topics/DocDistrib/') +### in the example dates are the dates of the files, '-doctopicdist.txt' are the suffix and 'topics/DocDistrib/' the folder (optional) +### Available functions: +### buildDocTopic(tpId, threshold) builds a list SxD (S: # of slice time, D: # of documents) +### with the probabilities associated to documents +### plotDocrvsTop (threshold), plots the number of documents by topic in all time slices +### oneVsoneDocs (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function) +### returns a list of p-values of the samples (slice_i x slice_i+1, slice_i+1 x slice_i+2 ....) +### and a list for labeling X axis +### +### oneVsallDocs (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function) +### returns a list of p-values of the samples (slice_first x slice_i+1, slice_first x slice_i+2 ....) +### and a list for labeling X axis +### PlotStatsDocs(statisc function, start, label of the graph, call oneVsAll (True) or oneVsone (False), threshold, list of topics, arguments for function) +### plots a line graph with p-values of the samples being compared +### calls buildTop, oneVsall or oneVsone for every topic and plots the result +### Examples +### t=buildTopic(10,.7) -> t is a list with 24 slices of time and n document probabilities for topic 10 +### PlotStatsDocs(stats.ttest_ind,1,"Student's T test") +### plots the p-values using t-test for all topics comparing the first slice with all others +### PlotStatsDocs(stats.ttest_ind,1,"Student's T test",False) +### plots the p-values using t-test for all topics comparing the two consective time slices from the fisrt one +### PlotStatsDocs(stats.ttest_ind,1,"Student's T test",False,[3,4,5]) +### The same above but it plots only for topics 3,4, and 5. + +import numpy as np +from scipy import stats +import matplotlib.pyplot as pl + +def main(): + global docTop + global colors + global vslices + docTop={} + vslices=['Jan13','Feb13','Mar13','Apr13','May13','Jun13','Jul13','Aug13','Sep13','Oct13','Nov13','Dec13', + 'Jan14','Feb14','Mar14','Apr14','May14','Jun14','Jul14','Aug14','Sep14','Oct14','Nov14','Dec14'] + + dates=['2013-01','2013-02','2013-03','2013-04','2013-05','2013-06','2013-07','2013-08','2013-09','2013-10','2013-11','2013-12', + '2014-01','2014-02','2014-03','2014-04','2014-05','2014-06','2014-07','2014-08','2014-09','2014-10','2014-11','2014-12'] + # + colors=['yellow', 'orange', 'pink','black','blue','brown','coral','crimson','cyan','darkblue', + 'darkgreen','fuchsia','gold','green','grey','indigo','red','yellowgreen','navy','olive', + 'azure','orchid','beige','plum','purple','lavender','salmon','silver','violet','aqua','magenta'] + + docTop=buildDict(dates,'-doctopicdist.txt','topics/DocDistrib/') + plotDocvsTop(.5) ## plot the number of topics per topic per time slice + ## plot the p-value from bartlett test for all topics from the first + ## time slice + PlotStatsDocs(stats.bartlett,1,"Bartlett's test",False,.05,[3,5,7,9,11]) + +### build a dictionary with all probabilities and documents +### docTop size is 24 (24 slices of time) +### doctop[n] is 20 (20 topics in slice n) +### doctop[n][t] is the probabilities of documents to be associated to +### topic t in slice n +def buildDict(dates,filesuf,fileprex=''): + #### topics vs docs + docT={} + for i in range(len(dates)): + file=fileprex+dates[i]+filesuf + lines=[line.split() for line in open(file)] + matprob=[] + for j in range(len(lines)): + probs=[float(n) for n in lines[j]] + matprob.append(probs[1:]) + matprob=np.array(matprob) + docT[i]=matprob + return docT +### Build the doc probabilities by topic + +## plot the number of documents per topic in each slice of time +## thr is the probability threshold +def plotDocvsTop(thr=0.01): + plist=[] + for i in range(len(docTop[0])): + p=[] + for j in range(len(docTop)): + p.append(len([i for i in docTop[j][i] if i > thr])) + plist.append(pl.plot(p)) + ltup=() + ctup=() + for i in range(len(plist)): + ntup=(plist[i][0],) + ltup=ltup+ntup + ntup=('T'+str(i),) + ctup=ctup+ntup + pl.legend(ltup,ctup) + pl.ylabel('# of documents') + pl.xticks(np.arange(0,24),np.arange(1,24)) + pl.grid(True) + pl.title('Documents per topics\n Threshold: '+str(thr)) + pl.show() + + +### build list for the documents probabilities in a given topic (tId) +### and a given threshol (thres) +### the list NxP, where N is the slice number and P the documents +### probabilities for tId in that slice (P is not fixed in the slices) +def buildDocTopic(tId,thres=0.001): + tn=[] ## stores the probabilities + for i in range(24): + tn.append([i for i in docTop[i][tId] if i > thres]) + return tn + +def oneVsallDocsPlot(fct,start,data,tp,label,**args): + Xlabels=[] + start=start-1 + for i in range(start+1,len(vslices)): + Xlabels.append(vslices[start]+'x'+vslices[i]) ### labels + + pval=[] + for i in range(1,len(Xlabels)+1): + if not args: + t,p=fct(data[start],data[start+i]) + else: + key=list(args)[0] + val=args.get(key) + args={key:val} + t,p=fct(data[start],data[start+i],**args) + pval.append(p) + pl.plot(pval) + pl.xticks(np.arange(0,len(Xlabels)),Xlabels,rotation=90) + pl.title(label+"\nTopic: "+str(tp)) + pl.show() + + + +### Returns the p-value for fct (stats.), comparing (1 is the first) to all following time slices +### for the document probabilities in a topic defined by data +### args is used as parameters for fct (if it is necessary) +def oneVsallDocs(fct,start,data,**args): + Xlabels=[] + start=start-1 + for i in range(start+1,len(vslices)): + Xlabels.append(vslices[start]+'x'+vslices[i]) ### labels + + pval=[] + for i in range(1,len(Xlabels)+1): + if not args: + t,p=fct(data[start],data[start+i]) + else: + key=list(args)[0] + val=args.get(key) + args={key:val} + t,p=fct(data[start],data[start+i],**args) + pval.append(p) + return pval,Xlabels + +### Returns the p-value for fct (stats.), from (1 is the first) to all following time slices +### comparing two slices in a row: jan13xfev13, fev13xmar13, and so on +### for the document probabilities in a topic defined by data +### args is used as parameters for fct (if it is necessary) +def oneVsoneDocs(fct,start,data,**args): + Xlabels=[] + start=start-1 + for i in range(start,len(vslices)-1): + Xlabels.append(vslices[i]+'x'+vslices[i+1]) ### labels + + pval=[] + for i in range(start,len(Xlabels)+start): + if not args: + t,p=fct(data[i],data[i+1]) + else: + key=list(args)[0] + val=args.get(key) + args={key:val} + t,p=fct(data[i],data[i+1],**args) + pval.append(p) + return pval,Xlabels + +### Plots all topics in slices from (1 is the first), with p-value +### calculate by fct. label is the name of the statistical function to +### appear in the title, topics is a list of topics to be plot (none=all) +### args is to be passed to fct +def PlotStatsDocs(fct,start,label,oneVsall=True,th=0,topics=None,**args): + t=[] + eTop=[] + if topics is None: + for i in range(20): + eTop.append(i) + else: + eTop=topics + for i in eTop: + t.append(buildDocTopic(i,th)) + # + plist=[] + for i in range(len(t)): + if not args: + if oneVsall: + pv,xl=oneVsallDocs(fct,start,t[i]) + else: + pv,xl=oneVsoneDocs(fct,start,t[i]) + else: + if oneVsall: + pv,xl=oneVsallDocs(fct,start,t[i],**args) + else: + pv,xl=oneVsoneDocs(fct,start,t[i],**args) + plist.append(pl.plot(pv,color=colors[i])) + # + ltup=() + ctup=() + j=0 + for i in range(len(plist)): + ntup=(plist[i][0],) + ltup=ltup+ntup + eTop[j] + ntup=('T'+str(eTop[j]),) + j=j+1 + ctup=ctup+ntup + # + pl.legend(ltup,ctup) + pl.ylabel('p-value') + pl.grid(True) + pl.xticks(np.arange(0,len(xl)),xl,rotation=90) + pl.title(label+'\nDocuments in Topics - Threshold: '+str(th)) + pl.show() + +if __name__ == '__main__': + main() diff --git a/plotFunctionsUser.py b/plotFunctionsUser.py new file mode 100644 index 0000000..677ac7e --- /dev/null +++ b/plotFunctionsUser.py @@ -0,0 +1,256 @@ +### How to use it: +### The dictionary userTop must be populated: userTop=buildDict(dates,'-topicuserdist.txt','topics/') +### in the example dates are the dates of the files, '-topicuserdist.txt' are the suffix and 'topics/' the folder (optional) +### Available functions: +### buildTopic(tpId, threshold) builds a list SxU (S: # of slice time, U: # of users) +## with the probabilities associated to users +### plotUservsTop (threshold), plots the number of user by topic in all time slices +### oneVsone (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function) +### returns a list of p-values of the samples (slice_i x slice_i+1, slice_i+1 x slice_i+2 ....) +### and a list for labeling X axis +### +### oneVsall (statistic function, first time slice ,data returned by buildTopic,arguments for statistic function) +### returns a list of p-values of the samples (slice_first x slice_i+1, slice_first x slice_i+2 ....) +### and a list for labeling X axis +### PlotStats(statisc function, start, label of the graph, call oneVsAll (True) or oneVsone (False), threshold, list of topics, arguments for function) +### plots a line graph with p-values of the samples being compared +### calls buildTop, oneVsall or oneVsone for every topic and plots the result +### Examples +### t=buildTopic(10,.7) -> t is a list with 24 slices of time and n user probabilities for topic 10 +### PlotStats(stats.ttest_ind,1,"Student's T test") +### plots the p-values using t-test for all topics comparing the first slice with all others +### PlotStats(stats.ttest_ind,1,"Student's T test",False) +### plots the p-values using t-test for all topics comparing the two consective time slices from the fisrt one +### PlotStats(stats.ttest_ind,1,"Student's T test",False,[3,4,5]) +### The same above but it plots only for topics 3,4, and 5. +import numpy as np +from scipy import stats + +import matplotlib.pyplot as pl + +def main(): + global userTop + global colors + global vslices + userTop={} + vslices=['Jan13','Feb13','Mar13','Apr13','May13','Jun13','Jul13','Aug13','Sep13','Oct13','Nov13','Dec13', + 'Jan14','Feb14','Mar14','Apr14','May14','Jun14','Jul14','Aug14','Sep14','Oct14','Nov14','Dec14'] + # + dates=['2013-01','2013-02','2013-03','2013-04','2013-05','2013-06','2013-07','2013-08','2013-09','2013-10','2013-11','2013-12', + '2014-01','2014-02','2014-03','2014-04','2014-05','2014-06','2014-07','2014-08','2014-09','2014-10','2014-11','2014-12'] + # + colors=['yellow', 'orange', 'pink','black','blue','brown','coral','crimson','cyan','darkblue', + 'darkgreen','fuchsia','gold','green','grey','indigo','red','yellowgreen','navy','olive', + 'azure','orchid','beige','plum','purple','lavender','salmon','silver','violet','aqua','magenta'] + + userTop=buildDict(dates,'-topicuserdist.txt','topics/') + plotUservsTop(.5) ## plot the number of topics per user as time goes by + ## plot the p-value from bartlett test for all topics from the first + ## time slice + PlotStats(stats.bartlett,1,"Bartlett's test",False) + + + +### Create a dict "userTop" with all the probabilities +### of a user to be associated to topics in 24 slices of time +### userTop[0] refers to the first time slice +#### userTop[0][0] refers to users probabilities in the first topic in time slice 0 +#### +def buildDict(dates,filesuf,fileprex=''): + userT={} + for i in range(len(dates)): + file=fileprex+dates[i]+filesuf + lines=[line.split() for line in open(file)] + matprob=[] + for j in range(len(lines)): + probs=[float(n) for n in lines[j]] + matprob.append(probs) + matprob=np.array(matprob) + userT[i]=matprob + return userT +### + +####################### USERS and TOPICS +def plotUservsTop(trh=0.001): + uvst=[] + for i in range(20): + t=[] + for j in range(len(userTop)): + t.append(np.count_nonzero(userTop[j][i]>trh)) + uvst.append(t) + plist=[] + for topId in range(20): + plist.append(pl.plot(uvst[topId],color=colors[topId])) + ltup=() + ctup=() + for i in range(20): + ntup=(plist[i][0],) + ltup=ltup+ntup + ntup=('T'+str(i),) + ctup=ctup+ntup + pl.legend(ltup,ctup) + pl.xticks(np.arange(0,len(userTop)),np.arange(1,len(userTop)+1)) + pl.ylabel('# of Users') + pl.xlabel('Slices of time') + pl.grid(True) + pl.title('Threshold '+str(trh)) + pl.show() + + +### Extract the user probabilities of the given topic tId for all time slices (24) +def buildTopic(tId,thres=0.001): + tn=[] ## stores the probabilities + for i in range(24): + tn.append(userTop[i][tId,userTop[i][tId]>=thres]) + return tn + +### given a stastic function (fct), a initial time slice, a dataset, the label correspondig to fct, and a topic tp +### plot the p-value from the start to point to all older ones. +def oneVsallPlot(fct,start,data,label,tp,**args): + Xlabels=[] + start=start-1 + for i in range(start+1,len(vslices)): + Xlabels.append(vslices[start]+'x'+vslices[i]) ### labels + + pval=[] + for i in range(1,len(Xlabels)+1): + if not args: + t,p=fct(data[start],data[start+i]) + else: + t,p=fct(data[start],data[start+i],alternative='two-sided') + pval.append(p) + pl.plot(pval) + pl.xticks(np.arange(0,len(Xlabels)),Xlabels,rotation=90) + pl.title(label+"\nTopic: "+str(tp)) + pl.show() + +def oneVsall(fct,start,data,**args): + Xlabels=[] + start=start-1 + for i in range(start+1,len(vslices)): + Xlabels.append(vslices[start]+'x'+vslices[i]) ### labels + + pval=[] + for i in range(1,len(Xlabels)+1): + if not args: + t,p=fct(data[start],data[start+i]) + else: + key=list(args)[0] + val=args.get(key) + args={key:val} + t,p=fct(data[start],data[start+i],**args) + pval.append(p) + return pval,Xlabels + +def oneVsone(fct,start,data,**args): + Xlabels=[] + start=start-1 + for i in range(start,len(vslices)-1): + Xlabels.append(vslices[i]+'x'+vslices[i+1]) ### labels + + pval=[] + for i in range(start,len(Xlabels)+start): + if not args: + t,p=fct(data[i],data[i+1]) + else: + key=list(args)[0] + val=args.get(key) + args={key:val} + t,p=fct(data[i],data[i+1],**args) + pval.append(p) + return pval,Xlabels + + +def PlotStats(fct,start,label,oneXall=True,th=0,topics=None,**args): + t=[] + eTop=[] + if topics is None: + for i in range(20): + eTop.append(i) + else: + eTop=topics + for i in eTop: + t.append(buildTopic(i,th)) + # + plist=[] + for i in range(len(t)): + if not args: + if oneXall: + pv,xl=oneVsall(fct,start,t[i]) + else: + pv,xl=oneVsone(fct,start,t[i]) + else: + if oneVsall: + pv,xl=oneVsall(fct,start,t[i],**args) + else: + pv,xl=oneVsone(fct,start,t[i],**args) + plist.append(pl.plot(pv,color=colors[i])) + # + ltup=() + ctup=() + j=0 + for i in range(len(plist)): + ntup=(plist[i][0],) + ltup=ltup+ntup + eTop[j] + ntup=('T'+str(eTop[j]),) + j=j+1 + ctup=ctup+ntup + # + pl.legend(ltup,ctup) + pl.ylabel('p-value') + pl.grid(True) + pl.xticks(np.arange(0,len(xl)),xl,rotation=90) + pl.title(label+'\nUsers in Topics - Threshold: '+str(th)) + pl.show() + + +if __name__ == '__main__': + main() + +### Paired test must be done in samples having the same shape + +######## KRUSKAL +## The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal. +## It is a non-parametric version of ANOVA. The test works on 2 or more independent samples, which may have different sizes. +## Note that rejecting the null hypothesis does not indicate which of the groups differs. +## Post-hoc comparisons between groups are required to determine which groups are different. +#ToneVsall(stats.kruskal,15,'Kruskal') +#ToneVsall(stats.kruskal,15,'Kruskal',.5,[9,11,13,15]) ## probability >=.5 and topics 9,11,13,15 + + +##### STUDENT'S T TEST +## Calculates the T-test for the means of two independent samples of scores. +## This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values. +## This test assumes that the populations have identical variances by default. +#ToneVsall(stats.ttest_ind,15,"Student's t test") + +##### MANN-WHITNEY U +## The Mann-Whitney U test is a nonparametric test that allows two groups or conditions or treatments to be +## compared without making the assumption that values are normally distributed. +## So, for example, one might compare the speed at which two different groups of people can run 100 metres, +## where one group has trained for six weeks and the other has not. +#ToneVsall(stats.mannwhitneyu,15,'Mann Whitney',alternative='two-sided') +#ToneVsall(stats.mannwhitneyu,15,'Mann Whitney',.5,[9,11,13,15],alternative='two-sided') + +##### KOLMOGOROV-SMIRNOV +## The Kolmogorov-Smirnov test (KS-test) tries to determine if two datasets differ significantly. +## The KS-test has the advantage of making no assumption about the distribution of data. (Technically speaking it is non-parametric and distribution free.) +## Note however, that this generality comes at some cost: other tests (for example Student's t-test) may be more sensitive if the +## data meet the requirements of the test. +#ToneVsall(stats.ks_2samp,15,'Kolmogorov-Smirnov') + + + +### ToneVsall(stats.wilcoxon,15,'Wilcoxon') Not for samples with different sizes +### ToneVsall(stats.ttest_rel,15,"T test on Two Related") Not for samples with different sizes + +### BARTLETT +## Bartlett’s test tests the null hypothesis that all input samples are from populations with equal variances +#ToneVsall(stats.bartlett,15,"Bartlett's test") + +### LEVENE +## Perform Levene test for equal variances. +## The Levene test tests the null hypothesis that all input samples are from populations with equal variances. +## Levene’s test is an alternative to Bartlett’s test bartlett in the case where there are significant deviations from normality. +#ToneVsall(stats.levene,15,"Levene's test") diff --git a/prepareQuarter.sh b/prepareQuarter.sh new file mode 100755 index 0000000..b316e3a --- /dev/null +++ b/prepareQuarter.sh @@ -0,0 +1,52 @@ +#!/bin/bash +echo "This script will prepare the stackoverflow data with time slice equals a quarter" + + + +# Data must already be download, and folder rawdata must exist (as well 2013.Posts.xml and 2014.Posts.xml) +cd rawdata + +echo "splitting files 2013 in quarters" +M20131st=`awk '/CreationDate=\"2013-01/ {print NR; exit}' 2013.Posts.xml` +M20132nd=`awk '/CreationDate=\"2013-04/ {print NR; exit}' 2013.Posts.xml` +LAST=`expr $M20132nd - 1` +awk 'NR=='$M20131st', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2013.Posts.xml > 2013-1stQ.Posts.xml +echo "2013-1st quarter done" + + +M20133rd=`awk '/CreationDate=\"2013-07/ {print NR; exit}' 2013.Posts.xml` +LAST=`expr $M20133rd - 1` +awk 'NR=='$M20132nd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2013.Posts.xml > 2013-2ndQ.Posts.xml +echo "2013-2nd Quarter done" + +M20134th=`awk '/CreationDate=\"2013-10/ {print NR; exit}' 2013.Posts.xml` +LAST=`expr $M20134th - 1` +awk 'NR=='$M20133rd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2013.Posts.xml > 2013-3rdQ.Posts.xml +echo "2013-3rd Quarter done" + +awk 'NR>='$M20134th 2013.Posts.xml > 2013-4thQ.Posts.xml +echo "2013-4th Quarter done" + +#### +echo "splitting files 2014 in quarters" +M20141st=`awk '/CreationDate=\"2014-01/ {print NR; exit}' 2014.Posts.xml` +M20142nd=`awk '/CreationDate=\"2014-04/ {print NR; exit}' 2014.Posts.xml` +LAST=`expr $M20142nd - 1` +awk 'NR=='$M20141st', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2014.Posts.xml > 2014-1stQ.Posts.xml +echo "2014-1st quarter done" + + +M20143rd=`awk '/CreationDate=\"2014-07/ {print NR; exit}' 2014.Posts.xml` +LAST=`expr $M20143rd - 1` +awk 'NR=='$M20142nd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2014.Posts.xml > 2014-2ndQ.Posts.xml +echo "2014-2nd Quarter done" + +M20144th=`awk '/CreationDate=\"2014-10/ {print NR; exit}' 2014.Posts.xml` +LAST=`expr $M20144th - 1` +awk 'NR=='$M20143rd', NR=='$LAST'-1; NR=='$LAST' {print; exit}' 2014.Posts.xml > 2014-3rdQ.Posts.xml +echo "2014-3rd Quarter done" + +awk 'NR>='$M20144th 2014.Posts.xml > 2014-4thQ.Posts.xml +echo "2014-4th Quarter done" + +cd ..