conference/cmtutils.py at master · oxcsml/conference · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import time
import os
import re
import sys
import numpy as np
import pickle
import pandas as pd
import sqlite3

import openpyxl # Requires python-openpyxl
from lxml import etree # for reading from CMT

from HTMLParser import HTMLParser
# General set up.

from pods.util import download_url
from pods.notebook import display_url

# interface to google docs
import pods
from config import *

conf_short_name = config.get('conference', 'short_name')
conf_year = config.get('conference', 'year')
program_chair_email = config.get('conference', 'chair_email')
program_chair_gmails = config.get('conference', 'chair_gmails').split(';')
cmt_data_directory = os.path.expandvars(config.get('cmt', 'export_directory'))
buddy_pair_key = os.path.expandvars(config.get('google docs', 'buddy_pair_key'))
global_results_key = os.path.expandvars(config.get('google docs', 'global_results_key'))

# When recruiting reviewers we add in people who area chaired at ICML since 2008, at NIPS since 2001 and at AISTATS since 2011.
# Conferences with area chair information stored
recorded_conferences = {'icml': [2008, 2009, 2010, 2011, 2012, 2013, 2014],
                        'nips': [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013],
                        'aistats' : [2011, 2012, 2013, 2014]}

# Helper function for formatting strings.
def my_format(num,length=3):
    return str(num)[:length+1]

# HTML Stripper from this stackoverflow post: http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def split_names(names):
    """
    If we have a spreadsheet that stores the name only, this function
    splits the name into first name, middle initials and last name, which
    is the format that CMT expects. Information about name splits is taken
    from the file 'name_splits.txt'
    """
    with open('name_splits.txt') as f:
        name_splits = f.read().splitlines()
    firstname = pd.Series(index=names.index)
    middlenames = pd.Series(index=names.index)
    lastname = pd.Series(index=names.index)
    for ind in names.index:
        split_name = False
        for split in name_splits:
            if  names[ind] == split.replace('|', ' '):
                cand_names = split.split('|')
                split_name = True
                break

        if not split_name:
            cand_names = names[ind].split(' ')

        firstname[ind] = cand_names[0].strip()
        lastname[ind] = cand_names[-1].strip()

        if len(names)>2:
            middlenames[ind] = ' '.join(cand_names[1:-1])
        else:
            middlenames[ind] = ''


    return firstname, middlenames, lastname

# How to map spreadsheet column titles to data base columns
default_mapping = {}
default_mapping['FirstName'] ='FirstName'
default_mapping['MiddleNames'] = 'MiddleNames'
default_mapping['LastName'] = 'LastName'
default_mapping['Institute'] = 'Institute'
default_mapping['Email'] = 'Email'
default_mapping['ScholarID'] = 'ScholarID'
default_mapping['Nominator'] = 'Nominator'


class review_report:
    """
    Class that looks at calibrated reviews and generates text reports or email reports based on review sumaries.
    """
    def __init__(self, filename=None,
                 calibrated_reviews=None,
                 attention_scoring=None,
                 light_grey=[0.1, 0.9],
                 firm_grey=[0.3, 0.7],
                 expected_reviews=3,
                 short_review_percentile=5.,
                 very_short_review_percentile=1.):

        if calibrated_reviews is None and filename is not None:
            calibrated_reviews = pd.io.parsers.read_csv(os.path.join(cmt_data_directory, filename),
                                                        dtype={'PaperID':object})
            calibrated_reviews.set_index(keys='PaperID', inplace=True)
            calibrated_reviews.fillna('', inplace=True)
        elif ((filename is None and calibrated_reviews is None) or
              (filename is not None and calibrated_reviews is not None)):
            raise ValueError("You need to provide either filename or calibrated_reviews as keyword arguments")

        self.reviews=calibrated_reviews
        self.short_review_percentile = short_review_percentile
        self.very_short_review_percentile = very_short_review_percentile
        self.comment_length_very_low_threshold = self.reviews.comment_length.quantile(self.very_short_review_percentile/100.)
        self.comment_length_low_threshold = self.reviews.comment_length.quantile(self.short_review_percentile/100.)
        self.high_impact_threshold = 1.5
        self.light_grey_area = light_grey
        self.firm_grey_area = firm_grey
        self.expected_reviews = expected_reviews
        if attention_scoring is None:
            self.attention_scoring = {'one_review':40,
                                      'too_few_reviews':14,
                                      'firm_grey_area':7,
                                      'light_grey_area':3,
                                      'likely_accept':2,
                                      'talk':7,
                                      'very_large_span':5,
                                      'large_span':2,
                                      'very_low_confidence':5,
                                      'low_confidence':2,
                                      'very_short_review':7,
                                      'short_review':4}
        else:
            self.attention_scoring=attention_scoring

    def email_html_comments(self, sendto_dict,
                            intro=None,
                            closing=None,
                            subject=None,
                            attention_threshold=0,
                            cc_list = [program_chair_email],
                            rankby='attention_score'):
        """Email html comments to area chairs."""

        if subject is None:
            subject = conf_short_name + ": Report on Papers which May Need Attention"
        if intro is None:
            intro = """Hi,<br><br>

            This is an automated report that tries to identify problems with
            papers that need attention. You may already be on top of these
            issues, but this report may still be helpful.<br><br>

            The report ranks papers by an 'attention score' to try and order
            which papers require most attention.<br><br>

            Calibrated quality scores are scores that take the estimated
            'reviewer bias' into account. The probability of accept is based
            on a probabilistic model that accounts for the reviewer bias we
            estimated and the associated uncertainty.<br><br>"""

        if closing is None:
            closing = """<br><br>

            Regards,<br><br>


            """ + chair_informal_names + "<br>\n" + conf_short_name + ' ' + conf_year + " Program Chairs"

        for email, papers in sendto_dict.iteritems():
            print "Sending mails summarizing papers", ', '.join(papers), 'to', email
        ans = raw_input('Are you sure you want to send mails (Y/N)?')
        if ans=='Y':
            mailer = gmail.email(gmail_username=gmail_account)
            for email, papers in sendto_dict.iteritems():
                body = ''
                for id, report in self.attention_report.loc[papers][self.attention_report.loc[papers].attention_score>attention_threshold].sort(columns=rankby, ascending=False).iterrows():
                    body += report.comments
                if len(body)>0:
                    email_text = intro + body + closing
                    mailer.send(session=session,recipient=email, cc=cc_list, body=email_text, subject=subject, reply_to=program_chair_email)

    def _repr_html_(self):
        """Return an HTML representation of the entire report."""
        html = ''
        max_return = 50
        count = 0
        for id, paper in self.attention_report.sort(columns='attention_score', ascending=False).iterrows():
            html += paper.comments
            count += 1
            if count > 50:
                html += '<br><br><b>Report continues, only 50 papers shown ...</b>'
                return html
        return html

    def attention_score(self, paper, issues=None):
        """Compute the attention score for a given paper."""
        if issues is None:
            issues = self.issues(paper)

        attention_score = 0
        for issue in issues:
            attention_score += self.attention_scoring[issue.split('+')[0]]
        return attention_score

    def generate_comments(self):
        """Generate a paragraph of comments for each paper."""
        self.generate_html_comments()
        self.comments = {}
        for paper, comments in self.html_comments.iteritems():
            self.comments[paper] = strip_tags(self.html_comments[paper])

    def generate_html_comments(self):
        """Generate html comments for each paper."""
        self.html_comments={}
        attention_scores={}
        for paper in set(self.reviews.index):
            p = self.reviews.loc[paper]
            issues = self.issues(paper)
            attention_score = self.attention_score(paper, issues)
            html_comment = self.generate_html_comment(paper, issues)
            if type(p) is pd.DataFrame:
                title = list(p.Title)[0]
            else:
                title = p.Title
            html_comment = '\n<h3>Paper '  + paper + ' ' + title + '</h3>\n\n' + html_comment + '<br>\nAttention Score: ' + str(attention_score)

            self.html_comments[paper] = html_comment
            attention_scores[paper] = attention_score
        self.attention_report = pd.DataFrame({'comments': pd.Series(self.html_comments), 'attention_score':pd.Series(attention_scores)})
        self.attention_report.sort(columns='attention_score', inplace=True, ascending=False)


    def spreadsheet_comments(self):
        """Generate comments suitable for placing in a spreadsheet."""
        comments = {}
        attention_scores = {}
        quality_scores = {}
        confidence_scores = {}
        calibrated_quality_scores = {}
        impact_scores = {}
        reviewer_list = {}
        prob_accept = {}
        paper_title = {}
        notes = {}
        accept = {}
        talk = {}
        spotlight = {}
        discussions = {}
        for paper in set(self.reviews.index):
            p = self.reviews.loc[paper]
            attention_scores[paper] = self.attention_score(paper)
            quality_scores[paper] = ','.join(map(str,p.Quality))
            confidence_scores[paper] = ','.join(map(str,p.Conf))
            impact_scores[paper] = ','.join(map(str,p.Impact))
            calibrated_quality_scores[paper] = ','.join(map(my_format,p.CalibratedQuality))
            comments[paper] = self.summary_comment(paper)
            reviewer_names = []
            for paperid, review in p.iterrows():
                reviewer_names.append(review.FirstName + ' ' + review.LastName)
            reviewer_list[paper] = ','.join(reviewer_names)
            paper_title[paper] = p.Title[0]
            prob_accept[paper] = my_format(p.AcceptProbability[0],5)
            notes[paper] = ''
            talk[paper] = ''
            spotlight[paper] = ''
            accept[paper] = ''
            discussions[paper] = p['Number Of Discussions'][0]
        self.attention_report = pd.DataFrame({'comments': pd.Series(comments),
                                              'attention_score':pd.Series(attention_scores),
                                              'quality':pd.Series(quality_scores),
                                              'calibrated_quality': pd.Series(calibrated_quality_scores),
                                              'confidence':pd.Series(confidence_scores),
                                              'impact':pd.Series(impact_scores),
                                              'reviewers':pd.Series(reviewer_list),
                                              'paper_title':paper_title,
                                              'prob_accept':prob_accept,
                                              'notes':notes,
                                              'talk':talk,
                                              'spotlight':spotlight,
                                              'accept':accept,
                                              'discussions':discussions})

        column_presentation_order = ['paper_title',
                                     'prob_accept',
                                     'attention_score',
                                     'discussions',
                                     'reviewers',
                                     'quality',
                                     'calibrated_quality',
                                     'confidence',
                                     'impact',
                                     'comments',
                                     'notes',
                                     'accept',
                                     'talk',
                                     'spotlight']

        column_sort_order = ['attention_score', 'prob_accept']

        self.attention_report = self.attention_report[column_presentation_order]
        self.attention_report.sort(column_sort_order, inplace=True,ascending=False)

    def issues(self, paper):
        """Identify the potential issues with a given paper."""

        paper = str(paper)
        p = self.reviews.loc[paper]
        issues = []

        # Check for requisite number of reviews
        num_revs = list(self.reviews.index).count(paper)
        if num_revs<self.expected_reviews:
            if num_revs < 2:
                issues.append('one_review')
                return issues
            else:
                issues.append('too_few_reviews')
        prob = p.AcceptProbability.mean()

        # Check for whether the paper is borderline
        if prob >= self.light_grey_area[0] and prob < self.light_grey_area[1]:
            if prob >= self.firm_grey_area[0] and prob<self.firm_grey_area[1]:
                issues.append('firm_grey_area')
            else:
                issues.append('light_grey_area')

        # Check if paper is likely to be accepted
        if prob >= self.light_grey_area[1]:
            issues.append('likely_accept')

        # Check if paper is high impact and likely to be accepted
        impact = p.Impact.mean()
        if impact>=self.high_impact_threshold and prob >= self.firm_grey_area[1]:
            issues.append('talk')

        # Check the span of the reviews
        review_span = p.Quality.max() - p.Quality.min()
        if review_span > 2:
            if review_span>3:
                issues.append('very_large_span')
            else:
                issues.append('large_span')

        # Check for reviewer confidence and review length.
        if num_revs > 1:
            for paperid, review in p.iterrows():
                if review.Conf < 3:
                    if review.Conf < 2:
                        issues.append('very_low_confidence'+'+'+ review.Email)
                    else:
                        issues.append('low_confidence' + '+' + review.Email)
                if review.comment_length < self.comment_length_low_threshold:
                    if review.comment_length < self.comment_length_very_low_threshold:
                        issues.append('very_short_review'+'+'+review.Email)
                    else:
                        issues.append('short_review'+'+'+review.Email)
        return issues

    def base_comments(self, paper):
        """Given general comments about the paper, ignoring specific issues."""
        paper = str(paper)
        p = self.reviews.loc[paper]

        if type(p) is pd.DataFrame: # there has to be a better way of doing this! loc returns string or data frame depending on number of reviewers of paper.
            base_comments = 'Quality scores: ' + ', '.join(map(str,p.Quality)) + '<br>\n'
            base_comments += 'Calibrated quality scores: ' + ', '.join(map(my_format,p.CalibratedQuality)) + '<br>\n'
            base_comments += 'Confidence scores: ' + ', '.join(map(str, p.Conf)) + '<br>\n'
            base_comments += 'Impact scores: ' + ', '.join(map(str, p.Impact)) + '<br>\n'

        else:
            base_comments = 'Quality scores: ' + str(p.Quality) + '<br>\n'
            base_comments += 'Calibrated quality scores: ' + my_format(p.CalibratedQuality) + '<br>\n'
            base_comments += 'Confidence scores: ' + str(p.Conf) + '<br>\n'
            base_comments += 'Impact scores: ' + str(p.Impact) + '<br>\n'

        base_comments += "<br>\nSome things to consider:<br>\n"
        prob = p.AcceptProbability.mean()
        base_comments += "Accept probability for this paper is <b>" + my_format(100*prob) + '%</b>.<br>\n'
        return base_comments

    def generate_html_comment(self, paper, issues=None):
        """ Generate html formatted comments for a specific paper."""

        # The comments dictionary declares the comments to be used.
        comments = {}
        comments['one_review'] = "This paper only has <b>ONE REVIEW</b>!<br>\nYou need to sort that out as soon as possible!<br>\n"
        comments['too_few_reviews'] = "This paper only has {num_revs} reviews.<br>\n"
        comments['firm_grey_area'] = "The paper is firmly in the <b>grey area</b> and will need discussion at teleconference.<br>\n"
        comments['light_grey_area'] = "The paper may be in the <b>grey area</b> and may need discussion at teleconference.<br>\n"
        comments['likely_accept'] = ""
        comments['talk'] = "This paper is likely to be accepted and is currently rated high impact, <b>would it make an appropriate talk or spotlight</b>?<br>\n"
        span = "Difference between max and minimum review score is {review_span}."
        comments['very_large_span'] = span  + ' This is a <b>very large span</b>, the reviewers need to try and discuss the reason for their differences of opinion. If it is resolved scores should be modified to reflect this.<br>\n'
        comments['large_span'] = span + ' This is a large span. Reviews should try and discuss and resolve (adjusting scores if necessary).<br>\n'
        comments['very_low_confidence'] = "Reviewer {reviewer} only has confidence of {reviewer_confidence}.<br>\n"
        comments['low_confidence'] = "Reviewer {reviewer} only has confidence of {reviewer_confidence}.<br>\n"
        reviewer_length = "Reviewer {reviewer} only has a review of length {comment_length} characters."
        review = "It reads as follows:<br>\n<quote>{comment}</quote><br>\n"

        comments['very_short_review'] = reviewer_length + " This is in the <b>shortest " + str(self.very_short_review_percentile) + "% percentile</b> of all our reviews.<br>\n" + review
        comments['short_review'] = reviewer_length + " This is in the <b>shortest " + str(self.short_review_percentile) + "% percentile</b> of all our reviews.<br>\n" + review

        base_comments = self.base_comments(paper)
        return base_comments + self.comment(paper, comments, issues)

    def summary_comment(self, paper, issues=None):
        """ Generate short summary comment for a specific paper. These comments are suitable for spreadsheet entry."""

        comments = {}
        comments['one_review'] = "ONE REVIEW! "
        comments['too_few_reviews'] = "{num_revs} reviews. "
        comments['firm_grey_area'] = "Firm grey area. "
        comments['light_grey_area'] = "Outer-grey area. "
        comments['likely_accept'] = "Likely accept. "
        comments['talk'] = "Talk or Spotlight? "
        span = "Difference between max and minimum review score is {review_span}. "
        comments['very_large_span'] = "Large review span of {review_span}. "
        comments['large_span'] = "Review span of {review_span}. "
        comments['very_low_confidence'] = "{reviewer} confidence is {reviewer_confidence}. "
        comments['low_confidence'] = "{reviewer} confidence is {reviewer_confidence}. "
        reviewer_length = "{reviewer} only has a review of length {comment_length} characters."
        review = "It reads as follows:\n{comment}\n"

        comments['very_short_review'] = "{reviewer} comments only {comment_length} chars long. "
        comments['short_review'] = "{reviewer} comments only {comment_length} chars long. "
        return self.comment(paper, comments, issues)


    def comment(self, paper, comment_dict, issues=None):
        """Generate comments for given paper given a dictionary of comments for specific issues."""
        if issues is None:
            issues = self.issues(paper)
        paper = str(paper)
        p = self.reviews.loc[paper]
        num_revs = list(self.reviews.index).count(paper)
        review_span = p.Quality.max() - p.Quality.min()
        review_tag = {}
        review_comments = {}
        review_confidence = {}
        if type(p) is pd.DataFrame:
            for paperid, review in p.iterrows():
                review_tag[review.Email] = review.FirstName + ' ' + review.LastName + ' (' + review.Email + ')'
                review_comments[review.Email] = review.Comments
                review_confidence[review.Email] = review.Conf
        else:
            review_tag[p.Email] = p.FirstName + ' ' + p.LastName + ' (' + p.Email + ')'
            review_comments[p.Email] = p.Comments
            review_confidence[p.Email] = p.Conf
        comment = ''
        for issue in issues:
            s = issue.split('+')
            if len(s)>1:
                reviewer = s[1]
                c = comment_dict[s[0]].format(reviewer=review_tag[reviewer],
                                          comment=review_comments[reviewer],
                                          comment_length=len(review_comments[reviewer]),
                                          reviewer_confidence = review_confidence[reviewer],
                                          num_revs=num_revs,
                                          review_span=review_span)
            else:
                c = comment_dict[s[0]].format(num_revs=num_revs, review_span=review_span)
            comment += c
        return comment

class reviewers:
    """
    Reviewer class that combines information from the local data base
    and exports from CMT on the reviewer subject areas to characterize the
    reviewers for paper matching
    """
    def __init__(self, directory=None, filename='users.xls', subject_file='Reviewer Subject Areas.xls'):
        if directory is None:
            directory = cmt_data_directory
        self.directory = directory
        self.subjects = {}
        self.load(filename=filename)
        print "Loaded Users."
        self.load_subjects(filename=subject_file)
        print "Loaded Reviewer Subjects."

    def load(self, filename='users.xls', localdb='reviewers.db'):
        a = xl_read(filename=filename, header=2, index_col='Email', dataframe=True, lower_index=True)
        cmt_users = a.items
        # Now load in the local store of information
        con = sqlite3.connect(os.path.join(cmt_data_directory, localdb))
        local_users = pd.read_sql('SELECT * from Reviewers', con, index_col='Email')

        # Build a user data base which has the shared info
        self.users = cmt_users.join(local_users, how='inner', rsuffix='_a')

    def load_subjects(self, filename='Reviewer Subject Areas.xls'):
        """Load the reviewer's chosen subject areas from the CMT export file."""
        data = xl_read(filename=os.path.join(self.directory, filename), index_col='Selected Subject Area', header=2, dataframe=True, worksheet_number=1)
        data.items.reset_index(inplace=True)
        #reviewer_subject.replace(to_replace
        data.items['index'] = data.items.index
        self.subjects = {}
        stati = ['Primary', 'Secondary']
        for status in stati:
            self.subjects[status] = data.items.pivot(index='index', columns='Email', values='Primary or Secondary')
            self.subjects[status].replace(to_replace=status, value=1, inplace=True)
            self.subjects[status].replace(to_replace=list(set(stati) - set([status])), value=[0], inplace=True)
            self.subjects[status].fillna(0, inplace=True)
            self.subjects[status].columns = map(str.lower, self.subjects[status].columns)

class papers:
    """
    Paper class that loads information from CMT about the papers'
    subject areas for use in paper to reviewer matching
    """
    def __init__(self, directory=None, filename='Papers.xls'):
        if directory is None:
            directory = cmt_data_directory
        self.directory = directory
        self.subjects = {}
        self.load(filename)
        print "Loaded Papers."
        self.load_subjects()
        print "Loaded Paper Subjects."

    def load(self, filename='Papers.xls'):
        """Load in the information about the papers, abstracts, titles, authors etc from CMT exports. `Submissions -> View Active Papers -> Export -> Metadata as Excel`"""
        a = xl_read(filename=filename, header=2, index_col='ID', dataframe=True)
        self.papers = a.items

    def load_subjects(self, filename = 'Paper Subject Areas.xls'):
        """Load paper subject areas from a CMT export file."""
        data = xl_read(filename=os.path.join(self.directory, filename), index_col='Paper ID', dataframe=True, worksheet_number=1)
        data.items.reset_index(inplace=True)
        data.items.rename(columns={'index':'Paper ID'}, inplace=True)
        #reviewer_subject.replace(to_replace
        self.subjects = {}
        stati = ['Primary', 'Secondary']
        for status in stati:
            self.subjects[status] = data.items.pivot(index='Selected Subject Area', columns='Paper ID', values='Primary or Secondary')
            self.subjects[status].replace(to_replace=status, value=1, inplace=True)
            self.subjects[status].replace(to_replace=list(set(stati) - set([status])), value=[0], inplace=True)
            self.subjects[status].fillna(0, inplace=True)

class similarities:
    """
    Similarities class, given a papers class object in submissions and
    a reviewers class object as reviewers it computes the similarity
    measures by loading in bids and TPMS scores and matching reviewers
    to papers by subject similarities. It is then used in an
    allocation object to perform paper allocation.
    """
    def __init__(self, submissions, reviewers, directory=None):
        if directory is None:
            directory = cmt_data_directory
        self.directory = directory

        self.reviewers = reviewers
        self.submissions = submissions
        # check that all subjects are in both reviewers and papers.
        self.subjects = list(set(self.reviewers.subjects['Primary'].index)
                             | set(self.reviewers.subjects['Secondary'].index)
                             | set(self.submissions.subjects['Primary'].index)
                             | set(self.submissions.subjects['Secondary'].index))

        for subjects in [self.reviewers.subjects, self.submissions.subjects]:
            for group in ['Primary', 'Secondary']:
                missing_subjects = list(set(self.subjects)
                                        -set(subjects[group].index))
                for subject in missing_subjects:
                    vals = np.zeros(subjects[group].shape[1])
                    subjects[group].loc[subject] = vals


        self.load_tpms()
        print "Loaded TPMS scores"
        self.load_bids()
        print "Loaded bids"

        # TAKE CARE OF MISSING TPMS ROWS - AK
        diff_index = self.bids.index.difference(self.affinity.index)
        for idx in diff_index:
            self.affinity.loc[idx] = 0

        self.compute_subject_similarity()
        self.compute_scores()


    def load_bids(self, filename='Bids.txt'):
        """Load in Bids information. This is obtained through the `Assignments
        & Conflicts -> Automatic Assignment Wizard`. You need to go through
        the wizard process almost until the end. Then select `Export Data for
        Custom Assignment`. Choose the Tab Delimited format and you will
        download a file `Bids.txt`."""
        self.bids = pd.read_csv(os.path.join(self.directory, filename), delimiter='\t', index_col=False, converters={'Email':str.lower, 'PaperID':str})
        self.bids = self.bids.pivot(index='PaperID', columns='Email', values='BidValue') # Moves the column records into a matrix (with lots of misisng values)
        self.bids.replace(to_replace=0, value=-1, inplace=True)
        self.bids.replace(to_replace=1, value=-0.5, inplace=True)
        self.bids.replace(to_replace=2, value=0.5, inplace=True)
        self.bids.replace(to_replace=3, value=1, inplace=True)
        self.bids.fillna(0, inplace=True)
        self.reviewers_missing_bids = list(set(self.reviewers.users[self.reviewers.users['IsReviewer']=='Yes'].index) - set(self.bids.columns))
        for reviewer in self.reviewers_missing_bids:
            self.bids[reviewer.strip()] = 0.
        self.papers_missing_bids = list(set(self.submissions.papers.index)-set(self.bids.index))
        for paper in self.papers_missing_bids:
            self.bids.loc[paper] = 0.


    def load_tpms(self, filename = 'External Matching Scores(Toronto Paper Matching System).txt'):
        """Load in TPMS information. If you are working with Laurent Charlin
        and TPMS you may have access to the Toronto paper matching
        scores. They are obtained byy first running the match `More ->
        External Reviewer Matching -> Submit Papers for Reviewer
        Matching`. And then you can export the data through the
        `Assignments & Conflicts -> Automatic Assignment Wizard`. You
        need to go through the wizard process almost until the
        end. Then select `Export Data for Custom Assignment`. Choose
        the Tab Delimited format and you will download a file
        `External Matching Scores(Toronto Paper Matching System).txt`.

        """

        self.affinity = pd.read_csv(os.path.join(self.directory, filename), delimiter='\t', index_col=False, na_values=['N/A'], converters={'PaperID':str}).fillna(0)
        self.affinity.set_index(['PaperID'], inplace=True)
        self.affinity.columns = map(str.lower, self.affinity.columns)
        for reviewer in list(set(self.reviewers.users[self.reviewers.users['IsReviewer']=='Yes'].index) - set(self.affinity.columns)):
            self.affinity[reviewer.strip()] = 0.
        #data = xl_read(, index_col='Paper ID', dataframe=True)
        #affinity = data.items
        # Scale affinities to be between 0 and 1.
        self.affinity -= self.affinity.values.min()
        self.affinity /= self.affinity.values.max()


    def compute_subject_similarity(self, alpha=0.5):
        """Compute the similarity between submissions and reviewers by subject
        keyword. Similarities are computed on the basis of keyword
        similarity using primary and secondary keyword matches.
        :param alpha: gives the weighting between primary and secondary keyword match.
        :type alpha: float

        """
        self._sim = {}

        self._sim['Primary'] = pd.DataFrame(np.dot(self.submissions.subjects['Primary'].T, self.reviewers.subjects['Primary']),
                                      index=self.submissions.subjects['Primary'].columns,
                                      columns=self.reviewers.subjects['Primary'].columns)
        self._sim['Secondary'] = pd.DataFrame(np.dot((self.submissions.subjects['Primary'].values + self.submissions.subjects['Secondary'].values).T,
                                               (self.reviewers.subjects['Primary'].values + self.reviewers.subjects['Secondary'])),
                                      index=self.submissions.subjects['Primary'].columns,
                                      columns=self.reviewers.subjects['Primary'].columns)
        self._sim['Secondary'] = (1/np.sqrt(self.reviewers.subjects['Secondary'].sum(axis=0)+1))*self._sim['Secondary']
        self._sim['Secondary'] = ((1/np.sqrt(self.submissions.subjects['Secondary'].sum(axis=0)+1))*self._sim['Secondary'].T).T
        self.subject_similarity = alpha*self._sim['Primary'] + (1-alpha)*self._sim['Secondary']

    def compute_scores(self, alpha = 0.5, b=1.5):
        """Combine TPMS, subject matching and bids into an overal score."""
        self.scores = (alpha*self.affinity + (1-alpha)*self.subject_similarity)
        self.scores = self.scores*b**self.bids

class assignment_diff:
    """
    Stores the difference between two assignments. This is useful for
    finding reviewers who have gained allocations or lost allocations
    between two different assignments. To use it you will need to
    download and store assignment allocation files from CMT regularly.
    """
    def __init__(self, assignment1, assignment2):
        self.gain_paper = {}
        self.loss_paper = {}
        self.gain_reviewer = {}
        self.loss_reviewer = {}

        for reviewer_type in ['reviewer', 'metareviewer']:
            self.loss_reviewer[reviewer_type] = {}
            self.gain_reviewer[reviewer_type] = {}
            papers = set(assignment1.assignment_paper[reviewer_type]) & set(assignment2.assignment_paper[reviewer_type])
            for paper in papers:
                if paper not in assignment2.assignment_paper[reviewer_type]:
                    self.gain_paper[paper] = assignment1.assignment_paper[reviewer_type][paper]

                elif paper not in assignment1.assignment_paper[reviewer_type]:
                    self.loss_paper[paper] = assignment1.assignment_paper[reviewer_type][paper]
                else:
                    diff = list(set(assignment2.assignment_paper[reviewer_type][paper])-set(assignment1.assignment_paper[reviewer_type][paper]))
                    if len(diff)>0:
                        self.gain_paper[paper] = diff
                    diff = list(set(assignment1.assignment_paper[reviewer_type][paper])-set(assignment2.assignment_paper[reviewer_type][paper]))
                    if len(diff)>0:
                        self.loss_paper[paper] = diff

            reviewers = set(assignment1.assignment_reviewer[reviewer_type]) & set(assignment2.assignment_reviewer[reviewer_type])
            for reviewer in reviewers:
                if reviewer not in assignment2.assignment_reviewer[reviewer_type]:
                    self.gain_reviewer[reviewer_type][reviewer] = assignment1.assignment_reviewer[reviewer_type]
                elif reviewer not in assignment1.assignment_reviewer[reviewer_type]:
                    self.loss_reviewer[reviewer_type][reviewer] = assignment2.assignment_reviewer[reviewer_type]
                else:
                    diff = list(set(assignment2.assignment_reviewer[reviewer_type][reviewer]) - set(assignment1.assignment_reviewer[reviewer_type][reviewer]))
                    if len(diff)>0:
                        self.gain_reviewer[reviewer_type][reviewer] = diff
                    diff = list(set(assignment1.assignment_reviewer[reviewer_type][reviewer]) - set(assignment2.assignment_reviewer[reviewer_type][reviewer]))
                    if len(diff)>0:
                        self.loss_reviewer[reviewer_type][reviewer] = diff

    def prod(self, similarities, score_type=None):
        """Compute the similarity score change associated with an assignment difference."""
        score = 0.0
        if score_type is None:
            scs = similarities.scores
        elif score_type == 'tpms':
            scs = similarities.affinity
        elif score_type == 'subject':
            scs = similarities.subject_similarity
        for paper in self.loss_paper:
            for reviewer in self.loss_paper[paper]:
                if paper in scs.index and reviewer in scs.columns:
                    score -= scs.loc[paper, reviewer]
                else:
                    print "Warning paper", paper, "has no score for reviewer", reviewer
        for paper in self.gain_paper:
            for reviewer in self.gain_paper[paper]:
                if paper in scs.index and reviewer in scs.columns:
                    score += scs.loc[paper, reviewer]
                else:
                    print "Warning paper", paper, "has no score for reviewer", reviewer
        return score


class assignment:
    """
    Stores an assignment of reviewers to papers. The assignment can
    either be loaded (e.g. as an export from CMT) in or allocated
    using a similarities matrix.

    """
    def __init__(self, directory=None, max_reviewers=3, max_papers=4,  meta_reviewers_per_paper=1):

        if directory is None:
            directory = cmt_data_directory
        self.directory = directory
        self.quota = {}
        self.quota['reviewer'] = {}
        self.quota['metareviewer'] = {}
        self.max_reviewers = max_reviewers
        self.max_papers = max_papers
        self.meta_reviewers_per_paper = meta_reviewers_per_paper
        self.assignment_paper = {}
        self.assignment_reviewer = {}
        for type in ['reviewer', 'metareviewer']:
            self.assignment_paper[type] = {}
            self.assignment_reviewer[type] = {}

    def __minus__(self, other):
        """ Overloading of the '+' operator. for more control, see self.add """
        return self.diff(other)


    def reviewer_area_chairs(self, reviewer):
        """Return the area chairs responsible for managing a reviewer."""
        area_chairs = []
        for paper in self.assignment_reviewer['reviewer'][reviewer]:
            for chair in self.assignment_paper['metareviewer'][paper]:
                area_chairs.append(chair)
        return area_chairs

    def prod(self, similarities, score_type=None, reviewer_type='reviewer'):
        """Compute the similarity score of an assignment."""
        score = 0.0
        if score_type is None:
            scs = similarities.scores
        elif score_type == 'tpms':
            scs = similarities.affinity
        elif score_type == 'subject':
            scs = similarities.subject_similarity
        for paper in self.assignment_paper[reviewer_type]:
            for reviewer in self.assignment_paper[reviewer_type][paper]:
                if paper in scs.index and reviewer in scs.columns:
                    score += scs.loc[paper, reviewer]
                else:
                    print "Warning paper", paper, "has no score for reviewer", reviewer
        return score

    def diff(self, other):
        """Compute the difference between two assignments for each paper and reviewer."""
        return assignment_diff(self, other)

    def load_assignment(self, filename=None, reviewer_type='reviewer'):
        """Load in the CMT assignments file."""
        self.clear_assignment(reviewer_type=reviewer_type)
        if filename==None:
            filename = 'Assignments.txt'
        if filename[-4:]=='.txt':
            with open(os.path.join(self.directory, filename)) as fin:
                rows = ( line.strip().split('\t') for line in fin)
                self.assignment_paper[reviewer_type] = {str(row[0]):[elem.lower() for elem in row[1:]] for row in rows}

            self._reviewer_from_paper(reviewer_type)
        elif filename[-4:] == '.xml':
            with open(os.path.join(self.directory, filename)) as xml_file:
                doc = etree.parse(xml_file)
            self.assignment_paper[reviewer_type] = {submission.get('submissionId'):[reviewer.get('email').lower() for reviewer in submission.xpath('./reviewer')] for submission in doc.xpath('/assignments/submission')}
            self._reviewer_from_paper(reviewer_type)

        elif filename[-4:] == '.xls':
            raise ValueError("un-implemented file type.")
        else:
            raise ValueError("unimplemented file type.")

    def _reviewer_from_paper(self, reviewer_type='reviewer'):
        """
        Set assignment_reviewer assuming assignment_paper is set correctly.
        """
        for paper in self.assignment_paper[reviewer_type]:
            for reviewer in self.assignment_paper[reviewer_type][paper]:
                if reviewer in self.assignment_reviewer[reviewer_type]:
                    self.assignment_reviewer[reviewer_type][reviewer].append(paper)
                else:
                    self.assignment_reviewer[reviewer_type][reviewer] = [paper]

    def update_group(self, group):
        """Update with a Data Series of true/false values which reviewers or area chairs are to be assigned."""
        self.group = group

    def prep_assignment(self):
        """Load quata and shotgun clusters in alongside conflicts in order to prepare for an assignment."""
        self.load_quota()
        print "Loaded Quota."
        self.load_shotgun()
        print "Loaded shotgun clusters."
        self.load_conflicts()
        print "Loaded Conflicts"

    def make_assignment(self, similarities, group=None, score_quantile=0.7, reviewer_type='reviewer'):
        if group is None:
            group = (similarities.reviewers.users['IsMetaReviewer']=='No') & (similarities.reviewers.users['IsReviewer']=='Yes')
        self.score_quantile = score_quantile
        self.prep_assignment()
        self.update_group(group)
        self.rank_similarity_scores(similarities)
        print "Ranked similarities"
        self.allocate(reviewer_type=reviewer_type)
        print "Performed allocation"

    def load_quota(self, filename='Reviewer Quotas.xls'):
        a = xl_read(filename=os.path.join(self.directory, filename), header=2, index_col='Reviewer Email', dataframe=True, lower_index=True)
        self.quota = a.items

    def unassigned_reviewers(self, reviewers, reviewer_type='reviewer', group=None):
        """Return a true/false series of reviewers that aren't at full allocation."""
        an = pd.Series(False, index=reviewers.users.index)
        for idx in an.index:
            if self.group.loc[idx]:
                if idx in self.assignment_reviewer[reviewer_type]:
                    num_assigned = len(self.assignment_reviewer[reviewer_type][idx])
                else:
                    num_assigned = 0
                if num_assigned<self.max_papers:
                    if idx not in list(self.quota.index):
                        an.loc[idx]=True
                    elif num_assigned<min([self.quota['Quota'][idx], self.max_papers]):
                        an.loc[idx] = True
                    else:
                        an.loc[idx] = False
                else:
                    an.loc[idx] = False
        return an

    def unassigned_papers(self, submissions, reviewer_type='reviewer'):
        """Return a true/false series of papers that are unassigned."""
        an = pd.Series(np.zeros(len(submissions.papers.index)), index=submissions.papers.index)
        for idx in an.index:
            #print idx
            if idx in self.assignment_paper[reviewer_type]:
                num_assigned = len(self.assignment_paper[reviewer_type][idx])
            else:
                num_assigned = 0
            if num_assigned<self.max_reviewers:
                an.loc[idx] = True
            else:
                an.loc[idx] = False
        return an

    def clear_assignment(self, reviewer_type='reviewer'):
        if reviewer_type is None:
            reviewer_types = ['reviewer', 'metareviewer']
        else:
            reviewer_types = [reviewer_type]
        for type in reviewer_types:
            self.assignment_paper[type] = {}
            self.assignment_reviewer[type] = {}

    def allocate(self,  reviewer_type='reviewer'):
        """Allocate papers to reviewers. This function goes through the similarities list *once* allocating papers. """

        for idx in list(self.score_vec.index):
            papers = str(self.score_vec['PaperID'][idx]).split('_')
            reviewer = str(self.score_vec['Email'][idx])
            assign = True
            for paper in papers:
                if not paper in self.assignment_paper[reviewer_type]:
                    self.assignment_paper[reviewer_type][paper] = []

                num_assigned = len(self.assignment_paper[reviewer_type][paper])+1
                if num_assigned>self.max_reviewers:
                    assign = False
                    continue
            if not assign:
                continue

            if not reviewer in self.assignment_reviewer[reviewer_type]:
                self.assignment_reviewer[reviewer_type][reviewer] = []
            num_assigned = len(self.assignment_reviewer[reviewer_type][reviewer]) + len(papers)
            if num_assigned>self.max_papers or (reviewer in list(self.quota.index) and num_assigned>self.quota['Quota'][reviewer]):
                continue

            # check paper isn't already assigned.
            for paper in papers:
                if paper in self.assignment_reviewer[reviewer_type][reviewer]:
                    assign = False
                    continue
            if not assign:
                continue

            for paper in papers:
                self.assignment_paper[reviewer_type][paper].append(reviewer)
            self.assignment_reviewer[reviewer_type][reviewer] += papers

    def load_shotgun(self, filename='ConstraintsGroup1.txt'):
        """
        Some papers have a very strong keyword clustering, and we'd like
        these to be reviewed alongside each other to check similarity

        """
        filename = 'ConstraintsGroup1.txt'
        with open(os.path.join(self.directory, filename)) as fin:
            rows = ( line.strip().split(' ') for line in fin)
            self.shotgun_clusters = [row for row in rows]
        fin.close()

    def load_conflicts(self, filename = 'Conflicts.txt'):
        """Load in the CMT conflicts file."""
        with open(os.path.join(self.directory, filename)) as fin:
            rows = ( line.strip().split('\t') for line in fin)
            self.conflicts_groups = {str(row[0]):[elem.lower() for elem in row[1:]] for row in rows}
        self.conflicts_by_reviewer = {}

        for paper in self.conflicts_groups:
            for reviewer in self.conflicts_groups[paper]:
                if reviewer in self.conflicts_by_reviewer:
                    self.conflicts_by_reviewer[reviewer].append(paper)
                else:
                    self.conflicts_by_reviewer[reviewer] = [paper]


    def rank_similarity_scores(self, similarities):
        """
        Place the similarity scores into a 'melted' structure and rank so that highest similarity is top.
        """
        # Allocate 'expert reviewers' those with 2 or more papers.
        rank_scores = similarities.scores.copy()
        # Normalise
        rank_scores = rank_scores/rank_scores.std()
        rank_scores = (rank_scores.T/rank_scores.T.std()).T

        for paper in self.conflicts_groups:
            rank_scores.loc[paper][self.conflicts_groups[paper]] = -np.inf

        # select users to allocate
        usergroup = similarities.reviewers.users[self.group].index
        rank_scores=rank_scores[usergroup]

        # merge shotgun papers for ranking.
        for cluster in self.shotgun_clusters:
            cluster_name = '_'.join(cluster)
            rank_scores.loc[cluster_name] = rank_scores.loc[cluster[0]]
            for paper in cluster[1:]:
                rank_scores.loc[cluster_name] += rank_scores.loc[paper]
            rank_scores.loc[cluster_name]/=len(cluster)
            rank_scores.drop(cluster, inplace=True)

        print "Allocating to", len(usergroup), "users."
        self.score_vec = rank_scores.reset_index()
        self.score_vec = pd.melt(self.score_vec, id_vars=['index']) # Opposite of a pivot!
        val = self.score_vec.value.quantile(self.score_quantile)
        print "Retaining scores greater than", self.score_quantile*100, "percentile which is", val
        self.score_vec = self.score_vec[self.score_vec.value >val]
        self.score_vec = self.score_vec[pd.notnull(self.score_vec.value)]
        self.score_vec.columns = ['PaperID', 'Email', 'Score']
        self.score_vec = self.score_vec.sort_index(by='Score', ascending=False)