Skip to content

Commit 2c2fd3c

Browse files
committed
Converted the GitHub API calls from the REST API to the GraphQL API. This new version is ~3.5 times faster, and is able to find more email addresses. The original version found ~85% of the email addresses for the Knative 2022 TOC voters compared to this version, which found 96% of the email addresses.
1 parent 3894220 commit 2c2fd3c

File tree

1 file changed

+99
-57
lines changed

1 file changed

+99
-57
lines changed

scripts/elekto_emails.py

Lines changed: 99 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -35,63 +35,88 @@
3535
3636
As output, a csv file of this format containing comma separated email addresses
3737
is created:
38-
elekto_emails_GHorgName_YYYYMMDD.csv
38+
elekto_emails_YYYYMMDD.csv
3939
40-
A message with the number of email addresses found out of the total voters
41-
is printed to the screen
40+
Each email address found is printed to the screen as a way to indicate progress,
41+
and a message with the number of email addresses found out of the total voters
42+
along with the name of the results csv file is printed to the screen at the end.
4243
4344
Parameters
4445
----------
45-
org_name : str
46-
The primary GitHub organization for the vote.
47-
Used to gather email address from commits
4846
file_name : str
4947
This should be an Elekto yaml file starting with "eligible_voters:"
5048
"""
5149

5250
def read_args():
53-
"""Reads the org name and yaml filename where the votes can be found.
51+
"""Reads the yaml filename where the voters can be found and prompts for a
52+
GitHub API Personal Access Token.
5453
5554
Parameters
5655
----------
5756
None
5857
5958
Returns
6059
-------
61-
org_name : str
62-
The primary GitHub organization for the vote.
63-
Used to gather email address from commits
6460
file_name : str
6561
This should be an Elekto yaml file (raw) starting with "eligible_voters:" Example:
66-
https://raw.githubusercontent.com/knative/community/main/elections/2021-TOC/voters.yaml
62+
https://raw.githubusercontent.com/knative/community/main/elections/2022-TOC/voters.yaml
6763
"""
6864
import sys
6965

70-
# read org name and filename from command line or prompt if no
66+
# read filename from command line or prompt if no
7167
# arguments were given.
7268
try:
73-
org_name = str(sys.argv[1])
74-
file_name = str(sys.argv[2])
69+
file_name = str(sys.argv[1])
7570

7671
except:
77-
print("Please enter the org name and filename for voters.yaml.")
78-
org_name = input("Enter a GitHub org name (like kubernetes): ")
72+
print("Please enter the filename for voters.yaml.")
7973
file_name = input("Enter a file name (like https://raw.githubusercontent.com/knative/community/main/elections/2021-TOC/voters.yaml): ")
8074

8175
api_token = input("Enter your GitHub Personal Access Token: ")
8276

83-
return org_name, file_name, api_token
77+
return file_name, api_token
8478

85-
def get_email(org, username, api_token):
79+
80+
def email_query():
81+
"""This contains the GitHub GraphQL API Query to get an email address from the
82+
profile and commits
83+
Returns
84+
-------
85+
str
86+
"""
87+
return """query pr_info_query($user_login: String!, $start_date: DateTime!, $end_date: DateTime!){
88+
user(login: $user_login) {
89+
email
90+
contributionsCollection(from: $start_date, to: $end_date){
91+
pullRequestContributions(first: 10){
92+
nodes{
93+
pullRequest{
94+
commits(first: 10){
95+
nodes{
96+
url
97+
commit{
98+
authoredByCommitter
99+
author{
100+
email
101+
}
102+
}
103+
}
104+
}
105+
}
106+
}
107+
}
108+
}
109+
}
110+
}"""
111+
112+
def get_email(username, api_token):
86113
"""Attempts to get an email address from the GitHub profile first.
87114
Otherwise, it attempts to find an email address from the most recent
88-
commit, which is why the name of the GitHub org is required. If the
89-
email contains the string 'noreply' it is not written to the csv file.
115+
commit. If the email contains the string 'noreply' it is not written
116+
to the csv file.
90117
91118
Parameters
92119
----------
93-
org : str
94-
The primary org name where the users can be found
95120
username : str
96121
GitHub username
97122
@@ -100,38 +125,53 @@ def get_email(org, username, api_token):
100125
email : str
101126
"""
102127

103-
import sys
104-
from github import Github # Uses https://github.com/PyGithub/
105-
106-
try:
107-
g = Github(api_token)
108-
except:
109-
print("Cannot read gh_key file or does not contain a valid GitHub API token?")
110-
sys.exit()
111-
112-
try:
113-
email = g.get_user(username).email
128+
import requests
129+
import json
130+
from dateutil.relativedelta import relativedelta
114131

115-
email_list = []
132+
# Set GitHub GraphQL API variables
133+
url = 'https://api.github.com/graphql'
134+
headers = {'Authorization': 'token %s' % api_token}
116135

117-
if email == None:
118-
repo_list = g.get_organization(org).get_repos()
136+
# Set query variables including dates for past 12 months (req for query)
137+
today = datetime.now()
138+
end_date = today.isoformat() #isoformat required for json serialization
139+
start_date = (today + relativedelta(months=-12)).isoformat()
140+
variables = {"user_login": username, "start_date": start_date, "end_date": end_date}
119141

120-
for repo in repo_list:
121-
commits = repo.get_commits(author=username)
142+
# Run query and load the results into a JSON file
143+
query = email_query()
144+
r = requests.post(url=url, json={'query': query, 'variables': variables}, headers=headers)
145+
json_data = json.loads(r.text)
122146

123-
if commits.totalCount > 0:
124-
email_list.append([commits[0].commit.author.email, commits[0].commit.author.date, repo.name])
147+
# Get email address
148+
email = None
125149

126-
if len(email_list) > 0:
127-
newest = sorted(email_list, key = lambda x: x[1], reverse = True)
128-
email = newest[0][0]
129-
else:
130-
email = None
131-
if 'noreply' in email:
132-
email = None
150+
# Try to get the email address from the profile first
151+
# This will fail and return immediately if the user has been deleted.
152+
try:
153+
email = json_data['data']['user']['email']
133154
except:
134-
email = None
155+
print(username, "not found")
156+
return email
157+
158+
# If the profile didn't have an email address, loop through the PRs and commits
159+
# until you find an email address in a commit where the commit was authored by
160+
# username (since PRs can have commits from other people) and does not contain
161+
# 'noreply' anywhere in the email address.
162+
if email == None or email == '':
163+
try:
164+
for pr in json_data['data']['user']['contributionsCollection']['pullRequestContributions']['nodes']:
165+
for commits in pr['pullRequest']['commits']['nodes']:
166+
authoredBy = commits['commit']['authoredByCommitter']
167+
if authoredBy:
168+
email = commits['commit']['author']['email']
169+
if 'noreply' not in email:
170+
break
171+
else:
172+
email = None
173+
except:
174+
pass
135175

136176
return(email)
137177

@@ -141,7 +181,7 @@ def get_email(org, username, api_token):
141181
import urllib.request
142182
from datetime import datetime
143183

144-
org_name, file_name, api_token = read_args()
184+
file_name, api_token = read_args()
145185

146186
# Loads the yaml file and creates a list of voters
147187
try:
@@ -154,30 +194,32 @@ def get_email(org, username, api_token):
154194
print("Cannot load or process the yaml file. Did you use the raw link?")
155195
sys.exit()
156196

157-
print("Gathering email addresses from GitHub. This may take a while.")
197+
print("Gathering email addresses from GitHub. This will take ~3 minutes for 100 voters.")
158198

159199
# Create a list for the emails and initialize a counter for the
160200
# number of emails found.
161201
email_list = []
162202
found_count = 0
163203

164204
# Attempt to get an email address for each voter. If an email address is found
165-
# append it to the list and increment the counter.
205+
# append it to the list and increment the counter. Also print to the screen to
206+
# show that the script is progressing.
166207
for username in voter_list:
167-
email = get_email(org_name, username, api_token)
208+
email = get_email(username, api_token)
168209
if email:
169210
email_list.append(email)
170211
found_count+=1
171212
print(email)
172213

173-
# Print status and write emails to the csv file.
174-
print("Found emails for", found_count, "out of", len(voter_list), "voters")
175-
176214
# Open the CSV file for writing
177215
today = datetime.today().strftime('%Y-%m-%d')
178-
outfile_name = 'elekto_emails_' + org_name + "_" + today + '.csv'
216+
outfile_name = 'elekto_emails_' + today + '.csv'
179217
f = open(outfile_name,'w')
180218
csv_file = csv.writer(f)
181219

220+
# Print status and write emails to the csv file.
221+
print("Found emails for", found_count, "out of", len(voter_list), "voters")
222+
print("Your results can be found in", outfile_name)
223+
182224
csv_file.writerow(email_list)
183-
f.close()
225+
f.close()

0 commit comments

Comments
 (0)