-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwikicrawler.py
More file actions
138 lines (121 loc) · 4.74 KB
/
wikicrawler.py
File metadata and controls
138 lines (121 loc) · 4.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import json
import re
import string
from urllib.error import HTTPError
import nltk.corpus
import nltk
import requests
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup as bs
import urllib.request
from urllib.parse import quote
dictionary = set(nltk.corpus.words.words())
def get_content(sites: list[string], words: dict, dicts: list[dict], word_count: dict):
wnl = WordNetLemmatizer()
en_stops = set(wnl.lemmatize(word) for word in stopwords.words('english'))
downloaded_sites = []
for site in sites:
link = get_link(site)
try:
webpage = str(urllib.request.urlopen(link).read())
dicts.append(dict())
soup = bs(webpage, features="html.parser")
site_text = clean(soup.getText())
word_list = site_text.split()[1:]
stemmed = [wnl.lemmatize(word).casefold() for word in word_list if (
(word in dictionary or word[0].isupper()) and word not in en_stops)]
for s in stemmed:
if s not in en_stops:
if s in dicts[-1]:
dicts[-1][s] += 1
else:
dicts[-1][s] = 1
if s in words:
words[s] += 1
else:
words[s] = 1
if s in word_count:
word_count[s] += 1
else:
word_count[s] = 1
downloaded_sites.append(site)
except HTTPError:
pass
return downloaded_sites
def search(url, category, maxdepth, depth=0):
page = requests.get(url)
data = page.text
soup = bs(data, features="html.parser")
result = set()
txt = soup.getText()
if category.casefold() in txt or category in txt:
for link in soup.find_all('a'):
href = link.get('href')
if href and href.startswith('/wiki/'):
rest = href[6:]
if maxdepth >= depth and not re.match(
'Category*|Wikipedia*|Special*|Wayback*|List*|File*|.*(identifier)$|Help*', rest):
result.add(rest)
elif maxdepth > depth and re.match('Category*|List*', rest) and not rest.startswith('Category:Commons'):
new_url = "https://en.wikipedia.org/wiki/" + rest
result.update(search(new_url, category, maxdepth, depth + 1))
return result
def main(length: int, buffer_size: int = 10):
nltk.download("punkt")
nltk.download('words')
nltk.download('stopwords')
nltk.download("wordnet")
nltk.download("omw-1.4")
print("Downloading categories started")
categories = ["Physics", "Mathematics", "Computer_science", "Astronomy"]
sites = set()
for category in categories:
print(category)
start = "https://en.wikipedia.org/wiki/Category:"+category
sites.update(search(start, category, 2))
site_list = list(sites)[:length]
print("total: ", len(site_list), " links")
print("Downloading sites content started")
words = dict()
word_count = dict()
dicts = []
new_sites = []
for i in range(len(site_list) // buffer_size):
downloaded_sites = get_content(site_list[i * buffer_size:(i + 1) * buffer_size], words, dicts, word_count)
for site in downloaded_sites:
new_sites.append(site)
with open("saved/sites.json", "w") as write_file:
json.dump([title(site) for site in new_sites], write_file)
print("Downloading sites content ended")
reduce(dicts, word_count, words, length)
print("total: ", len(new_sites), " sites")
print("total: ", len(words), " words")
with open("saved/words.json", "w") as write_file:
json.dump(words, write_file)
return words, dicts
def reduce(dicts: list[dict], word_count: dict, words: dict, n: int):
to_remove = set()
for word in word_count:
if word_count[word] < 20:
for d in dicts:
if word in d:
d.pop(word)
to_remove.add(word)
for word in words:
if words[word] < 3:
for d in dicts:
if word in d:
d.pop(word)
to_remove.add(word)
for word in list(to_remove):
words.pop(word)
def get_link(site_title: string) -> string:
return "https://en.wikipedia.org/wiki/" + quote(site_title.replace(" ", "_"))
def title(site: string) -> string:
return site.replace("_", " ")
def clean(text: string) -> string:
partly = re.sub('\\\\t|\\\\n|\\\\r|\\\\a|\\\\f|\\\\v|\\\\b', " ", text)
return re.sub('[^a-zA-Z]+', ' ', partly)
if __name__ == "__main__":
word_matrix, word_index = main(100)