-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmetaParserUtils.py
More file actions
executable file
·258 lines (221 loc) · 7.6 KB
/
metaParserUtils.py
File metadata and controls
executable file
·258 lines (221 loc) · 7.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#!/usr/bin/python3
# coding: UTF-8
import shutil
import requests
import os
import time
import random
import os.path
from multiprocessing.pool import ThreadPool as Pool
from lxml import html
from lxml.etree import tostring
from itertools import chain
class Downloader:
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'User-Agent': 'Mozilla/7.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/21.0.1084.52 Safari/546.5',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-EN,fr;q=0.8,en-US;q=0.6,en;q=0.4',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'identity',
'Content-Type': 'application/x-www-form-urlencoded',
}
DEFAULT_WAIT = 1
DEFAULT_NAME = ""
COOKIES = dict()
AUTH = ()
def __init__(self, parent=None, output_path="", nb_downloads=4):
self.parent = parent
self.output_path = output_path
self.folder_name = self.DEFAULT_NAME
self.wait_time = self.DEFAULT_WAIT
self.pool = Pool(processes=nb_downloads)
self.retry = 1
def __del__(self):
pass
def get_html(self, url, clean=False):
"""
Return the content if the url given.
:param url:
:param clean: force the output to be converted to utf-8
:return:
"""
for i in range(self.retry + 1):
try:
r = requests.get(url, headers=self.headers, auth=self.AUTH, cookies=self.COOKIES)
break
except Exception as e:
print("ERROR get failed "+str(e))
time.sleep(2)
if r and r.status_code == 200:
if clean:
if not r.encoding:
r.encoding = "utf8"
return r.text.encode(r.encoding, "ignore")
else:
return r.text
else:
print("Invalid URL:", url)
return ""
def get_xpath(self, url, xpath=None):
"""
Return a xpath on the url given in put
Use get_html(url, True) to get html.
:param url:
:param xpath:
:return:
"""
web_page = self.get_html(url, True)
if web_page:
tree = html.fromstring(web_page)
if xpath is None:
return tree
else:
return tree.xpath(xpath)
return None
def create_folder(self, path):
"""
A small utility function to encapsulate os.makedirs
:param path:
:return:
"""
try:
os.makedirs(path)
except OSError:
pass
def _download(self, url, folder_name, file_name):
r = requests.get(url, headers=self.headers, stream=True, auth=self.AUTH, cookies=self.COOKIES)
self.create_folder(self.output_path + folder_name)
with open(self.output_path + folder_name + os.sep + file_name, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
def download(self, url, folder_name, file_name):
"""
This function will download as a file the url.
Maybe the function you really want is get_file
:param url:
:param folder_name: the folder in wich we download the file
:param file_name: the name of the file
:return:
"""
for i in range(self.retry+1):
try:
self._download(url, folder_name, file_name)
break
except Exception as e:
print("ERROR while downloadind: "+str(e))
if self.retry > 0:
print("Retrying")
def get_file(self, url, folder_name="", file_name="", async=False, verbose=False):
"""
High level function to encapsulate download.
If no folder name or file name given the function will use global parameters or guess the values.
:param url:
:param folder_name:
:param file_name:
:param async: To use the thread pool to go faster
:param verbose: Debug mode
:return:
"""
if folder_name == "":
folder_name = self.folder_name
elif self.folder_name != self.DEFAULT_NAME:
folder_name = self.folder_name + os.sep + folder_name
if file_name is "":
file_name = url.split("/")[-1]
if verbose:
print("DOWNLOADING", folder_name, file_name)
if async:
self.pool.apply_async(self.download, (url, folder_name, file_name,))
else:
self.download(url, folder_name, file_name)
def launch_async(self, func, args):
"""
Debug function that will launch given function in threadpool.
:param func:
:param args: args of func
:return:
"""
return self.pool.apply_async(func, args)
def file_exists(self, file_path):
"""
Utility function because it's shorter to use this function.
:param file_path:
:return:
"""
return os.path.isfile(file_path)
def wait(self, wait_time=0, random=False):
"""
To avoid saturate a website you should wait.
If called without argument it will wait 1s. If random is true it will add a time between 0-1 sec.
:param wait_time:
:param random:
:return:
"""
if wait_time == 0:
wait_time = self.wait_time
if random:
time.sleep(wait_time + random.random()) # wait time +- 1sec
else:
time.sleep(wait_time)
def add_cookie(self, k, v):
self.COOKIES[k] = v
def add_auth(self, tuple):
self.AUTH = tuple
def get_output_path(self):
"""
Return the concatenation of the output path and the current folder.
:return:
"""
return self.output_path + self.folder_name
def set_wait_time(self, time):
self.wait_time = time
def set_retry_value(self, retry):
if retry >= 0:
self.retry = retry
else:
self.retry = 0
def set_folder_name(self, name):
"""
Set the folder in which we will store the downloads.
Note: This folder and output path are different.
this folder will be appended to output path and for your own sake you should not override output path.
:param name:
:return:
"""
self.folder_name = name
def reset(self):
"""
Reset the folder name, wait time.
:return:
"""
self.folder_name = "default"
self.wait_time = self.DEFAULT_WAIT
def parse(self, url):
"""
If we want to add a new url to the parser we will call the parent
:param url:
:return:
"""
if self.parent:
self.parent.execute([url])
else:
print("No parent was given so the nothing will be done")
def debug(self):
print("[DEBUG] Path: " + self.output_path + " time: " + str(self.wait_time))
def close_and_join(self):
"""
Utility function, should never be call by anyone other than me. Why are you reading that ?
:return:
"""
self.pool.close()
print("Finishing DL")
self.pool.join()
print("Done")
def element_to_string(self, node):
parts = ([node.text] +
list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
[node.tail])
# filter removes possible Nones in texts and tails
return ''.join(filter(None, parts))