MetaParser/metaParser.py at master · Lightjohn/MetaParser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/python3
# coding: UTF-8
from os.path import expanduser
import requests
import sys
import os
import importlib
import re
import types
import metaParserUtils
import traceback
import socket
import argparse

# adding local path hoster for futur import, taken from the location of this file
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__))+os.sep+"hoster")
output_path = expanduser("~")+os.sep+"metaParser"+os.sep


class Parser:
    """
    Every parser should implement this class and overload parser
    Contains:
        dl (downloader tool wrapping requests)
    """

    def __init__(self, dl_util):

        if isinstance(dl_util, metaParserUtils.Downloader):
            self.dl = dl_util
        else:
            raise TypeError

    def parse(self, url):
        pass


class MetaParser:
    """
        Metaparser is the main class that will read the input and will find and launch the module linked to the url.
    """
    THREADS = 4
    DEBUG = False

    def fix_object_name(self, obj_name):
        """
        Function that will remove every forbidden character in an url.

        :param obj_name:
        :return:
        """
        for a in ['.', '-']:
            obj_name = obj_name.replace(a, '')
        return obj_name

    def fix_url(self, url):
        """
        In the case where we have a simple url with no / at the end.

        :param url:
        :return:
        """
        if "/" not in url.replace("://", ""):
            url += "/"
        return url

    def execute(self, argv):
        """
        For every url:
            We purify the url, then we extract the core name of the url
            we try to load a module according to the core name
            Then we launch the module if everything is good

        :param argv:
        :return:
        """
        dl_utils = metaParserUtils.Downloader(self, output_path, nb_downloads=self.THREADS)
        loaded_module = dict()
        for url in argv:
            # extract the name from the url
            url = self.fix_url(url)
            name = re.match('(https?:\/\/)?(www\.)?(?P<website>.*?)\.[a-z]{2,3}\/', url)
            name = name.groupdict()
            module_name = name["website"]
            if module_name is not "":
                # When we have the name of the site, we load the file of the same name
                dl_utils.reset()
                try:
                    # importing module from hoster file
                    module_name = self.fix_object_name(module_name)
                    if module_name in loaded_module:
                        new_module = loaded_module[module_name]
                        importlib.reload(new_module)
                    else:
                        new_module = importlib.import_module(module_name)
                        loaded_module[module_name] = new_module
                    # getting class from the module
                    module = getattr(new_module, "ChildParser")
                    # initiating the module and parsing the url
                    m = module(dl_utils)
                    print("Module:", module_name)
                    try:
                        m.parse(url)
                    except Exception as e:
                        print("Exception occur in", module_name, e)
                        if self.DEBUG:
                            traceback.print_exc()
                except ImportError:
                    print(
                        "Cannot load module: " + module_name +
                        " : NOT IMPLEMENTED YET (or missing import or module dependency "
                        "not installed on the system)")
                    if self.DEBUG:
                        traceback.print_exc()
            else:
                print("Invalid URL found: " + url)
        dl_utils.close_and_join()

    def execute_main(self):
        parser = argparse.ArgumentParser(
            description='Handle url and will git them to the good module')
        parser.add_argument('-n', type=int, default=4, help='Number of downloads in parallel')
        parser.add_argument('-d', action='store_true', default=False, help='Enable debug output')
        parser.add_argument('urls', metavar='N', nargs='+', help='an infinite numbers of URLS')
        input_args = parser.parse_args()
        self.DEBUG = input_args.d
        self.THREADS = input_args.n
        self.execute(input_args.urls)


if __name__ == "__main__":
    a = MetaParser()
    a.execute_main()