-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpyoffline_parser.py
73 lines (50 loc) · 2.35 KB
/
pyoffline_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from logging import getLogger
from bs4 import BeautifulSoup
from pyoffline_models import Resource, Document
from beautifulsoup_extensions import has_href_with_url, has_src_with_url
from url_extensions import make_url_absolute, make_url_relative
class Parser:
def __init__(self, site_root: str, max_depth: str=None):
self.logger = getLogger()
self.site_root = site_root
self.max_depth = max_depth
self.visited = set()
def process_link(self, tag, attribute, current_depth=0):
absolute_url = make_url_absolute(self.site_root, tag[attribute])
relative_url = make_url_relative(self.site_root, tag[attribute])
tag[attribute] = relative_url
if tag.name == "a":
resource = Document(absolute_url, name=relative_url, depth=current_depth+1)
else:
resource = Resource(absolute_url, name=relative_url)
return resource
def detect_resources(self, document, current_depth):
resources = []
href_tags = document.find_all(has_href_with_url(self.site_root))
self.logger.info(f'Found {len(href_tags)} href resources tags.')
resources += [self.process_link(tag, "href",current_depth) for tag in href_tags]
src_tags = document.find_all(has_src_with_url(self.site_root))
self.logger.info(f'Found {len(src_tags)} src resources tags.')
resources += [self.process_link(resource, "src", current_depth) for resource in src_tags]
return [r for r in resources if self.must_visit(r)]
def parse(self, resource: Resource):
self.visited.add(resource.url)
detected_resources = []
if type(resource) is Document:
parsed_resource = BeautifulSoup(resource.body, "html.parser")
detected_resources = self.detect_resources(parsed_resource, resource.depth)
resource.body = str(parsed_resource)
return [resource] + detected_resources
def is_resource_writable(self, resource: Resource):
has_body = resource.body is not None
needs_processing = resource.mimeType == "text/html"
return has_body and not needs_processing
def is_visited(self, resource: Resource):
return resource.url in self.visited
def must_visit(self, resource: Resource):
if self.max_depth is None:
return not self.is_visited(resource)
can_go_deeper = True
if type(resource) is Document:
can_go_deeper = resource.depth < self.max_depth
return can_go_deeper and not self.is_visited(resource)