Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions botasaurus/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .list_utils import flatten
from .request_decorator import request
from .output import write_json
from .sitemap_parser_utils import clean_robots_txt_url, fix_bad_sitemap_response, clean_sitemap_url, extract_sitemaps, split_into_links_and_sitemaps, fix_gzip_response, is_empty_path, parse_sitemaps_from_robots_txt, wrap_in_sitemap
from .sitemap_parser_utils import clean_robots_txt_url, fix_bad_sitemap_response, clean_default_sitemap_urls, extract_sitemaps, split_into_links_and_sitemaps, fix_gzip_response, is_empty_path, parse_sitemaps_from_robots_txt, wrap_in_sitemap

default_request_options = {
# "use_stealth": True,
Expand Down Expand Up @@ -150,10 +150,10 @@ def sitemap(req, url):
extract_link_upto_nth_segment(0, url), content
)
if not result:
sm_url = clean_sitemap_url(url)
content = fetch_content(sm_url,proxy = request_options['proxy'])
if content:
return [sm_url]
for sm_url in clean_default_sitemap_urls(url):
content = fetch_content(sm_url,proxy = request_options['proxy'])
if content:
return [sm_url]
return []

return result
Expand Down
12 changes: 12 additions & 0 deletions botasaurus/sitemap_parser_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,18 @@ def clean_robots_txt_url(url):
def clean_sitemap_url(url):
return extract_link_upto_nth_segment(0, url) + "sitemap.xml"


def clean_default_sitemap_urls(url):
base_url = extract_link_upto_nth_segment(0, url)
candidates = (
"sitemap.xml",
"sitemap_index.xml",
"sitemap-index.xml",
"sitemap_index.html",
"sitemap-index.html",
)
return [base_url + candidate for candidate in candidates]

def clean_url(base_url, url: str) -> bool:
"""
Returns true if URL is of the "http" ("https") scheme.
Expand Down
Loading