diff --git a/botasaurus/sitemap.py b/botasaurus/sitemap.py index aa3fbec..4bf573c 100644 --- a/botasaurus/sitemap.py +++ b/botasaurus/sitemap.py @@ -7,7 +7,7 @@ from .list_utils import flatten from .request_decorator import request from .output import write_json -from .sitemap_parser_utils import clean_robots_txt_url, fix_bad_sitemap_response, clean_sitemap_url, extract_sitemaps, split_into_links_and_sitemaps, fix_gzip_response, is_empty_path, parse_sitemaps_from_robots_txt, wrap_in_sitemap +from .sitemap_parser_utils import clean_robots_txt_url, fix_bad_sitemap_response, clean_default_sitemap_urls, extract_sitemaps, split_into_links_and_sitemaps, fix_gzip_response, is_empty_path, parse_sitemaps_from_robots_txt, wrap_in_sitemap default_request_options = { # "use_stealth": True, @@ -150,10 +150,10 @@ def sitemap(req, url): extract_link_upto_nth_segment(0, url), content ) if not result: - sm_url = clean_sitemap_url(url) - content = fetch_content(sm_url,proxy = request_options['proxy']) - if content: - return [sm_url] + for sm_url in clean_default_sitemap_urls(url): + content = fetch_content(sm_url,proxy = request_options['proxy']) + if content: + return [sm_url] return [] return result diff --git a/botasaurus/sitemap_parser_utils.py b/botasaurus/sitemap_parser_utils.py index 03f8e68..2984cea 100644 --- a/botasaurus/sitemap_parser_utils.py +++ b/botasaurus/sitemap_parser_utils.py @@ -137,6 +137,18 @@ def clean_robots_txt_url(url): def clean_sitemap_url(url): return extract_link_upto_nth_segment(0, url) + "sitemap.xml" + +def clean_default_sitemap_urls(url): + base_url = extract_link_upto_nth_segment(0, url) + candidates = ( + "sitemap.xml", + "sitemap_index.xml", + "sitemap-index.xml", + "sitemap_index.html", + "sitemap-index.html", + ) + return [base_url + candidate for candidate in candidates] + def clean_url(base_url, url: str) -> bool: """ Returns true if URL is of the "http" ("https") scheme.