apify · Mantisus · Apr 21, 2025 · Apr 22, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/docs/guides/code_examples/request_loaders/sitemap_example.py b/docs/guides/code_examples/request_loaders/sitemap_example.py
@@ -0,0 +1,28 @@
+import asyncio
+import re
+
+from crawlee.http_clients import HttpxHttpClient
+from crawlee.request_loaders import SitemapRequestLoader
+
+
+async def main() -> None:
+    # Create an HTTP client for fetching sitemaps
+    async with HttpxHttpClient() as http_client:
+        # Create a sitemap request loader with URL filtering
+        sitemap_loader = SitemapRequestLoader(
+            sitemap_urls=['https://crawlee.dev/sitemap.xml'],
+            http_client=http_client,
+            # Exclude all URLs that do not contain 'blog'
+            exclude=[re.compile(r'^((?!blog).)*$')],
+            max_buffer_size=500,  # Buffer up to 500 URLs in memory
+        )
+
+        while request := await sitemap_loader.fetch_next_request():
+            # Do something with it...
+
+            # And mark it as handled.
+            await sitemap_loader.mark_request_as_handled(request)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx
@@ -10,6 +10,7 @@ import TabItem from '@theme/TabItem';
 import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 
 import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py';
+import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example.py';
 import TandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/tandem_example.py';
 import ExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/tandem_example_explicit.py';
 
@@ -23,9 +24,10 @@ The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/
 - <ApiLink to="class/RequestManager">`RequestManager`</ApiLink>: Extends `RequestLoader` with write capabilities.
 - <ApiLink to="class/RequestManagerTandem">`RequestManagerTandem`</ApiLink>: Combines a read-only `RequestLoader` with a writable `RequestManager`.
 
-And one specific request loader:
+And specific request loaders:
 
 - <ApiLink to="class/RequestList">`RequestList`</ApiLink>: A lightweight implementation of request loader for managing a static list of URLs.
+- <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink>: A request loader that reads URLs from XML sitemaps with filtering capabilities.
 
 Below is a class diagram that illustrates the relationships between these components and the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>:
 
@@ -83,6 +85,11 @@ class RequestList {
     _methods_()
 }
 
+class SitemapRequestLoader {
+    _attributes_
+    _methods_()
+}
+
 class RequestManagerTandem {
     _attributes_
     _methods_()
@@ -97,6 +104,7 @@ RequestManager <|-- RequestQueue
 
 RequestLoader <|-- RequestManager
 RequestLoader <|-- RequestList
+RequestLoader <|-- SitemapRequestLoader
 RequestManager <|-- RequestManagerTandem
 ```
 
@@ -112,6 +120,14 @@ Here is a basic example of working with the <ApiLink to="class/RequestList">`Req
     {RlBasicExample}
 </RunnableCodeBlock>
 
+## Sitemap request loader
+
+The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> is a specialized request loader that reads URLs from XML sitemaps. It's particularly useful when you want to crawl a website systematically by following its sitemap structure. The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> provides streaming processing of sitemaps, which ensures efficient memory usage without loading the entire sitemap into memory.
+
+<RunnableCodeBlock className="language-python" language="python">
+    {SitemapExample}
+</RunnableCodeBlock>
+
 ## Request manager
 
 The <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add or reclaim them. This is important for dynamic crawling projects, where new URLs may emerge during the crawl process. Or when certain requests may failed and need to be retried. For more details refer to the <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> API reference.
@@ -139,4 +155,4 @@ This sections describes the combination of the <ApiLink to="class/RequestList">`
 
 ## Conclusion
 
-This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` class. You also saw examples of how to work with these classes in practice. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
+This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` and `SitemapRequestLoader` classes. You also saw examples of how to work with these classes in practice. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
@@ -5,6 +5,7 @@
 from protego import Protego
 from yarl import URL
 
+from crawlee._utils.sitemap import Sitemap
 from crawlee._utils.web import is_status_code_client_error
 
 if TYPE_CHECKING:
@@ -15,9 +16,13 @@
 
 
 class RobotsTxtFile:
-    def __init__(self, url: str, robots: Protego) -> None:
+    def __init__(
+        self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
+    ) -> None:
         self._robots = robots
         self._original_url = URL(url).origin()
+        self._http_client = http_client
+        self._proxy_info = proxy_info
 
     @classmethod
     async def from_content(cls, url: str, content: str) -> Self:
@@ -56,7 +61,7 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
 
         robots = Protego.parse(body.decode('utf-8'))
 
-        return cls(url, robots)
+        return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
 
     def is_allowed(self, url: str, user_agent: str = '*') -> bool:
         """Check if the given URL is allowed for the given user agent.
@@ -83,3 +88,16 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None:
         """
         crawl_delay = self._robots.crawl_delay(user_agent)
         return int(crawl_delay) if crawl_delay is not None else None
+
+    async def parse_sitemaps(self) -> Sitemap:
+        """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance."""
+        sitemaps = self.get_sitemaps()
+        if not self._http_client:
+            raise ValueError('HTTP client is required to parse sitemaps.')
+
+        return await Sitemap.load(sitemaps, self._http_client, self._proxy_info)
+
+    async def parse_urls_from_sitemaps(self) -> list[str]:
+        """Parse the sitemaps in the robots.txt file and return a list URLs."""
+        sitemap = await self.parse_sitemaps()
+        return sitemap.urls