Source code for langchain.document_loaders.recursive_url_loader

from typing import Iterator, List, Optional, Set
from urllib.parse import urljoin, urlparse

import requests

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


[docs]class RecursiveUrlLoader(BaseLoader): """Loads all child links from a given url.""" def __init__( self, url: str, exclude_dirs: Optional[str] = None, ) -> None: """Initialize with URL to crawl and any subdirectories to exclude. Args: url: The URL to crawl. exclude_dirs: A list of subdirectories to exclude. """ self.url = url self.exclude_dirs = exclude_dirs
[docs] def lazy_load(self) -> Iterator[Document]: """Lazy load web pages.""" return self.get_child_links_recursive(self.url)
[docs] def load(self) -> List[Document]: """Load web pages.""" return list(self.lazy_load())