Source code for langchain.document_loaders.parsers.pdf

"""Module contains common parsers for PDFs."""
from typing import Any, Iterator, Mapping, Optional, Union

from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.schema import Document


[docs]class PyPDFParser(BaseBlobParser): """Loads a PDF with pypdf and chunks at character level.""" def __init__(self, password: Optional[Union[str, bytes]] = None): self.password = password
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" import pypdf with blob.as_bytes_io() as pdf_file_obj: pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password) yield from [ Document( page_content=page.extract_text(), metadata={"source": blob.source, "page": page_number}, ) for page_number, page in enumerate(pdf_reader.pages) ]
[docs]class PDFMinerParser(BaseBlobParser): """Parse PDFs with PDFMiner."""
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" from pdfminer.high_level import extract_text with blob.as_bytes_io() as pdf_file_obj: text = extract_text(pdf_file_obj) metadata = {"source": blob.source} yield Document(page_content=text, metadata=metadata)
[docs]class PyMuPDFParser(BaseBlobParser): """Parse PDFs with PyMuPDF.""" def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None: """Initialize the parser. Args: text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``. """ self.text_kwargs = text_kwargs or {}
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" import fitz with blob.as_bytes_io() as file_path: doc = fitz.open(file_path) # open document yield from [ Document( page_content=page.get_text(**self.text_kwargs), metadata=dict( { "source": blob.source, "file_path": blob.source, "page": page.number, "total_pages": len(doc), }, **{ k: doc.metadata[k] for k in doc.metadata if type(doc.metadata[k]) in [str, int] }, ), ) for page in doc ]
[docs]class PyPDFium2Parser(BaseBlobParser): """Parse PDFs with PyPDFium2.""" def __init__(self) -> None: """Initialize the parser.""" try: import pypdfium2 # noqa:F401 except ImportError: raise ImportError( "pypdfium2 package not found, please install it with" " `pip install pypdfium2`" )
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" import pypdfium2 # pypdfium2 is really finicky with respect to closing things, # if done incorrectly creates seg faults. with blob.as_bytes_io() as file_path: pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True) try: for page_number, page in enumerate(pdf_reader): text_page = page.get_textpage() content = text_page.get_text_range() text_page.close() page.close() metadata = {"source": blob.source, "page": page_number} yield Document(page_content=content, metadata=metadata) finally: pdf_reader.close()
[docs]class PDFPlumberParser(BaseBlobParser): """Parse PDFs with PDFPlumber.""" def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None: """Initialize the parser. Args: text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` """ self.text_kwargs = text_kwargs or {}
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" import pdfplumber with blob.as_bytes_io() as file_path: doc = pdfplumber.open(file_path) # open document yield from [ Document( page_content=page.extract_text(**self.text_kwargs), metadata=dict( { "source": blob.source, "file_path": blob.source, "page": page.page_number, "total_pages": len(doc.pages), }, **{ k: doc.metadata[k] for k in doc.metadata if type(doc.metadata[k]) in [str, int] }, ), ) for page in doc.pages ]