Source code for langchain.document_loaders.parsers.pdf
"""Module contains common parsers for PDFs."""
from typing import Any, Iterator, Mapping, Optional, Union
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.schema import Document
[docs]class PyPDFParser(BaseBlobParser):
"""Loads a PDF with pypdf and chunks at character level."""
def __init__(self, password: Optional[Union[str, bytes]] = None):
self.password = password
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import pypdf
with blob.as_bytes_io() as pdf_file_obj:
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
yield from [
Document(
page_content=page.extract_text(),
metadata={"source": blob.source, "page": page_number},
)
for page_number, page in enumerate(pdf_reader.pages)
]
[docs]class PDFMinerParser(BaseBlobParser):
"""Parse PDFs with PDFMiner."""
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
from pdfminer.high_level import extract_text
with blob.as_bytes_io() as pdf_file_obj:
text = extract_text(pdf_file_obj)
metadata = {"source": blob.source}
yield Document(page_content=text, metadata=metadata)
[docs]class PyMuPDFParser(BaseBlobParser):
"""Parse PDFs with PyMuPDF."""
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
"""Initialize the parser.
Args:
text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``.
"""
self.text_kwargs = text_kwargs or {}
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import fitz
with blob.as_bytes_io() as file_path:
doc = fitz.open(file_path) # open document
yield from [
Document(
page_content=page.get_text(**self.text_kwargs),
metadata=dict(
{
"source": blob.source,
"file_path": blob.source,
"page": page.number,
"total_pages": len(doc),
},
**{
k: doc.metadata[k]
for k in doc.metadata
if type(doc.metadata[k]) in [str, int]
},
),
)
for page in doc
]
[docs]class PyPDFium2Parser(BaseBlobParser):
"""Parse PDFs with PyPDFium2."""
def __init__(self) -> None:
"""Initialize the parser."""
try:
import pypdfium2 # noqa:F401
except ImportError:
raise ImportError(
"pypdfium2 package not found, please install it with"
" `pip install pypdfium2`"
)
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import pypdfium2
# pypdfium2 is really finicky with respect to closing things,
# if done incorrectly creates seg faults.
with blob.as_bytes_io() as file_path:
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
try:
for page_number, page in enumerate(pdf_reader):
text_page = page.get_textpage()
content = text_page.get_text_range()
text_page.close()
page.close()
metadata = {"source": blob.source, "page": page_number}
yield Document(page_content=content, metadata=metadata)
finally:
pdf_reader.close()
[docs]class PDFPlumberParser(BaseBlobParser):
"""Parse PDFs with PDFPlumber."""
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
"""Initialize the parser.
Args:
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
"""
self.text_kwargs = text_kwargs or {}
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import pdfplumber
with blob.as_bytes_io() as file_path:
doc = pdfplumber.open(file_path) # open document
yield from [
Document(
page_content=page.extract_text(**self.text_kwargs),
metadata=dict(
{
"source": blob.source,
"file_path": blob.source,
"page": page.page_number,
"total_pages": len(doc.pages),
},
**{
k: doc.metadata[k]
for k in doc.metadata
if type(doc.metadata[k]) in [str, int]
},
),
)
for page in doc.pages
]