Module kgforge.utils.pdfreader
Expand source code
import io
import logging
from typing import List
import requests
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
logger = logging.getLogger(__name__)
class TextLoader:
"""Reads text from a variety of sources."""
@staticmethod
def _read_pdf(path: str) -> List[str]:
"""Reads text from a PDF file.
Usage example:
>>> loader = TextLoader()
>>> loader._read_pdf("path/to/file.pdf")
Args:
path (str): Path to the PDF file.
Returns:
List[str]: List of strings, each string representing a column in the PDF.
Raises:
FileNotFoundError: If the file does not exist.
Exception: If an error occurs while reading the PDF.
"""
try:
resource_manager = PDFResourceManager()
file_handle = io.StringIO()
converter = TextConverter(
resource_manager, file_handle, laparams=LAParams()
)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(path, "rb") as file:
for page in PDFPage.get_pages(
file, caching=True, check_extractable=True
):
page_interpreter.process_page(page)
text = file_handle.getvalue()
if text.find("\n\n") == -1:
logger.info("Single column PDF detected.")
columns = [text]
else:
logger.info("Multi column PDF detected.")
columns = text.split("\n\n")
converter.close()
file_handle.close()
return columns
except FileNotFoundError:
logger.error("File not found.")
raise FileNotFoundError
except Exception as e:
logger.error("Error occurred while reading PDF. " + str(e))
raise e
@staticmethod
def read_pdf_from_url(url: str = None) -> List[str]:
"""Reads PDF file from an online URL.
Usage example:
>>> loader = TextLoader()
>>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf")
Args:
url (str): URL of the PDF file.
Returns:
List[str]: Text from the PDF file.
Raises:
ValueError: If no URL is provided.
"""
if url is None:
raise ValueError("URL cannot be empty")
try:
response = requests.get(url)
resource_manager = PDFResourceManager()
file_handle = io.StringIO()
converter = TextConverter(
resource_manager, file_handle, laparams=LAParams()
)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
for page in PDFPage.get_pages(
io.BytesIO(response.content), caching=True, check_extractable=True
):
page_interpreter.process_page(page)
text = file_handle.getvalue()
if text.find("\n\n") == -1:
logger.info("Single column PDF detected.")
columns = [text]
else:
logger.info("Multi column PDF detected.")
columns = text.split("\n\n")
converter.close()
file_handle.close()
return columns
except Exception as e:
logger.error("Error occurred while reading PDF. " + str(e))
return None
Classes
class TextLoader
-
Reads text from a variety of sources.
Expand source code
class TextLoader: """Reads text from a variety of sources.""" @staticmethod def _read_pdf(path: str) -> List[str]: """Reads text from a PDF file. Usage example: >>> loader = TextLoader() >>> loader._read_pdf("path/to/file.pdf") Args: path (str): Path to the PDF file. Returns: List[str]: List of strings, each string representing a column in the PDF. Raises: FileNotFoundError: If the file does not exist. Exception: If an error occurs while reading the PDF. """ try: resource_manager = PDFResourceManager() file_handle = io.StringIO() converter = TextConverter( resource_manager, file_handle, laparams=LAParams() ) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(path, "rb") as file: for page in PDFPage.get_pages( file, caching=True, check_extractable=True ): page_interpreter.process_page(page) text = file_handle.getvalue() if text.find("\n\n") == -1: logger.info("Single column PDF detected.") columns = [text] else: logger.info("Multi column PDF detected.") columns = text.split("\n\n") converter.close() file_handle.close() return columns except FileNotFoundError: logger.error("File not found.") raise FileNotFoundError except Exception as e: logger.error("Error occurred while reading PDF. " + str(e)) raise e @staticmethod def read_pdf_from_url(url: str = None) -> List[str]: """Reads PDF file from an online URL. Usage example: >>> loader = TextLoader() >>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf") Args: url (str): URL of the PDF file. Returns: List[str]: Text from the PDF file. Raises: ValueError: If no URL is provided. """ if url is None: raise ValueError("URL cannot be empty") try: response = requests.get(url) resource_manager = PDFResourceManager() file_handle = io.StringIO() converter = TextConverter( resource_manager, file_handle, laparams=LAParams() ) page_interpreter = PDFPageInterpreter(resource_manager, converter) for page in PDFPage.get_pages( io.BytesIO(response.content), caching=True, check_extractable=True ): page_interpreter.process_page(page) text = file_handle.getvalue() if text.find("\n\n") == -1: logger.info("Single column PDF detected.") columns = [text] else: logger.info("Multi column PDF detected.") columns = text.split("\n\n") converter.close() file_handle.close() return columns except Exception as e: logger.error("Error occurred while reading PDF. " + str(e)) return None
Static methods
def read_pdf_from_url(url: str = None) ‑> List[str]
-
Reads PDF file from an online URL.
Usage example:
>>> loader = TextLoader() >>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf")
Args
url
:str
- URL of the PDF file.
Returns
List[str]
- Text from the PDF file.
Raises
ValueError
- If no URL is provided.
Expand source code
@staticmethod def read_pdf_from_url(url: str = None) -> List[str]: """Reads PDF file from an online URL. Usage example: >>> loader = TextLoader() >>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf") Args: url (str): URL of the PDF file. Returns: List[str]: Text from the PDF file. Raises: ValueError: If no URL is provided. """ if url is None: raise ValueError("URL cannot be empty") try: response = requests.get(url) resource_manager = PDFResourceManager() file_handle = io.StringIO() converter = TextConverter( resource_manager, file_handle, laparams=LAParams() ) page_interpreter = PDFPageInterpreter(resource_manager, converter) for page in PDFPage.get_pages( io.BytesIO(response.content), caching=True, check_extractable=True ): page_interpreter.process_page(page) text = file_handle.getvalue() if text.find("\n\n") == -1: logger.info("Single column PDF detected.") columns = [text] else: logger.info("Multi column PDF detected.") columns = text.split("\n\n") converter.close() file_handle.close() return columns except Exception as e: logger.error("Error occurred while reading PDF. " + str(e)) return None