Module kgforge.utils
Expand source code
from kgforge.utils.openalex_util import OpenAlexUtil
from kgforge.utils.pdfreader import TextLoader
__all__ = ["TextLoader", "OpenAlexUtil"]
Sub-modules
kgforge.utils.openalex_util
kgforge.utils.pdfreader
Classes
class OpenAlexUtil (config: OpenAlexUtilConfig = <kgforge.utils.openalex_util.OpenAlexUtilConfig object>)
-
Provides functionality to fetch artifacts from OpenAlex.
Expand source code
class OpenAlexUtil: """Provides functionality to fetch artifacts from OpenAlex.""" def __init__(self, config: OpenAlexUtilConfig = OpenAlexUtilConfig()) -> None: self.config = config or OpenAlexUtilConfig() def search_works(self, search_query: str, results_limit: int = 25) -> List[Any]: """Searches for artifacts using a query. Usage example: >>>oa_util = OpenAlexUtil() >>>oa_util.search_works("sample-query", 25) Args: search_query (str): Query to search for artifacts. results_limit (int): Number of results to return. Returns: List[ResearchArtifact]: List of artifacts that match the query. Raises: HTTPError: If an HTTP error occurs while searching for artifacts. Exception: If an error occurs while searching for artifacts. """ url = self.config.search_endpoint.format(search_query, results_limit) try: response = requests.get(url) response.raise_for_status() search_results = response.json().get("results") if response.status_code == 200 and search_results is not None: return search_results # artifacts = [ResearchArtifact.parse_obj(_) for _ in search_results] # full_text_artifacts = list(map(lambda x: x.get_full_text(), artifacts)) # return full_text_artifacts else: return [] except HTTPError as http_err: logger.info(f"HTTP error occurred: {http_err}") return [] except Exception as err: logger.info(f"Other error occurred: {err}") return []
Methods
def search_works(self, search_query: str, results_limit: int = 25) ‑> List[Any]
-
Searches for artifacts using a query.
Usage example:
oa_util = OpenAlexUtil() oa_util.search_works("sample-query", 25)
Args
search_query
:str
- Query to search for artifacts.
results_limit
:int
- Number of results to return.
Returns
List[ResearchArtifact]
- List of artifacts that match the query.
Raises
HTTPError
- If an HTTP error occurs while searching for artifacts.
Exception
- If an error occurs while searching for artifacts.
Expand source code
def search_works(self, search_query: str, results_limit: int = 25) -> List[Any]: """Searches for artifacts using a query. Usage example: >>>oa_util = OpenAlexUtil() >>>oa_util.search_works("sample-query", 25) Args: search_query (str): Query to search for artifacts. results_limit (int): Number of results to return. Returns: List[ResearchArtifact]: List of artifacts that match the query. Raises: HTTPError: If an HTTP error occurs while searching for artifacts. Exception: If an error occurs while searching for artifacts. """ url = self.config.search_endpoint.format(search_query, results_limit) try: response = requests.get(url) response.raise_for_status() search_results = response.json().get("results") if response.status_code == 200 and search_results is not None: return search_results # artifacts = [ResearchArtifact.parse_obj(_) for _ in search_results] # full_text_artifacts = list(map(lambda x: x.get_full_text(), artifacts)) # return full_text_artifacts else: return [] except HTTPError as http_err: logger.info(f"HTTP error occurred: {http_err}") return [] except Exception as err: logger.info(f"Other error occurred: {err}") return []
class TextLoader
-
Reads text from a variety of sources.
Expand source code
class TextLoader: """Reads text from a variety of sources.""" @staticmethod def _read_pdf(path: str) -> List[str]: """Reads text from a PDF file. Usage example: >>> loader = TextLoader() >>> loader._read_pdf("path/to/file.pdf") Args: path (str): Path to the PDF file. Returns: List[str]: List of strings, each string representing a column in the PDF. Raises: FileNotFoundError: If the file does not exist. Exception: If an error occurs while reading the PDF. """ try: resource_manager = PDFResourceManager() file_handle = io.StringIO() converter = TextConverter( resource_manager, file_handle, laparams=LAParams() ) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(path, "rb") as file: for page in PDFPage.get_pages( file, caching=True, check_extractable=True ): page_interpreter.process_page(page) text = file_handle.getvalue() if text.find("\n\n") == -1: logger.info("Single column PDF detected.") columns = [text] else: logger.info("Multi column PDF detected.") columns = text.split("\n\n") converter.close() file_handle.close() return columns except FileNotFoundError: logger.error("File not found.") raise FileNotFoundError except Exception as e: logger.error("Error occurred while reading PDF. " + str(e)) raise e @staticmethod def read_pdf_from_url(url: str = None) -> List[str]: """Reads PDF file from an online URL. Usage example: >>> loader = TextLoader() >>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf") Args: url (str): URL of the PDF file. Returns: List[str]: Text from the PDF file. Raises: ValueError: If no URL is provided. """ if url is None: raise ValueError("URL cannot be empty") try: response = requests.get(url) resource_manager = PDFResourceManager() file_handle = io.StringIO() converter = TextConverter( resource_manager, file_handle, laparams=LAParams() ) page_interpreter = PDFPageInterpreter(resource_manager, converter) for page in PDFPage.get_pages( io.BytesIO(response.content), caching=True, check_extractable=True ): page_interpreter.process_page(page) text = file_handle.getvalue() if text.find("\n\n") == -1: logger.info("Single column PDF detected.") columns = [text] else: logger.info("Multi column PDF detected.") columns = text.split("\n\n") converter.close() file_handle.close() return columns except Exception as e: logger.error("Error occurred while reading PDF. " + str(e)) return None
Static methods
def read_pdf_from_url(url: str = None) ‑> List[str]
-
Reads PDF file from an online URL.
Usage example:
>>> loader = TextLoader() >>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf")
Args
url
:str
- URL of the PDF file.
Returns
List[str]
- Text from the PDF file.
Raises
ValueError
- If no URL is provided.
Expand source code
@staticmethod def read_pdf_from_url(url: str = None) -> List[str]: """Reads PDF file from an online URL. Usage example: >>> loader = TextLoader() >>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf") Args: url (str): URL of the PDF file. Returns: List[str]: Text from the PDF file. Raises: ValueError: If no URL is provided. """ if url is None: raise ValueError("URL cannot be empty") try: response = requests.get(url) resource_manager = PDFResourceManager() file_handle = io.StringIO() converter = TextConverter( resource_manager, file_handle, laparams=LAParams() ) page_interpreter = PDFPageInterpreter(resource_manager, converter) for page in PDFPage.get_pages( io.BytesIO(response.content), caching=True, check_extractable=True ): page_interpreter.process_page(page) text = file_handle.getvalue() if text.find("\n\n") == -1: logger.info("Single column PDF detected.") columns = [text] else: logger.info("Multi column PDF detected.") columns = text.split("\n\n") converter.close() file_handle.close() return columns except Exception as e: logger.error("Error occurred while reading PDF. " + str(e)) return None