Module `kgforge.utils`

Expand source code

from kgforge.utils.openalex_util import OpenAlexUtil
from kgforge.utils.pdfreader import TextLoader

__all__ = ["TextLoader", "OpenAlexUtil"]

Sub-modules

kgforge.utils.openalex_util
kgforge.utils.pdfreader

Classes

class OpenAlexUtil (config: OpenAlexUtilConfig = <kgforge.utils.openalex_util.OpenAlexUtilConfig object>)

Provides functionality to fetch artifacts from OpenAlex.

Expand source code

class OpenAlexUtil:
    """Provides functionality to fetch artifacts from OpenAlex."""

    def __init__(self, config: OpenAlexUtilConfig = OpenAlexUtilConfig()) -> None:
        self.config = config or OpenAlexUtilConfig()

    def search_works(self, search_query: str, results_limit: int = 25) -> List[Any]:
        """Searches for artifacts using a query.

        Usage example:
        >>>oa_util = OpenAlexUtil()
        >>>oa_util.search_works("sample-query", 25)

        Args:
            search_query (str): Query to search for artifacts.
            results_limit (int): Number of results to return.

        Returns:
            List[ResearchArtifact]: List of artifacts that match the query.

        Raises:
            HTTPError: If an HTTP error occurs while searching for artifacts.
            Exception: If an error occurs while searching for artifacts.
        """
        url = self.config.search_endpoint.format(search_query, results_limit)

        try:
            response = requests.get(url)
            response.raise_for_status()
            search_results = response.json().get("results")
            if response.status_code == 200 and search_results is not None:
                return search_results
                # artifacts = [ResearchArtifact.parse_obj(_) for _ in search_results]
                # full_text_artifacts = list(map(lambda x: x.get_full_text(), artifacts))
                # return full_text_artifacts
            else:
                return []
        except HTTPError as http_err:
            logger.info(f"HTTP error occurred: {http_err}")
            return []
        except Exception as err:
            logger.info(f"Other error occurred: {err}")
            return []

Methods

def search_works(self, search_query: str, results_limit: int = 25) ‑> List[Any]

Searches for artifacts using a query.

Usage example:

oa_util = OpenAlexUtil() oa_util.search_works("sample-query", 25)

Args

search_query : str: Query to search for artifacts.
results_limit : int: Number of results to return.

Returns

List[ResearchArtifact]: List of artifacts that match the query.

Raises

HTTPError: If an HTTP error occurs while searching for artifacts.
Exception: If an error occurs while searching for artifacts.

Expand source code

def search_works(self, search_query: str, results_limit: int = 25) -> List[Any]:
    """Searches for artifacts using a query.

    Usage example:
    >>>oa_util = OpenAlexUtil()
    >>>oa_util.search_works("sample-query", 25)

    Args:
        search_query (str): Query to search for artifacts.
        results_limit (int): Number of results to return.

    Returns:
        List[ResearchArtifact]: List of artifacts that match the query.

    Raises:
        HTTPError: If an HTTP error occurs while searching for artifacts.
        Exception: If an error occurs while searching for artifacts.
    """
    url = self.config.search_endpoint.format(search_query, results_limit)

    try:
        response = requests.get(url)
        response.raise_for_status()
        search_results = response.json().get("results")
        if response.status_code == 200 and search_results is not None:
            return search_results
            # artifacts = [ResearchArtifact.parse_obj(_) for _ in search_results]
            # full_text_artifacts = list(map(lambda x: x.get_full_text(), artifacts))
            # return full_text_artifacts
        else:
            return []
    except HTTPError as http_err:
        logger.info(f"HTTP error occurred: {http_err}")
        return []
    except Exception as err:
        logger.info(f"Other error occurred: {err}")
        return []

class TextLoader

Reads text from a variety of sources.

Expand source code

class TextLoader:
    """Reads text from a variety of sources."""

    @staticmethod
    def _read_pdf(path: str) -> List[str]:
        """Reads text from a PDF file.

        Usage example:
        >>> loader = TextLoader()
        >>> loader._read_pdf("path/to/file.pdf")

        Args:
            path (str): Path to the PDF file.

        Returns:
            List[str]: List of strings, each string representing a column in the PDF.

        Raises:
            FileNotFoundError: If the file does not exist.
            Exception: If an error occurs while reading the PDF.
        """
        try:
            resource_manager = PDFResourceManager()
            file_handle = io.StringIO()
            converter = TextConverter(
                resource_manager, file_handle, laparams=LAParams()
            )
            page_interpreter = PDFPageInterpreter(resource_manager, converter)

            with open(path, "rb") as file:
                for page in PDFPage.get_pages(
                    file, caching=True, check_extractable=True
                ):
                    page_interpreter.process_page(page)
                text = file_handle.getvalue()

            if text.find("\n\n") == -1:
                logger.info("Single column PDF detected.")
                columns = [text]
            else:
                logger.info("Multi column PDF detected.")
                columns = text.split("\n\n")

            converter.close()
            file_handle.close()

            return columns
        except FileNotFoundError:
            logger.error("File not found.")
            raise FileNotFoundError
        except Exception as e:
            logger.error("Error occurred while reading PDF. " + str(e))
            raise e

    @staticmethod
    def read_pdf_from_url(url: str = None) -> List[str]:
        """Reads PDF file from an online URL.

        Usage example:
        >>> loader = TextLoader()
        >>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf")

        Args:
            url (str): URL of the PDF file.

        Returns:
            List[str]: Text from the PDF file.

        Raises:
            ValueError: If no URL is provided.
        """

        if url is None:
            raise ValueError("URL cannot be empty")
        try:
            response = requests.get(url)
            resource_manager = PDFResourceManager()
            file_handle = io.StringIO()
            converter = TextConverter(
                resource_manager, file_handle, laparams=LAParams()
            )
            page_interpreter = PDFPageInterpreter(resource_manager, converter)

            for page in PDFPage.get_pages(
                io.BytesIO(response.content), caching=True, check_extractable=True
            ):
                page_interpreter.process_page(page)
            text = file_handle.getvalue()

            if text.find("\n\n") == -1:
                logger.info("Single column PDF detected.")
                columns = [text]
            else:
                logger.info("Multi column PDF detected.")
                columns = text.split("\n\n")

            converter.close()
            file_handle.close()

            return columns

        except Exception as e:
            logger.error("Error occurred while reading PDF. " + str(e))
            return None

Static methods

def read_pdf_from_url(url: str = None) ‑> List[str]

Reads PDF file from an online URL.

Usage example:

>>> loader = TextLoader()
>>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf")

Args

url : str: URL of the PDF file.

Returns

List[str]: Text from the PDF file.

Raises

ValueError: If no URL is provided.

Expand source code

@staticmethod
def read_pdf_from_url(url: str = None) -> List[str]:
    """Reads PDF file from an online URL.

    Usage example:
    >>> loader = TextLoader()
    >>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf")

    Args:
        url (str): URL of the PDF file.

    Returns:
        List[str]: Text from the PDF file.

    Raises:
        ValueError: If no URL is provided.
    """

    if url is None:
        raise ValueError("URL cannot be empty")
    try:
        response = requests.get(url)
        resource_manager = PDFResourceManager()
        file_handle = io.StringIO()
        converter = TextConverter(
            resource_manager, file_handle, laparams=LAParams()
        )
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        for page in PDFPage.get_pages(
            io.BytesIO(response.content), caching=True, check_extractable=True
        ):
            page_interpreter.process_page(page)
        text = file_handle.getvalue()

        if text.find("\n\n") == -1:
            logger.info("Single column PDF detected.")
            columns = [text]
        else:
            logger.info("Multi column PDF detected.")
            columns = text.split("\n\n")

        converter.close()
        file_handle.close()

        return columns

    except Exception as e:
        logger.error("Error occurred while reading PDF. " + str(e))
        return None