Module kgforge.utils

Expand source code
from kgforge.utils.openalex_util import OpenAlexUtil
from kgforge.utils.pdfreader import TextLoader

__all__ = ["TextLoader", "OpenAlexUtil"]

Sub-modules

kgforge.utils.openalex_util
kgforge.utils.pdfreader

Classes

class OpenAlexUtil (config: OpenAlexUtilConfig = <kgforge.utils.openalex_util.OpenAlexUtilConfig object>)

Provides functionality to fetch artifacts from OpenAlex.

Expand source code
class OpenAlexUtil:
    """Provides functionality to fetch artifacts from OpenAlex."""

    def __init__(self, config: OpenAlexUtilConfig = OpenAlexUtilConfig()) -> None:
        self.config = config or OpenAlexUtilConfig()

    def search_works(self, search_query: str, results_limit: int = 25) -> List[Any]:
        """Searches for artifacts using a query.

        Usage example:
        >>>oa_util = OpenAlexUtil()
        >>>oa_util.search_works("sample-query", 25)

        Args:
            search_query (str): Query to search for artifacts.
            results_limit (int): Number of results to return.

        Returns:
            List[ResearchArtifact]: List of artifacts that match the query.

        Raises:
            HTTPError: If an HTTP error occurs while searching for artifacts.
            Exception: If an error occurs while searching for artifacts.
        """
        url = self.config.search_endpoint.format(search_query, results_limit)

        try:
            response = requests.get(url)
            response.raise_for_status()
            search_results = response.json().get("results")
            if response.status_code == 200 and search_results is not None:
                return search_results
                # artifacts = [ResearchArtifact.parse_obj(_) for _ in search_results]
                # full_text_artifacts = list(map(lambda x: x.get_full_text(), artifacts))
                # return full_text_artifacts
            else:
                return []
        except HTTPError as http_err:
            logger.info(f"HTTP error occurred: {http_err}")
            return []
        except Exception as err:
            logger.info(f"Other error occurred: {err}")
            return []

Methods

def search_works(self, search_query: str, results_limit: int = 25) ‑> List[Any]

Searches for artifacts using a query.

Usage example:

oa_util = OpenAlexUtil() oa_util.search_works("sample-query", 25)

Args

search_query : str
Query to search for artifacts.
results_limit : int
Number of results to return.

Returns

List[ResearchArtifact]
List of artifacts that match the query.

Raises

HTTPError
If an HTTP error occurs while searching for artifacts.
Exception
If an error occurs while searching for artifacts.
Expand source code
def search_works(self, search_query: str, results_limit: int = 25) -> List[Any]:
    """Searches for artifacts using a query.

    Usage example:
    >>>oa_util = OpenAlexUtil()
    >>>oa_util.search_works("sample-query", 25)

    Args:
        search_query (str): Query to search for artifacts.
        results_limit (int): Number of results to return.

    Returns:
        List[ResearchArtifact]: List of artifacts that match the query.

    Raises:
        HTTPError: If an HTTP error occurs while searching for artifacts.
        Exception: If an error occurs while searching for artifacts.
    """
    url = self.config.search_endpoint.format(search_query, results_limit)

    try:
        response = requests.get(url)
        response.raise_for_status()
        search_results = response.json().get("results")
        if response.status_code == 200 and search_results is not None:
            return search_results
            # artifacts = [ResearchArtifact.parse_obj(_) for _ in search_results]
            # full_text_artifacts = list(map(lambda x: x.get_full_text(), artifacts))
            # return full_text_artifacts
        else:
            return []
    except HTTPError as http_err:
        logger.info(f"HTTP error occurred: {http_err}")
        return []
    except Exception as err:
        logger.info(f"Other error occurred: {err}")
        return []
class TextLoader

Reads text from a variety of sources.

Expand source code
class TextLoader:
    """Reads text from a variety of sources."""

    @staticmethod
    def _read_pdf(path: str) -> List[str]:
        """Reads text from a PDF file.

        Usage example:
        >>> loader = TextLoader()
        >>> loader._read_pdf("path/to/file.pdf")

        Args:
            path (str): Path to the PDF file.

        Returns:
            List[str]: List of strings, each string representing a column in the PDF.

        Raises:
            FileNotFoundError: If the file does not exist.
            Exception: If an error occurs while reading the PDF.
        """
        try:
            resource_manager = PDFResourceManager()
            file_handle = io.StringIO()
            converter = TextConverter(
                resource_manager, file_handle, laparams=LAParams()
            )
            page_interpreter = PDFPageInterpreter(resource_manager, converter)

            with open(path, "rb") as file:
                for page in PDFPage.get_pages(
                    file, caching=True, check_extractable=True
                ):
                    page_interpreter.process_page(page)
                text = file_handle.getvalue()

            if text.find("\n\n") == -1:
                logger.info("Single column PDF detected.")
                columns = [text]
            else:
                logger.info("Multi column PDF detected.")
                columns = text.split("\n\n")

            converter.close()
            file_handle.close()

            return columns
        except FileNotFoundError:
            logger.error("File not found.")
            raise FileNotFoundError
        except Exception as e:
            logger.error("Error occurred while reading PDF. " + str(e))
            raise e

    @staticmethod
    def read_pdf_from_url(url: str = None) -> List[str]:
        """Reads PDF file from an online URL.

        Usage example:
        >>> loader = TextLoader()
        >>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf")

        Args:
            url (str): URL of the PDF file.

        Returns:
            List[str]: Text from the PDF file.

        Raises:
            ValueError: If no URL is provided.
        """

        if url is None:
            raise ValueError("URL cannot be empty")
        try:
            response = requests.get(url)
            resource_manager = PDFResourceManager()
            file_handle = io.StringIO()
            converter = TextConverter(
                resource_manager, file_handle, laparams=LAParams()
            )
            page_interpreter = PDFPageInterpreter(resource_manager, converter)

            for page in PDFPage.get_pages(
                io.BytesIO(response.content), caching=True, check_extractable=True
            ):
                page_interpreter.process_page(page)
            text = file_handle.getvalue()

            if text.find("\n\n") == -1:
                logger.info("Single column PDF detected.")
                columns = [text]
            else:
                logger.info("Multi column PDF detected.")
                columns = text.split("\n\n")

            converter.close()
            file_handle.close()

            return columns

        except Exception as e:
            logger.error("Error occurred while reading PDF. " + str(e))
            return None

Static methods

def read_pdf_from_url(url: str = None) ‑> List[str]

Reads PDF file from an online URL.

Usage example:

>>> loader = TextLoader()
>>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf")

Args

url : str
URL of the PDF file.

Returns

List[str]
Text from the PDF file.

Raises

ValueError
If no URL is provided.
Expand source code
@staticmethod
def read_pdf_from_url(url: str = None) -> List[str]:
    """Reads PDF file from an online URL.

    Usage example:
    >>> loader = TextLoader()
    >>> loader.read_pdf_from_url("https://arxiv.org/pdf/2106.01558.pdf")

    Args:
        url (str): URL of the PDF file.

    Returns:
        List[str]: Text from the PDF file.

    Raises:
        ValueError: If no URL is provided.
    """

    if url is None:
        raise ValueError("URL cannot be empty")
    try:
        response = requests.get(url)
        resource_manager = PDFResourceManager()
        file_handle = io.StringIO()
        converter = TextConverter(
            resource_manager, file_handle, laparams=LAParams()
        )
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        for page in PDFPage.get_pages(
            io.BytesIO(response.content), caching=True, check_extractable=True
        ):
            page_interpreter.process_page(page)
        text = file_handle.getvalue()

        if text.find("\n\n") == -1:
            logger.info("Single column PDF detected.")
            columns = [text]
        else:
            logger.info("Multi column PDF detected.")
            columns = text.split("\n\n")

        converter.close()
        file_handle.close()

        return columns

    except Exception as e:
        logger.error("Error occurred while reading PDF. " + str(e))
        return None