Source code for curies.dataframe

"""Dataframe operations."""

from __future__ import annotations

from collections import defaultdict
from collections.abc import Callable, Collection
from typing import TYPE_CHECKING, Any, Literal, TypeAlias, TypeGuard, Union, cast

from .utils import _prefix_from_curie

if TYPE_CHECKING:
    import pandas as pd

    from .api import Converter

__all__ = [
    "PrefixIndexMethod",
    "filter_df_by_curies",
    "filter_df_by_prefixes",
    "get_df_curies_index",
    "get_df_prefixes_index",
    "get_df_unique_prefixes",
    "get_filter_df_by_curies_index",
    "get_filter_df_by_prefixes_index",
]

DataframeOrSeries: TypeAlias = Union["pd.DataFrame", "pd.Series[str]"]


def _get_prefix_checker(prefix: str | Collection[str]) -> Callable[[str], bool]:
    """Get a function that checks if a CURIE starts with a prefix."""
    if isinstance(prefix, str):
        prefix_with_colon = prefix + ":"

        def _func(curie: str) -> bool:
            return curie.startswith(prefix_with_colon)

    else:
        prefixes_with_colons = {p + ":" for p in prefix}

        def _func(curie: str) -> bool:
            return any(
                curie.startswith(prefix_with_colon) for prefix_with_colon in prefixes_with_colons
            )

    return _func


def _get_prefixes_from_curie_column(
    df: DataframeOrSeries,
    *,
    column: int | str | None = None,
    converter: Converter | None = None,
    validate: bool,
) -> pd.Series[str]:
    return _get_series(df, column).map(_get_curie_parser(converter=converter, validate=validate))


def _get_curie_parser(
    *, converter: Converter | None = None, validate: bool = False
) -> Callable[[str], str]:
    # TODO what if it can't parse?
    # TODO handle None?
    # TODO handle invalid CURIEs?

    if not validate:
        return _prefix_from_curie
    elif converter is None:
        raise ValueError("converter is required for validation")
    else:

        def _func(curie: str) -> str:
            reference = converter.parse_curie(curie, strict=True)
            return reference.prefix

    return _func


#: The method for filtering on prefixe
PrefixIndexMethod: TypeAlias = Literal["iterative", "precalculated"]



[docs]
def get_filter_df_by_prefixes_index(
    df: DataframeOrSeries,
    *,
    column: str | int | None = None,
    prefixes: str | Collection[str],
    method: PrefixIndexMethod | None = None,
    validate: bool = False,
    converter: Converter | None = None,
) -> pd.Series[bool]:
    """Get an index of CURIEs in the given column that start with the prefix(es).

    :param df: A dataframe or series. If a dataframe is given, the ``column`` must not
        be none.
    :param column: The column to check, if a dataframe was passed. If a series was
        passed, this can be left as none.
    :param prefixes: The prefix or set of prefixes to identify
    :param method: The indexing method
    :param validate: Should the prefixes be validated against the converter?
    :param converter: A converter for validating CURIEs

    :returns: A pandas boolean series that corresponds to the rows of the dataframe or
        series provided

    :raises ValueError: If validation is set to true but no converter is passed

    Example usage:

    .. code-block:: python

        import pandas as pd
        from curies.dataframe import get_filter_df_by_prefixes_index

        rows = [
            ("DOID:0080795", "skos:exactMatch", "EFO:0003029", "semapv:ManualMappingCuration"),
            ("DOID:0080795", "skos:exactMatch", "mesh:D015471", "semapv:ManualMappingCuration"),
            ("DOID:0080799", "skos:exactMatch", "EFO:1000527", "semapv:ManualMappingCuration"),
            (
                "DOID:0080808",
                "skos:exactMatch",
                "mesh:D000069295",
                "semapv:ManualMappingCuration",
            ),
        ]
        df = pd.DataFrame(
            rows, columns=["subject_id", "predicate_id", "object_id", "mapping_justification"]
        )
        idx = get_filter_df_by_prefixes_index(df, column="object_id", prefixes=["EFO"])
        filtered_df = df[idx]
    """
    if method == "iterative" or method is None:
        return _get_series(df, column).map(_get_prefix_checker(prefixes))
    elif method == "precalculated":
        if converter is None:  # pragma: no cover
            raise ValueError("a converter is required for method B")
        prefix_series = _get_prefixes_from_curie_column(
            df, column=column, converter=converter, validate=validate
        )
        if isinstance(prefixes, str):
            return prefix_series == prefixes
        else:
            return prefix_series.isin(prefixes)
    else:  # pragma: no cover
        raise ValueError(f"invalid method given: {method}")




[docs]
def filter_df_by_prefixes(
    df: pd.DataFrame,
    *,
    column: str | int,
    prefixes: str | Collection[str],
    method: PrefixIndexMethod | None = None,
    validate: bool = False,
    converter: Converter | None = None,
) -> pd.DataFrame:
    """Filter a dataframe based on CURIEs in a given column having a given prefix or set of prefixes.

    :param df: A dataframe
    :param column: The integer index or column name of a column containing CURIEs
    :param prefixes: The prefix (given as a string) or collection of prefixes (given as
        a list, set, etc.) to keep
    :param method: The implementation for getting the prefix index
    :param validate: Should the prefixes be validated against the converter?
    :param converter: A converter for validating CURIEs

    :returns: If not in place, return a new dataframe.

    Example usage:

    .. code-block:: python

        import pandas as pd
        from curies.dataframe import filter_df_by_prefixes

        rows = [
            ("DOID:0080795", "skos:exactMatch", "EFO:0003029", "semapv:ManualMappingCuration"),
            ("DOID:0080795", "skos:exactMatch", "mesh:D015471", "semapv:ManualMappingCuration"),
            ("DOID:0080799", "skos:exactMatch", "EFO:1000527", "semapv:ManualMappingCuration"),
            (
                "DOID:0080808",
                "skos:exactMatch",
                "mesh:D000069295",
                "semapv:ManualMappingCuration",
            ),
        ]
        df = pd.DataFrame(
            rows, columns=["subject_id", "predicate_id", "object_id", "mapping_justification"]
        )
        filtered_df = filter_df_by_prefixes(df, column="object_id", prefixes=["EFO"])

    This results in the following dataframe:

    ============ =============== =========== ============================
    subject_id   predicate_id    object_id   mapping_justification
    ============ =============== =========== ============================
    DOID:0080795 skos:exactMatch EFO:0003029 semapv:ManualMappingCuration
    DOID:0080799 skos:exactMatch EFO:1000527 semapv:ManualMappingCuration
    ============ =============== =========== ============================

    Internally, this function uses :func:`get_filter_df_by_prefixes_index`.
    """
    idx = get_filter_df_by_prefixes_index(
        df=df,
        column=column,
        prefixes=prefixes,
        method=method,
        converter=converter,
        validate=validate,
    )
    return df[idx]




[docs]
def get_filter_df_by_curies_index(
    df: DataframeOrSeries,
    *,
    column: str | int | None = None,
    curies: str | Collection[str],
) -> pd.Series[bool]:
    """Get an index of CURIEs in the given column that are the given CURIE(s)."""
    series = _get_series(df, column)
    if isinstance(curies, str):
        return series == curies
    else:
        return series.isin(set(curies))




[docs]
def get_df_curies_index(
    df: DataframeOrSeries, *, column: str | int | None = None
) -> dict[str, list[int]]:
    """Get a dictionary from CURIEs that appear in the column to the row indexes where they appear."""
    dd: defaultdict[str, list[int]] = defaultdict(list)
    for i, curie in enumerate(_get_series(df, column)):
        dd[curie].append(i)
    return dict(dd)




[docs]
def filter_df_by_curies(
    df: pd.DataFrame,
    *,
    column: str | int,
    curies: str | Collection[str],
) -> pd.DataFrame:
    """Filter a dataframe based on CURIEs in a given column having a given prefix or set of prefixes.

    :param df: A dataframe
    :param column: The integer index or column name of a column containing CURIEs
    :param curies: The CURIE (given as a string) or collection of CURIEs (given as a
        list, set, etc.) to keep

    :returns: If not in place, return a new dataframe.

    Example usage:

    .. code-block:: python

        import pandas as pd
        from curies.dataframe import filter_df_by_curies

        rows = [
            ("DOID:0080795", "skos:exactMatch", "EFO:0003029", "semapv:ManualMappingCuration"),
            ("DOID:0080795", "skos:exactMatch", "mesh:D015471", "semapv:ManualMappingCuration"),
            ("DOID:0080799", "skos:exactMatch", "EFO:1000527", "semapv:ManualMappingCuration"),
            (
                "DOID:0080808",
                "skos:exactMatch",
                "mesh:D000069295",
                "semapv:ManualMappingCuration",
            ),
        ]
        df = pd.DataFrame(
            rows, columns=["subject_id", "predicate_id", "object_id", "mapping_justification"]
        )
        filtered_df = filter_df_by_curies(df, column="subject_id", prefixes=["DOID:0080795"])

    This results in the following dataframe:

    ============ =============== ============ ============================
    subject_id   predicate_id    object_id    mapping_justification
    ============ =============== ============ ============================
    DOID:0080795 skos:exactMatch EFO:0003029  semapv:ManualMappingCuration
    DOID:0080795 skos:exactMatch mesh:D015471 semapv:ManualMappingCuration
    ============ =============== ============ ============================
    """
    idx = get_filter_df_by_curies_index(df=df, column=column, curies=curies)
    return df[idx]




[docs]
def get_df_prefixes_index(
    df: DataframeOrSeries,
    *,
    column: str | int | None = None,
    converter: Converter | None = None,
    validate: bool = False,
) -> dict[str, list[int]]:
    """Get a dictionary from prefixes that appear in the column to the row indexes where they appear."""
    dd: defaultdict[str, list[int]] = defaultdict(list)
    f = _get_curie_parser(converter=converter, validate=validate)
    for i, prefix in enumerate(_get_series(df, column).map(f)):
        dd[prefix].append(i)
    return dict(dd)




[docs]
def get_df_unique_prefixes(
    df: DataframeOrSeries,
    *,
    column: str | int | None = None,
    validate: bool = False,
    converter: Converter | None = None,
) -> set[str]:
    """Get unique prefixes.

    :param df: A dataframe or series. If a dataframe is given, the ``column`` must not
        be none.
    :param column: The column to check, if a dataframe was passed. If a series was
        passed, this can be left as none.
    :param validate: Should the prefixes be validated against the converter?
    :param converter: A converter for validating CURIEs

    :returns: A set of prefixes appearing in CURIEs in the given column

    .. code-block:: python

        import pandas as pd
        from curies.dataframe import get_df_unique_prefixes

        rows = [
            ("DOID:0080795", "skos:exactMatch", "EFO:0003029", "semapv:ManualMappingCuration"),
            ("DOID:0080795", "skos:exactMatch", "mesh:D015471", "semapv:ManualMappingCuration"),
            ("DOID:0080799", "skos:exactMatch", "EFO:1000527", "semapv:ManualMappingCuration"),
            (
                "DOID:0080808",
                "skos:exactMatch",
                "mesh:D000069295",
                "semapv:ManualMappingCuration",
            ),
        ]
        df = pd.DataFrame(
            rows, columns=["subject_id", "predicate_id", "object_id", "mapping_justification"]
        )
        assert get_df_unique_prefixes(df, column="object_id") == {"EFO", "mesh"}
    """
    series = _get_series(df, column)
    f = _get_curie_parser(converter=converter, validate=validate)
    return set(series.map(f).unique())



def _disallowed_dtype(series: pd.Series[Any] | str) -> TypeGuard[pd.Series[str]]:
    import numpy as np
    import pandas

    if isinstance(series, str):
        return False

    # pandas 3.0 introduced a new datatype - this code implicitly
    # checks if we're using pandas 3.0 if pandas.StringDtype is available
    if (string_dtype := getattr(pandas, "StringDtype", None)) and isinstance(
        series.dtype, string_dtype
    ):
        return False

    return series.dtype != np.str_ and series.dtype != np.dtype("O")


def _get_series(df_or_series: DataframeOrSeries, column: str | int | None = None) -> pd.Series[str]:
    import pandas as pd

    if isinstance(df_or_series, pd.Series):
        if _disallowed_dtype(df_or_series):
            raise TypeError(
                f"passed series that does not have strings: {df_or_series.dtype=} {type(df_or_series.dtype)=}\n\n{df_or_series}"
            )
        return df_or_series  # ty:ignore

    if column is None:
        raise ValueError("must pass non-none column when using a dataframe directly")

    series = df_or_series[column]
    if _disallowed_dtype(series):
        raise TypeError(
            f"passed series that does not have strings: {series.dtype=} {type(series.dtype)=}\n\n{series}"
        )
    return cast("pd.Series[str]", series)