Source code for curies.triples.filters

"""Filter functions for triples."""

from __future__ import annotations

import logging
from collections.abc import Collection, Iterable

from .model import Triple, TriplePredicate, TripleType
from ..api import Converter, Reference

__all__ = [
    "exclude_object_prefixes",
    "exclude_prefixes_both",
    "exclude_references_both",
    "exclude_same_prefixes",
    "exclude_subject_prefixes",
    "exclude_triples",
    "keep_object_prefixes",
    "keep_predicates",
    "keep_prefixes_both",
    "keep_prefixes_either",
    "keep_references_both",
    "keep_references_either",
    "keep_subject_prefixes",
    "keep_triples_by_hash",
]

logger = logging.getLogger(__name__)


def _filter(
    func: TriplePredicate[TripleType], triples: Iterable[TripleType], progress: bool = False
) -> Iterable[TripleType]:
    if progress:  # pragma: no cover
        try:
            from tqdm import tqdm
        except ImportError:
            logger.warning("tqdm is not installed, can't use progress bar for %s", func)
        else:
            triples = tqdm(triples, unit="triples", unit_scale=True)
    return filter(func, triples)


[docs] def keep_prefixes_both( triples: Iterable[TripleType], prefixes: Iterable[str], *, progress: bool = False ) -> Iterable[TripleType]: """Keep triples whose subjects' and objects' prefixes are in the given prefixes. :param triples: An iterable of triples :param prefixes: A set of prefixes to use for filtering the triples :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose subjects' and objects' prefixes are in the given prefixes >>> from curies import Triple >>> from curies.vocabulary import exact_match >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "umls:C4551571" >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, exact_match.curie, c3) >>> assert list(keep_prefixes_both([m1, m2, m3], {"DOID", "mesh"})) == [m1] """ return _filter(_keep_prefixes_both_filter(prefixes), triples, progress=progress)
def _keep_prefixes_both_filter(prefixes: Iterable[str]) -> TriplePredicate[TripleType]: prefixes = set(prefixes) if len(prefixes) < 2: raise ValueError def _func(triple: TripleType) -> bool: return triple.subject.prefix in prefixes and triple.object.prefix in prefixes return _func
[docs] def keep_prefixes_either( triples: Iterable[TripleType], prefixes: str | Iterable[str], *, progress: bool = False ) -> Iterable[TripleType]: """Keep triples whose subjects' and objects' prefixes are in the given prefixes. :param triples: An iterable of triples :param prefixes: A set of prefixes to use for filtering the triples :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose subjects' and objects' prefixes are in the given prefixes >>> from curies import Triple >>> from curies.vocabulary import exact_match >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "umls:C4551571" >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, exact_match.curie, c3) >>> assert list(keep_prefixes_either([m1, m2, m3], {"DOID", "mesh"})) == [m1] """ return _filter(_keep_prefixes_either_filter(prefixes), triples, progress=progress)
def _keep_prefixes_either_filter(prefixes: Iterable[str]) -> TriplePredicate[TripleType]: if isinstance(prefixes, str): def _func(triple: TripleType) -> bool: return triple.subject.prefix == prefixes or triple.object.prefix == prefixes else: prefixes = set(prefixes) def _func(triple: TripleType) -> bool: return triple.subject.prefix in prefixes or triple.object.prefix in prefixes return _func
[docs] def keep_subject_prefixes( triples: Iterable[TripleType], prefixes: str | Iterable[str], *, progress: bool = False ) -> Iterable[TripleType]: """Keep triples whose subjects' prefixes are in the given prefixes. :param triples: An iterable of triples :param prefixes: A set of prefixes to use for filtering the triples :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose subjects' prefixes are in the given prefixes >>> from curies import Triple >>> from curies.vocabulary import exact_match >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "umls:C4551571" >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, exact_match.curie, c3) >>> assert list(keep_subject_prefixes([m1, m2, m3], {"DOID"})) == [m1, m3] """ return _filter(_keep_subject_prefixes_filter(prefixes), triples, progress=progress)
def _keep_subject_prefixes_filter(prefixes: str | Iterable[str]) -> TriplePredicate[TripleType]: if isinstance(prefixes, str): def _func(triple: TripleType) -> bool: return triple.subject.prefix == prefixes else: prefixes = set(prefixes) def _func(triple: TripleType) -> bool: return triple.subject.prefix in prefixes return _func
[docs] def keep_object_prefixes( triples: Iterable[TripleType], prefixes: str | Iterable[str], *, progress: bool = False ) -> Iterable[TripleType]: """Keep triples whose objects' prefixes are in the given prefixes. :param triples: An iterable of triples :param prefixes: A set of prefixes to use for filtering the triples :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose objects' prefixes are in the given prefixes >>> from curies import Triple >>> from curies.vocabulary import exact_match >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "umls:C4551571" >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, exact_match.curie, c3) >>> assert list(keep_object_prefixes([m1, m2, m3], {"umls"})) == [m2, m3] """ return _filter(_keep_object_prefixes_filter(prefixes), triples, progress=progress)
def _keep_object_prefixes_filter(prefixes: str | Iterable[str]) -> TriplePredicate[TripleType]: if isinstance(prefixes, str): def _func(triple: TripleType) -> bool: return triple.object.prefix == prefixes else: prefixes = set(prefixes) def _func(triple: TripleType) -> bool: return triple.object.prefix in prefixes return _func
[docs] def exclude_prefixes_both( triples: Iterable[TripleType], prefixes: str | Iterable[str], *, progress: bool = False ) -> Iterable[TripleType]: """Exclude triples whose subjects' and objects' prefixes are in the given prefixes. :param triples: An iterable of triples :param prefixes: A set of prefixes to use for filtering the triples :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose subjects' and objects' prefixes are not in the given prefixes >>> from curies import Triple >>> from curies.vocabulary import exact_match >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "umls:C4551571" >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, exact_match.curie, c3) >>> assert list(exclude_prefixes_both([m1, m2, m3], {"umls"})) == [m1] >>> assert list(exclude_prefixes_both([m1, m2, m3], {"DOID"})) == [m2] >>> assert list(exclude_prefixes_both([m1, m2, m3], {"mesh"})) == [m3] """ return _filter(_exclude_prefixes_filter(prefixes), triples, progress=progress)
def _exclude_prefixes_filter(prefixes: str | Iterable[str]) -> TriplePredicate[TripleType]: if isinstance(prefixes, str): def _func(triple: TripleType) -> bool: return triple.subject.prefix != prefixes and triple.object.prefix != prefixes else: prefixes = set(prefixes) def _func(triple: TripleType) -> bool: return triple.subject.prefix not in prefixes and triple.object.prefix not in prefixes return _func
[docs] def exclude_subject_prefixes( triples: Iterable[TripleType], prefixes: str | Iterable[str], *, progress: bool = False ) -> Iterable[TripleType]: """Exclude triples whose subjects' prefixes are in the given prefixes. :param triples: An iterable of triples :param prefixes: A set of prefixes to use for filtering the triples :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose subjects' prefixes are not in the given prefixes >>> from curies import Triple >>> from curies.vocabulary import exact_match >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "umls:C4551571" >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, exact_match.curie, c3) >>> assert list(exclude_subject_prefixes([m1, m2, m3], {"DOID"})) == [m2] >>> assert list(exclude_subject_prefixes([m1, m2, m3], {"umls"})) == [m1, m2, m3] >>> assert list(exclude_subject_prefixes([m1, m2, m3], {"mesh"})) == [m1, m3] """ return _filter(_exclude_subject_prefixes_filter(prefixes), triples, progress=progress)
def _exclude_subject_prefixes_filter(prefixes: str | Iterable[str]) -> TriplePredicate[TripleType]: if isinstance(prefixes, str): def _func(triple: TripleType) -> bool: return triple.subject.prefix != prefixes else: prefixes = set(prefixes) def _func(triple: TripleType) -> bool: return triple.subject.prefix not in prefixes return _func
[docs] def exclude_object_prefixes( triples: Iterable[TripleType], prefixes: str | Iterable[str], *, progress: bool = False ) -> Iterable[TripleType]: """Exclude triples whose objects' prefixes are in the given prefixes. :param triples: An iterable of triples :param prefixes: A set of prefixes to use for filtering the triples :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose objects' prefixes are not in the given prefixes >>> from curies import Triple >>> from curies.vocabulary import exact_match >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "umls:C4551571" >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, exact_match.curie, c3) >>> assert list(exclude_object_prefixes([m1, m2, m3], {"umls"})) == [m1] >>> assert list(exclude_object_prefixes([m1, m2, m3], {"mesh"})) == [m2, m3] >>> assert list(exclude_object_prefixes([m1, m2, m3], {"DOID"})) == [m1, m2, m3] """ return _filter(_exclude_object_prefixes_filter(prefixes), triples, progress=progress)
def _exclude_object_prefixes_filter(prefixes: str | Iterable[str]) -> TriplePredicate[TripleType]: if isinstance(prefixes, str): def _func(triple: TripleType) -> bool: return triple.object.prefix != prefixes else: prefixes = set(prefixes) def _func(triple: TripleType) -> bool: return triple.object.prefix not in prefixes return _func
[docs] def exclude_same_prefixes( triples: Iterable[TripleType], *, progress: bool = False ) -> Iterable[TripleType]: """Exclude triples whose subject and object prefixes are the same. :param triples: An iterable of triples :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose subject and object prefixes are not the same. >>> from curies import Triple >>> from curies.vocabulary import exact_match, subclass_of >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "DOID:225" >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, subclass_of.curie, c3) >>> assert list(exclude_same_prefixes([m1, m2, m3])) == [m1, m2] """ return _filter(_same_prefix_filter, triples, progress=progress)
def _same_prefix_filter(triple: TripleType) -> bool: return triple.subject.prefix != triple.object.prefix
[docs] def keep_triples_by_hash( triples: Iterable[TripleType], converter: Converter, triple_hashes: str | Iterable[str], *, progress: bool = False, ) -> Iterable[TripleType]: """Keep triples whose triple hash (under the given converter) is in the given collection. :param triples: An iterable of triples :param converter: A converter :param triple_hashes: A hash or hashs to check triples for :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose triple hash under the given constructor appears in the given collection >>> from curies import Triple, Converter >>> from curies.vocabulary import exact_match, subclass_of >>> converter = Converter.from_prefix_map( ... { ... "DOID": "http://purl.obolibrary.org/obo/DOID_", ... "skos": "http://www.w3.org/2004/02/skos/core#", ... "mesh": "http://id.nlm.nih.gov/mesh/", ... "umls": "https://uts.nlm.nih.gov/uts/umls/concept/", ... } ... ) >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "DOID:225" >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, subclass_of.curie, c3) >>> m1_hash = "081f943d3791dae3a85f8eb9190fee3fbdc47ba269a374e4a0a28a2b0b982625" >>> assert list(keep_triples_by_hash([m1, m2, m3], converter, m1_hash)) == [m1] """ return _filter(_triple_has_hash(converter, triple_hashes), triples, progress=progress)
def _triple_has_hash( converter: Converter, triple_hashes: str | Iterable[str] ) -> TriplePredicate[TripleType]: if isinstance(triple_hashes, str): def _func(triple: TripleType) -> bool: return converter.hash_triple(triple) == triple_hashes else: triple_hashes = set(triple_hashes) def _func(triple: TripleType) -> bool: return converter.hash_triple(triple) in triple_hashes return _func
[docs] def exclude_triples( triples: Iterable[TripleType], exclusion: TripleType | Collection[TripleType], *, progress: bool = False, ) -> Iterable[TripleType]: """Exclude triples in the given set. :param triples: An iterable of triples :param exclusion: A triple or collection of triples to exclude :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose subject and object prefixes are not the same. >>> from curies import Triple >>> from curies.vocabulary import exact_match, subclass_of >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "DOID:225" >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, subclass_of.curie, c3) >>> assert list(exclude_triples([m1, m2, m3], m3)) == [m1, m2] """ return _filter(_exclude_triples(exclusion), triples, progress=progress)
def _exclude_triples( exclusion_triples: TripleType | Iterable[TripleType], ) -> TriplePredicate[TripleType]: if isinstance(exclusion_triples, Triple): exclusion_triples_ = {exclusion_triples} else: exclusion_triples_ = set(exclusion_triples) def _func(triple: TripleType) -> bool: return triple not in exclusion_triples_ return _func
[docs] def keep_references_either( triples: Iterable[TripleType], references: Reference | Collection[Reference], *, progress: bool = False, ) -> Iterable[TripleType]: """Keep triples whose subject and object appear in the given references. :param triples: An iterable of triples :param references: A collection of references :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose subject and object appear in the given references. >>> from curies import Reference, Triple >>> from curies.vocabulary import exact_match, subclass_of >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "DOID:225" >>> r1, r2, r3 = (Reference.from_curie(c) for c in (c1, c2, c3)) >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, subclass_of.curie, c3) >>> assert list(keep_references_either([m1, m2, m3], [r2, r1])) == [m1] """ return _filter(_include_references_either(references), triples, progress=progress)
def _include_references_either( references: Reference | Collection[Reference], ) -> TriplePredicate[TripleType]: if isinstance(references, Reference): def _func(triple: TripleType) -> bool: return triple.subject == references or triple.object == references else: references = set(references) def _func(triple: TripleType) -> bool: return triple.subject in references or triple.object in references return _func
[docs] def keep_references_both( triples: Iterable[TripleType], references: Collection[Reference], *, progress: bool = False ) -> Iterable[TripleType]: """Keep triples whose subject and object appear in the given references. :param triples: An iterable of triples :param references: A collection of references :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose subject and object appear in the given references. >>> from curies import Reference, Triple >>> from curies.vocabulary import exact_match, subclass_of >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "DOID:225" >>> r1, r2, r3 = (Reference.from_curie(c) for c in (c1, c2, c3)) >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, subclass_of.curie, c3) >>> assert list(keep_references_both([m1, m2, m3], [r2, r1])) == [m1] """ return _filter(_include_references_both(references), triples, progress=progress)
def _include_references_both(references: Collection[Reference]) -> TriplePredicate[TripleType]: references = set(references) if len(references) < 2: raise ValueError("two or more references are required") def _func(triple: TripleType) -> bool: return triple.subject in references and triple.object in references return _func
[docs] def exclude_references_both( triples: Iterable[TripleType], references: Reference | Collection[Reference], *, progress: bool = False, ) -> Iterable[TripleType]: """Exclude triples whose subject and object appear in the given references. :param triples: An iterable of triples :param references: A collection of references :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose subject and object don't appear in the given references. >>> from curies import Reference, Triple >>> from curies.vocabulary import exact_match, subclass_of >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "DOID:225" >>> r1, r2, r3 = (Reference.from_curie(c) for c in (c1, c2, c3)) >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, subclass_of.curie, c3) >>> assert list(exclude_references_both([m1, m2, m3], [r1])) == [m2] >>> assert list(exclude_references_both([m1, m2, m3], [r2])) == [m3] >>> assert list(exclude_references_both([m1, m2, m3], [r3])) == [m1] """ return _filter(_exclude_references(references), triples, progress=progress)
def _exclude_references( references: Reference | Collection[Reference], ) -> TriplePredicate[TripleType]: if isinstance(references, Reference): references = {references} else: references = set(references) def _func(triple: TripleType) -> bool: return triple.subject not in references and triple.object not in references return _func
[docs] def keep_predicates( triples: Iterable[TripleType], predicates: Reference | Collection[Reference], *, progress: bool = False, ) -> Iterable[TripleType]: """Keep triples whose predicate appear in the given references. :param triples: An iterable of triples :param predicates: A collection of references :param progress: Should a progress bar be shown? :returns: A sub-iterable of triples whose predicate appear in the given references. >>> from curies import Reference, Triple >>> from curies.vocabulary import exact_match, subclass_of >>> c1, c2, c3 = "DOID:0050577", "mesh:C562966", "DOID:225" >>> r1, r2, r3 = (Reference.from_curie(c) for c in (c1, c2, c3)) >>> m1 = Triple.from_curies(c1, exact_match.curie, c2) >>> m2 = Triple.from_curies(c2, exact_match.curie, c3) >>> m3 = Triple.from_curies(c1, subclass_of.curie, c3) >>> assert list(keep_predicates([m1, m2, m3], exact_match)) == [m1, m2] """ return _filter(_include_predicates(predicates), triples, progress=progress)
def _include_predicates( predicates: Reference | Collection[Reference], ) -> TriplePredicate[TripleType]: if isinstance(predicates, Reference): def _func(triple: TripleType) -> bool: return triple.predicate == predicates else: predicates = set(predicates) def _func(triple: TripleType) -> bool: return triple.predicate in predicates return _func