"""Data structures and algorithms for :mod:`curies`."""
from __future__ import annotations
import csv
import itertools as itt
import json
import logging
from collections import UserDict, defaultdict
from collections.abc import Callable, Collection, Iterable, Iterator, Mapping, Sequence
from functools import partial
from pathlib import Path
from textwrap import dedent
from typing import (
TYPE_CHECKING,
Annotated,
Any,
Literal,
NamedTuple,
TypeAlias,
TypeVar,
cast,
overload,
)
from pydantic import (
BaseModel,
ConfigDict,
Field,
GetCoreSchemaHandler,
RootModel,
field_validator,
model_validator,
)
from pydantic_core import core_schema
from typing_extensions import Self
from .utils import NoCURIEDelimiterError, _split
if TYPE_CHECKING: # pragma: no cover
import pandas
import rdflib
from .triples import Triple
__all__ = [
"Converter",
"DuplicatePrefixes",
"DuplicateURIPrefixes",
"DuplicateValueError",
"NamableReference",
"NamedReference",
"NoCURIEDelimiterError",
"Prefix",
"PrefixMap",
"Record",
"Records",
"Reference",
"ReferenceTuple",
"Trie",
"TrieNode",
"chain",
"load_extended_prefix_map",
"load_jsonld_context",
"load_prefix_map",
"load_shacl",
"upgrade_prefix_map",
"write_extended_prefix_map",
"write_jsonld_context",
"write_shacl",
"write_tsv",
]
logger = logging.getLogger(__name__)
X = TypeVar("X")
LocationOr: TypeAlias = str | Path | X
RETURN_NONE_ERROR_TEXT = (
"Converter.parse_uri stopped returning ``(None, None)`` in curies v0.11.0. "
"``return_none`` is now a no-op argument (i.e., you shouldn't pass it "
"anymore) and the function returns ``None`` when parsing fails in non-strict mode"
)
def _get_field_validator_values(values: Any, key: str) -> str:
"""Get the value for the key from a field validator object."""
return cast(str, values.data[key])
[docs]
class ReferenceTuple(NamedTuple):
"""A pair of a prefix (corresponding to a semantic space) and a local unique identifier in that semantic space.
This class derives from the "named tuple" which means that it acts like a tuple in
most senses - it can be hashed and unpacked like most other tuples. Underneath, it
has a C implementation and is very efficient.
A reference tuple can be constructed two ways:
>>> ReferenceTuple("chebi", "1234")
ReferenceTuple(prefix='chebi', identifier='1234')
>>> ReferenceTuple.from_curie("chebi:1234")
ReferenceTuple(prefix='chebi', identifier='1234')
A reference tuple can be formatted as a CURIE string with the ``curie`` attribute
>>> ReferenceTuple.from_curie("chebi:1234").curie
'chebi:1234'
Reference tuples can be sliced like regular 2-tuples
>>> t = ReferenceTuple.from_curie("chebi:1234")
>>> t[0]
'chebi'
>>> t[1]
'1234'
Similarly, reference tuples can be unpacked like regular 2-tuples
>>> prefix, identifier = ReferenceTuple.from_curie("chebi:1234")
>>> prefix
'chebi'
>>> identifier
'1234'
Because they are named tuples, reference tuples can be accessed with attributes
>>> t = ReferenceTuple.from_curie("chebi:1234")
>>> t.prefix
'chebi'
>>> t.identifier
'1234'
"""
prefix: str
identifier: str
@property
def curie(self) -> str:
"""Get the reference as a CURIE string.
:returns: A string representation of a compact URI (CURIE).
>>> ReferenceTuple("chebi", "1234").curie
'chebi:1234'
"""
return f"{self.prefix}:{self.identifier}"
[docs]
@classmethod
def from_curie(cls, curie: str, *, sep: str = ":") -> Self:
"""Parse a CURIE string and populate a reference tuple.
:param curie: A string representation of a compact URI (CURIE)
:param sep: The separator
:returns: A reference tuple
>>> ReferenceTuple.from_curie("chebi:1234")
ReferenceTuple(prefix='chebi', identifier='1234')
"""
prefix, identifier = _split(curie, sep=sep)
return cls(prefix, identifier)
# docstr-coverage:excused `overload`
@overload
def to_pydantic(self, *, name: str = ...) -> NamedReference: ...
# docstr-coverage:excused `overload`
@overload
def to_pydantic(self, *, name: None = ...) -> Reference: ...
[docs]
def to_pydantic(self, *, name: str | None = None) -> Reference | NamedReference:
"""Get a Pydantic model."""
if name is None:
return Reference(prefix=Prefix(self.prefix), identifier=self.identifier)
if not name:
raise ValueError(
f"tried to construct a pydantic named reference with a missing name from {self.curie}"
)
return NamedReference(prefix=Prefix(self.prefix), identifier=self.identifier, name=name)
[docs]
class Prefix(str):
"""A string that is validated by Pydantic as a CURIE prefix.
This class is a subclass of Python's built-in string class, so you can wrap any
string with it:
.. code-block:: python
from curies import Prefix
prefix = Prefix("CHEBI")
You can implicitly type annotate data with this class:
.. code-block:: python
from curies import Prefix
prefix_map: dict[Prefix, str] = {
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
}
You can more explicitly type annotate data with this class using Pydantic's `root
model
<https://docs.pydantic.dev/2.3/usage/models/#rootmodel-and-custom-root-types>`_:
.. code-block:: python
from pydantic import RootModel
from curies import Prefix
PrefixMap = RootModel[dict[Prefix, str]]
prefix_map = PrefixMap.model_validate(
{
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
}
).root
This pattern is common enough that it's included in :class:`curies.PrefixMap`.
When used inside a Pydantic model, this class knows how to do validation that the
prefix matches the regular expression for an XSD NCName. Here's an example usage
with Pydantic:
.. code-block:: python
from curies import Prefix
from pydantic import BaseModel
class ResourceInfo(BaseModel):
prefix: Prefix
name: str
model = ResourceInfo.model_validate(
{
"prefix": "CHEBI",
"name": "Chemical Entities of Biological Interest",
}
)
# raises a pydantic.ValidationError, because the prefix
# doesn't match the NCName pattern
ResourceInfo.model_validate(
{
"prefix": "$nope",
"name": "An invalid semantic space!",
}
)
This class implements a hook that uses Pydantic's "context" for validation that lets
you pass a :class:`Converter` to check for existence and standardization with
respect to the context in the converter:
.. code-block:: python
from curies import Prefix, get_obo_converter
from pydantic import BaseModel
class ResourceInfo(BaseModel):
prefix: Prefix
name: str
converter = get_obo_converter()
model = ResourceInfo.model_validate(
{
"prefix": "CHEBI",
"name": "Chemical Entities of Biological Interest",
},
context=converter,
)
# raises a pydantic.ValidationError, because the prefix
# is not registered in the OBO Foundry, and is therefore
# not part of the OBO converter
ResourceInfo.model_validate(
{
"prefix": "efo",
"name": "Experimental Factor Ontology",
},
context=converter,
)
# In case you need to pass more arbitrary
# context, you can also use a dict with the key
# "converter"
ResourceInfo.model_validate(
{
"prefix": "CHEBI",
"name": "Chemical Entities of Biological Interest",
},
context={
"converter": converter,
},
)
.. warning::
Serialization with YAML with :mod:`yaml` might not work as expected for classes
containing :class:`Prefix` instances, since they are subclasses of strings. Do
something like this:
.. code-block:: python
def _reference_representer(
dumper: SafeRepresenter, data: curies.Prefix
) -> yaml.ScalarNode:
return dumper.represent_str(str(data))
# if you're using yaml.safe_dump()
yaml.add_representer(curies.Prefix, _reference_representer, Dumper=yaml.SafeDumper)
# if you're using yaml.dump()
yaml.add_representer(curies.Prefix, _reference_representer, Dumper=yaml.Dumper)
"""
@classmethod
def __get_pydantic_core_schema__(
cls, source: type[Any], handler: GetCoreSchemaHandler
) -> core_schema.AfterValidatorFunctionSchema:
return core_schema.with_info_after_validator_function(
cls._validate,
# TODO consider if we should use strict NCNAME pattern
# here like ^$|^[a-zA-Z_][\w.-]*$. See also
# https://cthoyt.com/2023/01/11/bioregistry-w3c-compliance.html
core_schema.str_schema(strict=False),
)
@classmethod
def _validate(cls, __input_value: str, info: core_schema.ValidationInfo) -> Self:
converter = _converter_from_validation_info(info)
if converter is None:
return cls(__input_value)
return cls(converter.standardize_prefix(__input_value, strict=True))
[docs]
class PrefixMap(RootModel[dict[Prefix, str]]):
"""A simple prefix map.
This can be used to validate dictionaries:
.. code-block:: python
from curies import PrefixMap
prefix_map_model = PrefixMap.model_validate(
{
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
}
)
# note that you have to unpack it
prefix_map_dict = prefix_map_model.root
Similarly, a prefix map can be used as part of another Pydantic model like in:
.. code-block:: python
from pydantic import BaseModel
from curies import PrefixMap
class RDFContent(BaseModel):
prefix_map: PrefixMap
triples: list[tuple[str, str, str]]
rdf_content = RDFContent.model_validate(
{
"prefix_map": {
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
},
"triples": [
("CHEBI:1234", "RO:0000001", "CHEBI:5678"),
],
}
)
# note that you have to unpack the resulting prefix map
prefix_map = rdf_content.prefix_map.root
"""
[docs]
class Reference(BaseModel):
"""A reference to an entity in a given identifier space.
This class uses Pydantic to make it easier to build other more complex data types
with Pydantic that also uses a first- class notion of parsed reference (instead of
merely stringified CURIEs). Instances of this class can also be hashed because of
the "frozen" configuration from Pydantic (see
https://docs.pydantic.dev/latest/usage/model_config/ for more details).
A reference can be constructed several ways:
>>> Reference(prefix="chebi", identifier="1234")
Reference(prefix='chebi', identifier='1234')
>>> Reference.from_curie("chebi:1234")
Reference(prefix='chebi', identifier='1234')
A reference can also be constructued using Pydantic's parsing utilities, but keep in
mind if you're using Pydantic v1 or Pydantic v2.
A reference can be formatted as a CURIE string with the ``curie`` attribute
>>> Reference.from_curie("chebi:1234").curie
'chebi:1234'
References can't be sliced like reference tuples, but they can still be accessed
through attributes
>>> t = Reference.from_curie("chebi:1234")
>>> t.prefix
'chebi'
>>> t.identifier
'1234'
If you need a performance gain, you can get a :class:`ReferenceTuple` using the
``pair`` attribute:
>>> reference = Reference.from_curie("chebi:1234")
>>> reference.pair
ReferenceTuple(prefix='chebi', identifier='1234')
"""
prefix: Annotated[
Prefix,
Field(
description="The prefix used in a compact URI (CURIE).",
),
]
identifier: Annotated[
str, Field(description="The local unique identifier used in a compact URI (CURIE).")
]
model_config = ConfigDict(frozen=True)
@model_validator(mode="before")
def _parse_from_string(cls, values: str | dict[str, str]) -> dict[str, str]: # noqa:N805
if isinstance(values, str):
prefix, identifier = _split(values)
return {"prefix": prefix, "identifier": identifier}
return values
def __lt__(self, other: Reference) -> bool:
"""Sort the reference lexically first by prefix, then by identifier."""
return self.pair < other.pair
def __hash__(self) -> int:
return hash((self.prefix, self.identifier))
def __eq__(self, other: Any) -> bool:
return (
isinstance(other, Reference)
and self.prefix == other.prefix
and self.identifier == other.identifier
)
def __composite_values__(self) -> tuple[str, str]:
"""Return values appropriate for :func:`sqlalchemy.orm.composite`."""
return self.prefix, self.identifier
@property
def curie(self) -> str:
"""Get the reference as a CURIE string.
:returns: A string representation of a compact URI (CURIE).
>>> Reference(prefix="chebi", identifier="1234").curie
'chebi:1234'
"""
return f"{self.prefix}:{self.identifier}"
@property
def pair(self) -> ReferenceTuple:
"""Get the reference as a 2-tuple of prefix and identifier."""
return ReferenceTuple(self.prefix, self.identifier)
# note that it's important that this is explicitly
# Reference and not Self, since all subclasses should
# return only a reference.
[docs]
def without_name(self) -> Reference:
"""Return this reference, since it already has no name."""
return self
[docs]
def with_name(self, name: str) -> NamableReference:
"""Return this reference, with a name."""
return NamedReference(prefix=self.prefix, identifier=self.identifier, name=name)
[docs]
@classmethod
def from_curie(cls, curie: str, *, sep: str = ":", converter: Converter | None = None) -> Self:
"""Parse a CURIE string and populate a reference.
:param curie: A string representation of a compact URI (CURIE)
:param sep: The separator
:param converter: The converter to use as context when parsing
:returns: A reference object
>>> Reference.from_curie("chebi:1234")
Reference(prefix='chebi', identifier='1234')
"""
prefix, identifier = _split(curie, sep=sep)
return cls.model_validate({"prefix": prefix, "identifier": identifier}, context=converter)
[docs]
@classmethod
def from_reference(
cls, reference: Reference | ReferenceTuple, *, converter: Converter | None = None
) -> Self:
"""Parse a CURIE string and populate a reference.
:param reference: A pre-parsed reference
:param converter: The converter to use as context when parsing
:returns: A reference object
"""
return cls.model_validate(
{"prefix": reference.prefix, "identifier": reference.identifier}, context=converter
)
[docs]
class NamableReference(Reference):
"""A reference, maybe with a name."""
name: Annotated[
str | None,
Field(
description="The name of the entity referenced by this object's prefix and identifier, if exists.",
),
] = None
model_config = ConfigDict(frozen=True)
[docs]
@classmethod
def from_curie(
cls,
curie: str,
name: str | None = None,
*,
sep: str = ":",
converter: Converter | None = None,
) -> Self:
"""Parse a CURIE string and populate a reference.
:param curie: A string representation of a compact URI (CURIE)
:param name: The optional name of the reference
:param sep: The separator
:param converter: The converter to use as context when parsing
:returns: A reference object
>>> NamableReference.from_curie("chebi:1234")
NamableReference(prefix='chebi', identifier='1234', name=None)
>>> NamableReference.from_curie("chebi:1234", "6-methoxy-2-octaprenyl-1,4-benzoquinone")
NamableReference(prefix='chebi', identifier='1234', name='6-methoxy-2-octaprenyl-1,4-benzoquinone')
"""
prefix, identifier = _split(curie, sep=sep)
return cls.model_validate(
{"prefix": prefix, "identifier": identifier, "name": name}, context=converter
)
[docs]
@classmethod
def from_reference(
cls, reference: Reference | ReferenceTuple, *, converter: Converter | None = None
) -> Self:
"""Parse a CURIE string and populate a reference.
:param reference: A pre-parsed reference
:param converter: The converter to use as context when parsing
:returns: A reference object
"""
name = reference.name if isinstance(reference, NamableReference) else None
return cls.model_validate(
{"prefix": reference.prefix, "identifier": reference.identifier, "name": name},
context=converter,
)
[docs]
def without_name(self) -> Reference:
"""Return this reference without a name."""
return Reference(prefix=self.prefix, identifier=self.identifier)
[docs]
def with_name(self, name: str) -> Self:
"""Return this reference, with a name."""
return self.model_copy(update={"name": name})
[docs]
class NamedReference(NamableReference):
"""A reference with a name."""
name: Annotated[
str,
Field(
description="The name of the entity referenced by this object's prefix and identifier."
),
]
model_config = ConfigDict(frozen=True)
[docs]
@classmethod
def from_curie( # type:ignore
cls, curie: str, name: str, *, sep: str = ":", converter: Converter | None = None
) -> Self:
"""Parse a CURIE string and populate a reference.
:param curie: A string representation of a compact URI (CURIE)
:param name: The name of the reference
:param sep: The separator
:param converter: The converter to use as context when parsing
:returns: A reference object
>>> NamedReference.from_curie("chebi:1234", "6-methoxy-2-octaprenyl-1,4-benzoquinone")
NamedReference(prefix='chebi', identifier='1234', name='6-methoxy-2-octaprenyl-1,4-benzoquinone')
"""
prefix, identifier = _split(curie, sep=sep)
return cls.model_validate(
{"prefix": prefix, "identifier": identifier, "name": name}, context=converter
)
[docs]
@classmethod
def from_reference(
cls, reference: Reference | ReferenceTuple, *, converter: Converter | None = None
) -> Self:
"""Parse a CURIE string and populate a reference.
:param reference: A pre-parsed reference
:param converter: The converter to use as context when parsing
:returns: A reference object
:raises TypeError: if a reference that has no name field is passed (e.g., a
vanilla :class:`curies.Reference`)
"""
if not isinstance(reference, NamableReference):
raise TypeError(
f"tried to construct a named reference from a non-named reference: {reference}"
)
return cls.model_validate(
{
"prefix": reference.prefix,
"identifier": reference.identifier,
"name": reference.name,
},
context=converter,
)
[docs]
def with_name(self, name: str) -> Self:
"""Return this reference, with a name."""
return self.model_copy(update={"name": name})
RecordKey = tuple[str, str, str, str]
[docs]
class Record(BaseModel):
"""A record of some prefixes and their associated URI prefixes.
.. seealso::
https://github.com/cthoyt/curies/issues/70
"""
prefix: Annotated[
str,
Field(
title="CURIE prefix",
description="The canonical CURIE prefix, used in the reverse prefix map",
),
]
uri_prefix: Annotated[
str,
Field(
title="URI prefix",
description="The canonical URI prefix, used in the forward prefix map",
),
]
prefix_synonyms: list[str] = Field(default_factory=list, title="CURIE prefix synonyms")
uri_prefix_synonyms: list[str] = Field(default_factory=list, title="URI prefix synonyms")
pattern: Annotated[
str | None,
Field(
description="The regular expression pattern for entries in this semantic space. "
"Warning: this is an experimental feature.",
),
] = None
[docs]
@field_validator("prefix_synonyms") # type:ignore
@classmethod
def prefix_not_in_synonyms(cls, v: str, values: Mapping[str, Any]) -> str:
"""Check that the canonical prefix does not apper in the prefix synonym list."""
prefix = _get_field_validator_values(values, "prefix")
if prefix in v:
raise ValueError(f"Duplicate of canonical prefix `{prefix}` in prefix synonyms")
return v
[docs]
@field_validator("uri_prefix_synonyms") # type:ignore
@classmethod
def uri_prefix_not_in_synonyms(cls, v: str, values: Mapping[str, Any]) -> str:
"""Check that the canonical URI prefix does not apper in the URI prefix synonym list."""
uri_prefix = _get_field_validator_values(values, "uri_prefix")
if uri_prefix in v:
raise ValueError(
f"Duplicate of canonical URI prefix `{uri_prefix}` in URI prefix synonyms"
)
return v
@property
def _all_prefixes(self) -> list[str]:
return [self.prefix, *self.prefix_synonyms]
@property
def _all_uri_prefixes(self) -> list[str]:
return [self.uri_prefix, *self.uri_prefix_synonyms]
@property
def _key(self) -> RecordKey:
"""Get a hashable key."""
return (
self.prefix,
self.uri_prefix,
",".join(sorted(self.prefix_synonyms)),
",".join(sorted(self.uri_prefix_synonyms)),
)
# An explanation of RootModels in Pydantic V2 can be found on
# https://docs.pydantic.dev/latest/concepts/models/#rootmodel-and-custom-root-types
[docs]
class Records(RootModel[list[Record]]):
"""A list of records."""
def __iter__(self) -> Iterator[Record]: # type:ignore[override]
"""Iterate over records."""
return iter(self.root)
class DuplicateSummary(NamedTuple):
"""A triple representing two records that are duplicated, either based on a CURIE or URI prefix."""
record_1: Record
record_2: Record
prefix: str
[docs]
class DuplicateValueError(ValueError):
"""An error raised with constructing a converter with data containing duplicate values."""
def __init__(self, duplicates: list[DuplicateSummary]) -> None:
"""Initialize the error."""
self.duplicates = duplicates
def _str(self) -> str:
rv = ""
for duplicate in self.duplicates:
rv += f"\n{duplicate.prefix}:\n\t{duplicate.record_1}\n\t{duplicate.record_2}\n"
return rv
[docs]
class DuplicateURIPrefixes(DuplicateValueError):
"""An error raised with constructing a converter with data containing duplicate URI prefixes."""
def __str__(self) -> str:
return f"Duplicate URI prefixes:\n{self._str()}"
[docs]
class DuplicatePrefixes(DuplicateValueError):
"""An error raised with constructing a converter with data containing duplicate prefixes."""
def __str__(self) -> str:
return f"Duplicate prefixes:\n{self._str()}"
class ConversionError(ValueError):
"""An error raised on conversion."""
class ExpansionError(ConversionError):
"""An error raised on expansion if the prefix can't be looked up."""
class CompressionError(ConversionError):
"""An error raised on expansion if the URI prefix can't be matched."""
class StandardizationError(ValueError):
"""An error raised on standardization."""
class PrefixStandardizationError(StandardizationError):
"""An error raise when a prefix can't be standardized."""
class IdentifierStandardizationError(StandardizationError):
"""An error raise when an identifier can't be standardized."""
class CURIEStandardizationError(StandardizationError):
"""An error raise when a CURIE can't be standardized."""
class URIStandardizationError(StandardizationError):
"""An error raise when a URI can't be standardized."""
def _get_duplicate_uri_prefixes(records: list[Record]) -> list[DuplicateSummary]:
return [
DuplicateSummary(record_1, record_2, uri_prefix)
for record_1, record_2 in itt.combinations(records, 2)
for uri_prefix, up2 in itt.product(record_1._all_uri_prefixes, record_2._all_uri_prefixes)
if uri_prefix == up2
]
def _get_duplicate_prefixes(records: list[Record]) -> list[DuplicateSummary]:
return [
DuplicateSummary(record_1, record_2, prefix)
for record_1, record_2 in itt.combinations(records, 2)
for prefix, p2 in itt.product(record_1._all_prefixes, record_2._all_prefixes)
if prefix == p2
]
def _prepare(data: LocationOr[X]) -> X:
if isinstance(data, Path):
with data.open() as file:
return cast(X, json.load(file))
elif isinstance(data, str):
if any(data.startswith(p) for p in ("https://", "http://", "ftp://")):
return cast(X, _get_remote_json(data))
with open(data) as file:
return cast(X, json.load(file))
else:
return data
def _get_remote_json(url: str, timeout: int = 15) -> Any:
import urllib.request
with urllib.request.urlopen(url, timeout=timeout) as response: # noqa:S310
json_str_data = response.read().decode()
return json.loads(json_str_data)
[docs]
class Converter:
"""A cached prefix map data structure.
.. code-block::
# Construct a prefix map:
>>> converter = Converter.from_prefix_map(
... {
... "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
... "MONDO": "http://purl.obolibrary.org/obo/MONDO_",
... "GO": "http://purl.obolibrary.org/obo/GO_",
... "OBO": "http://purl.obolibrary.org/obo/",
... }
... )
# Compression and Expansion:
>>> converter.compress("http://purl.obolibrary.org/obo/CHEBI_1")
'CHEBI:1'
>>> converter.expand("CHEBI:1")
'http://purl.obolibrary.org/obo/CHEBI_1'
# Example with unparsable URI:
>>> converter.compress("http://example.com/missing:0000000")
# Example with missing prefix:
>>> converter.expand("missing:0000000")
"""
#: A prefix trie for efficient parsing of URIs
trie: Trie
#: A mapping from prefix to regular expression pattern. Not necessarily complete wrt the prefix map.
#:
#: .. warning:: patterns are an experimental feature
pattern_map: dict[str, str]
#: The list of records. Don't modify this directly
records: list[Record]
def __init__(
self, records: Iterable[Record] | None = None, *, delimiter: str = ":", strict: bool = True
) -> None:
"""Instantiate a converter.
:param records: A list of records. If you plan to build a converter
incrementally, pass an empty list or leave this blank.
:param strict: If true, raises issues on duplicate URI prefixes
:param delimiter: The delimiter used for CURIEs. Defaults to a colon.
:raises DuplicatePrefixes: if any records share any synonyms
:raises DuplicateURIPrefixes: if any records share any URI prefixes
"""
if records is None:
records = []
records = sorted(records, key=lambda r: r.prefix)
if strict:
duplicate_uri_prefixes = _get_duplicate_uri_prefixes(records)
if duplicate_uri_prefixes:
raise DuplicateURIPrefixes(duplicate_uri_prefixes)
duplicate_prefixes = _get_duplicate_prefixes(records)
if duplicate_prefixes:
raise DuplicatePrefixes(duplicate_prefixes)
self.delimiter = delimiter
self.records = records
self._prefix_to_record: dict[str, Record] = {}
self._prefix_ci_to_record: dict[str, Record] = {}
self._uri_prefix_to_record: dict[str, Record] = {}
self._uri_prefix_ci_to_record: dict[str, Record] = {}
self.trie = Trie()
self.pattern_map = {}
for record in records:
self._index(record)
@property
def prefix_map(self) -> Mapping[str, str]:
"""Get the non-URI-prefix-unique prefix map.."""
return {
prefix: record.uri_prefix for record in self.records for prefix in record._all_prefixes
}
@property
def reverse_prefix_map(self) -> Mapping[str, str]:
"""Get the non-URI-prefix-unique prefix map."""
return {
uri_prefix: record.prefix
for record in self.records
for uri_prefix in record._all_uri_prefixes
}
def __len__(self) -> int:
"""Count the number of records."""
return len(self.records)
def __iter__(self) -> Iterator[Record]:
"""Iterate over records."""
return iter(self.records)
@property
def bimap(self) -> Mapping[str, str]:
"""Get the bijective mapping between CURIE prefixes and URI prefixes."""
return {r.prefix: r.uri_prefix for r in self.records}
@property
def reverse_bimap(self) -> Mapping[str, str]:
"""Get the bijective mapping between URI CURIE prefixes and CURIE prefixes."""
return {r.uri_prefix: r.prefix for r in self.records}
def _match_record(
self, external: Record, case_sensitive: bool = True
) -> Mapping[str, list[str]]:
"""Match the given record to existing records."""
rv: defaultdict[str, list[str]] = defaultdict(list)
if case_sensitive:
if record := self._prefix_to_record.get(external.prefix):
rv[record.prefix].append(
f"primary prefix ({external.prefix}) match to {self._label(external.prefix == record.prefix)} prefix for {record.prefix}"
)
for prefix in external.prefix_synonyms:
if record := self._prefix_to_record.get(prefix):
rv[record.prefix].append(
f"secondary prefix ({prefix}) matched {self._label(prefix == record.prefix)} prefix for {record.prefix}"
)
if record := self._uri_prefix_to_record.get(external.uri_prefix):
rv[record.prefix].append(
f"primary URI prefix ({external.uri_prefix}) matched {self._label(record.uri_prefix == external.uri_prefix)} URI prefix for {record.prefix}"
)
for uri_prefix in external.uri_prefix_synonyms:
if record := self._uri_prefix_to_record.get(uri_prefix):
rv[record.prefix].append(
f"secondary URI prefix ({uri_prefix}) matched {self._label(record.uri_prefix == uri_prefix)} URI prefix for {record.prefix}"
)
else:
for prefix in external._all_prefixes:
if record := self._prefix_ci_to_record.get(prefix.casefold()):
rv[record.prefix].append("prefix case-insenstive match")
for uri_prefix in external._all_uri_prefixes:
if record := self._uri_prefix_ci_to_record.get(uri_prefix.casefold()):
rv[record.prefix].append("URI case-insenstive prefix match")
return dict(rv)
@staticmethod
def _label(x: bool) -> str:
return "primary" if x else "secondary"
[docs]
def add_record(
self, record: Record, *, case_sensitive: bool = True, merge: bool = False
) -> None:
"""Append a record to the converter."""
matched = self._match_record(record, case_sensitive=case_sensitive)
if len(matched) > 1:
msg = "".join(f"\n {m} -> {v}" for m, v in matched.items())
raise ValueError(f"new record has duplicates:{msg}")
if len(matched) == 1:
prefix, values = next(iter(matched.items()))
if not merge:
msg = "\n".join(f"- {v}" for v in values)
raise ValueError(
f"failed to add {record.prefix} because of overlaps. Full record:\n\n{record.model_dump_json(indent=2)}\n\nExplanation:\n{msg}\n"
)
existing_record = self._prefix_to_record[prefix]
self._merge(record, into=existing_record)
self._index(existing_record)
else:
# Append a new record
self.records.append(record)
self._index(record)
@staticmethod
def _merge(record: Record, into: Record) -> None:
xx = set(into._all_prefixes)
for prefix in record._all_prefixes:
if prefix not in xx:
into.prefix_synonyms.append(prefix)
into.prefix_synonyms.sort()
yy = set(into._all_uri_prefixes)
for uri_prefix in record._all_uri_prefixes:
if uri_prefix not in yy:
into.uri_prefix_synonyms.append(uri_prefix)
into.uri_prefix_synonyms.sort()
def _index(self, record: Record) -> None:
for prefix in record._all_prefixes:
self._index_prefix(prefix, record)
for uri_prefix in record._all_uri_prefixes:
self._index_uri_prefix(uri_prefix, record)
if record.pattern and record.prefix not in self.pattern_map:
self.pattern_map[record.prefix] = record.pattern
def _index_prefix(self, prefix: str, record: Record) -> None:
self._prefix_to_record[prefix] = record
self._prefix_ci_to_record[prefix.casefold()] = record
def _index_uri_prefix(self, uri_prefix: str, record: Record) -> None:
self._uri_prefix_to_record[uri_prefix] = record
self._uri_prefix_ci_to_record[uri_prefix.casefold()] = record
self.trie[uri_prefix] = record
[docs]
def add_prefix_synonym(
self, prefix: str, prefix_synonym: str, *, case_sensitive: bool = True
) -> None:
"""Add a prefix synonym to the record with the given prefix."""
if not case_sensitive:
raise NotImplementedError
try:
record = self._prefix_to_record[prefix]
except KeyError as e:
raise KeyError(
f"can not add prefix synoynm {prefix_synonym} to prefix {prefix} since {prefix} is not already indexed"
) from e
sr = self._prefix_to_record.get(prefix_synonym)
if sr is not None:
if sr.prefix != record.prefix:
raise ValueError(f"this prefix synonym is already taken by record for {sr.prefix}")
return None
self._index_prefix(prefix_synonym, record)
record.prefix_synonyms.append(prefix_synonym)
record.prefix_synonyms.sort()
[docs]
def add_uri_prefix_synonym(
self, prefix: str, uri_prefix_synonym: str, *, case_sensitive: bool = True
) -> None:
"""Add a URI synonym to the record with the given prefix."""
if not case_sensitive:
raise NotImplementedError
try:
record = self._prefix_to_record[prefix]
except KeyError as e:
raise KeyError(
f"can't add URI prefix synoynm {uri_prefix_synonym} to prefix {prefix} because "
f"{prefix} is not already indexed"
) from e
sr = self._uri_prefix_to_record.get(uri_prefix_synonym)
if sr is not None:
if sr.prefix != record.prefix:
raise ValueError(
f"can't add URI prefix synonym {uri_prefix_synonym} to {prefix} becauase "
f"it is already taken by record for {sr.prefix}"
)
return None
self._index_uri_prefix(uri_prefix_synonym, record)
record.uri_prefix_synonyms.append(uri_prefix_synonym)
record.uri_prefix_synonyms.sort()
[docs]
def add_prefix(
self,
prefix: str,
uri_prefix: str,
prefix_synonyms: Collection[str] | None = None,
uri_prefix_synonyms: Collection[str] | None = None,
*,
pattern: str | None = None,
case_sensitive: bool = True,
merge: bool = False,
) -> None:
"""Append a prefix to the converter.
:param prefix: The prefix to append, e.g., ``go``
:param uri_prefix: The URI prefix to append, e.g.,
``http://purl.obolibrary.org/obo/GO_``
:param prefix_synonyms: An optional collection of synonyms for the prefix such
as ``gomf``, ``gocc``, etc.
:param uri_prefix_synonyms: An optional collections of synonyms for the URI
prefix such as ``https://bioregistry.io/go:``,
``http://www.informatics.jax.org/searches/GO.cgi?id=GO:``, etc.
:param pattern: An optional pattern
:param case_sensitive: Should prefixes and URI prefixes be compared in a
case-sensitive manner when checking for uniqueness? Defaults to True.
:param merge: Should this record be merged into an existing record if it
uniquely maps to a single existing record? When false, will raise an error
if one or more existing records can be mapped. Defaults to false.
This can be used to add missing namespaces on-the-fly to an existing converter:
>>> import curies
>>> converter = curies.get_obo_converter()
>>> converter.add_prefix("hgnc", "https://bioregistry.io/hgnc:")
>>> converter.expand("hgnc:1234")
'https://bioregistry.io/hgnc:1234'
>>> converter.expand("GO:0032571")
'http://purl.obolibrary.org/obo/GO_0032571'
This can also be used to incrementally build up a converter from scratch:
>>> import curies
>>> converter = curies.Converter(records=[])
>>> converter.add_prefix("hgnc", "https://bioregistry.io/hgnc:")
>>> converter.expand("hgnc:1234")
'https://bioregistry.io/hgnc:1234'
"""
record = Record(
prefix=prefix,
uri_prefix=uri_prefix,
prefix_synonyms=sorted(prefix_synonyms or []),
uri_prefix_synonyms=sorted(uri_prefix_synonyms or []),
pattern=pattern,
)
self.add_record(record, case_sensitive=case_sensitive, merge=merge)
[docs]
@classmethod
def from_extended_prefix_map(
cls, records: LocationOr[Iterable[Record | dict[str, Any]]], **kwargs: Any
) -> Converter:
"""Get a converter from a list of dictionaries by creating records out of them.
:param records: One of the following:
- An iterable of :class:`curies.Record` objects or dictionaries that will
get converted into record objects that together constitute an extended
prefix map
- A string containing a remote location of a JSON file containg an extended
prefix map
- A string or :class:`pathlib.Path` object corresponding to a local file
path to a JSON file containing an extended prefix map
:param kwargs: Keyword arguments to pass to :meth:`curies.Converter.__init__`
:returns: A converter
An extended prefix map is a list of dictionaries containing four keys:
1. A ``prefix`` string
2. A ``uri_prefix`` string
3. An optional list of strings ``prefix_synonyms``
4. An optional list of strings ``uri_prefix_synonyms``
Across the whole list of dictionaries, there should be uniqueness within the
union of all ``prefix`` and ``prefix_synonyms`` as well as uniqueness within the
union of all ``uri_prefix`` and ``uri_prefix_synonyms``.
>>> epm = [
... {
... "prefix": "CHEBI",
... "prefix_synonyms": ["chebi", "ChEBI"],
... "uri_prefix": "http://purl.obolibrary.org/obo/CHEBI_",
... "uri_prefix_synonyms": [
... "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:"
... ],
... },
... {
... "prefix": "GO",
... "uri_prefix": "http://purl.obolibrary.org/obo/GO_",
... },
... ]
>>> converter = Converter.from_extended_prefix_map(epm)
Expand using the preferred/canonical prefix:
>>> converter.expand("CHEBI:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
Expand using a prefix synonym:
>>> converter.expand("chebi:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
Compress using the preferred/canonical URI prefix:
>>> converter.compress("http://purl.obolibrary.org/obo/CHEBI_138488")
'CHEBI:138488'
Compressing using a URI prefix synonym:
>>> converter.compress("https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:138488")
'CHEBI:138488'
Example from a remote source:
>>> url = "https://github.com/biopragmatics/bioregistry/raw/main/exports/contexts/bioregistry.epm.json"
>>> converter = Converter.from_extended_prefix_map(url)
"""
return cls(
records=[
record if isinstance(record, Record) else Record(**record)
for record in _prepare(records)
],
**kwargs,
)
[docs]
@classmethod
def from_priority_prefix_map(
cls, data: LocationOr[Mapping[str, list[str]]], **kwargs: Any
) -> Converter:
"""Get a converter from a priority prefix map.
:param data: A prefix map where the keys are prefixes (e.g., `chebi`) and the
values are lists of URI prefixes (e.g.,
``http://purl.obolibrary.org/obo/CHEBI_``) with the first element of the
list being the priority URI prefix for expansions.
:param kwargs: Keyword arguments to pass to the parent class's init
:returns: A converter
>>> priority_prefix_map = {
... "CHEBI": [
... "http://purl.obolibrary.org/obo/CHEBI_",
... "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:",
... ],
... "GO": ["http://purl.obolibrary.org/obo/GO_"],
... "obo": ["http://purl.obolibrary.org/obo/"],
... }
>>> converter = Converter.from_priority_prefix_map(priority_prefix_map)
>>> converter.expand("CHEBI:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.compress("http://purl.obolibrary.org/obo/CHEBI_138488")
'CHEBI:138488'
>>> converter.compress("https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:138488")
'CHEBI:138488'
"""
return cls(
[
Record(
prefix=prefix, uri_prefix=uri_prefixes[0], uri_prefix_synonyms=uri_prefixes[1:]
)
for prefix, uri_prefixes in _prepare(data).items()
],
**kwargs,
)
[docs]
@classmethod
def from_prefix_map(cls, prefix_map: LocationOr[Mapping[str, str]], **kwargs: Any) -> Converter:
"""Get a converter from a simple prefix map.
:param prefix_map: One of the following:
- A mapping whose keys represent CURIE prefixes and values represent URI
prefixes
- A string containing a remote location of a JSON file containg a prefix map
- A string or :class:`pathlib.Path` object corresponding to a local file
path to a JSON file containing a prefix map
:param kwargs: Keyword arguments to pass to :meth:`curies.Converter.__init__`
:returns: A converter
>>> converter = Converter.from_prefix_map(
... {
... "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
... "MONDO": "http://purl.obolibrary.org/obo/MONDO_",
... "GO": "http://purl.obolibrary.org/obo/GO_",
... "OBO": "http://purl.obolibrary.org/obo/",
... }
... )
>>> converter.expand("CHEBI:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.compress("http://purl.obolibrary.org/obo/CHEBI_138488")
'CHEBI:138488'
"""
return cls(
[
Record(prefix=prefix, uri_prefix=uri_prefix)
for prefix, uri_prefix in _prepare(prefix_map).items()
],
**kwargs,
)
[docs]
@classmethod
def from_reverse_prefix_map(
cls, reverse_prefix_map: LocationOr[Mapping[str, str]], **kwargs: Any
) -> Converter:
"""Get a converter from a reverse prefix map.
:param reverse_prefix_map: A mapping whose keys are URI prefixes and whose
values are the corresponding prefixes. This data structure allow for
multiple different URI formats to point to the same prefix.
:param kwargs: Keyword arguments to pass to :meth:`curies.Converter.__init__`
:returns: A converter
>>> converter = Converter.from_reverse_prefix_map(
... {
... "http://purl.obolibrary.org/obo/CHEBI_": "CHEBI",
... "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=": "CHEBI",
... "http://purl.obolibrary.org/obo/MONDO_": "MONDO",
... }
... )
>>> converter.expand("CHEBI:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.compress("http://purl.obolibrary.org/obo/CHEBI_138488")
'CHEBI:138488'
>>> converter.compress("https://www.ebi.ac.uk/chebi/searchId.do?chebiId=138488")
'CHEBI:138488'
Altenatively, get content from the internet like
>>> url = "https://github.com/biopragmatics/bioregistry/raw/main/exports/contexts/bioregistry.rpm.json"
>>> converter = Converter.from_reverse_prefix_map(url)
>>> "chebi" in converter.prefix_map
"""
dd: defaultdict[str, list[str]] = defaultdict(list)
for uri_prefix, prefix in _prepare(reverse_prefix_map).items():
dd[prefix].append(uri_prefix)
records = []
for prefix, uri_prefixes in dd.items():
uri_prefix, *uri_prefix_synonyms = sorted(uri_prefixes, key=len)
records.append(
Record(
prefix=prefix, uri_prefix=uri_prefix, uri_prefix_synonyms=uri_prefix_synonyms
)
)
return cls(records, **kwargs)
[docs]
@classmethod
def from_jsonld(cls, data: LocationOr[dict[str, Any]], **kwargs: Any) -> Converter:
"""Get a converter from a JSON-LD object, which contains a prefix map in its ``@context`` key.
:param data: A JSON-LD object
:param kwargs: Keyword arguments to pass to :meth:`curies.Converter.__init__`
:returns: A converter
Example from a remote context file:
>>> base = "https://raw.githubusercontent.com"
>>> url = f"{base}/biopragmatics/bioregistry/main/exports/contexts/semweb.context.jsonld"
>>> converter = Converter.from_jsonld(url)
>>> "rdf" in converter.prefix_map
.. seealso::
https://www.w3.org/TR/json-ld11/#the-context defines the ``@context`` aspect
of JSON-LD
"""
prefix_map = {}
for key, value in _prepare(data)["@context"].items():
# TODO how to handle key == "@base"?
if not key:
logger.warning(
"The JSON-LD specification says in https://www.w3.org/TR/json-ld/#terms that "
"keys are not allowed to be empty strings. The given @context object contained "
"an empty string as one of its keys"
)
continue
if key.startswith("@"):
continue
if isinstance(value, str):
prefix_map[key] = value
elif isinstance(value, dict) and value.get("@prefix") is True:
prefix_map[key] = value["@id"]
return cls.from_prefix_map(prefix_map, **kwargs)
[docs]
@classmethod
def from_jsonld_github(
cls, owner: str, repo: str, *path: str, branch: str = "main", **kwargs: Any
) -> Converter:
"""Construct a remote JSON-LD URL on GitHub then parse with :meth:`Converter.from_jsonld`.
:param owner: A github repository owner or organization (e.g.,
``biopragmatics``)
:param repo: The name of the repository (e.g., ``bioregistry``)
:param path: The file path in the GitHub repository to a JSON-LD context file.
:param branch: The branch from which the file should be downloaded. Defaults to
``main``, for old repositories this might need to be changed to ``master``.
:param kwargs: Keyword arguments to pass to :meth:`curies.Converter.__init__`
:returns: A converter
:raises ValueError: If the given path doesn't end in a .jsonld file name
>>> converter = Converter.from_jsonld_github(
... "biopragmatics",
... "bioregistry",
... "exports",
... "contexts",
... "semweb.context.jsonld",
... )
>>> "rdf" in converter.prefix_map
True
"""
if not path or not path[-1].endswith(".jsonld"):
raise ValueError("final path argument should end with .jsonld")
rest = "/".join(path)
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{rest}"
return cls.from_jsonld(url, **kwargs)
[docs]
@classmethod
def from_rdflib(
cls,
graph_or_manager: rdflib.Graph | rdflib.namespace.NamespaceManager,
**kwargs: Any,
) -> Converter:
"""Get a converter from an RDFLib graph or namespace manager.
:param graph_or_manager: A RDFLib graph or manager object
:param kwargs: Keyword arguments to pass to :meth:`from_prefix_map`
:returns: A converter
In the following example, a :class:`rdflib.Graph` is created, a namespace is
bound to it, then a converter is made:
>>> import rdflib, curies
>>> graph = rdflib.Graph()
>>> graph.bind("hgnc", "https://bioregistry.io/hgnc:")
>>> converter = curies.Converter.from_rdflib(graph)
>>> converter.expand("hgnc:1234")
'https://bioregistry.io/hgnc:1234'
This also works if you directly start with a
:class:`rdflib.namespace.NamespaceManager`:
>>> converter = curies.Converter.from_rdflib(graph.namespace_manager)
>>> converter.expand("hgnc:1234")
'https://bioregistry.io/hgnc:1234'
"""
# it's required to stringify namespace since it's a rdflib.URIRef
# object, which acts funny if not coerced into a string
prefix_map = {prefix: str(namespace) for prefix, namespace in graph_or_manager.namespaces()}
return cls.from_prefix_map(prefix_map, **kwargs)
[docs]
def bind_rdflib(
self,
graph_or_manager: rdflib.Graph | rdflib.namespace.NamespaceManager,
synonyms: bool = False,
) -> None:
"""Add the prefix map from this converter to a RDFlib graph or manager.
:param graph_or_manager: A RDFLib graph or manager object
:param synonyms: Should CURIE prefix synonyms be bound?
Binding a graph:
>>> import curies, rdflib
>>> converter = curies.get_obo_converter()
>>> graph = rdflib.Graph()
>>> converter.bind_rdflib(graph)
Binding a manager:
>>> import curies, rdflib, rdflib.namespace
>>> converter = curies.get_obo_converter()
>>> manager = rdflib.namespace.NamespaceManager(graph=rdflib.Graph())
>>> converter.bind_rdflib(manager)
"""
from rdflib import Namespace
for record in self.records:
namespace = Namespace(record.uri_prefix)
graph_or_manager.bind(record.prefix, namespace)
if synonyms:
for synonym in record.prefix_synonyms:
graph_or_manager.bind(synonym, namespace)
[docs]
@classmethod
def from_shacl(
cls,
graph: str | Path | rdflib.Graph,
format: str | None = None,
**kwargs: Any,
) -> Converter:
"""Get a converter from SHACL, either in a turtle f.
:param graph: A RDFLib graph, a Path, a string representing a file path, or a
string URL
:param format: The RDF format, if a file path is given
:param kwargs: Keyword arguments to pass to :meth:`Converter.__init__`
:returns: A converter
"""
if isinstance(graph, (str, Path)):
import rdflib
temporary_graph = rdflib.Graph()
temporary_graph.parse(location=Path(graph).resolve().as_posix(), format=format)
graph = temporary_graph
query = """\
SELECT ?curie_prefix ?uri_prefix ?pattern
WHERE {
?bnode1 sh:declare ?bnode2 .
?bnode2 sh:prefix ?curie_prefix .
?bnode2 sh:namespace ?uri_prefix .
OPTIONAL { ?bnode2 sh:pattern ?pattern . }
}
"""
results = cast(Iterable[tuple[str, str, str]], graph.query(query))
records = [
Record(prefix=str(prefix), uri_prefix=str(uri_prefix), pattern=pattern and str(pattern))
for prefix, uri_prefix, pattern in results
]
return cls(records, **kwargs)
[docs]
def get_prefixes(self, *, include_synonyms: bool = False) -> set[str]:
"""Get the set of prefixes covered by this converter.
:param include_synonyms: If true, include secondary prefixes.
:returns: A set of primary prefixes covered by the converter. If
``include_synonyms`` is set to ``True``, secondary prefixes (i.e., ones in
:data:`Record.prefix_synonyms` are also included
"""
rv = {record.prefix for record in self.records}
if include_synonyms:
rv.update(
prefix_synonym
for record in self.records
for prefix_synonym in record.prefix_synonyms
)
return rv
[docs]
def get_uri_prefixes(self, *, include_synonyms: bool = False) -> set[str]:
"""Get the set of URI prefixes covered by this converter.
:param include_synonyms: If true, include secondary prefixes.
:returns: A set of primary URI prefixes covered by the converter. If
``include_synonyms`` is set to ``True``, secondary URI prefixes (i.e., ones
in :data:`Record.uri_prefix_synonyms` are also included
"""
rv = {record.uri_prefix for record in self.records}
if include_synonyms:
rv.update(
uri_prefix_synonym
for record in self.records
for uri_prefix_synonym in record.uri_prefix_synonyms
)
return rv
[docs]
def is_uri(self, s: str) -> bool:
"""Check if the string can be parsed as a URI by this converter.
:param s: A string that might be a URI
:returns: If the string can be parsed as a URI by this converter. Note that some
valid URIs, when passed to this function, will result in False if their URI
prefixes are not registered with this converter.
>>> import curies
>>> converter = curies.get_obo_converter()
>>> converter.is_uri("http://purl.obolibrary.org/obo/GO_1234567")
True
>>> converter.is_uri("GO:1234567")
False
The following is a valid URI, but the prefix is not registered with the
converter based on the OBO Foundry prefix map, so it returns False.
>>> converter.is_uri("http://proteopedia.org/wiki/index.php/2gc4")
False
"""
return self.compress(s) is not None
# docstr-coverage:excused `overload`
@overload
def compress_or_standardize(
self, uri_or_curie: str, *, strict: Literal[True] = True, passthrough: bool = ...
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def compress_or_standardize(
self,
uri_or_curie: str,
*,
strict: Literal[False] = False,
passthrough: Literal[True] = True,
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def compress_or_standardize(
self,
uri_or_curie: str,
*,
strict: Literal[False] = False,
passthrough: Literal[False] = False,
) -> str | None: ...
[docs]
def compress_or_standardize(
self, uri_or_curie: str, *, strict: bool = False, passthrough: bool = False
) -> str | None:
"""Compress a URI or standardize a CURIE.
:param uri_or_curie: A string representing a compact URI (CURIE) or a URI.
:param strict: If true and the string is neither a URI that can be compressed
nor a CURIE that can be standardized, returns an error. Defaults to false.
:param passthrough: If true, strict is false, and the string is neither a URI
that can be compressed nor a CURIE that can be standardized, return the
input. Defaults to false.
:returns: If the string is a URI, and it can be compressed, returns the
corresponding CURIE. If the string is a CURIE, and it can be standardized,
returns the standard CURIE.
:raises CompressionError: If strict is true and the URI can't be compressed
>>> from curies import Converter, Record
>>> converter = Converter.from_extended_prefix_map(
... [
... Record(
... prefix="CHEBI",
... prefix_synonyms=["chebi"],
... uri_prefix="http://purl.obolibrary.org/obo/CHEBI_",
... uri_prefix_synonyms=["https://identifiers.org/chebi:"],
... ),
... ]
... )
>>> converter.compress_or_standardize("http://purl.obolibrary.org/obo/CHEBI_138488")
'CHEBI:138488'
>>> converter.compress_or_standardize("https://identifiers.org/chebi:138488")
'CHEBI:138488'
>>> converter.compress_or_standardize("CHEBI:138488")
'CHEBI:138488'
>>> converter.compress_or_standardize("chebi:138488")
'CHEBI:138488'
>>> converter.compress_or_standardize("missing:0000000")
>>> converter.compress_or_standardize("https://example.com/missing:0000000")
"""
reference = self.parse(uri_or_curie, strict=False)
if reference is not None:
return self.format_curie(reference.prefix, reference.identifier)
if strict:
raise CompressionError(uri_or_curie)
if passthrough:
return uri_or_curie
return None
# docstr-coverage:excused `overload`
@overload
def parse(
self, str_or_uri_or_curie: str, *, strict: Literal[True] = True
) -> ReferenceTuple: ...
# docstr-coverage:excused `overload`
@overload
def parse(
self, str_or_uri_or_curie: str, *, strict: Literal[False] = False
) -> ReferenceTuple | None: ...
[docs]
def parse(self, str_or_uri_or_curie: str, *, strict: bool = False) -> ReferenceTuple | None:
"""Parse a string, URI, or CURIE."""
if self.is_uri(str_or_uri_or_curie):
return self.parse_uri(str_or_uri_or_curie, strict=strict) # type:ignore[no-any-return,call-overload]
if self.is_curie(str_or_uri_or_curie):
return self.parse_curie(str_or_uri_or_curie, strict=strict) # type:ignore[no-any-return,call-overload]
if strict:
raise CompressionError(str_or_uri_or_curie)
return None
[docs]
def compress_strict(self, uri: str) -> str:
"""Compress a URI to a CURIE, and raise an error of not possible."""
return self.compress(uri, strict=True)
# docstr-coverage:excused `overload`
@overload
def compress(
self, uri: str, *, strict: Literal[True] = True, passthrough: bool = ...
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def compress(
self, uri: str, *, strict: Literal[False] = False, passthrough: Literal[True] = True
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def compress(
self, uri: str, *, strict: Literal[False] = False, passthrough: Literal[False] = False
) -> str | None: ...
[docs]
def compress(self, uri: str, *, strict: bool = False, passthrough: bool = False) -> str | None:
"""Compress a URI to a CURIE, if possible.
:param uri: A string representing a valid uniform resource identifier (URI)
:param strict: If true and the URI can't be compressed, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and the URI can't be compressed,
return the input. Defaults to false.
:returns: A compact URI if this converter could find an appropriate URI prefix,
otherwise none.
:raises CompressionError: If strict is set to true and the URI can't be
compressed
>>> from curies import Converter
>>> converter = Converter.from_prefix_map(
... {
... "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
... "MONDO": "http://purl.obolibrary.org/obo/MONDO_",
... "GO": "http://purl.obolibrary.org/obo/GO_",
... "OBO": "http://purl.obolibrary.org/obo/",
... }
... )
>>> converter.compress("http://purl.obolibrary.org/obo/GO_0032571")
'GO:0032571'
>>> converter.compress("http://purl.obolibrary.org/obo/go.owl")
'OBO:go.owl'
>>> converter.compress("http://example.org/missing:0000000")
.. note::
If there are partially overlapping *URI prefixes* in this converter (e.g.,
``http://purl.obolibrary.org/obo/GO_`` for the prefix ``GO`` and
``http://purl.obolibrary.org/obo/`` for the prefix ``OBO``), the longest URI
prefix will always be matched. For example, parsing
``http://purl.obolibrary.org/obo/GO_0032571`` will return ``GO:0032571``
instead of ``OBO:GO_0032571``.
"""
reference: ReferenceTuple | None = self.parse_uri(uri)
if reference:
return self.format_curie(reference.prefix, reference.identifier)
if strict:
raise CompressionError(uri)
if passthrough:
return uri
return None
# docstr-coverage:excused `overload`
@overload
def parse_uri(
self, uri: str, *, strict: Literal[False] = ..., return_none: None = ...
) -> ReferenceTuple | None: ...
# docstr-coverage:excused `overload`
@overload
def parse_uri(
self,
uri: str,
*,
strict: Literal[True] = True,
return_none: None = ...,
) -> ReferenceTuple: ...
[docs]
def parse_uri(
self, uri: str, *, strict: bool = False, return_none: None = None
) -> ReferenceTuple | None:
"""Compress a URI to a CURIE pair.
:param uri: A string representing a valid uniform resource identifier (URI)
:param strict: If true and the URI can't be parsed, returns an error. Defaults
to false.
:param return_none: Opt into future type returning of a single None instead of a
pair of Nones
:returns: A CURIE pair if the URI could be parsed, otherwise a pair of None's
:raises CompressionError: if strict is set to true and the URI can't be parsed
:raises NotImplementedError: If you pass ``False`` to return_none
>>> from curies import Converter
>>> converter = Converter.from_prefix_map(
... {
... "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
... "MONDO": "http://purl.obolibrary.org/obo/MONDO_",
... "GO": "http://purl.obolibrary.org/obo/GO_",
... }
... )
>>> converter.parse_uri("http://purl.obolibrary.org/obo/CHEBI_138488")
ReferenceTuple(prefix='CHEBI', identifier='138488')
>>> converter.parse_uri("http://example.org/missing:0000000")
"""
rv = self.trie.parse_uri(uri)
if rv is not None:
return rv
if strict:
raise CompressionError(uri) from None
if return_none is None:
return None
elif return_none is True:
raise NotImplementedError("return_none should not be passed as of curies v0.13.0")
else: # i.e., return_none=False, which isn't supported anymore.
raise NotImplementedError(RETURN_NONE_ERROR_TEXT)
[docs]
def is_curie(self, s: str) -> bool:
"""Check if the string can be parsed as a CURIE by this converter.
:param s: A string that might be a CURIE
:returns: If the string can be parsed as a CURIE by this converter. Note that
some valid CURIEs, when passed to this function, will result in False if
their prefixes are not registered with this converter.
>>> import curies
>>> converter = curies.get_obo_converter()
>>> converter.is_curie("GO:1234567")
True
>>> converter.is_curie("http://purl.obolibrary.org/obo/GO_1234567")
False
The following is a valid CURIE, but the prefix is not registered with the
converter based on the OBO Foundry prefix map, so it returns False.
>>> converter.is_curie("pdb:2gc4")
False
"""
try:
return self.expand(s) is not None
except ValueError:
return False
# docstr-coverage:excused `overload`
@overload
def expand_or_standardize(
self, curie_or_uri: str, *, strict: Literal[True] = True, passthrough: bool = ...
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def expand_or_standardize(
self,
curie_or_uri: str,
*,
strict: Literal[False] = False,
passthrough: Literal[True] = True,
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def expand_or_standardize(
self,
curie_or_uri: str,
*,
strict: Literal[False] = False,
passthrough: Literal[False] = False,
) -> str | None: ...
[docs]
def expand_or_standardize(
self, curie_or_uri: str, *, strict: bool = False, passthrough: bool = False
) -> str | None:
"""Expand a CURIE or standardize a URI.
:param curie_or_uri: A string representing a compact URI (CURIE) or a URI.
:param strict: If true and the string is neither a CURIE that can be expanded
nor a URI that can be standardized, returns an error. Defaults to false.
:param passthrough: If true, strict is false, and the string is neither a CURIE
that can be expanded nor a URI that can be standardized, return the input.
Defaults to false.
:returns: If the string is a CURIE, and it can be expanded, returns the
corresponding URI. If the string is a URI, and it can be standardized,
returns the standard URI.
:raises ExpansionError: If strict is true and the CURIE can't be expanded
>>> from curies import Converter, Record
>>> converter = Converter.from_extended_prefix_map(
... [
... Record(
... prefix="CHEBI",
... prefix_synonyms=["chebi"],
... uri_prefix="http://purl.obolibrary.org/obo/CHEBI_",
... uri_prefix_synonyms=["https://identifiers.org/chebi:"],
... ),
... ]
... )
>>> converter.expand_or_standardize("CHEBI:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.expand_or_standardize("chebi:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.expand_or_standardize("http://purl.obolibrary.org/obo/CHEBI_138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.expand_or_standardize("https://identifiers.org/chebi:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.expand_or_standardize("missing:0000000")
>>> converter.expand_or_standardize("https://example.com/missing:0000000")
"""
reference = self.parse(curie_or_uri, strict=False)
if reference is not None:
return self.expand_reference(reference, strict=strict, passthrough=passthrough) # type:ignore[no-any-return,call-overload]
if strict:
raise ExpansionError(curie_or_uri)
if passthrough:
return curie_or_uri
return None
[docs]
def expand_strict(self, curie: str) -> str:
"""Expand a CURIE to a URI, and raise an error of not possible."""
return self.expand(curie, strict=True)
# docstr-coverage:excused `overload`
@overload
def expand(
self, curie: str, *, strict: Literal[True] = True, passthrough: bool = ...
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def expand(
self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[True] = True
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def expand(
self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[False] = False
) -> str | None: ...
[docs]
def expand(self, curie: str, *, strict: bool = False, passthrough: bool = False) -> str | None:
"""Expand a CURIE to a URI, if possible.
:param curie: A string representing a compact URI (CURIE)
:param strict: If true and the CURIE can't be expanded, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and the CURIE can't be expanded,
return the input. Defaults to false. If your strings can either be a CURIE
_or_ a URI, consider using :meth:`Converter.expand_or_standardize` instead.
:returns: A URI if this converter contains a URI prefix for the prefix in this
CURIE
:raises ExpansionError: If strict is true and the CURIE can't be expanded
>>> from curies import Converter
>>> converter = Converter.from_prefix_map(
... {
... "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
... "MONDO": "http://purl.obolibrary.org/obo/MONDO_",
... "GO": "http://purl.obolibrary.org/obo/GO_",
... }
... )
>>> converter.expand("CHEBI:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.expand("missing:0000000")
"""
reference = self.parse_curie(curie, strict=False)
if reference is not None:
return self.expand_reference(reference, strict=strict, passthrough=passthrough) # type:ignore[no-any-return,call-overload]
if strict:
raise ExpansionError(curie)
if passthrough:
return curie
return None
# docstr-coverage:excused `overload`
@overload
def expand_all(
self, curie: str, *, strict: Literal[False] = False
) -> Collection[str] | None: ...
# docstr-coverage:excused `overload`
@overload
def expand_all(self, curie: str, *, strict: Literal[True] = True) -> Collection[str]: ...
[docs]
def expand_all(self, curie: str, *, strict: bool = False) -> Collection[str] | None:
"""Expand a CURIE pair to all possible URIs.
:param curie: A string representing a compact URI
:param strict: If true and the prefix can't be expanded, returns an error.
Defaults to false.
:returns: A list of URIs that this converter can create for the given CURIE. The
first entry is the "standard" URI then others are based on URI prefix
synonyms. If the prefix is not registered to this converter, none is
returned.
:raises PrefixStandardizationError: if the prefix in the CURIE can not be looked
up
>>> priority_prefix_map = {
... "CHEBI": [
... "http://purl.obolibrary.org/obo/CHEBI_",
... "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:",
... ],
... }
>>> converter = Converter.from_priority_prefix_map(priority_prefix_map)
>>> converter.expand_all("CHEBI:138488")
['http://purl.obolibrary.org/obo/CHEBI_138488', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:138488']
>>> converter.expand_all("NOPE:NOPE") is None
True
"""
reference = self.parse_curie(curie, strict=False)
if reference is not None:
return self.expand_pair_all(reference.prefix, reference.identifier)
if strict:
raise PrefixStandardizationError(curie)
return None
# docstr-coverage:excused `overload`
@overload
def parse_curie(
self, curie: str, *, strict: Literal[False] = False
) -> ReferenceTuple | None: ...
# docstr-coverage:excused `overload`
@overload
def parse_curie(self, curie: str, *, strict: Literal[True] = True) -> ReferenceTuple: ...
[docs]
def parse_curie(self, curie: str, *, strict: bool = False) -> ReferenceTuple | None:
"""Parse and standardize a CURIE."""
prefix, identifier = _split(curie, sep=self.delimiter)
norm_prefix = self.standardize_prefix(prefix, strict=False)
if norm_prefix is None:
if strict:
raise PrefixStandardizationError(prefix)
return None
norm_identifier = self.standardize_identifier(norm_prefix, identifier)
if norm_identifier is None:
if strict:
raise IdentifierStandardizationError(curie)
return None
return ReferenceTuple(norm_prefix, norm_identifier)
# docstr-coverage:excused `overload`
@overload
def standardize_identifier(
self, standard_prefix: str, identifier: str, *, strict: Literal[True] = ...
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def standardize_identifier(
self, standard_prefix: str, identifier: str, *, strict: Literal[False] = ...
) -> str | None: ...
[docs]
def standardize_identifier(
self, standard_prefix: str, identifier: str, *, strict: bool = False
) -> str | None:
"""Standardize an identifier.
:param standard_prefix: This is a prefix that has already been standardized
using :meth:`standardize_prefix` in this converter
:param identifier: An unstandardized identifier
:param strict: If true, requires standardization to succeed or throws an error
:returns: A standardized identifier.
By default, this function is a no-op, meaning that it just returns the
identifier as is. You can override the :class:`Converter` class to implement
this method to do standardization (e.g., removing redundant prefixes) and to do
validation (e.g., checking against a regular expression).
"""
return identifier
# docstr-coverage:excused `overload`
@overload
def expand_reference(
self,
reference: ReferenceTuple | Reference,
*,
strict: Literal[True] = ...,
passthrough: bool = ...,
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def expand_reference(
self,
reference: ReferenceTuple | Reference,
*,
strict: Literal[False] = ...,
passthrough: bool = ...,
) -> str | None: ...
[docs]
def expand_reference(
self,
reference: ReferenceTuple | Reference,
*,
strict: bool = False,
passthrough: bool = False,
) -> str | None:
"""Expand a reference."""
uri_prefix = self.prefix_map.get(reference.prefix)
if uri_prefix is not None:
return uri_prefix + reference.identifier
if strict:
raise ExpansionError(reference.prefix)
if passthrough:
return self.format_curie(reference.prefix, reference.identifier)
return None
# docstr-coverage:excused `overload`
@overload
def expand_pair(
self, prefix: str, identifier: str, *, strict: Literal[True] = True, passthrough: bool = ...
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def expand_pair(
self,
prefix: str,
identifier: str,
*,
strict: Literal[False] = False,
passthrough: bool = ...,
) -> str | None: ...
[docs]
def expand_pair(
self, prefix: str, identifier: str, *, strict: bool = False, passthrough: bool = False
) -> str | None:
"""Expand a CURIE pair to the standard URI.
:param prefix: The prefix of the CURIE
:param identifier: The local unique identifier of the CURIE
:param strict: If true and the prefix can't be expanded, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and the prefix can't be expanded,
return the input. Defaults to false.
:returns: A URI if this converter contains a URI prefix for the prefix in this
CURIE
>>> from curies import Converter
>>> converter = Converter.from_prefix_map(
... {
... "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
... "MONDO": "http://purl.obolibrary.org/obo/MONDO_",
... "GO": "http://purl.obolibrary.org/obo/GO_",
... }
... )
>>> converter.expand_pair("CHEBI", "138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.expand_pair("missing", "0000000")
"""
return self.expand_reference( # type:ignore[no-any-return,call-overload]
ReferenceTuple(prefix, identifier), strict=strict, passthrough=passthrough
)
# docstr-coverage:excused `overload`
@overload
def expand_pair_all(
self, prefix: str, identifier: str, *, strict: Literal[True] = True
) -> Collection[str]: ...
# docstr-coverage:excused `overload`
@overload
def expand_pair_all(
self, prefix: str, identifier: str, *, strict: Literal[False] = False
) -> Collection[str] | None: ...
[docs]
def expand_pair_all(
self, prefix: str, identifier: str, *, strict: bool = False
) -> Collection[str] | None:
"""Expand a CURIE pair to all possible URIs.
:param prefix: The prefix of the CURIE
:param identifier: The local unique identifier of the CURIE
:param strict: If true and the prefix can't be expanded, returns an error.
Defaults to false.
:returns: A list of URIs that this converter can create for the given CURIE. The
first entry is the "standard" URI then others are based on URI prefix
synonyms. If the prefix is not registered to this converter, none is
returned.
:raises ExpansionError: if the prefix in the CURIE can not be looked up
>>> priority_prefix_map = {
... "CHEBI": [
... "http://purl.obolibrary.org/obo/CHEBI_",
... "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:",
... ],
... }
>>> converter = Converter.from_priority_prefix_map(priority_prefix_map)
>>> converter.expand_pair_all("CHEBI", "138488")
['http://purl.obolibrary.org/obo/CHEBI_138488', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:138488']
>>> converter.expand_pair_all("NOPE", "NOPE") is None
True
"""
record = self._prefix_to_record.get(prefix)
if record is not None:
rv = [record.uri_prefix + identifier]
for uri_prefix_synonyms in record.uri_prefix_synonyms:
rv.append(uri_prefix_synonyms + identifier)
return rv
if strict:
raise ExpansionError(prefix)
return None
# docstr-coverage:excused `overload`
@overload
def standardize_prefix(
self, prefix: str, *, strict: Literal[True] = True, passthrough: bool = False
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def standardize_prefix(
self, prefix: str, *, strict: Literal[False] = False, passthrough: Literal[True] = True
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def standardize_prefix(
self, prefix: str, *, strict: Literal[False] = False, passthrough: Literal[False] = False
) -> str | None: ...
[docs]
def standardize_prefix(
self, prefix: str, *, strict: bool = False, passthrough: bool = False
) -> str | None:
"""Standardize a prefix.
:param prefix: The prefix of the CURIE
:param strict: If true and the prefix can't be standardized, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and the prefix can't be
standardized, return the input. Defaults to false.
:returns: The standardized version of this prefix wrt this converter. If the
prefix is not registered in this converter, returns none.
:raises PrefixStandardizationError: If strict is true and the prefix can't be
standardied
>>> from curies import Converter, Record
>>> converter = Converter.from_extended_prefix_map(
... [
... Record(prefix="CHEBI", prefix_synonyms=["chebi"], uri_prefix="..."),
... ]
... )
>>> converter.standardize_prefix("chebi")
'CHEBI'
>>> converter.standardize_prefix("CHEBI")
'CHEBI'
>>> converter.standardize_prefix("NOPE") is None
True
>>> converter.standardize_prefix("NOPE", passthrough=True)
'NOPE'
"""
record = self._prefix_to_record.get(prefix)
if record:
return record.prefix
if strict:
raise PrefixStandardizationError(prefix)
if passthrough:
return prefix
return None
# docstr-coverage:excused `overload`
@overload
def standardize_curie(
self, curie: str, *, strict: Literal[True] = True, passthrough: bool = False
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def standardize_curie(
self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[True] = True
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def standardize_curie(
self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[False] = False
) -> str | None: ...
[docs]
def standardize_curie(
self, curie: str, *, strict: bool = False, passthrough: bool = False
) -> str | None:
"""Standardize a CURIE.
:param curie: A string representing a compact URI (CURIE)
:param strict: If true and the CURIE can't be standardized, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and the CURIE can't be
standardized, return the input. Defaults to false.
:returns: A standardized version of the CURIE in case a prefix synonym was used.
Note that this function is idempotent, i.e., if you give an already standard
CURIE, it will just return it as is. If the CURIE can't be parsed with
respect to the records in the converter, None is returned.
:raises CURIEStandardizationError: If strict is true and the CURIE can't be
standardized
>>> from curies import Converter, Record
>>> converter = Converter.from_extended_prefix_map(
... [
... Record(
... prefix="CHEBI",
... prefix_synonyms=["chebi"],
... uri_prefix="http://purl.obolibrary.org/obo/CHEBI_",
... ),
... ]
... )
>>> converter.standardize_curie("chebi:138488")
'CHEBI:138488'
>>> converter.standardize_curie("CHEBI:138488")
'CHEBI:138488'
>>> converter.standardize_curie("NOPE:NOPE") is None
True
>>> converter.standardize_curie("NOPE:NOPE", passthrough=True)
'NOPE:NOPE'
"""
rt = self.parse_curie(curie)
if rt is not None:
return self.format_curie(*rt)
if strict:
raise CURIEStandardizationError(curie)
if passthrough:
return curie
return None
# docstr-coverage:excused `overload`
@overload
def standardize_uri(
self, uri: str, *, strict: Literal[True] = True, passthrough: bool = False
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def standardize_uri(
self, uri: str, *, strict: Literal[False] = False, passthrough: Literal[True] = True
) -> str: ...
# docstr-coverage:excused `overload`
@overload
def standardize_uri(
self, uri: str, *, strict: Literal[False] = False, passthrough: Literal[False] = False
) -> str | None: ...
[docs]
def standardize_uri(
self, uri: str, *, strict: bool = False, passthrough: bool = False
) -> str | None:
"""Standardize a URI.
:param uri: A string representing a valid uniform resource identifier (URI)
:param strict: If true and the URI can't be standardized, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and the URI can't be standardized,
return the input. Defaults to false.
:returns: A standardized version of the URI in case a URI prefix synonym was
used. Note that this function is idempotent, i.e., if you give an already
standard URI, it will just return it as is. If the URI can't be parsed with
respect to the records in the converter, None is returned.
:raises URIStandardizationError: If strict is true and the URI can't be
standardized
>>> from curies import Converter, Record
>>> converter = Converter.from_extended_prefix_map(
... [
... Record(
... prefix="CHEBI",
... uri_prefix="http://purl.obolibrary.org/obo/CHEBI_",
... uri_prefix_synonyms=[
... "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:",
... ],
... ),
... ]
... )
>>> converter.standardize_uri(
... "https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:138488"
... )
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.standardize_uri("http://purl.obolibrary.org/obo/CHEBI_138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.standardize_uri("http://example.org/NOPE") is None
True
>>> converter.standardize_uri("http://example.org/NOPE", passthrough=True)
'http://example.org/NOPE'
"""
reference: ReferenceTuple | None = self.parse_uri(uri, strict=False)
if reference is not None:
# prefix is ensured to be in self.prefix_map because of successful parse
return self.prefix_map[reference.prefix] + reference.identifier
if strict:
raise URIStandardizationError(uri)
if passthrough:
return uri
return None
# docstr-coverage:excused `overload`
@overload
def standardize_reference(
self, reference: Reference, *, strict: Literal[True] = ...
) -> Reference: ...
# docstr-coverage:excused `overload`
@overload
def standardize_reference(
self, reference: Reference, *, strict: Literal[False] = ...
) -> Reference | None: ...
[docs]
def standardize_reference(
self, reference: Reference, *, strict: bool = False
) -> Reference | None:
"""Standardizes a reference."""
st_prefix = self.standardize_prefix(reference.prefix, strict=False)
if st_prefix is None and strict:
raise PrefixStandardizationError(reference.prefix)
st_identifier = self.standardize_identifier(st_prefix, reference.identifier, strict=False)
if st_identifier is None and strict:
raise IdentifierStandardizationError(reference.curie)
if st_prefix and st_identifier:
return reference.model_copy(update={"prefix": st_prefix, "identifier": st_identifier})
return None
[docs]
def pd_compress(
self,
df: pandas.DataFrame,
column: str | int,
target_column: None | str | int = None,
strict: bool = False,
passthrough: bool = False,
ambiguous: bool = False,
) -> None:
"""Convert all URIs in the given column to CURIEs.
:param df: A pandas DataFrame
:param column: The column in the dataframe containing URIs to convert to CURIEs.
:param target_column: The column to put the results in. Defaults to input
column.
:param strict: If true and the URI can't be compressed, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and the URI can't be compressed,
return the input. Defaults to false.
:param ambiguous: If true, consider the column as containing either CURIEs or
URIs.
"""
if ambiguous:
func = partial(self.compress_or_standardize, strict=strict, passthrough=passthrough)
else:
func = partial(self.compress, strict=strict, passthrough=passthrough)
df[column if target_column is None else target_column] = df[column].map(func)
[docs]
def pd_expand(
self,
df: pandas.DataFrame,
column: str | int,
target_column: None | str | int = None,
strict: bool = False,
passthrough: bool = False,
ambiguous: bool = False,
) -> None:
"""Convert all CURIEs in the given column to URIs.
:param df: A pandas DataFrame
:param column: The column in the dataframe containing CURIEs to convert to URIs.
:param target_column: The column to put the results in. Defaults to input
column.
:param strict: If true and the CURIE can't be expanded, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and the CURIE can't be expanded,
return the input. Defaults to false.
:param ambiguous: If true, consider the column as containing either CURIEs or
URIs.
"""
if ambiguous:
func = partial(self.expand_or_standardize, strict=strict, passthrough=passthrough)
else:
func = partial(self.expand, strict=strict, passthrough=passthrough)
df[column if target_column is None else target_column] = df[column].map(func)
[docs]
def pd_standardize_prefix(
self,
df: pandas.DataFrame,
*,
column: str | int,
target_column: None | str | int = None,
strict: bool = False,
passthrough: bool = False,
) -> None:
"""Standardize all prefixes in the given column.
:param df: A pandas DataFrame
:param column: The column in the dataframe containing prefixes to standardize.
:param target_column: The column to put the results in. Defaults to input
column.
:param strict: If true and any prefix can't be standardized, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and any prefix can't be
standardized, return the input. Defaults to false.
"""
func = partial(self.standardize_prefix, strict=strict, passthrough=passthrough)
df[column if target_column is None else target_column] = df[column].map(func)
[docs]
def pd_standardize_curie(
self,
df: pandas.DataFrame,
*,
column: str | int,
target_column: None | str | int = None,
strict: bool = False,
passthrough: bool = False,
) -> None:
r"""Standardize all CURIEs in the given column.
:param df: A pandas DataFrame
:param column: The column in the dataframe containing CURIEs to standardize.
:param target_column: The column to put the results in. Defaults to input
column.
:param strict: If true and any CURIE can't be standardized, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and any CURIE can't be
standardized, return the input. Defaults to false.
The Disease Ontology curates mappings to other semantic spaces and distributes
them in the tabular SSSOM format. However, they use a wide variety of
non-standard prefixes for referring to external vocabularies like SNOMED-CT. The
Bioregistry contains these synonyms to support reconciliation. The following
example shows how the SSSOM mappings dataframe can be loaded and this function
applied to the mapping ``object_id`` column (in place).
>>> import curies
>>> import pandas as pd
>>> commit = "faca4fc335f9a61902b9c47a1facd52a0d3d2f8b"
>>> url = f"https://raw.githubusercontent.com/mapping-commons/disease-mappings/{commit}/mappings/doid.sssom.tsv"
>>> df = pd.read_csv(url, sep="\t", comment="#")
>>> converter = curies.get_bioregistry_converter()
>>> converter.pd_standardize_curie(df, column="object_id")
"""
func = partial(self.standardize_curie, strict=strict, passthrough=passthrough)
df[column if target_column is None else target_column] = df[column].map(func)
[docs]
def pd_standardize_uri(
self,
df: pandas.DataFrame,
*,
column: str | int,
target_column: None | str | int = None,
strict: bool = False,
passthrough: bool = False,
) -> None:
"""Standardize all URIs in the given column.
:param df: A pandas DataFrame
:param column: The column in the dataframe containing URIs to standardize.
:param target_column: The column to put the results in. Defaults to input
column.
:param strict: If true and any URI can't be standardized, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and any URI can't be standardized,
return the input. Defaults to false.
"""
func = partial(self.standardize_uri, strict=strict, passthrough=passthrough)
df[column if target_column is None else target_column] = df[column].map(func)
[docs]
def file_compress(
self,
path: str | Path,
column: int,
*,
sep: str | None = None,
header: bool = True,
strict: bool = False,
passthrough: bool = False,
ambiguous: bool = False,
) -> None:
"""Convert all URIs in the given column of a CSV file to CURIEs.
:param path: A pandas DataFrame
:param column: The column in the dataframe containing URIs to convert to CURIEs.
:param sep: The delimiter of the CSV file, defaults to tab
:param header: Does the file have a header row?
:param strict: If true and the URI can't be compressed, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and the URI can't be compressed,
return the input. Defaults to false.
:param ambiguous: If true, consider the column as containing either CURIEs or
URIs.
"""
if ambiguous:
func = partial(self.compress_or_standardize, strict=strict, passthrough=passthrough)
else:
func = partial(self.compress, strict=strict, passthrough=passthrough)
self._file_helper(func, path=path, column=column, sep=sep, header=header)
[docs]
def file_expand(
self,
path: str | Path,
column: int,
*,
sep: str | None = None,
header: bool = True,
strict: bool = False,
passthrough: bool = False,
ambiguous: bool = False,
) -> None:
"""Convert all CURIEs in the given column of a CSV file to URIs.
:param path: A pandas DataFrame
:param column: The column in the dataframe containing CURIEs to convert to URIs.
:param sep: The delimiter of the CSV file, defaults to tab
:param header: Does the file have a header row?
:param strict: If true and the CURIE can't be expanded, returns an error.
Defaults to false.
:param passthrough: If true, strict is false, and the CURIE can't be expanded,
return the input. Defaults to false.
:param ambiguous: If true, consider the column as containing either CURIEs or
URIs.
"""
if ambiguous:
func = partial(self.expand_or_standardize, strict=strict, passthrough=passthrough)
else:
func = partial(self.expand, strict=strict, passthrough=passthrough)
self._file_helper(func, path=path, column=column, sep=sep, header=header)
@staticmethod
def _file_helper(
func: Callable[[str], str | None],
path: str | Path,
column: int,
sep: str | None = None,
header: bool = True,
) -> None:
path = Path(path).expanduser().resolve()
rows = []
delimiter = sep or "\t"
with path.open() as file_in:
reader = csv.reader(file_in, delimiter=delimiter)
_header = next(reader) if header else None
for row in reader:
row[column] = func(row[column]) or ""
rows.append(row)
with path.open("w") as file_out:
writer = csv.writer(file_out, delimiter=delimiter)
if _header:
writer.writerow(_header)
writer.writerows(rows)
# docstr-coverage:excused `overload`
@overload
def get_record(self, prefix: str, *, strict: Literal[True] = True) -> Record: ...
# docstr-coverage:excused `overload`
@overload
def get_record(self, prefix: str, *, strict: Literal[False] = False) -> Record | None: ...
[docs]
def get_record(self, prefix: str, *, strict: bool = False) -> Record | None:
"""Get the record for the prefix."""
# TODO use self._prefix_to_record
for record in self.records:
if record.prefix == prefix or prefix in record.prefix_synonyms:
return record
if strict:
raise KeyError(f"could not find prefix: {prefix}")
return None
[docs]
def get_subconverter(self, prefixes: Iterable[str]) -> Converter:
r"""Get a converter with a subset of prefixes.
:param prefixes: A list of prefixes to keep from this converter. These can
correspond either to preferred CURIE prefixes or CURIE prefix synonyms.
:returns: A new, slimmed down converter
This functionality is useful for downstream applications like the following:
1. You load a comprehensive extended prefix map, e.g., from the Bioregistry
using :func:`curies.get_bioregistry_converter()`.
2. You load some data that conforms to this prefix map by convention. This is
often the case for semantic mappings stored in the `SSSOM format
<https://github.com/mapping-commons/sssom>`_.
3. You extract the list of prefixes *actually* used within your data
4. You subset the detailed extended prefix map to only include prefixes relevant
for your data
5. You make some kind of output of the subsetted extended prefix map to go with
your data. Effectively, this is a way of reconciling data. This is especially
effective when using the Bioregistry or other comprehensive extended prefix
maps.
Here's a concrete example of doing this (which also includes a bit of data
science) to do this on the SSSOM mappings from the `Disease Ontology
<https://disease-ontology.org/>`_ project.
>>> import curies
>>> import pandas as pd
>>> import itertools as itt
>>> commit = "faca4fc335f9a61902b9c47a1facd52a0d3d2f8b"
>>> url = f"https://raw.githubusercontent.com/mapping-commons/disease-mappings/{commit}/mappings/doid.sssom.tsv"
>>> df = pd.read_csv(url, sep="\t", comment="#")
>>> prefixes = {
... curies.Reference.from_curie(curie).prefix
... for column in ["subject_id", "predicate_id", "object_id"]
... for curie in df[column]
... }
>>> converter = curies.get_bioregistry_converter()
>>> slim_converter = converter.get_subconverter(prefixes)
"""
prefixes = set(prefixes)
records = [
record
for record in self.records
if any(prefix in prefixes for prefix in record._all_prefixes)
]
return Converter(records)
[docs]
def hash_triple(self, triple: Triple, *, negate: bool = False) -> str:
"""Hash a triple using :func:`curies.triples.hash_triple`, implementing https://ts4nfdi.github.io/mapping-sameness-identifier.
:param triple: A subject-predicate-object triple
:param negate: If true, considers the triple as "negative" and postpends a ``~``
to the hash
:returns: A hexadecimal digest of the SHA-256 hash of the space-joined expanded
URI triple
>>> import curies
>>> from curies import Triple, Converter
>>> converter = curies.load_prefix_map(
... {
... "mesh": "http://id.nlm.nih.gov/mesh/",
... "skos": "http://www.w3.org/2004/02/skos/core#",
... "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
... }
... )
>>> triple = Triple(
... subject="mesh:C000089",
... predicate="skos:exactMatch",
... object="CHEBI:28646",
... )
>>> converter.hash_triple(triple)
'36a1f9244ea7641a90987c82f33c25c0c13712ee8f48207b2a0825f8a4e4e26a'
>>> converter.hash_triple(triple, negate=True)
'36a1f9244ea7641a90987c82f33c25c0c13712ee8f48207b2a0825f8a4e4e26a~'
"""
from .triples import hash_triple
return hash_triple(self, triple, negate=negate)
[docs]
def chain(converters: Sequence[Converter], *, case_sensitive: bool = True) -> Converter:
"""Chain several converters.
:param converters: A list or tuple of converters
:param case_sensitive: If false, will not allow case-sensitive duplicates
:returns: A converter that looks up one at a time in the other converters.
:raises ValueError: If there are no converters
Chain is the perfect tool if you want to override parts of an existing extended
prefix map. For example, if you want to use most of the Bioregistry, but you would
like to specify a custom URI prefix (e.g., using Identifiers.org), you can do the
following:
>>> import curies
>>> bioregistry_converter = curies.get_bioregistry_converter()
>>> overrides = curies.load_prefix_map({"pubmed": "https://identifiers.org/pubmed:"})
>>> converter = curies.chain([overrides, bioregistry_converter])
>>> converter.bimap["pubmed"]
'https://identifiers.org/pubmed:'
Similarly, this also works if you want to override a prefix. Keep in mind for this
to work with a simple prefix map, you need to make sure the URI prefix matches in
each converter, otherwise you will get duplicates:
>>> overrides = curies.load_prefix_map({"PMID": "https://www.ncbi.nlm.nih.gov/pubmed/"})
>>> converter = chain([overrides, bioregistry_converter])
>>> converter.bimap["PMID"]
'https://www.ncbi.nlm.nih.gov/pubmed/'
A safer way is to specify your override using an extended prefix map, which can tie
together prefix synonyms and URI prefix synonyms:
>>> import curies
>>> from curies import Converter, chain, get_bioregistry_converter
>>> overrides = curies.load_extended_prefix_map(
... [
... {
... "prefix": "PMID",
... "prefix_synonyms": ["pubmed", "PubMed"],
... "uri_prefix": "https://www.ncbi.nlm.nih.gov/pubmed/",
... "uri_prefix_synonyms": [
... "https://identifiers.org/pubmed:",
... "http://bio2rdf.org/pubmed:",
... ],
... },
... ]
... )
>>> converter = curies.chain([overrides, bioregistry_converter])
>>> converter.bimap["PMID"]
'https://www.ncbi.nlm.nih.gov/pubmed/'
Chain prioritizes based on the order given. Therefore, if two prefix maps having the
same prefix but different URI prefixes are given, the first is retained
>>> c1 = curies.load_prefix_map({"GO": "http://purl.obolibrary.org/obo/GO_"})
>>> c2 = curies.load_prefix_map({"GO": "https://identifiers.org/go:"})
>>> c3 = curies.chain([c1, c2])
>>> c3.prefix_map["GO"]
'http://purl.obolibrary.org/obo/GO_'
"""
converters = list(converters)
if not converters:
raise ValueError
if len(converters) == 1:
return converters[0]
rv = Converter()
for converter in converters:
for record in converter.records:
rv.add_record(record, case_sensitive=case_sensitive, merge=True)
return rv
[docs]
def load_prefix_map(prefix_map: LocationOr[Mapping[str, str]], **kwargs: Any) -> Converter:
"""Get a converter from a simple prefix map.
:param prefix_map: One of the following:
- A mapping whose keys represent CURIE prefixes and values represent URI
prefixes
- A string containing a remote location of a JSON file containg a prefix map
- A string or :class:`pathlib.Path` object corresponding to a local file path to
a JSON file containing a prefix map
:param kwargs: Keyword arguments to pass to :meth:`curies.Converter.__init__`
:returns: A converter
>>> import curies
>>> converter = curies.load_prefix_map(
... {
... "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
... }
... )
>>> converter.expand("CHEBI:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
>>> converter.compress("http://purl.obolibrary.org/obo/CHEBI_138488")
'CHEBI:138488'
"""
return Converter.from_prefix_map(prefix_map, **kwargs)
[docs]
def load_extended_prefix_map(
records: LocationOr[Iterable[Record | dict[str, Any]]], **kwargs: Any
) -> Converter:
"""Get a converter from a list of dictionaries by creating records out of them.
:param records: One of the following:
- An iterable of :class:`curies.Record` objects or dictionaries that will get
converted into record objects that together constitute an extended prefix map
- A string containing a remote location of a JSON file containg an extended
prefix map
- A string or :class:`pathlib.Path` object corresponding to a local file path to
a JSON file containing an extended prefix map
:param kwargs: Keyword arguments to pass to :meth:`curies.Converter.__init__`
:returns: A converter
An extended prefix map is a list of dictionaries containing four keys:
1. A ``prefix`` string
2. A ``uri_prefix`` string
3. An optional list of strings ``prefix_synonyms``
4. An optional list of strings ``uri_prefix_synonyms``
Across the whole list of dictionaries, there should be uniqueness within the union
of all ``prefix`` and ``prefix_synonyms`` as well as uniqueness within the union of
all ``uri_prefix`` and ``uri_prefix_synonyms``.
>>> import curies
>>> epm = [
... {
... "prefix": "CHEBI",
... "prefix_synonyms": ["chebi", "ChEBI"],
... "uri_prefix": "http://purl.obolibrary.org/obo/CHEBI_",
... "uri_prefix_synonyms": ["https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:"],
... },
... {
... "prefix": "GO",
... "uri_prefix": "http://purl.obolibrary.org/obo/GO_",
... },
... ]
>>> converter = curies.load_extended_prefix_map(epm)
Expand using the preferred/canonical prefix:
>>> converter.expand("CHEBI:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
Expand using a prefix synonym:
>>> converter.expand("chebi:138488")
'http://purl.obolibrary.org/obo/CHEBI_138488'
Compress using the preferred/canonical URI prefix:
>>> converter.compress("http://purl.obolibrary.org/obo/CHEBI_138488")
'CHEBI:138488'
Compressing using a URI prefix synonym:
>>> converter.compress("https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:138488")
'CHEBI:138488'
Example from a remote source:
>>> url = "https://github.com/biopragmatics/bioregistry/raw/main/exports/contexts/bioregistry.epm.json"
>>> converter = curies.load_extended_prefix_map(url)
"""
return Converter.from_extended_prefix_map(records, **kwargs)
[docs]
def load_jsonld_context(data: LocationOr[dict[str, Any]], **kwargs: Any) -> Converter:
"""Get a converter from a JSON-LD object, which contains a prefix map in its ``@context`` key.
:param data: A JSON-LD object
:param kwargs: Keyword arguments to pass to :meth:`curies.Converter.__init__`
:returns: A converter
Example from a remote context file:
>>> base = "https://raw.githubusercontent.com"
>>> url = f"{base}/biopragmatics/bioregistry/main/exports/contexts/semweb.context.jsonld"
>>> converter = Converter.from_jsonld(url)
>>> "rdf" in converter.prefix_map
"""
return Converter.from_jsonld(data, **kwargs)
[docs]
def load_shacl(data: LocationOr[rdflib.Graph], **kwargs: Any) -> Converter:
"""Get a converter from a JSON-LD object, which contains a prefix map in its ``@context`` key.
:param data: A path to an RDF file or a RDFlib graph
:param kwargs: Keyword arguments to pass to :meth:`curies.Converter.__init__`
:returns: A converter
"""
return Converter.from_shacl(data, **kwargs)
[docs]
def write_extended_prefix_map(converter: Converter, path: str | Path) -> None:
"""Write an extended prefix map as JSON to a file."""
path = _ensure_path(path)
path.write_text(
json.dumps(
[_record_to_dict(record) for record in converter.records],
indent=4,
sort_keys=True,
ensure_ascii=False,
)
)
def _record_to_dict(record: Record) -> Mapping[str, str | list[str]]:
"""Convert a record to a dict."""
rv: dict[str, str | list[str]] = {
"prefix": record.prefix,
"uri_prefix": record.uri_prefix,
}
if record.prefix_synonyms:
rv["prefix_synonyms"] = sorted(record.prefix_synonyms)
if record.uri_prefix_synonyms:
rv["uri_prefix_synonyms"] = sorted(record.uri_prefix_synonyms)
if record.pattern:
rv["pattern"] = record.pattern
return rv
def _ensure_path(path: str | Path) -> Path:
if isinstance(path, str):
path = Path(path).resolve()
return path
def _get_jsonld_context(
converter: Converter, *, expand: bool = False, include_synonyms: bool = False
) -> dict[str, Any]:
"""Get a JSON-LD context based on the converter."""
context = {}
for record in converter.records:
term = _get_expanded_term(record, expand=expand)
context[record.prefix] = term
if include_synonyms:
for prefix_synonym in record.prefix_synonyms:
context[prefix_synonym] = term
return {"@context": context}
[docs]
def write_jsonld_context(
converter: Converter,
path: str | Path,
*,
include_synonyms: bool = False,
expand: bool = False,
) -> None:
"""Write the converter's bijective map as a JSON-LD context to a file.
:param converter: The converter to export
:param path: The path to a file to write to
:param include_synonyms: If true, includes CURIE prefix synonyms. URI prefix
synonyms are not output.
:param expand: If False, output a dictionary-like ``@context`` element. If True, use
``@prefix`` and ``@id`` as keys for the CURIE prefix and URI prefix,
respectively, to maximize compatibility.
The following example shows writing a JSON-LD context:
.. code-block:: python
import curies
converter = curies.load_prefix_map(
{
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
}
)
curies.write_jsonld_context(converter, "example_context.json")
.. code-block:: json
{
"@context": {
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_"
}
}
Because some implementations of JSON-LD do not like URI prefixes that end with an
underscore ``_``, we can use the ``expand`` keyword to turn on more verbose JSON-LD
context output that contains explicit ``@prefix`` and ``@id`` annotations
.. code-block:: python
import curies
converter = curies.load_prefix_map(
{
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
}
)
curies.write_jsonld_context(converter, "example_context.json", expand=True)
.. code-block:: json
{
"@context": {
"CHEBI": {
"@id": "http://purl.obolibrary.org/obo/CHEBI_",
"@prefix": true
}
}
}
"""
path = _ensure_path(path)
obj = _get_jsonld_context(converter, include_synonyms=include_synonyms, expand=expand)
with path.open("w") as file:
json.dump(obj, file, indent=4, sort_keys=True)
def _get_expanded_term(record: Record, *, expand: bool) -> str | dict[str, Any]:
if not expand:
return record.uri_prefix
# Use expanded term definition described in https://www.w3.org/TR/json-ld11/#expanded-term-definition
rv = {"@prefix": True, "@id": record.uri_prefix}
# TODO add an @context inside here to somehow capture the pattern, if available
# if record.pattern:
# rv["@context"] = {
# "sh": "http://www.w3.org/ns/shacl#",
# "sh:pattern": record.pattern,
# }
return rv
[docs]
def write_shacl(
converter: Converter,
path: str | Path,
*,
include_synonyms: bool = False,
) -> None:
"""Write the converter's bijective map as SHACL in turtle RDF to a file.
:param converter: The converter to export
:param path: The path to a file to write to
:param include_synonyms: If true, includes CURIE prefix synonyms. URI prefix
synonyms are not output.
.. seealso::
https://www.w3.org/TR/shacl/#sparql-prefixes
.. code-block:: python
import curies
converter = curies.load_prefix_map(
{
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
}
)
curies.write_shacl(converter, "example_shacl.ttl")
.. code-block::
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
[
sh:declare
[ sh:prefix "CHEBI" ; sh:namespace "http://purl.obolibrary.org/obo/CHEBI_"^^xsd:anyURI ]
] .
"""
text = dedent(
"""\
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
[
sh:declare
{entries}
] .
"""
)
path = _ensure_path(path)
lines = []
for record in converter.records:
lines.append(_get_shacl_line(record.prefix, record.uri_prefix, pattern=record.pattern))
if include_synonyms:
for prefix_synonym in record.prefix_synonyms:
lines.append(
_get_shacl_line(prefix_synonym, record.uri_prefix, pattern=record.pattern)
)
path.write_text(text.format(entries=",\n".join(lines)))
[docs]
def write_tsv(
converter: Converter, path: str | Path, *, header: tuple[str, str] = ("prefix", "base")
) -> None:
"""Write a simple prefix map CSV file.
:param converter: The converter to export
:param path: The path to a file to write to
:param header: A 2-tuple of strings representing the header used in the file, where
the first element is the label for CURIE prefixes and the second element is the
label for URI prefixes
.. code-block:: python
import curies
converter = curies.load_prefix_map(
{
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
}
)
curies.write_tsv(converter, "example_context.tsv")
.. code-block::
prefix base
CHEBI http://purl.obolibrary.org/obo/CHEBI_
"""
import csv
path = _ensure_path(path)
with path.open("w") as csvfile:
writer = csv.writer(csvfile, delimiter="\t")
writer.writerow(header)
for record in converter.records:
writer.writerow((record.prefix, record.uri_prefix))
def _get_shacl_line(prefix: str, uri_prefix: str, pattern: str | None = None) -> str:
line = f' [ sh:prefix "{prefix}" ; sh:namespace "{uri_prefix}"^^xsd:anyURI '
if pattern:
pattern = pattern.replace("\\", "\\\\")
line += f'; sh:pattern "{pattern}"'
return line + " ]"
[docs]
def upgrade_prefix_map(prefix_map: Mapping[str, str]) -> list[Record]:
"""Convert a (potentially problematic) prefix map (i.e., not bijective) into a list of records.
A prefix map is bijective if it has no duplicate CURIE prefixes (i.e., keys in a
dictionary) and no duplicate URI prefixes (i.e., values in a dictionary). Because of
the way that dictionaries work in Python, we are always guaranteed that there are no
duplicate keys.
However, it is both possible and frequent to have duplicate values. This happens
because many semantic spaces have multiple synonymous CURIE prefixes. For example,
the `OBO in OWL <https://bioregistry.io/oboinowl>`_ vocabulary has two common,
interchangable prefixes: ``oio`` and ``oboInOwl`` (and the case variant
``oboinowl``). Therefore, a prefix map might contain the following parts that make
it non-bijective:
.. code-block:: json
{
"oio": "http://www.geneontology.org/formats/oboInOwl#",
"oboInOwl": "http://www.geneontology.org/formats/oboInOwl#"
}
This is bad because this prefix map can't be used to determinstically compress a
URI. For example, should ``http://www.geneontology.org/formats/oboInOwl#hasDbXref``
be compressed to ``oio:hasDbXref`` or ``oboInOwl:hasDbXref``? Neither is necessarily
incorrect, but the issue here is that there is not an explicit choice by the data
modeler, meaning that data compressed into CURIEs with this non-bijective map might
not be readily integrable with other datasets.
The best solution to this situation is not more code, but rather for the data
modeler to address the issue upstream in the following steps:
1. Choose the which of prefix synonyms is going to be the primary prefix. If you're
not sure, the `Bioregistry <https://bioregistry.io/>`_ is a comprehensive
registry of prefixes and their syonyms applicable in the semantic web and the
natural sciences. It gives a good suggestion of what the best prefix is. In the
OBO in OWL case, it suggests ``oboInOwl``.
2. Update all related data artifacts to only use that preferred prefix
3. Either 1) remove the other synonyms (in this example, ``oio``) from the prefix
map *or* 2) transition to using :ref:`epms`, a more modern data structure for
supporting URI and CURIE interconversion.
The first part of step 3 in this solution highlights one of the key shortcomings of
prefix maps themselves - they can't keep track of synonyms, which are often useful
in data integration, especially when a single prefix map is defined on the level of
a project or community. The extended prefix map is a simple data structure proposed
to address this.
- - -
This function is for people who are not in the position to make the sustainable fix,
and want to automate the assignment of which is the preferred prefix. It uses a
deterministic algorithm to choose from two or more CURIE prefixes that have the same
URI prefix and generate an extended prefix map in which they have bene collapsed
into a single record. More specitically, the algorithm is based on a case-sensitive
lexical sort of the prefixes. The first in the sort order becomes the primary prefix
and the others become synonyms in the resulting record.
:param prefix_map: A mapping whose keys represent CURIE prefixes and values
represent URI prefixes
:returns: A list of :class:`curies.Record` objects that together constitute an
extended prefix map
>>> from curies import Converter, upgrade_prefix_map
>>> pm = {"a": "https://example.com/a/", "b": "https://example.com/a/"}
>>> records = upgrade_prefix_map(pm)
>>> converter = Converter(records)
>>> converter.expand("a:1")
'https://example.com/a/1'
>>> converter.expand("b:1")
'https://example.com/a/1'
>>> converter.compress("https://example.com/a/1")
'a:1'
.. note::
Thanks to `Joe Flack <https://github.com/joeflack4>`_ for proposing this
algorithm `in this discussion
<https://github.com/mapping-commons/sssom-py/pull/485#discussion_r1451812733>`_.
"""
uri_prefix_to_curie_synonyms = defaultdict(list)
for curie_prefix, uri_prefix in prefix_map.items():
uri_prefix_to_curie_synonyms[uri_prefix].append(curie_prefix)
priority_prefix_map = {
uri_prefix: sorted(curie_prefixes)
for uri_prefix, curie_prefixes in uri_prefix_to_curie_synonyms.items()
}
return [
Record(prefix=prefix, prefix_synonyms=prefix_synonyms, uri_prefix=uri_prefix)
for uri_prefix, (prefix, *prefix_synonyms) in sorted(priority_prefix_map.items())
]
def _converter_from_validation_info(info: core_schema.ValidationInfo) -> Converter | None:
context = info.context or {}
if isinstance(context, Converter):
return context
elif isinstance(context, dict):
return context.get("converter")
else:
raise TypeError
class Trie(UserDict[str, Record]):
"""A partial implementation of a string trie."""
root: TrieNode
def __init__(self, initial_dict: dict[str, Record] | None = None) -> None:
"""Create a new string trie."""
super().__init__()
self.root = TrieNode()
if initial_dict:
for key, value in initial_dict.items():
self[key] = value
def __setitem__(self, key: str, item: Record) -> None:
self.root._ensure_node(key).value = item
def parse_uri(self, uri: str) -> ReferenceTuple | None:
"""Parse a URI into a prefix/identifier pair based prefixes in the trie."""
node: TrieNode = self.root
record: Record | None = self.root.value
max_non_null_index = -1
for i, character in enumerate(uri):
new_node = node.children.get(character)
if new_node is None:
break
if new_node.value is not None:
record = new_node.value
max_non_null_index = i
node = new_node
if record is None:
return None
identifier = uri[max_non_null_index + 1 :]
return ReferenceTuple(record.prefix, identifier)
def __contains__(self, key: Any) -> bool:
node = self.root._find_node(key)
return node is not None and node.value is not None
class TrieNode:
"""Trie node class."""
__slots__ = ("children", "value")
value: Record | None
children: dict[str, TrieNode]
def __init__(self, value: Record | None = None) -> None:
"""Initialize the node."""
self.value = value # this needs to be mutable, which is why this isn't a named tuple
self.children = {}
def _ensure_node(self, key: str) -> TrieNode:
node = self
for character in key:
next_node: TrieNode | None = node.children.get(character)
if next_node is None:
node = node.children.setdefault(character, TrieNode())
else:
node = next_node
return node
def _find_node(self, key: Sequence[str]) -> TrieNode | None:
node = self
for character in key:
next_node = node.children.get(character)
if next_node is None:
return None
node = next_node
return node