Source code for curies.w3c

"""Validation based on W3C standards.

The Worldwide Web Consortium (W3C) provides standards for
`prefixes <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_ (i.e., ``NCName``),
`CURIEs <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_, and
`IRIs <https://www.ietf.org/rfc/rfc3987.txt>`_, but they are
highly obfuscated and spread across many documents.
This module attempts to operationalize these standards, along with best practices
of documentation and testing.

.. seealso::

    Some other work towards operationalizing these standards:

    - https://github.com/linkml/linkml-runtime/blob/main/linkml_runtime/utils/uri_validator.py
    - https://github.com/dgerber/rfc3987/blob/gh-archived/rfc3987.py

"""

import re

__all__ = [
    "CURIE_PATTERN",
    "LOCAL_UNIQUE_IDENTIFIER_PATTERN",
    "NCNAME_PATTERN",
    "is_w3c_curie",
    "is_w3c_prefix",
]

NCNAME_PATTERN = r"[A-Za-z_][A-Za-z0-9\.\-_]*"
"""A regex for prefixes, from https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName.

.. code-block::

    prefix := NCName
    NCName := (Letter | '_') (NCNameChar)*
    NCNameChar	::=	Letter | Digit | '.' | '-' | '_'
"""

NCNAME_RE = re.compile(f"^{NCNAME_PATTERN}$")

LOCAL_UNIQUE_IDENTIFIER_PATTERN = r"(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)"
"""A regex for local unique identifiers in CURIEs, based on https://www.ietf.org/rfc/rfc3987.txt

This pattern was adapted from https://gist.github.com/niklasl/2506955, which sort of
implements RFC3987,
"""

LOCAL_UNIQUE_IDENTIFIER_RE = re.compile(LOCAL_UNIQUE_IDENTIFIER_PATTERN)

CURIE_PATTERN = rf"^({NCNAME_PATTERN}?:)?{LOCAL_UNIQUE_IDENTIFIER_PATTERN}$"
"""A regex for CURIEs, based on https://www.w3.org/TR/2010/NOTE-curie-20101216.

.. code-block::

    curie       :=   [ [ prefix ] ':' ] reference
    prefix      :=   NCName
    reference   :=   irelative-ref (as defined in `IRI <https://www.ietf.org/rfc/rfc3987.txt>`_)

`irelative-ref` is defined/documented in :data:`curies.w3c.LOCAL_UNIQUE_IDENTIFIER_PATTERN`.
"""

CURIE_RE = re.compile(CURIE_PATTERN)


[docs] def is_w3c_prefix(prefix: str) -> bool: """Return if the string is a valid prefix under the W3C specification. :param prefix: A string :return: If the string is a valid prefix under the W3C specification. Validation is implemented as a regular expression match against :data:`curies.w3c.NCNAME_PATTERN`, as defined by the W3C `here <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_. Examples -------- Strings containig numbers, letters, and underscores are valid prefixes. >>> is_w3c_prefix("GO") True The W3C specification states that the prefix '_' is reserved for use by languages that support RDF. For this reason, the prefix '_' SHOULD be avoided by authors. >>> is_w3c_prefix("_") True Strings starting with a number are not valid prefixes. >>> is_w3c_prefix("3dmet") Strings containing a colon or other characters are invalid >>> is_w3c_prefix("GO:") False """ return bool(NCNAME_RE.match(prefix))
def _is_w3c_luid(luid: str) -> bool: return bool(LOCAL_UNIQUE_IDENTIFIER_RE.match(luid))
[docs] def is_w3c_curie(curie: str) -> bool: """Return if the string is a valid CURIE under the W3C specification. :param curie: A string to check if it is a valid CURIE under the W3C specification. :return: True if the string is a valid CURIE under the W3C specification. .. warning:: This is slightly different from the :meth:`curies.Converter.is_curie` function, which checks if a given CURIE is valid under the extended prefix map contained within the converter. Further, the base converter is slightly more lenient than the W3C specification by default to allow for the inclusion of CURIEs, e.g., for SMILES strings like ``smiles:CC(=O)NC([H])(C)C(=O)O``. These are useful, but not technically valid due to their inclusion of brackets. Examples -------- If no prefix is given, the host language chooses how to assign a default prefix. >>> is_w3c_curie(":test") True From the specification, regarding using an underscore as the prefix The CURIE prefix '_' is reserved for use by languages that support RDF. For this reason, the prefix '_' SHOULD be avoided by authors. >>> is_w3c_curie("_:test") True This is invalid because a CURIE prefix isn't allowed to start with a number. It has to start with either a letter, or an underscore. >>> is_w3c_curie("4cdn:test") False Empty strings are explicitly noted as being invalid. >>> is_w3c_curie("") False """ if "[" in curie or "]" in curie: return False # empty curie is invalid (for now) if not curie.strip(): return False # if there's no colon, then validate the whole thing against the LUID pattern. # this is because prefix, sep, identifier = curie.partition(":") if not sep: return _is_w3c_luid(curie) # it's okay for there to be no prefix in a CURIE, even though # the NCName definition is not itself allowed to be empty if not prefix: return _is_w3c_luid(identifier) return is_w3c_prefix(prefix) and _is_w3c_luid(identifier)