Classes
Citator
CiteURL's main feature: a collection of schemas, and the tools to apply them to text, to find all kinds of citations in a text.
Attributes:
Name | Type | Description |
---|---|---|
schemas |
list |
A list of schema objects that this citator will try to match against. |
generic_id |
str |
A common regex the citator will append to each schema when it is loaded, to recognize a simple citation to the most-recently cited source. |
__init__(self, yaml_paths=[], defaults=True, generic_id='\\b(Ib)?[Ii]d\\.(<\\/(i|em|u)>)?')
special
Calls load_yaml one or more times, to load the citator with schemas.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
defaults |
bool |
Whether to load CiteURL's default schemas |
True |
yaml_paths |
list |
paths to additional YAML files with schemas that should be loaded to supplement or replace the defaults. |
[] |
generic_id |
str |
a common regex to append to all schemas, to recognize a simple citation to the most-recently cited source. Detects "id." or "ibid." by default. To disable, set to None. |
'\\b(Ib)?[Ii]d\\.(<\\/(i|em|u)>)?' |
Source code in citeurl/__init__.py
def __init__(
self,
yaml_paths: list[str]=[],
defaults: bool=True,
generic_id: str=GENERIC_ID
):
"""
Calls load_yaml one or more times, to load the citator with
schemas.
Arguments:
defaults: Whether to load CiteURL's default schemas
yaml_paths: paths to additional YAML files with schemas that
should be loaded to supplement or replace the defaults.
generic_id: a common regex to append to all schemas, to
recognize a simple citation to the most-recently cited
source. Detects "id." or "ibid." by default. To
disable, set to None.
"""
self.generic_id: str = generic_id
self.schemas: list = []
if defaults:
self.load_yaml(DEFAULT_YAML_PATH)
for path in yaml_paths:
self.load_yaml(path)
insert_links(self, text, attrs={'class': 'citation'}, url_optional=False, link_detailed_ids=True, link_plain_ids=False, id_break_regex='L\\. ?Rev\\.|J\\. ?Law|\\. ?([Cc]ode|[Cc]onst)', id_break_indices=[])
Convenience method to return a copy of the given text, with citation hyperlinks inserted.
If you plan to do more than just insert links, it's better to get a list of citations with list_citations first, then insert those links with the module-wide insert_links function.
Source code in citeurl/__init__.py
def insert_links(
self,
text: str,
attrs: dict={'class': 'citation'},
url_optional: bool=False,
link_detailed_ids: bool=True,
link_plain_ids: bool=False,
id_break_regex: str=DEFAULT_ID_BREAKS,
id_break_indices: list=[]) -> str:
"""
Convenience method to return a copy of the given text, with
citation hyperlinks inserted.
If you plan to do more than just insert links, it's better to
get a list of citations with list_citations first, then insert
those links with the module-wide insert_links function.
"""
citations = self.list_citations(
text,
id_break_regex=id_break_regex,
id_break_indices=id_break_indices
)
return insert_links(
citations,
text,
attrs=attrs,
link_detailed_ids=link_detailed_ids,
link_plain_ids=link_plain_ids,
url_optional=url_optional
)
list_authorities(self, text)
Convenience method to list all the authorities cited in a given text.
If you plan to do more than list authorities, it's better to get a list of citations with list_citations, then list the unique authorities with the module-wide list_authorities function.
Source code in citeurl/__init__.py
def list_authorities(self, text: str) -> list:
"""
Convenience method to list all the authorities cited in a
given text.
If you plan to do more than list authorities, it's better to
get a list of citations with list_citations, then list the
unique authorities with the module-wide list_authorities
function.
"""
citations = self.list_citations(text)
return list_authorities(citations)
list_citations(self, text, id_forms=True, id_break_regex='L\\. ?Rev\\.|J\\. ?Law|\\. ?([Cc]ode|[Cc]onst)', id_break_indices=[])
Scan a text and return a list of all citations in it, in order of appearance.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
id_forms |
bool |
Whether to detect citations like |
True |
id_break_regex |
str |
A pattern to look for in the text. Any occurrence of the pattern will interrupt a chain of "id." citations as if it were another citation. |
'L\\. ?Rev\\.|J\\. ?Law|\\. ?([Cc]ode|[Cc]onst)' |
id_break_indices |
list |
A list of positions in the text where "id." citations should be interrupted |
[] |
Returns:
Type | Description |
---|---|
list |
A list of citation objects, in order of appearance in the text. |
Source code in citeurl/__init__.py
def list_citations(self,
text: str,
id_forms: bool=True,
id_break_regex: str=DEFAULT_ID_BREAKS,
id_break_indices: list=[],
) -> list:
"""
Scan a text and return a list of all citations in it, in
order of appearance.
Arguments:
id_forms: Whether to detect citations like
"Id." and "Id. at 30."
id_break_regex: A pattern to look for in the text. Any
occurrence of the pattern will interrupt a chain of
"id." citations as if it were another citation.
id_break_indices: A list of positions in the text
where "id." citations should be interrupted
Returns:
A list of citation objects, in order of appearance in the
text.
"""
# First, get full citations:
citations = []
for schema in self.schemas:
citations += schema.get_citations(text)
shortform_cites = []
# Then, add shortforms
for citation in citations:
shortform_cites += citation._get_shortform_citations(text)
citations += shortform_cites
citations = _sort_and_remove_overlaps(citations)
if not id_forms: # no need to proceed
return citations
# determine where to break chains of id. citations
for citation in citations: # break at full or short citations
id_break_indices.append(citation.span[0])
if id_break_regex: #also break at specified regexes
matches = re.compile(id_break_regex).finditer(text)
for match in matches:
id_break_indices.append(match.span()[0])
id_break_indices = sorted(set(id_break_indices))
# loop through all citations to find their id citations
id_citations = []
for citation in citations:
# find the next id break point
i = -1
for index in id_break_indices:
i += 1
if index > citation.span[1]:
end_point = index
break
else:
end_point = None
id_break_indices = id_break_indices[i:]
# get each citation's id citations until the break point
id_citations += citation._get_id_citations(
text, end_point=end_point
)
return _sort_and_remove_overlaps(citations + id_citations)
load_yaml(self, path, use_generic_id=True)
Import schemas from the specified YAML file into the citator.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
str |
path to the YAML file to load |
required |
use_generic_id |
bool |
Whether to append the citator's generic_id |
True |
Source code in citeurl/__init__.py
def load_yaml(self, path: str, use_generic_id: bool=True):
"""
Import schemas from the specified YAML file into the citator.
Arguments:
path: path to the YAML file to load
use_generic_id: Whether to append the citator's generic_id
citation format to the loaded schemas.
"""
yaml_text = Path(path).read_text()
yaml_dict = safe_load(yaml_text)
# read each item in the YAML into a new schema
for schema_name, schema_data in yaml_dict.items():
# if regex is specified in singular form, convert it to a
# list with one item, for sake of consistency with multiple-
# regex schemas.
for key in ['regex', 'broadRegex']:
if key in schema_data:
schema_data[key + 'es'] = [schema_data.pop(key)]
# unrelated: if an individual regex is given as a list of
# strings (convenient for reusing YAML anchors), concatenate
# it to one string.
for key in ['regexes', 'broadRegexes', 'idForms', 'shortForms']:
if key not in schema_data:
continue
for i, regex in enumerate(schema_data[key]):
if type(regex) is list:
schema_data[key][i] = ''.join(regex)
# make the schema and add it to the citator, adding the
# generic id-form citation if applicable
new_schema = Schema(name=schema_name, **schema_data)
if use_generic_id and self.generic_id:
new_schema.idForms.append(self.generic_id)
self.schemas.append(new_schema)
lookup(self, query, broad=True)
Convenience method to get the first citation from the first matching schema, or None.
This is meant for cases where false positives are not an issue, so it uses broadRegex and case-insensitive matching by default.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
broad |
bool |
Whether to use case-insensitive regex matching and, if available, each schema's broadRegex. |
True |
query |
str |
The text to scan for a citation |
required |
Returns:
Type | Description |
---|---|
Citation |
A single citation object, or None |
Source code in citeurl/__init__.py
def lookup(self, query: str, broad: bool=True) -> Citation:
"""
Convenience method to get the first citation from the first
matching schema, or None.
This is meant for cases where false positives are not an issue,
so it uses broadRegex and case-insensitive matching by default.
Arguments:
broad: Whether to use case-insensitive regex matching and,
if available, each schema's broadRegex.
query: The text to scan for a citation
Returns:
A single citation object, or None
"""
for schema in self.schemas:
citation = next(schema.get_citations(query, broad=broad), None)
if citation:
return citation
return None
Schema
A pattern to recognize a single kind of citation and generate URLs from matches.
In most cases, it is more useful to use the Citator class to load schemas from YAML files and apply them en masse, rather than use the Schema class directly.
__init__(self, name, regexes, URL=None, broadRegexes=None, idForms=[], shortForms=[], defaults={}, operations=[], parent_citation=None, is_id=False)
special
Schema constructor. Primarily meant for use in loading YAML files and dynamically generating shortform schemas, but can be run directly if needed.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
name |
str |
The name of this schema |
required |
regexes |
list |
A list of one or more regexes that this schema will match. Each regex should be provided as a string, and should include one or more named capture groups (i.e. "tokens") that will be used to generate the URL. |
required |
URL |
|
The template by which to generate URLs from citation matches. Placeholders in {curly braces} will be replaced by the value of the token with the same name, after that token has been processed by the schema The URL template can be provided either as as a string or as a list of strings to concatenate. In the latter case, if a list item contains a placeholder for which no value is set, the list item will be skipped. |
None |
defaults |
dict |
A dictionary of tokens and corresponding default values which should be set if the token's value is not otherwise set by a regex capture group. |
{} |
operations |
list |
A list of operations to perform on the tokens,
in sequence, to transform them from Each operation must specify a The supported operations are The The The The |
[] |
shortForms |
list |
A list of regex templates to generate regexes that recognize short-forms of a parent long-form citation that has appeared earlier in the text. Any named section in {curly braces} will be replaced by
the value of the corresponding token from the parent
citation. So if a schema detects a longform citation to
"372 U.S. 335" and has a shortform |
[] |
idForms |
list |
Think "id.", not ID. Identical to shortForms, except that these regexes will only match until the next different citation or other interruption. |
[] |
parent_citation |
|
The citation, if any, that this schema was created as a shortform of. This argument is for dynamically-generated schemas, and there is usually no need to use it manually. |
None |
is_id |
|
Whether this schema represents an immediate repeat shortform citation like "id." or "id. at 30". Really only relevant for procedurally-generated schemas. |
False |
Source code in citeurl/__init__.py
def __init__(self,
name: str,
regexes: list[str],
URL=None,
broadRegexes: list[str]=None,
idForms: list[str]=[],
shortForms: list[str]=[],
defaults: dict={},
operations: list[dict]=[],
parent_citation=None,
is_id=False
):
"""
Schema constructor. Primarily meant for use in loading YAML
files and dynamically generating shortform schemas, but can be
run directly if needed.
Arguments:
name: The name of this schema
regexes: A list of one or more regexes that this schema will
match. Each regex should be provided as a string, and
should include one or more named capture groups
(i.e. "tokens") that will be used to generate the URL.
URL: The template by which to generate URLs from citation
matches. Placeholders in {curly braces} will be replaced
by the value of the token with the same name, after that
token has been processed by the schema
The URL template can be provided either as as a string
or as a list of strings to concatenate. In the latter
case, if a list item contains a placeholder for which
no value is set, the list item will be skipped.
defaults: A dictionary of tokens and corresponding default
values which should be set if the token's value is not
otherwise set by a regex capture group.
operations: A list of operations to perform on the tokens,
in sequence, to transform them from `captured_tokens` to
`processed_tokens`, the tokens that are used for URL
generation.
Each operation must specify a `token` for its input. It
will also be used as the output of the operation, unless
`output` is specified. If the specified input token is
not set, the operation will be skipped.
The supported operations are `case`, `sub`, `lookup`,
`optionalLookup`, `lpad`, and `numberFormat`.
The `case` operation outputs the input token, set to the
specified capitalization, either 'upper', 'lower', or
'title'.
The `sub` operation performs a regex substitution. It
requires a list of two strings; the first is the regex
to match in the input token, and the second is the text
to replace each match with.
The `lookup` operation tries to match the input against
a series of dictionary keys (using case-insensitive
regex), and set the output to the corresponding value.
If the dictionary does not contain a matching key, the
entire schema match will retroactively fail.
`optionalLookup` works the same way, except that failed
lookups will not cause the schema to fail, and will
simply leave tokens unmodified.
The `numberFormat` operation assumes that the input
token is a number, either in digit form or Roman
numerals. It outputs the same number, converted to the
specified number format, either 'roman' or 'digit'.
shortForms: A list of regex templates to generate regexes
that recognize short-forms of a parent long-form
citation that has appeared earlier in the text.
Any named section in {curly braces} will be replaced by
the value of the corresponding token from the parent
citation. So if a schema detects a longform citation to
"372 U.S. 335" and has a shortform `{volume} {reporter}
at (?P<pincite>\d+)`, it will generate the following
regex: `372 U.S. at (?P<pincite>\d+)`.
idForms: Think "id.", not ID. Identical to shortForms,
except that these regexes will only match until the
next different citation or other interruption.
parent_citation: The citation, if any, that this schema
was created as a shortform of. This argument is
for dynamically-generated schemas, and there is usually
no need to use it manually.
is_id: Whether this schema represents an immediate repeat
shortform citation like "id." or "id. at 30". Really
only relevant for procedurally-generated schemas.
"""
# Basic values
self.name: str = name
self.regexes: str = regexes
self.is_id: bool = is_id
if URL:
self.URL: str = URL if type(URL) is list else [URL]
# Supplemental regexes
self.broadRegexes: str = broadRegexes
self.idForms: list = idForms
self.shortForms: list = shortForms
# String operators
self.defaults: dict = defaults
self.operations: list = operations
# Extra data for shortform citations
self.parent_citation: Citation = parent_citation
# hack: prevent all regexes from matching mid-word
for key in ['regexes', 'broadRegexes', 'idForms', 'shortForms']:
regex_list = self.__dict__[key]
if not regex_list:
continue
regex_list = list(map(lambda x: r'(\b|^)' + x, regex_list))
# dictionaries of compiled regexes
self._compiled_regexes: dict = {}
self._compiled_broadRegexes: dict = {}
get_citations(self, text, broad=False, span=(0,))
Generator to return all citations the schema finds in text.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
The text to scan for a citation |
required |
broad |
bool |
Whether to use case-insensitive regex matching and, if available, the schema's broadRegex. |
False |
span |
tuple |
A tuple of one or two values determining the start and end index of where in the text to search for citations. Defaults to (0,) to scan the entire text. |
(0,) |
Returns:
Type | Description |
---|---|
Iterable |
Generator that yields each citation the schema finds in the text, or None. |
Source code in citeurl/__init__.py
def get_citations(
self,
text: str,
broad: bool=False,
span: tuple=(0,)
) -> Iterable:
"""
Generator to return all citations the schema finds in text.
Arguments:
text: The text to scan for a citation
broad: Whether to use case-insensitive regex matching and,
if available, the schema's broadRegex.
span: A tuple of one or two values determining
the start and end index of where in the text to search
for citations. Defaults to (0,) to scan the entire text.
Returns:
Generator that yields each citation the schema finds in the
text, or None.
"""
matches = []
for index in range(len(self.regexes)):
matches += self._compiled_re(index, broad).finditer(text, *span)
for match in matches:
try:
citation = Citation(match, self)
# skip citations where lookup failed:
except KeyError as e:
citation = None
if citation:
yield citation
return None
lookup(self, text, broad=True, span=(0,))
Returns the first citation it finds in the text, or None.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
The text to scan for a citation. |
required |
broad |
bool |
Whether to use case-insensitive regex matching and, if available, the schema's broadRegex. |
True |
span |
tuple |
A tuple of one or two values determining the start and end index of where in the text to search for citations. Defaults to (0,) to scan the entire text. |
(0,) |
Returns:
Type | Description |
---|---|
Citation |
The first citation this schema finds in the scanned text, or None. |
Source code in citeurl/__init__.py
def lookup(
self,
text: str,
broad: bool=True,
span: tuple=(0,)
) -> Citation:
"""
Returns the first citation it finds in the text, or None.
Arguments:
text: The text to scan for a citation.
broad: Whether to use case-insensitive regex matching
and, if available, the schema's broadRegex.
span: A tuple of one or two values determining
the start and end index of where in the text to search
for citations. Defaults to (0,) to scan the entire text.
Returns:
The first citation this schema finds in the scanned text,
or None.
"""
try:
return next(self.get_citations(text, broad=broad, span=span))
except:
return None
Citation
A single citation found in text.
Attributes:
Name | Type | Description |
---|---|---|
text |
str |
The text of the citation itself, like "42 USC ยง 1988(b)" |
span |
tuple |
The beginning and end positions of this citation in the source text. |
schema |
Schema |
The schema which recognized this citation |
tokens |
dict |
Dictionary of the named capture groups from the regex this citation matched. For "id." and "shortform" citations, this includes tokens carried over from the parent citation. |
processed_tokens |
dict |
Dictionary of tokens after they have been modified via the schema's processes. |
URL |
str |
The URL where a user can read this citation online |
__init__(self, match, schema)
special
For internal use. There should be no need to create citations by means other than a Citator or Schema object.
Source code in citeurl/__init__.py
def __init__(self, match: re.Match, schema):
"""
For internal use. There should be no need to create citations
by means other than a Citator or Schema object.
"""
self.span: tuple = match.span()
self.schema: Schema = schema
self.text: str = match.group(0)
# idForm and shortForm citations get values from parent citation
# except where their regexes include space for those values
if schema.parent_citation:
self.tokens: dict = dict(schema.parent_citation.tokens)
for key, val in match.groupdict().items():
self.tokens[key] = val
else:
self.tokens: dict = match.groupdict()
self.processed_tokens: dict = self.schema._process_tokens(self.tokens)
self.URL: str = self._get_url()
get_link(self, attrs={'class': 'citation'})
Return citation's link element, with given attributes
Source code in citeurl/__init__.py
def get_link(self, attrs: dict={'class': 'citation'}):
"""Return citation's link element, with given attributes"""
if self.URL:
attrs['href'] = self.URL
else:
del attrs['href']
attr_str = ''
for key, value in attrs.items():
attr_str += ' %s="%s"' % (key, value)
return '<a%s>%s</a>' % (attr_str, self.text)
Authority
A single source cited one or more times in a text.
Attributes:
Name | Type | Description |
---|---|---|
defining_tokens |
dict |
A dictionary of tokens that define this authority, such that any citations with incompatible token values will not match it. Note that this uses processed_tokens (those which have been modified by the schema's operations). |
schema |
Schema |
The schema which found all the citations to this authority |
citations |
list |
The list of all the citations that refer to this authority. |
base_citation |
|
A citation object representing the hypothetical generic citation to this authority. |
name |
str |
The text of base_cite |
__init__(self, first_cite, allowed_differences=[])
special
Define an authority by providing a single long-form citation, and the list of tokens which, if present in the citation, should be discarded from the definition of the authority.
Generates a base_citation to represent the generic instance of this authority.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
first_cite |
|
A long-form citation object representing the first and archetypal citation to this authority. The first_cite will be added as the first entry in the authority's citation list, and it will be used as the basis to generate the authority's base_cite. |
required |
allowed_differences |
list |
A list of tokens whose values can differ among citations to the same authority |
[] |
Source code in citeurl/__init__.py
def __init__(self, first_cite, allowed_differences: list=[]):
"""
Define an authority by providing a single long-form citation,
and the list of tokens which, if present in the citation, should
be discarded from the definition of the authority.
Generates a base_citation to represent the generic instance of
this authority.
Arguments:
first_cite: A long-form citation object representing the
first and archetypal citation to this authority. The
first_cite will be added as the first entry in the
authority's citation list, and it will be used as the
basis to generate the authority's base_cite.
allowed_differences: A list of tokens whose values can
differ among citations to the same authority
"""
long_cite = first_cite._original_cite()
self.schema: Schema = long_cite.schema
self.citations: list = [first_cite]
# List the token values that distinguish this authority from
# others in the same schema. This uses processed tokens, not
# raw, so that a citation to "50 U.S. 5" will match
# a citation to "50 U. S. 5", etc.
self.defining_tokens: dict = {}
for t in first_cite.processed_tokens:
if (
first_cite.processed_tokens[t] != None
and t not in allowed_differences
):
self.defining_tokens[t] = first_cite.processed_tokens[t]
# Next, derive a base citation to represent this authority.
# If the first_citation to this authority isn't a longform, use
# whatever longform it's a child of.
try:
self.base_citation = self._derive_base_citation(long_cite)
except TypeError:
self.base_citation = first_cite
# Set other instance variables
self.name: str = self.base_citation.text
self.URL: str = self.base_citation.URL
# finally, give the first citation a reference to this authority
first_cite.authority = self
include(self, citation)
Adds the citation to this schema's list of citations. Also,
adds the authority
tag to the citation, referring back to this
authority.
Source code in citeurl/__init__.py
def include(self, citation):
"""Adds the citation to this schema's list of citations. Also,
adds the `authority` tag to the citation, referring back to this
authority."""
self.citations.append(citation)
citation.authority = self
matches(self, citation)
Checks whether a given citation matches the schema and defining tokens of this authority.
Source code in citeurl/__init__.py
def matches(self, citation) -> bool:
"""
Checks whether a given citation matches the schema and defining
tokens of this authority.
"""
if self.schema.name != citation.schema.name:
return False
for key, value in self.defining_tokens.items():
if (key not in citation.processed_tokens
or citation.processed_tokens[key] != value):
return False
return True