updates
This commit is contained in:
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,23 @@
|
||||
about_resource: _pyahocorasick.py
|
||||
download_url: https://github.com/WojciechMula/pyahocorasick/tree/ec2fb9cb393f571fd4316ea98ed7b65992f16127/py
|
||||
name: pyahocorasick-python
|
||||
version: ec2fb9
|
||||
|
||||
homepage_url: https://github.com/WojciechMula/pyahocorasick
|
||||
license_expression: public-domain
|
||||
|
||||
copyright: originally authored by Wojciech Mula, modified by the license_expression authors.
|
||||
|
||||
notes: this is a vendored subset of the full pyahocorasick containing only the pure
|
||||
python part with an implementation modified to return non-overlapping matches and
|
||||
non-matches.
|
||||
It has many limitation and in particular it does not pickle well and is much slower
|
||||
than the full C-based implementation but is convenient to use as a vendored, pure
|
||||
Python library.
|
||||
|
||||
owner: nexB Inc.
|
||||
author: Wojciech Mula http://0x80.pl/
|
||||
|
||||
vcs_tool: git
|
||||
vcs_repository: https://github.com/WojciechMula/pyahocorasick.git
|
||||
|
||||
@@ -0,0 +1,649 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-scancode-public-domain
|
||||
# See https://github.com/aboutcode-org/license-expression for support or download.
|
||||
# See https://aboutcode.org for more information about nexB OSS projects.
|
||||
#
|
||||
"""
|
||||
Aho-Corasick string search algorithm in pure Python
|
||||
|
||||
Original Author: Wojciech Muła, wojciech_mula@poczta.onet.pl
|
||||
WWW : http://0x80.pl
|
||||
License : public domain
|
||||
|
||||
This is the pure Python Aho-Corasick automaton from pyahocorasick modified for
|
||||
use in the license_expression library for advanced tokenization:
|
||||
|
||||
- add support for unicode strings.
|
||||
- case insensitive search using sequence of words and not characters
|
||||
- improve returned results with the actual start,end and matched string.
|
||||
- support returning non-matched parts of a string
|
||||
"""
|
||||
|
||||
from collections import deque
|
||||
from collections import OrderedDict
|
||||
import logging
|
||||
import re
|
||||
|
||||
TRACE = False
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def logger_debug(*args):
|
||||
pass
|
||||
|
||||
|
||||
if TRACE:
|
||||
|
||||
def logger_debug(*args):
|
||||
return logger.debug(" ".join(isinstance(a, str) and a or repr(a) for a in args))
|
||||
|
||||
import sys
|
||||
|
||||
logging.basicConfig(stream=sys.stdout)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# used to distinguish from None
|
||||
nil = object()
|
||||
|
||||
|
||||
class TrieNode(object):
|
||||
"""
|
||||
Node of the Trie/Aho-Corasick automaton.
|
||||
"""
|
||||
|
||||
__slots__ = ["token", "output", "fail", "children"]
|
||||
|
||||
def __init__(self, token, output=nil):
|
||||
# token of a tokens string added to the Trie as a string
|
||||
self.token = token
|
||||
|
||||
# an output function (in the Aho-Corasick meaning) for this node: this
|
||||
# is an object that contains the original key string and any
|
||||
# additional value data associated to that key. Or "nil" for a node that
|
||||
# is not a terminal leave for a key. It will be returned with a match.
|
||||
self.output = output
|
||||
|
||||
# failure link used by the Aho-Corasick automaton and its search procedure
|
||||
self.fail = nil
|
||||
|
||||
# children of this node as a mapping of char->node
|
||||
self.children = {}
|
||||
|
||||
def __repr__(self):
|
||||
if self.output is not nil:
|
||||
return "TrieNode(%r, %r)" % (self.token, self.output)
|
||||
else:
|
||||
return "TrieNode(%r)" % self.token
|
||||
|
||||
|
||||
class Trie(object):
|
||||
"""
|
||||
A Trie and Aho-Corasick automaton. This behaves more or less like a mapping of
|
||||
key->value. This is the main entry point.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize a new Trie.
|
||||
"""
|
||||
self.root = TrieNode("")
|
||||
|
||||
# set of any unique tokens in the trie, updated on each addition we keep
|
||||
# track of the set of tokens added to the trie to build the automaton
|
||||
# these are needed to created the first level children failure links
|
||||
self._known_tokens = set()
|
||||
|
||||
# Flag set to True once a Trie has been converted to an Aho-Corasick automaton
|
||||
self._converted = False
|
||||
|
||||
def add(self, tokens_string, value=None):
|
||||
"""
|
||||
Add a new tokens_string and its associated value to the trie. If the
|
||||
tokens_string already exists in the Trie, its value is replaced with the
|
||||
provided value, typically a Token object. If a value is not provided,
|
||||
the tokens_string is used as value.
|
||||
|
||||
A tokens_string is any string. It will be tokenized when added
|
||||
to the Trie.
|
||||
"""
|
||||
if self._converted:
|
||||
raise Exception(
|
||||
"This Trie has been converted to an Aho-Corasick automaton and cannot be modified."
|
||||
)
|
||||
|
||||
if not tokens_string or not isinstance(tokens_string, str):
|
||||
return
|
||||
|
||||
tokens = [t for t in get_tokens(tokens_string) if t.strip()]
|
||||
|
||||
# we keep track of the set of tokens added to the trie to build the
|
||||
# automaton these are needed to created the first level children failure
|
||||
# links
|
||||
|
||||
self._known_tokens.update(tokens)
|
||||
|
||||
node = self.root
|
||||
for token in tokens:
|
||||
try:
|
||||
node = node.children[token]
|
||||
except KeyError:
|
||||
child = TrieNode(token)
|
||||
node.children[token] = child
|
||||
node = child
|
||||
|
||||
node.output = (tokens_string, value or tokens_string)
|
||||
|
||||
def __get_node(self, tokens_string):
|
||||
"""
|
||||
Return a node for this tokens_string or None if the trie does not
|
||||
contain the tokens_string. Private function retrieving a final node of
|
||||
the Trie for a given tokens_string.
|
||||
"""
|
||||
if not tokens_string or not isinstance(tokens_string, str):
|
||||
return
|
||||
|
||||
tokens = [t for t in get_tokens(tokens_string) if t.strip()]
|
||||
node = self.root
|
||||
for token in tokens:
|
||||
try:
|
||||
node = node.children[token]
|
||||
except KeyError:
|
||||
return None
|
||||
return node
|
||||
|
||||
def get(self, tokens_string, default=nil):
|
||||
"""
|
||||
Return the output value found associated with a `tokens_string`. If
|
||||
there is no such tokens_string in the Trie, return the default value
|
||||
(other than nil). If `default` is not provided or is `nil`, raise a
|
||||
KeyError.
|
||||
"""
|
||||
node = self.__get_node(tokens_string)
|
||||
output = nil
|
||||
if node:
|
||||
output = node.output
|
||||
|
||||
if output is nil:
|
||||
if default is nil:
|
||||
raise KeyError(tokens_string)
|
||||
else:
|
||||
return default
|
||||
else:
|
||||
return output
|
||||
|
||||
def keys(self):
|
||||
"""
|
||||
Yield all keys stored in this trie.
|
||||
"""
|
||||
return (key for key, _ in self.items())
|
||||
|
||||
def values(self):
|
||||
"""
|
||||
Yield all values associated with keys stored in this trie.
|
||||
"""
|
||||
return (value for _, value in self.items())
|
||||
|
||||
def items(self):
|
||||
"""
|
||||
Yield tuple of all (key, value) stored in this trie.
|
||||
"""
|
||||
items = []
|
||||
|
||||
def walk(node, tokens):
|
||||
"""
|
||||
Walk the trie, depth first.
|
||||
"""
|
||||
tokens = [t for t in tokens + [node.token] if t]
|
||||
if node.output is not nil:
|
||||
items.append(
|
||||
(
|
||||
node.output[0],
|
||||
node.output[1],
|
||||
)
|
||||
)
|
||||
|
||||
for child in node.children.values():
|
||||
if child is not node:
|
||||
walk(child, tokens)
|
||||
|
||||
walk(self.root, tokens=[])
|
||||
|
||||
return iter(items)
|
||||
|
||||
def exists(self, tokens_string):
|
||||
"""
|
||||
Return True if the key is present in this trie.
|
||||
"""
|
||||
node = self.__get_node(tokens_string)
|
||||
if node:
|
||||
return bool(node.output != nil)
|
||||
return False
|
||||
|
||||
def is_prefix(self, tokens_string):
|
||||
"""
|
||||
Return True if tokens_string is a prefix of any existing tokens_string in the trie.
|
||||
"""
|
||||
return bool(self.__get_node(tokens_string) is not None)
|
||||
|
||||
def make_automaton(self):
|
||||
"""
|
||||
Convert this trie to an Aho-Corasick automaton.
|
||||
Note that this is an error to add new keys to a Trie once it has been
|
||||
converted to an Automaton.
|
||||
"""
|
||||
queue = deque()
|
||||
|
||||
# 1. create root children for each known items range (e.g. all unique
|
||||
# characters from all the added tokens), failing to root.
|
||||
# And build a queue of these
|
||||
for token in self._known_tokens:
|
||||
if token in self.root.children:
|
||||
node = self.root.children[token]
|
||||
# e.g. f(s) = 0, Aho-Corasick-wise
|
||||
node.fail = self.root
|
||||
queue.append(node)
|
||||
else:
|
||||
self.root.children[token] = self.root
|
||||
|
||||
# 2. using the queue of all possible top level items/chars, walk the trie and
|
||||
# add failure links to nodes as needed
|
||||
while queue:
|
||||
current_node = queue.popleft()
|
||||
for node in current_node.children.values():
|
||||
queue.append(node)
|
||||
state = current_node.fail
|
||||
while node.token not in state.children:
|
||||
state = state.fail
|
||||
node.fail = state.children.get(node.token, self.root)
|
||||
|
||||
# Mark the trie as converted so it cannot be modified anymore
|
||||
self._converted = True
|
||||
|
||||
def iter(self, tokens_string, include_unmatched=False, include_space=False):
|
||||
"""
|
||||
Yield Token objects for matched strings by performing the Aho-Corasick
|
||||
search procedure.
|
||||
|
||||
The Token start and end positions in the searched string are such that
|
||||
the matched string is "tokens_string[start:end+1]". And the start is
|
||||
computed from the end_index collected by the Aho-Corasick search
|
||||
procedure such that
|
||||
"start=end_index - n + 1" where n is the length of a matched string.
|
||||
|
||||
The Token.value is an object associated with a matched string.
|
||||
|
||||
For example:
|
||||
>>> a = Trie()
|
||||
>>> a.add('BCDEF')
|
||||
>>> a.add('CDE')
|
||||
>>> a.add('DEFGH')
|
||||
>>> a.add('EFGH')
|
||||
>>> a.add('KL')
|
||||
>>> a.make_automaton()
|
||||
>>> tokens_string = 'a bcdef ghij kl m'
|
||||
>>> strings = Token.sort(a.iter(tokens_string))
|
||||
>>> expected = [
|
||||
... Token(2, 6, u'bcdef', u'BCDEF'),
|
||||
... Token(13, 14, u'kl', u'KL')
|
||||
... ]
|
||||
|
||||
>>> strings == expected
|
||||
True
|
||||
|
||||
>>> list(a.iter('')) == []
|
||||
True
|
||||
|
||||
>>> list(a.iter(' ')) == []
|
||||
True
|
||||
"""
|
||||
if not tokens_string:
|
||||
return
|
||||
|
||||
tokens = get_tokens(tokens_string)
|
||||
state = self.root
|
||||
|
||||
if TRACE:
|
||||
logger_debug("Trie.iter() with:", repr(tokens_string))
|
||||
logger_debug(" tokens:", tokens)
|
||||
|
||||
end_pos = -1
|
||||
for token_string in tokens:
|
||||
end_pos += len(token_string)
|
||||
if TRACE:
|
||||
logger_debug()
|
||||
logger_debug("token_string", repr(token_string))
|
||||
logger_debug(" end_pos", end_pos)
|
||||
|
||||
if not include_space and not token_string.strip():
|
||||
if TRACE:
|
||||
logger_debug(" include_space skipped")
|
||||
continue
|
||||
|
||||
if token_string not in self._known_tokens:
|
||||
state = self.root
|
||||
if TRACE:
|
||||
logger_debug(" unmatched")
|
||||
if include_unmatched:
|
||||
n = len(token_string)
|
||||
start_pos = end_pos - n + 1
|
||||
tok = Token(
|
||||
start=start_pos,
|
||||
end=end_pos,
|
||||
string=tokens_string[start_pos : end_pos + 1],
|
||||
value=None,
|
||||
)
|
||||
if TRACE:
|
||||
logger_debug(" unmatched tok:", tok)
|
||||
yield tok
|
||||
continue
|
||||
|
||||
yielded = False
|
||||
|
||||
# search for a matching token_string in the children, starting at root
|
||||
while token_string not in state.children:
|
||||
state = state.fail
|
||||
|
||||
# we have a matching starting token_string
|
||||
state = state.children.get(token_string, self.root)
|
||||
match = state
|
||||
while match is not nil:
|
||||
if match.output is not nil:
|
||||
matched_string, output_value = match.output
|
||||
if TRACE:
|
||||
logger_debug(" type output", repr(output_value), type(matched_string))
|
||||
n = len(matched_string)
|
||||
start_pos = end_pos - n + 1
|
||||
if TRACE:
|
||||
logger_debug(" start_pos", start_pos)
|
||||
yield Token(
|
||||
start_pos, end_pos, tokens_string[start_pos : end_pos + 1], output_value
|
||||
)
|
||||
yielded = True
|
||||
match = match.fail
|
||||
if not yielded and include_unmatched:
|
||||
if TRACE:
|
||||
logger_debug(" unmatched but known token")
|
||||
n = len(token_string)
|
||||
start_pos = end_pos - n + 1
|
||||
tok = Token(start_pos, end_pos, tokens_string[start_pos : end_pos + 1], None)
|
||||
if TRACE:
|
||||
logger_debug(" unmatched tok 2:", tok)
|
||||
yield tok
|
||||
|
||||
logger_debug()
|
||||
|
||||
def tokenize(self, string, include_unmatched=True, include_space=False):
|
||||
"""
|
||||
Tokenize a string for matched and unmatched sub-sequences and yield non-
|
||||
overlapping Token objects performing a modified Aho-Corasick search
|
||||
procedure:
|
||||
|
||||
- return both matched and unmatched sub-sequences.
|
||||
- do not return matches with positions that are contained or overlap with
|
||||
another match:
|
||||
- discard smaller matches contained in a larger match.
|
||||
- when there is overlap (but not containment), the matches are sorted by
|
||||
start and biggest length and then:
|
||||
- we return the largest match of two overlaping matches
|
||||
- if they have the same length, keep the match starting the earliest and
|
||||
return the non-overlapping portion of the other discarded match as a
|
||||
non-match.
|
||||
|
||||
Each Token contains the start and end position, the corresponding string
|
||||
and an associated value object.
|
||||
|
||||
For example:
|
||||
>>> a = Trie()
|
||||
>>> a.add('BCDEF')
|
||||
>>> a.add('CDE')
|
||||
>>> a.add('DEFGH')
|
||||
>>> a.add('EFGH')
|
||||
>>> a.add('KL')
|
||||
>>> a.make_automaton()
|
||||
>>> string = 'a bcdef ghij kl'
|
||||
>>> tokens = list(a.tokenize(string, include_space=True))
|
||||
|
||||
>>> expected = [
|
||||
... Token(0, 0, u'a', None),
|
||||
... Token(1, 1, u' ', None),
|
||||
... Token(2, 6, u'bcdef', u'BCDEF'),
|
||||
... Token(7, 7, u' ', None),
|
||||
... Token(8, 11, u'ghij', None),
|
||||
... Token(12, 12, u' ', None),
|
||||
... Token(13, 14, u'kl', u'KL')
|
||||
... ]
|
||||
>>> tokens == expected
|
||||
True
|
||||
"""
|
||||
tokens = self.iter(string, include_unmatched=include_unmatched, include_space=include_space)
|
||||
tokens = list(tokens)
|
||||
if TRACE:
|
||||
logger_debug("tokenize.tokens:", tokens)
|
||||
if not include_space:
|
||||
tokens = [t for t in tokens if t.string.strip()]
|
||||
tokens = filter_overlapping(tokens)
|
||||
return tokens
|
||||
|
||||
|
||||
def filter_overlapping(tokens):
|
||||
"""
|
||||
Return a new list from an iterable of `tokens` discarding contained and
|
||||
overlaping Tokens using these rules:
|
||||
|
||||
- skip a token fully contained in another token.
|
||||
- keep the biggest, left-most token of two overlapping tokens and skip the other
|
||||
|
||||
For example:
|
||||
>>> tokens = [
|
||||
... Token(0, 0, 'a'),
|
||||
... Token(1, 5, 'bcdef'),
|
||||
... Token(2, 4, 'cde'),
|
||||
... Token(3, 7, 'defgh'),
|
||||
... Token(4, 7, 'efgh'),
|
||||
... Token(8, 9, 'ij'),
|
||||
... Token(10, 13, 'klmn'),
|
||||
... Token(11, 15, 'lmnop'),
|
||||
... Token(16, 16, 'q'),
|
||||
... ]
|
||||
|
||||
>>> expected = [
|
||||
... Token(0, 0, 'a'),
|
||||
... Token(1, 5, 'bcdef'),
|
||||
... Token(8, 9, 'ij'),
|
||||
... Token(11, 15, 'lmnop'),
|
||||
... Token(16, 16, 'q'),
|
||||
... ]
|
||||
|
||||
>>> filtered = list(filter_overlapping(tokens))
|
||||
>>> filtered == expected
|
||||
True
|
||||
"""
|
||||
tokens = Token.sort(tokens)
|
||||
|
||||
# compare pair of tokens in the sorted sequence: current and next
|
||||
i = 0
|
||||
while i < len(tokens) - 1:
|
||||
j = i + 1
|
||||
while j < len(tokens):
|
||||
curr_tok = tokens[i]
|
||||
next_tok = tokens[j]
|
||||
|
||||
logger_debug("curr_tok, i, next_tok, j:", curr_tok, i, next_tok, j)
|
||||
# disjoint tokens: break, there is nothing to do
|
||||
if next_tok.is_after(curr_tok):
|
||||
logger_debug(" break to next", curr_tok)
|
||||
break
|
||||
|
||||
# contained token: discard the contained token
|
||||
if next_tok in curr_tok:
|
||||
logger_debug(" del next_tok contained:", next_tok)
|
||||
del tokens[j]
|
||||
continue
|
||||
|
||||
# overlap: Keep the longest token and skip the smallest overlapping
|
||||
# tokens. In case of length tie: keep the left most
|
||||
if curr_tok.overlap(next_tok):
|
||||
if len(curr_tok) >= len(next_tok):
|
||||
logger_debug(" del next_tok smaller overlap:", next_tok)
|
||||
del tokens[j]
|
||||
continue
|
||||
else:
|
||||
logger_debug(" del curr_tok smaller overlap:", curr_tok)
|
||||
del tokens[i]
|
||||
break
|
||||
j += 1
|
||||
i += 1
|
||||
return tokens
|
||||
|
||||
|
||||
class Token(object):
|
||||
"""
|
||||
A Token is used to track the tokenization an expression with its
|
||||
start and end as index position in the original string and other attributes:
|
||||
|
||||
- `start` and `end` are zero-based index in the original string S such that
|
||||
S[start:end+1] will yield `string`.
|
||||
- `string` is the matched substring from the original string for this Token.
|
||||
- `value` is the corresponding object for this token as one of:
|
||||
- a LicenseSymbol object
|
||||
- a "Keyword" object (and, or, with, left and right parens)
|
||||
- None if this is a space.
|
||||
"""
|
||||
|
||||
__slots__ = (
|
||||
"start",
|
||||
"end",
|
||||
"string",
|
||||
"value",
|
||||
)
|
||||
|
||||
def __init__(self, start, end, string="", value=None):
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.string = string
|
||||
self.value = value
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
self.__class__.__name__ + "(%(start)r, %(end)r, %(string)r, %(value)r)" % self.as_dict()
|
||||
)
|
||||
|
||||
def as_dict(self):
|
||||
return OrderedDict([(s, getattr(self, s)) for s in self.__slots__])
|
||||
|
||||
def __len__(self):
|
||||
return self.end - self.start + 1
|
||||
|
||||
def __eq__(self, other):
|
||||
return isinstance(other, Token) and (
|
||||
self.start == other.start
|
||||
and self.end == other.end
|
||||
and self.string == other.string
|
||||
and self.value == other.value
|
||||
)
|
||||
|
||||
def __hash__(self):
|
||||
tup = self.start, self.end, self.string, self.value
|
||||
return hash(tup)
|
||||
|
||||
@classmethod
|
||||
def sort(cls, tokens):
|
||||
"""
|
||||
Return a new sorted sequence of tokens given a sequence of tokens. The
|
||||
primary sort is on start and the secondary sort is on longer lengths.
|
||||
Therefore if two tokens have the same start, the longer token will sort
|
||||
first.
|
||||
|
||||
For example:
|
||||
>>> tokens = [Token(0, 0), Token(5, 5), Token(1, 1), Token(2, 4), Token(2, 5)]
|
||||
>>> expected = [Token(0, 0), Token(1, 1), Token(2, 5), Token(2, 4), Token(5, 5)]
|
||||
>>> expected == Token.sort(tokens)
|
||||
True
|
||||
"""
|
||||
|
||||
def key(s):
|
||||
return (
|
||||
s.start,
|
||||
-len(s),
|
||||
)
|
||||
|
||||
return sorted(tokens, key=key)
|
||||
|
||||
def is_after(self, other):
|
||||
"""
|
||||
Return True if this token is after the other token.
|
||||
|
||||
For example:
|
||||
>>> Token(1, 2).is_after(Token(5, 6))
|
||||
False
|
||||
>>> Token(5, 6).is_after(Token(5, 6))
|
||||
False
|
||||
>>> Token(2, 3).is_after(Token(1, 2))
|
||||
False
|
||||
>>> Token(5, 6).is_after(Token(3, 4))
|
||||
True
|
||||
"""
|
||||
return self.start > other.end
|
||||
|
||||
def is_before(self, other):
|
||||
return self.end < other.start
|
||||
|
||||
def __contains__(self, other):
|
||||
"""
|
||||
Return True if this token contains the other token.
|
||||
|
||||
For example:
|
||||
>>> Token(5, 7) in Token(5, 7)
|
||||
True
|
||||
>>> Token(6, 8) in Token(5, 7)
|
||||
False
|
||||
>>> Token(6, 6) in Token(4, 8)
|
||||
True
|
||||
>>> Token(3, 9) in Token(4, 8)
|
||||
False
|
||||
>>> Token(4, 8) in Token(3, 9)
|
||||
True
|
||||
"""
|
||||
return self.start <= other.start and other.end <= self.end
|
||||
|
||||
def overlap(self, other):
|
||||
"""
|
||||
Return True if this token and the other token overlap.
|
||||
|
||||
For example:
|
||||
>>> Token(1, 2).overlap(Token(5, 6))
|
||||
False
|
||||
>>> Token(5, 6).overlap(Token(5, 6))
|
||||
True
|
||||
>>> Token(4, 5).overlap(Token(5, 6))
|
||||
True
|
||||
>>> Token(4, 5).overlap(Token(5, 7))
|
||||
True
|
||||
>>> Token(4, 5).overlap(Token(6, 7))
|
||||
False
|
||||
"""
|
||||
start = self.start
|
||||
end = self.end
|
||||
return (start <= other.start <= end) or (start <= other.end <= end)
|
||||
|
||||
|
||||
# tokenize to separate text from parens
|
||||
_tokenizer = re.compile(
|
||||
r"""
|
||||
(?P<text>[^\s\(\)]+)
|
||||
|
|
||||
(?P<space>\s+)
|
||||
|
|
||||
(?P<parens>[\(\)])
|
||||
""",
|
||||
re.VERBOSE | re.MULTILINE | re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
def get_tokens(tokens_string):
|
||||
"""
|
||||
Return an iterable of strings splitting on spaces and parens.
|
||||
"""
|
||||
return [match for match in _tokenizer.split(tokens_string.lower()) if match]
|
||||
@@ -0,0 +1,395 @@
|
||||
Attribution 4.0 International
|
||||
|
||||
=======================================================================
|
||||
|
||||
Creative Commons Corporation ("Creative Commons") is not a law firm and
|
||||
does not provide legal services or legal advice. Distribution of
|
||||
Creative Commons public licenses does not create a lawyer-client or
|
||||
other relationship. Creative Commons makes its licenses and related
|
||||
information available on an "as-is" basis. Creative Commons gives no
|
||||
warranties regarding its licenses, any material licensed under their
|
||||
terms and conditions, or any related information. Creative Commons
|
||||
disclaims all liability for damages resulting from their use to the
|
||||
fullest extent possible.
|
||||
|
||||
Using Creative Commons Public Licenses
|
||||
|
||||
Creative Commons public licenses provide a standard set of terms and
|
||||
conditions that creators and other rights holders may use to share
|
||||
original works of authorship and other material subject to copyright
|
||||
and certain other rights specified in the public license below. The
|
||||
following considerations are for informational purposes only, are not
|
||||
exhaustive, and do not form part of our licenses.
|
||||
|
||||
Considerations for licensors: Our public licenses are
|
||||
intended for use by those authorized to give the public
|
||||
permission to use material in ways otherwise restricted by
|
||||
copyright and certain other rights. Our licenses are
|
||||
irrevocable. Licensors should read and understand the terms
|
||||
and conditions of the license they choose before applying it.
|
||||
Licensors should also secure all rights necessary before
|
||||
applying our licenses so that the public can reuse the
|
||||
material as expected. Licensors should clearly mark any
|
||||
material not subject to the license. This includes other CC-
|
||||
licensed material, or material used under an exception or
|
||||
limitation to copyright. More considerations for licensors:
|
||||
wiki.creativecommons.org/Considerations_for_licensors
|
||||
|
||||
Considerations for the public: By using one of our public
|
||||
licenses, a licensor grants the public permission to use the
|
||||
licensed material under specified terms and conditions. If
|
||||
the licensor's permission is not necessary for any reason--for
|
||||
example, because of any applicable exception or limitation to
|
||||
copyright--then that use is not regulated by the license. Our
|
||||
licenses grant only permissions under copyright and certain
|
||||
other rights that a licensor has authority to grant. Use of
|
||||
the licensed material may still be restricted for other
|
||||
reasons, including because others have copyright or other
|
||||
rights in the material. A licensor may make special requests,
|
||||
such as asking that all changes be marked or described.
|
||||
Although not required by our licenses, you are encouraged to
|
||||
respect those requests where reasonable. More considerations
|
||||
for the public:
|
||||
wiki.creativecommons.org/Considerations_for_licensees
|
||||
|
||||
=======================================================================
|
||||
|
||||
Creative Commons Attribution 4.0 International Public License
|
||||
|
||||
By exercising the Licensed Rights (defined below), You accept and agree
|
||||
to be bound by the terms and conditions of this Creative Commons
|
||||
Attribution 4.0 International Public License ("Public License"). To the
|
||||
extent this Public License may be interpreted as a contract, You are
|
||||
granted the Licensed Rights in consideration of Your acceptance of
|
||||
these terms and conditions, and the Licensor grants You such rights in
|
||||
consideration of benefits the Licensor receives from making the
|
||||
Licensed Material available under these terms and conditions.
|
||||
|
||||
|
||||
Section 1 -- Definitions.
|
||||
|
||||
a. Adapted Material means material subject to Copyright and Similar
|
||||
Rights that is derived from or based upon the Licensed Material
|
||||
and in which the Licensed Material is translated, altered,
|
||||
arranged, transformed, or otherwise modified in a manner requiring
|
||||
permission under the Copyright and Similar Rights held by the
|
||||
Licensor. For purposes of this Public License, where the Licensed
|
||||
Material is a musical work, performance, or sound recording,
|
||||
Adapted Material is always produced where the Licensed Material is
|
||||
synched in timed relation with a moving image.
|
||||
|
||||
b. Adapter's License means the license You apply to Your Copyright
|
||||
and Similar Rights in Your contributions to Adapted Material in
|
||||
accordance with the terms and conditions of this Public License.
|
||||
|
||||
c. Copyright and Similar Rights means copyright and/or similar rights
|
||||
closely related to copyright including, without limitation,
|
||||
performance, broadcast, sound recording, and Sui Generis Database
|
||||
Rights, without regard to how the rights are labeled or
|
||||
categorized. For purposes of this Public License, the rights
|
||||
specified in Section 2(b)(1)-(2) are not Copyright and Similar
|
||||
Rights.
|
||||
|
||||
d. Effective Technological Measures means those measures that, in the
|
||||
absence of proper authority, may not be circumvented under laws
|
||||
fulfilling obligations under Article 11 of the WIPO Copyright
|
||||
Treaty adopted on December 20, 1996, and/or similar international
|
||||
agreements.
|
||||
|
||||
e. Exceptions and Limitations means fair use, fair dealing, and/or
|
||||
any other exception or limitation to Copyright and Similar Rights
|
||||
that applies to Your use of the Licensed Material.
|
||||
|
||||
f. Licensed Material means the artistic or literary work, database,
|
||||
or other material to which the Licensor applied this Public
|
||||
License.
|
||||
|
||||
g. Licensed Rights means the rights granted to You subject to the
|
||||
terms and conditions of this Public License, which are limited to
|
||||
all Copyright and Similar Rights that apply to Your use of the
|
||||
Licensed Material and that the Licensor has authority to license.
|
||||
|
||||
h. Licensor means the individual(s) or entity(ies) granting rights
|
||||
under this Public License.
|
||||
|
||||
i. Share means to provide material to the public by any means or
|
||||
process that requires permission under the Licensed Rights, such
|
||||
as reproduction, public display, public performance, distribution,
|
||||
dissemination, communication, or importation, and to make material
|
||||
available to the public including in ways that members of the
|
||||
public may access the material from a place and at a time
|
||||
individually chosen by them.
|
||||
|
||||
j. Sui Generis Database Rights means rights other than copyright
|
||||
resulting from Directive 96/9/EC of the European Parliament and of
|
||||
the Council of 11 March 1996 on the legal protection of databases,
|
||||
as amended and/or succeeded, as well as other essentially
|
||||
equivalent rights anywhere in the world.
|
||||
|
||||
k. You means the individual or entity exercising the Licensed Rights
|
||||
under this Public License. Your has a corresponding meaning.
|
||||
|
||||
|
||||
Section 2 -- Scope.
|
||||
|
||||
a. License grant.
|
||||
|
||||
1. Subject to the terms and conditions of this Public License,
|
||||
the Licensor hereby grants You a worldwide, royalty-free,
|
||||
non-sublicensable, non-exclusive, irrevocable license to
|
||||
exercise the Licensed Rights in the Licensed Material to:
|
||||
|
||||
a. reproduce and Share the Licensed Material, in whole or
|
||||
in part; and
|
||||
|
||||
b. produce, reproduce, and Share Adapted Material.
|
||||
|
||||
2. Exceptions and Limitations. For the avoidance of doubt, where
|
||||
Exceptions and Limitations apply to Your use, this Public
|
||||
License does not apply, and You do not need to comply with
|
||||
its terms and conditions.
|
||||
|
||||
3. Term. The term of this Public License is specified in Section
|
||||
6(a).
|
||||
|
||||
4. Media and formats; technical modifications allowed. The
|
||||
Licensor authorizes You to exercise the Licensed Rights in
|
||||
all media and formats whether now known or hereafter created,
|
||||
and to make technical modifications necessary to do so. The
|
||||
Licensor waives and/or agrees not to assert any right or
|
||||
authority to forbid You from making technical modifications
|
||||
necessary to exercise the Licensed Rights, including
|
||||
technical modifications necessary to circumvent Effective
|
||||
Technological Measures. For purposes of this Public License,
|
||||
simply making modifications authorized by this Section 2(a)
|
||||
(4) never produces Adapted Material.
|
||||
|
||||
5. Downstream recipients.
|
||||
|
||||
a. Offer from the Licensor -- Licensed Material. Every
|
||||
recipient of the Licensed Material automatically
|
||||
receives an offer from the Licensor to exercise the
|
||||
Licensed Rights under the terms and conditions of this
|
||||
Public License.
|
||||
|
||||
b. No downstream restrictions. You may not offer or impose
|
||||
any additional or different terms or conditions on, or
|
||||
apply any Effective Technological Measures to, the
|
||||
Licensed Material if doing so restricts exercise of the
|
||||
Licensed Rights by any recipient of the Licensed
|
||||
Material.
|
||||
|
||||
6. No endorsement. Nothing in this Public License constitutes or
|
||||
may be construed as permission to assert or imply that You
|
||||
are, or that Your use of the Licensed Material is, connected
|
||||
with, or sponsored, endorsed, or granted official status by,
|
||||
the Licensor or others designated to receive attribution as
|
||||
provided in Section 3(a)(1)(A)(i).
|
||||
|
||||
b. Other rights.
|
||||
|
||||
1. Moral rights, such as the right of integrity, are not
|
||||
licensed under this Public License, nor are publicity,
|
||||
privacy, and/or other similar personality rights; however, to
|
||||
the extent possible, the Licensor waives and/or agrees not to
|
||||
assert any such rights held by the Licensor to the limited
|
||||
extent necessary to allow You to exercise the Licensed
|
||||
Rights, but not otherwise.
|
||||
|
||||
2. Patent and trademark rights are not licensed under this
|
||||
Public License.
|
||||
|
||||
3. To the extent possible, the Licensor waives any right to
|
||||
collect royalties from You for the exercise of the Licensed
|
||||
Rights, whether directly or through a collecting society
|
||||
under any voluntary or waivable statutory or compulsory
|
||||
licensing scheme. In all other cases the Licensor expressly
|
||||
reserves any right to collect such royalties.
|
||||
|
||||
|
||||
Section 3 -- License Conditions.
|
||||
|
||||
Your exercise of the Licensed Rights is expressly made subject to the
|
||||
following conditions.
|
||||
|
||||
a. Attribution.
|
||||
|
||||
1. If You Share the Licensed Material (including in modified
|
||||
form), You must:
|
||||
|
||||
a. retain the following if it is supplied by the Licensor
|
||||
with the Licensed Material:
|
||||
|
||||
i. identification of the creator(s) of the Licensed
|
||||
Material and any others designated to receive
|
||||
attribution, in any reasonable manner requested by
|
||||
the Licensor (including by pseudonym if
|
||||
designated);
|
||||
|
||||
ii. a copyright notice;
|
||||
|
||||
iii. a notice that refers to this Public License;
|
||||
|
||||
iv. a notice that refers to the disclaimer of
|
||||
warranties;
|
||||
|
||||
v. a URI or hyperlink to the Licensed Material to the
|
||||
extent reasonably practicable;
|
||||
|
||||
b. indicate if You modified the Licensed Material and
|
||||
retain an indication of any previous modifications; and
|
||||
|
||||
c. indicate the Licensed Material is licensed under this
|
||||
Public License, and include the text of, or the URI or
|
||||
hyperlink to, this Public License.
|
||||
|
||||
2. You may satisfy the conditions in Section 3(a)(1) in any
|
||||
reasonable manner based on the medium, means, and context in
|
||||
which You Share the Licensed Material. For example, it may be
|
||||
reasonable to satisfy the conditions by providing a URI or
|
||||
hyperlink to a resource that includes the required
|
||||
information.
|
||||
|
||||
3. If requested by the Licensor, You must remove any of the
|
||||
information required by Section 3(a)(1)(A) to the extent
|
||||
reasonably practicable.
|
||||
|
||||
4. If You Share Adapted Material You produce, the Adapter's
|
||||
License You apply must not prevent recipients of the Adapted
|
||||
Material from complying with this Public License.
|
||||
|
||||
|
||||
Section 4 -- Sui Generis Database Rights.
|
||||
|
||||
Where the Licensed Rights include Sui Generis Database Rights that
|
||||
apply to Your use of the Licensed Material:
|
||||
|
||||
a. for the avoidance of doubt, Section 2(a)(1) grants You the right
|
||||
to extract, reuse, reproduce, and Share all or a substantial
|
||||
portion of the contents of the database;
|
||||
|
||||
b. if You include all or a substantial portion of the database
|
||||
contents in a database in which You have Sui Generis Database
|
||||
Rights, then the database in which You have Sui Generis Database
|
||||
Rights (but not its individual contents) is Adapted Material; and
|
||||
|
||||
c. You must comply with the conditions in Section 3(a) if You Share
|
||||
all or a substantial portion of the contents of the database.
|
||||
|
||||
For the avoidance of doubt, this Section 4 supplements and does not
|
||||
replace Your obligations under this Public License where the Licensed
|
||||
Rights include other Copyright and Similar Rights.
|
||||
|
||||
|
||||
Section 5 -- Disclaimer of Warranties and Limitation of Liability.
|
||||
|
||||
a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
|
||||
EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
|
||||
AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
|
||||
ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
|
||||
IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
|
||||
WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
|
||||
PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
|
||||
ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
|
||||
KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
|
||||
ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
|
||||
|
||||
b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
|
||||
TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
|
||||
NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
|
||||
INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
|
||||
COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
|
||||
USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
|
||||
ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
|
||||
DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
|
||||
IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
|
||||
|
||||
c. The disclaimer of warranties and limitation of liability provided
|
||||
above shall be interpreted in a manner that, to the extent
|
||||
possible, most closely approximates an absolute disclaimer and
|
||||
waiver of all liability.
|
||||
|
||||
|
||||
Section 6 -- Term and Termination.
|
||||
|
||||
a. This Public License applies for the term of the Copyright and
|
||||
Similar Rights licensed here. However, if You fail to comply with
|
||||
this Public License, then Your rights under this Public License
|
||||
terminate automatically.
|
||||
|
||||
b. Where Your right to use the Licensed Material has terminated under
|
||||
Section 6(a), it reinstates:
|
||||
|
||||
1. automatically as of the date the violation is cured, provided
|
||||
it is cured within 30 days of Your discovery of the
|
||||
violation; or
|
||||
|
||||
2. upon express reinstatement by the Licensor.
|
||||
|
||||
For the avoidance of doubt, this Section 6(b) does not affect any
|
||||
right the Licensor may have to seek remedies for Your violations
|
||||
of this Public License.
|
||||
|
||||
c. For the avoidance of doubt, the Licensor may also offer the
|
||||
Licensed Material under separate terms or conditions or stop
|
||||
distributing the Licensed Material at any time; however, doing so
|
||||
will not terminate this Public License.
|
||||
|
||||
d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
|
||||
License.
|
||||
|
||||
|
||||
Section 7 -- Other Terms and Conditions.
|
||||
|
||||
a. The Licensor shall not be bound by any additional or different
|
||||
terms or conditions communicated by You unless expressly agreed.
|
||||
|
||||
b. Any arrangements, understandings, or agreements regarding the
|
||||
Licensed Material not stated herein are separate from and
|
||||
independent of the terms and conditions of this Public License.
|
||||
|
||||
|
||||
Section 8 -- Interpretation.
|
||||
|
||||
a. For the avoidance of doubt, this Public License does not, and
|
||||
shall not be interpreted to, reduce, limit, restrict, or impose
|
||||
conditions on any use of the Licensed Material that could lawfully
|
||||
be made without permission under this Public License.
|
||||
|
||||
b. To the extent possible, if any provision of this Public License is
|
||||
deemed unenforceable, it shall be automatically reformed to the
|
||||
minimum extent necessary to make it enforceable. If the provision
|
||||
cannot be reformed, it shall be severed from this Public License
|
||||
without affecting the enforceability of the remaining terms and
|
||||
conditions.
|
||||
|
||||
c. No term or condition of this Public License will be waived and no
|
||||
failure to comply consented to unless expressly agreed to by the
|
||||
Licensor.
|
||||
|
||||
d. Nothing in this Public License constitutes or may be interpreted
|
||||
as a limitation upon, or waiver of, any privileges and immunities
|
||||
that apply to the Licensor or You, including from the legal
|
||||
processes of any jurisdiction or authority.
|
||||
|
||||
|
||||
=======================================================================
|
||||
|
||||
Creative Commons is not a party to its public
|
||||
licenses. Notwithstanding, Creative Commons may elect to apply one of
|
||||
its public licenses to material it publishes and in those instances
|
||||
will be considered the “Licensor.” The text of the Creative Commons
|
||||
public licenses is dedicated to the public domain under the CC0 Public
|
||||
Domain Dedication. Except for the limited purpose of indicating that
|
||||
material is shared under a Creative Commons public license or as
|
||||
otherwise permitted by the Creative Commons policies published at
|
||||
creativecommons.org/policies, Creative Commons does not authorize the
|
||||
use of the trademark "Creative Commons" or any other trademark or logo
|
||||
of Creative Commons without its prior written consent including,
|
||||
without limitation, in connection with any unauthorized modifications
|
||||
to any of its public licenses or any other arrangements,
|
||||
understandings, or agreements concerning use of licensed material. For
|
||||
the avoidance of doubt, this paragraph does not form part of the
|
||||
public licenses.
|
||||
|
||||
Creative Commons may be contacted at creativecommons.org.
|
||||
@@ -0,0 +1,8 @@
|
||||
about_resource: scancode-licensedb-index.json
|
||||
download_url: https://raw.githubusercontent.com/aboutcode-org/scancode-licensedb/1dfa89ae348338b23a359c4c6b23e39c128a41e5/docs/index.json
|
||||
spdx_license_list_version: 3.27
|
||||
name: scancode-licensedb-index.json
|
||||
license_expression: cc-by-4.0
|
||||
copyright: Copyright (c) nexB Inc. and others.
|
||||
homepage_url: https://scancode-licensedb.aboutcode.org/
|
||||
note: Last updated on July 22, 2025
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user