updates
This commit is contained in:
@@ -0,0 +1,51 @@
|
||||
# Natural Language Toolkit: Metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
NLTK Metrics
|
||||
|
||||
Classes and methods for scoring processing modules.
|
||||
"""
|
||||
|
||||
from nltk.metrics.agreement import AnnotationTask
|
||||
from nltk.metrics.aline import align
|
||||
from nltk.metrics.association import (
|
||||
BigramAssocMeasures,
|
||||
ContingencyMeasures,
|
||||
NgramAssocMeasures,
|
||||
QuadgramAssocMeasures,
|
||||
TrigramAssocMeasures,
|
||||
)
|
||||
from nltk.metrics.confusionmatrix import ConfusionMatrix
|
||||
from nltk.metrics.distance import (
|
||||
binary_distance,
|
||||
custom_distance,
|
||||
edit_distance,
|
||||
edit_distance_align,
|
||||
fractional_presence,
|
||||
interval_distance,
|
||||
jaccard_distance,
|
||||
masi_distance,
|
||||
presence,
|
||||
)
|
||||
from nltk.metrics.paice import Paice
|
||||
from nltk.metrics.scores import (
|
||||
accuracy,
|
||||
approxrand,
|
||||
f_measure,
|
||||
log_likelihood,
|
||||
precision,
|
||||
recall,
|
||||
)
|
||||
from nltk.metrics.segmentation import ghd, pk, windowdiff
|
||||
from nltk.metrics.spearman import (
|
||||
ranks_from_scores,
|
||||
ranks_from_sequence,
|
||||
spearman_correlation,
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,467 @@
|
||||
# Natural Language Toolkit: Agreement Metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Tom Lippincott <tom@cs.columbia.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
Implementations of inter-annotator agreement coefficients surveyed by Artstein
|
||||
and Poesio (2007), Inter-Coder Agreement for Computational Linguistics.
|
||||
|
||||
An agreement coefficient calculates the amount that annotators agreed on label
|
||||
assignments beyond what is expected by chance.
|
||||
|
||||
In defining the AnnotationTask class, we use naming conventions similar to the
|
||||
paper's terminology. There are three types of objects in an annotation task:
|
||||
|
||||
the coders (variables "c" and "C")
|
||||
the items to be annotated (variables "i" and "I")
|
||||
the potential categories to be assigned (variables "k" and "K")
|
||||
|
||||
Additionally, it is often the case that we don't want to treat two different
|
||||
labels as complete disagreement, and so the AnnotationTask constructor can also
|
||||
take a distance metric as a final argument. Distance metrics are simply
|
||||
functions that take two arguments, and return a value between 0.0 and 1.0
|
||||
indicating the distance between them. If not supplied, the default is binary
|
||||
comparison between the arguments.
|
||||
|
||||
The simplest way to initialize an AnnotationTask is with a list of triples,
|
||||
each containing a coder's assignment for one object in the task:
|
||||
|
||||
task = AnnotationTask(data=[('c1', '1', 'v1'),('c2', '1', 'v1'),...])
|
||||
|
||||
Note that the data list needs to contain the same number of triples for each
|
||||
individual coder, containing category values for the same set of items.
|
||||
|
||||
Alpha (Krippendorff 1980)
|
||||
Kappa (Cohen 1960)
|
||||
S (Bennet, Albert and Goldstein 1954)
|
||||
Pi (Scott 1955)
|
||||
|
||||
|
||||
TODO: Describe handling of multiple coders and missing data
|
||||
|
||||
Expected results from the Artstein and Poesio survey paper:
|
||||
|
||||
>>> from nltk.metrics.agreement import AnnotationTask
|
||||
>>> import os.path
|
||||
>>> t = AnnotationTask(data=[x.split() for x in open(os.path.join(os.path.dirname(__file__), "artstein_poesio_example.txt"))])
|
||||
>>> t.avg_Ao()
|
||||
0.88
|
||||
>>> round(t.pi(), 5)
|
||||
0.79953
|
||||
>>> round(t.S(), 2)
|
||||
0.82
|
||||
|
||||
This would have returned a wrong value (0.0) in @785fb79 as coders are in
|
||||
the wrong order. Subsequently, all values for pi(), S(), and kappa() would
|
||||
have been wrong as they are computed with avg_Ao().
|
||||
>>> t2 = AnnotationTask(data=[('b','1','stat'),('a','1','stat')])
|
||||
>>> t2.avg_Ao()
|
||||
1.0
|
||||
|
||||
The following, of course, also works.
|
||||
>>> t3 = AnnotationTask(data=[('a','1','othr'),('b','1','othr')])
|
||||
>>> t3.avg_Ao()
|
||||
1.0
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
from itertools import groupby
|
||||
from operator import itemgetter
|
||||
|
||||
from nltk.internals import deprecated
|
||||
from nltk.metrics.distance import binary_distance
|
||||
from nltk.probability import ConditionalFreqDist, FreqDist
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AnnotationTask:
|
||||
"""Represents an annotation task, i.e. people assign labels to items.
|
||||
|
||||
Notation tries to match notation in Artstein and Poesio (2007).
|
||||
|
||||
In general, coders and items can be represented as any hashable object.
|
||||
Integers, for example, are fine, though strings are more readable.
|
||||
Labels must support the distance functions applied to them, so e.g.
|
||||
a string-edit-distance makes no sense if your labels are integers,
|
||||
whereas interval distance needs numeric values. A notable case of this
|
||||
is the MASI metric, which requires Python sets.
|
||||
"""
|
||||
|
||||
def __init__(self, data=None, distance=binary_distance):
|
||||
"""Initialize an annotation task.
|
||||
|
||||
The data argument can be None (to create an empty annotation task) or a sequence of 3-tuples,
|
||||
each representing a coder's labeling of an item:
|
||||
``(coder,item,label)``
|
||||
|
||||
The distance argument is a function taking two arguments (labels) and producing a numerical distance.
|
||||
The distance from a label to itself should be zero:
|
||||
``distance(l,l) = 0``
|
||||
"""
|
||||
self.distance = distance
|
||||
self.I = set()
|
||||
self.K = set()
|
||||
self.C = set()
|
||||
self.data = []
|
||||
if data is not None:
|
||||
self.load_array(data)
|
||||
|
||||
def __str__(self):
|
||||
return "\r\n".join(
|
||||
map(
|
||||
lambda x: "%s\t%s\t%s"
|
||||
% (x["coder"], x["item"].replace("_", "\t"), ",".join(x["labels"])),
|
||||
self.data,
|
||||
)
|
||||
)
|
||||
|
||||
def load_array(self, array):
|
||||
"""Load an sequence of annotation results, appending to any data already loaded.
|
||||
|
||||
The argument is a sequence of 3-tuples, each representing a coder's labeling of an item:
|
||||
(coder,item,label)
|
||||
"""
|
||||
for coder, item, labels in array:
|
||||
self.C.add(coder)
|
||||
self.K.add(labels)
|
||||
self.I.add(item)
|
||||
self.data.append({"coder": coder, "labels": labels, "item": item})
|
||||
|
||||
def agr(self, cA, cB, i, data=None):
|
||||
"""Agreement between two coders on a given item"""
|
||||
data = data or self.data
|
||||
# cfedermann: we don't know what combination of coder/item will come
|
||||
# first in x; to avoid StopIteration problems due to assuming an order
|
||||
# cA,cB, we allow either for k1 and then look up the missing as k2.
|
||||
k1 = next(x for x in data if x["coder"] in (cA, cB) and x["item"] == i)
|
||||
if k1["coder"] == cA:
|
||||
k2 = next(x for x in data if x["coder"] == cB and x["item"] == i)
|
||||
else:
|
||||
k2 = next(x for x in data if x["coder"] == cA and x["item"] == i)
|
||||
|
||||
ret = 1.0 - float(self.distance(k1["labels"], k2["labels"]))
|
||||
log.debug("Observed agreement between %s and %s on %s: %f", cA, cB, i, ret)
|
||||
log.debug(
|
||||
'Distance between "%r" and "%r": %f', k1["labels"], k2["labels"], 1.0 - ret
|
||||
)
|
||||
return ret
|
||||
|
||||
def Nk(self, k):
|
||||
return float(sum(1 for x in self.data if x["labels"] == k))
|
||||
|
||||
def Nik(self, i, k):
|
||||
return float(sum(1 for x in self.data if x["item"] == i and x["labels"] == k))
|
||||
|
||||
def Nck(self, c, k):
|
||||
return float(sum(1 for x in self.data if x["coder"] == c and x["labels"] == k))
|
||||
|
||||
@deprecated("Use Nk, Nik or Nck instead")
|
||||
def N(self, k=None, i=None, c=None):
|
||||
"""Implements the "n-notation" used in Artstein and Poesio (2007)"""
|
||||
if k is not None and i is None and c is None:
|
||||
ret = self.Nk(k)
|
||||
elif k is not None and i is not None and c is None:
|
||||
ret = self.Nik(i, k)
|
||||
elif k is not None and c is not None and i is None:
|
||||
ret = self.Nck(c, k)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"You must pass either i or c, not both! (k={k!r},i={i!r},c={c!r})"
|
||||
)
|
||||
log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret)
|
||||
return ret
|
||||
|
||||
def _grouped_data(self, field, data=None):
|
||||
data = data or self.data
|
||||
return groupby(sorted(data, key=itemgetter(field)), itemgetter(field))
|
||||
|
||||
def Ao(self, cA, cB):
|
||||
"""Observed agreement between two coders on all items."""
|
||||
data = self._grouped_data(
|
||||
"item", (x for x in self.data if x["coder"] in (cA, cB))
|
||||
)
|
||||
ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len(
|
||||
self.I
|
||||
)
|
||||
log.debug("Observed agreement between %s and %s: %f", cA, cB, ret)
|
||||
return ret
|
||||
|
||||
def _pairwise_average(self, function):
|
||||
"""
|
||||
Calculates the average of function results for each coder pair
|
||||
"""
|
||||
total = 0
|
||||
n = 0
|
||||
s = self.C.copy()
|
||||
for cA in self.C:
|
||||
s.remove(cA)
|
||||
for cB in s:
|
||||
total += function(cA, cB)
|
||||
n += 1
|
||||
ret = total / n
|
||||
return ret
|
||||
|
||||
def avg_Ao(self):
|
||||
"""Average observed agreement across all coders and items."""
|
||||
ret = self._pairwise_average(self.Ao)
|
||||
log.debug("Average observed agreement: %f", ret)
|
||||
return ret
|
||||
|
||||
def Do_Kw_pairwise(self, cA, cB, max_distance=1.0):
|
||||
"""The observed disagreement for the weighted kappa coefficient."""
|
||||
total = 0.0
|
||||
data = (x for x in self.data if x["coder"] in (cA, cB))
|
||||
for i, itemdata in self._grouped_data("item", data):
|
||||
# we should have two items; distance doesn't care which comes first
|
||||
total += self.distance(next(itemdata)["labels"], next(itemdata)["labels"])
|
||||
|
||||
ret = total / (len(self.I) * max_distance)
|
||||
log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret)
|
||||
return ret
|
||||
|
||||
def Do_Kw(self, max_distance=1.0):
|
||||
"""Averaged over all labelers"""
|
||||
ret = self._pairwise_average(
|
||||
lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance)
|
||||
)
|
||||
log.debug("Observed disagreement: %f", ret)
|
||||
return ret
|
||||
|
||||
# Agreement Coefficients
|
||||
def S(self):
|
||||
"""Bennett, Albert and Goldstein 1954"""
|
||||
Ae = 1.0 / len(self.K)
|
||||
ret = (self.avg_Ao() - Ae) / (1.0 - Ae)
|
||||
return ret
|
||||
|
||||
def pi(self):
|
||||
"""Scott 1955; here, multi-pi.
|
||||
Equivalent to K from Siegel and Castellan (1988).
|
||||
|
||||
"""
|
||||
total = 0.0
|
||||
label_freqs = FreqDist(x["labels"] for x in self.data)
|
||||
for k, f in label_freqs.items():
|
||||
total += f**2
|
||||
Ae = total / ((len(self.I) * len(self.C)) ** 2)
|
||||
return (self.avg_Ao() - Ae) / (1 - Ae)
|
||||
|
||||
def Ae_kappa(self, cA, cB):
|
||||
Ae = 0.0
|
||||
nitems = float(len(self.I))
|
||||
label_freqs = ConditionalFreqDist((x["labels"], x["coder"]) for x in self.data)
|
||||
for k in label_freqs.conditions():
|
||||
Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
|
||||
return Ae
|
||||
|
||||
def kappa_pairwise(self, cA, cB):
|
||||
""" """
|
||||
Ae = self.Ae_kappa(cA, cB)
|
||||
ret = (self.Ao(cA, cB) - Ae) / (1.0 - Ae)
|
||||
log.debug("Expected agreement between %s and %s: %f", cA, cB, Ae)
|
||||
return ret
|
||||
|
||||
def kappa(self):
|
||||
"""Cohen 1960
|
||||
Averages naively over kappas for each coder pair.
|
||||
|
||||
"""
|
||||
return self._pairwise_average(self.kappa_pairwise)
|
||||
|
||||
def multi_kappa(self):
|
||||
"""Davies and Fleiss 1982
|
||||
Averages over observed and expected agreements for each coder pair.
|
||||
|
||||
"""
|
||||
Ae = self._pairwise_average(self.Ae_kappa)
|
||||
return (self.avg_Ao() - Ae) / (1.0 - Ae)
|
||||
|
||||
def Disagreement(self, label_freqs):
|
||||
total_labels = sum(label_freqs.values())
|
||||
pairs = 0.0
|
||||
for j, nj in label_freqs.items():
|
||||
for l, nl in label_freqs.items():
|
||||
pairs += float(nj * nl) * self.distance(l, j)
|
||||
return 1.0 * pairs / (total_labels * (total_labels - 1))
|
||||
|
||||
def alpha(self):
|
||||
"""Krippendorff 1980"""
|
||||
# check for degenerate cases
|
||||
if len(self.K) == 0:
|
||||
raise ValueError("Cannot calculate alpha, no data present!")
|
||||
if len(self.K) == 1:
|
||||
log.debug("Only one annotation value, alpha returning 1.")
|
||||
return 1
|
||||
if len(self.C) == 1 and len(self.I) == 1:
|
||||
raise ValueError("Cannot calculate alpha, only one coder and item present!")
|
||||
|
||||
total_disagreement = 0.0
|
||||
total_ratings = 0
|
||||
all_valid_labels_freq = FreqDist([])
|
||||
total_do = 0.0 # Total observed disagreement for all items.
|
||||
for i, itemdata in self._grouped_data("item"):
|
||||
label_freqs = FreqDist(x["labels"] for x in itemdata)
|
||||
labels_count = sum(label_freqs.values())
|
||||
if labels_count < 2:
|
||||
# Ignore the item.
|
||||
continue
|
||||
all_valid_labels_freq += label_freqs
|
||||
total_do += self.Disagreement(label_freqs) * labels_count
|
||||
|
||||
if len(all_valid_labels_freq.keys()) == 1:
|
||||
log.debug("Only one valid annotation value, alpha returning 1.")
|
||||
return 1
|
||||
|
||||
do = total_do / sum(all_valid_labels_freq.values())
|
||||
|
||||
de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
|
||||
k_alpha = 1.0 - do / de
|
||||
|
||||
return k_alpha
|
||||
|
||||
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
|
||||
"""Cohen 1968"""
|
||||
total = 0.0
|
||||
label_freqs = ConditionalFreqDist(
|
||||
(x["coder"], x["labels"]) for x in self.data if x["coder"] in (cA, cB)
|
||||
)
|
||||
for j in self.K:
|
||||
for l in self.K:
|
||||
total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
|
||||
De = total / (max_distance * pow(len(self.I), 2))
|
||||
log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
|
||||
Do = self.Do_Kw_pairwise(cA, cB)
|
||||
ret = 1.0 - (Do / De)
|
||||
return ret
|
||||
|
||||
def weighted_kappa(self, max_distance=1.0):
|
||||
"""Cohen 1968"""
|
||||
return self._pairwise_average(
|
||||
lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import optparse
|
||||
import re
|
||||
|
||||
from nltk.metrics import distance
|
||||
|
||||
# process command-line arguments
|
||||
parser = optparse.OptionParser()
|
||||
parser.add_option(
|
||||
"-d",
|
||||
"--distance",
|
||||
dest="distance",
|
||||
default="binary_distance",
|
||||
help="distance metric to use",
|
||||
)
|
||||
parser.add_option(
|
||||
"-a",
|
||||
"--agreement",
|
||||
dest="agreement",
|
||||
default="kappa",
|
||||
help="agreement coefficient to calculate",
|
||||
)
|
||||
parser.add_option(
|
||||
"-e",
|
||||
"--exclude",
|
||||
dest="exclude",
|
||||
action="append",
|
||||
default=[],
|
||||
help="coder names to exclude (may be specified multiple times)",
|
||||
)
|
||||
parser.add_option(
|
||||
"-i",
|
||||
"--include",
|
||||
dest="include",
|
||||
action="append",
|
||||
default=[],
|
||||
help="coder names to include, same format as exclude",
|
||||
)
|
||||
parser.add_option(
|
||||
"-f",
|
||||
"--file",
|
||||
dest="file",
|
||||
help="file to read labelings from, each line with three columns: 'labeler item labels'",
|
||||
)
|
||||
parser.add_option(
|
||||
"-v",
|
||||
"--verbose",
|
||||
dest="verbose",
|
||||
default="0",
|
||||
help="how much debugging to print on stderr (0-4)",
|
||||
)
|
||||
parser.add_option(
|
||||
"-c",
|
||||
"--columnsep",
|
||||
dest="columnsep",
|
||||
default="\t",
|
||||
help="char/string that separates the three columns in the file, defaults to tab",
|
||||
)
|
||||
parser.add_option(
|
||||
"-l",
|
||||
"--labelsep",
|
||||
dest="labelsep",
|
||||
default=",",
|
||||
help="char/string that separates labels (if labelers can assign more than one), defaults to comma",
|
||||
)
|
||||
parser.add_option(
|
||||
"-p",
|
||||
"--presence",
|
||||
dest="presence",
|
||||
default=None,
|
||||
help="convert each labeling into 1 or 0, based on presence of LABEL",
|
||||
)
|
||||
parser.add_option(
|
||||
"-T",
|
||||
"--thorough",
|
||||
dest="thorough",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="calculate agreement for every subset of the annotators",
|
||||
)
|
||||
(options, remainder) = parser.parse_args()
|
||||
|
||||
if not options.file:
|
||||
parser.print_help()
|
||||
exit()
|
||||
|
||||
logging.basicConfig(level=50 - 10 * int(options.verbose))
|
||||
|
||||
# read in data from the specified file
|
||||
data = []
|
||||
with open(options.file) as infile:
|
||||
for l in infile:
|
||||
toks = l.split(options.columnsep)
|
||||
coder, object_, labels = (
|
||||
toks[0],
|
||||
str(toks[1:-1]),
|
||||
frozenset(toks[-1].strip().split(options.labelsep)),
|
||||
)
|
||||
if (
|
||||
(options.include == options.exclude)
|
||||
or (len(options.include) > 0 and coder in options.include)
|
||||
or (len(options.exclude) > 0 and coder not in options.exclude)
|
||||
):
|
||||
data.append((coder, object_, labels))
|
||||
|
||||
if options.presence:
|
||||
task = AnnotationTask(
|
||||
data, getattr(distance, options.distance)(options.presence)
|
||||
)
|
||||
else:
|
||||
task = AnnotationTask(data, getattr(distance, options.distance))
|
||||
|
||||
if options.thorough:
|
||||
pass
|
||||
else:
|
||||
print(getattr(task, options.agreement)())
|
||||
|
||||
logging.shutdown()
|
||||
1597
Backend/venv/lib/python3.12/site-packages/nltk/metrics/aline.py
Normal file
1597
Backend/venv/lib/python3.12/site-packages/nltk/metrics/aline.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,476 @@
|
||||
# Natural Language Toolkit: Ngram Association Measures
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Provides scoring functions for a number of association measures through a
|
||||
generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
|
||||
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
|
||||
"""
|
||||
|
||||
import math as _math
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from functools import reduce
|
||||
|
||||
_log2 = lambda x: _math.log2(x)
|
||||
_ln = _math.log
|
||||
|
||||
_product = lambda s: reduce(lambda x, y: x * y, s)
|
||||
|
||||
_SMALL = 1e-20
|
||||
|
||||
try:
|
||||
from scipy.stats import fisher_exact
|
||||
except ImportError:
|
||||
|
||||
def fisher_exact(*_args, **_kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
### Indices to marginals arguments:
|
||||
|
||||
NGRAM = 0
|
||||
"""Marginals index for the ngram count"""
|
||||
|
||||
UNIGRAMS = -2
|
||||
"""Marginals index for a tuple of each unigram count"""
|
||||
|
||||
TOTAL = -1
|
||||
"""Marginals index for the number of words in the data"""
|
||||
|
||||
|
||||
class NgramAssocMeasures(metaclass=ABCMeta):
|
||||
"""
|
||||
An abstract class defining a collection of generic association measures.
|
||||
Each public method returns a score, taking the following arguments::
|
||||
|
||||
score_fn(count_of_ngram,
|
||||
(count_of_n-1gram_1, ..., count_of_n-1gram_j),
|
||||
(count_of_n-2gram_1, ..., count_of_n-2gram_k),
|
||||
...,
|
||||
(count_of_1gram_1, ..., count_of_1gram_n),
|
||||
count_of_total_words)
|
||||
|
||||
See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``
|
||||
|
||||
Inheriting classes should define a property _n, and a method _contingency
|
||||
which calculates contingency values from marginals in order for all
|
||||
association measures defined here to be usable.
|
||||
"""
|
||||
|
||||
_n = 0
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def _contingency(*marginals):
|
||||
"""Calculates values of a contingency table from marginal values."""
|
||||
raise NotImplementedError(
|
||||
"The contingency table is not available" "in the general ngram case"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def _marginals(*contingency):
|
||||
"""Calculates values of contingency table marginals from its values."""
|
||||
raise NotImplementedError(
|
||||
"The contingency table is not available" "in the general ngram case"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _expected_values(cls, cont):
|
||||
"""Calculates expected values for a contingency table."""
|
||||
n_all = sum(cont)
|
||||
bits = [1 << i for i in range(cls._n)]
|
||||
|
||||
# For each contingency table cell
|
||||
for i in range(len(cont)):
|
||||
# Yield the expected value
|
||||
yield (
|
||||
_product(
|
||||
sum(cont[x] for x in range(2**cls._n) if (x & j) == (i & j))
|
||||
for j in bits
|
||||
)
|
||||
/ (n_all ** (cls._n - 1))
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def raw_freq(*marginals):
|
||||
"""Scores ngrams by their frequency"""
|
||||
return marginals[NGRAM] / marginals[TOTAL]
|
||||
|
||||
@classmethod
|
||||
def student_t(cls, *marginals):
|
||||
"""Scores ngrams using Student's t test with independence hypothesis
|
||||
for unigrams, as in Manning and Schutze 5.3.1.
|
||||
"""
|
||||
return (
|
||||
marginals[NGRAM]
|
||||
- _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
|
||||
) / (marginals[NGRAM] + _SMALL) ** 0.5
|
||||
|
||||
@classmethod
|
||||
def chi_sq(cls, *marginals):
|
||||
"""Scores ngrams using Pearson's chi-square as in Manning and Schutze
|
||||
5.3.3.
|
||||
"""
|
||||
cont = cls._contingency(*marginals)
|
||||
exps = cls._expected_values(cont)
|
||||
return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps))
|
||||
|
||||
@staticmethod
|
||||
def mi_like(*marginals, **kwargs):
|
||||
"""Scores ngrams using a variant of mutual information. The keyword
|
||||
argument power sets an exponent (default 3) for the numerator. No
|
||||
logarithm of the result is calculated.
|
||||
"""
|
||||
return marginals[NGRAM] ** kwargs.get("power", 3) / _product(
|
||||
marginals[UNIGRAMS]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def pmi(cls, *marginals):
|
||||
"""Scores ngrams by pointwise mutual information, as in Manning and
|
||||
Schutze 5.4.
|
||||
"""
|
||||
return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2(
|
||||
_product(marginals[UNIGRAMS])
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def likelihood_ratio(cls, *marginals):
|
||||
"""Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4."""
|
||||
cont = cls._contingency(*marginals)
|
||||
return 2 * sum(
|
||||
obs * _ln(obs / (exp + _SMALL) + _SMALL)
|
||||
for obs, exp in zip(cont, cls._expected_values(cont))
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def poisson_stirling(cls, *marginals):
|
||||
"""Scores ngrams using the Poisson-Stirling measure."""
|
||||
exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
|
||||
return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)
|
||||
|
||||
@classmethod
|
||||
def jaccard(cls, *marginals):
|
||||
"""Scores ngrams using the Jaccard index."""
|
||||
cont = cls._contingency(*marginals)
|
||||
return cont[0] / sum(cont[:-1])
|
||||
|
||||
|
||||
class BigramAssocMeasures(NgramAssocMeasures):
|
||||
"""
|
||||
A collection of bigram association measures. Each association measure
|
||||
is provided as a function with three arguments::
|
||||
|
||||
bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
|
||||
|
||||
The arguments constitute the marginals of a contingency table, counting
|
||||
the occurrences of particular events in a corpus. The letter i in the
|
||||
suffix refers to the appearance of the word in question, while x indicates
|
||||
the appearance of any word. Thus, for example:
|
||||
|
||||
- n_ii counts ``(w1, w2)``, i.e. the bigram being scored
|
||||
- n_ix counts ``(w1, *)``
|
||||
- n_xi counts ``(*, w2)``
|
||||
- n_xx counts ``(*, *)``, i.e. any bigram
|
||||
|
||||
This may be shown with respect to a contingency table::
|
||||
|
||||
w1 ~w1
|
||||
------ ------
|
||||
w2 | n_ii | n_oi | = n_xi
|
||||
------ ------
|
||||
~w2 | n_io | n_oo |
|
||||
------ ------
|
||||
= n_ix TOTAL = n_xx
|
||||
"""
|
||||
|
||||
_n = 2
|
||||
|
||||
@staticmethod
|
||||
def _contingency(n_ii, n_ix_xi_tuple, n_xx):
|
||||
"""Calculates values of a bigram contingency table from marginal values."""
|
||||
(n_ix, n_xi) = n_ix_xi_tuple
|
||||
n_oi = n_xi - n_ii
|
||||
n_io = n_ix - n_ii
|
||||
return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io)
|
||||
|
||||
@staticmethod
|
||||
def _marginals(n_ii, n_oi, n_io, n_oo):
|
||||
"""Calculates values of contingency table marginals from its values."""
|
||||
return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii)
|
||||
|
||||
@staticmethod
|
||||
def _expected_values(cont):
|
||||
"""Calculates expected values for a contingency table."""
|
||||
n_xx = sum(cont)
|
||||
# For each contingency table cell
|
||||
for i in range(4):
|
||||
yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx
|
||||
|
||||
@classmethod
|
||||
def phi_sq(cls, *marginals):
|
||||
"""Scores bigrams using phi-square, the square of the Pearson correlation
|
||||
coefficient.
|
||||
"""
|
||||
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
|
||||
|
||||
return (n_ii * n_oo - n_io * n_oi) ** 2 / (
|
||||
(n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
|
||||
"""Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
|
||||
of bigrams, as in Manning and Schutze 5.3.3.
|
||||
"""
|
||||
(n_ix, n_xi) = n_ix_xi_tuple
|
||||
return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)
|
||||
|
||||
@classmethod
|
||||
def fisher(cls, *marginals):
|
||||
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
|
||||
sensitive to small counts than PMI or Chi Sq, but also more expensive
|
||||
to compute. Requires scipy.
|
||||
"""
|
||||
|
||||
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
|
||||
|
||||
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less")
|
||||
return pvalue
|
||||
|
||||
@staticmethod
|
||||
def dice(n_ii, n_ix_xi_tuple, n_xx):
|
||||
"""Scores bigrams using Dice's coefficient."""
|
||||
(n_ix, n_xi) = n_ix_xi_tuple
|
||||
return 2 * n_ii / (n_ix + n_xi)
|
||||
|
||||
|
||||
class TrigramAssocMeasures(NgramAssocMeasures):
|
||||
"""
|
||||
A collection of trigram association measures. Each association measure
|
||||
is provided as a function with four arguments::
|
||||
|
||||
trigram_score_fn(n_iii,
|
||||
(n_iix, n_ixi, n_xii),
|
||||
(n_ixx, n_xix, n_xxi),
|
||||
n_xxx)
|
||||
|
||||
The arguments constitute the marginals of a contingency table, counting
|
||||
the occurrences of particular events in a corpus. The letter i in the
|
||||
suffix refers to the appearance of the word in question, while x indicates
|
||||
the appearance of any word. Thus, for example:
|
||||
|
||||
- n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored
|
||||
- n_ixx counts ``(w1, *, *)``
|
||||
- n_xxx counts ``(*, *, *)``, i.e. any trigram
|
||||
"""
|
||||
|
||||
_n = 3
|
||||
|
||||
@staticmethod
|
||||
def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx):
|
||||
"""Calculates values of a trigram contingency table (or cube) from
|
||||
marginal values.
|
||||
>>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
|
||||
(1, 0, 0, 0, 0, 72, 0, 1927)
|
||||
"""
|
||||
(n_iix, n_ixi, n_xii) = n_iix_tuple
|
||||
(n_ixx, n_xix, n_xxi) = n_ixx_tuple
|
||||
n_oii = n_xii - n_iii
|
||||
n_ioi = n_ixi - n_iii
|
||||
n_iio = n_iix - n_iii
|
||||
n_ooi = n_xxi - n_iii - n_oii - n_ioi
|
||||
n_oio = n_xix - n_iii - n_oii - n_iio
|
||||
n_ioo = n_ixx - n_iii - n_ioi - n_iio
|
||||
n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo
|
||||
|
||||
return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo)
|
||||
|
||||
@staticmethod
|
||||
def _marginals(*contingency):
|
||||
"""Calculates values of contingency table marginals from its values.
|
||||
>>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
|
||||
(1, (1, 1, 1), (1, 73, 1), 2000)
|
||||
"""
|
||||
n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
|
||||
return (
|
||||
n_iii,
|
||||
(n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
|
||||
(
|
||||
n_iii + n_ioi + n_iio + n_ioo,
|
||||
n_iii + n_oii + n_iio + n_oio,
|
||||
n_iii + n_oii + n_ioi + n_ooi,
|
||||
),
|
||||
sum(contingency),
|
||||
)
|
||||
|
||||
|
||||
class QuadgramAssocMeasures(NgramAssocMeasures):
|
||||
"""
|
||||
A collection of quadgram association measures. Each association measure
|
||||
is provided as a function with five arguments::
|
||||
|
||||
trigram_score_fn(n_iiii,
|
||||
(n_iiix, n_iixi, n_ixii, n_xiii),
|
||||
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
|
||||
(n_ixxx, n_xixx, n_xxix, n_xxxi),
|
||||
n_all)
|
||||
|
||||
The arguments constitute the marginals of a contingency table, counting
|
||||
the occurrences of particular events in a corpus. The letter i in the
|
||||
suffix refers to the appearance of the word in question, while x indicates
|
||||
the appearance of any word. Thus, for example:
|
||||
|
||||
- n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored
|
||||
- n_ixxi counts ``(w1, *, *, w4)``
|
||||
- n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram
|
||||
"""
|
||||
|
||||
_n = 4
|
||||
|
||||
@staticmethod
|
||||
def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx):
|
||||
"""Calculates values of a quadgram contingency table from
|
||||
marginal values.
|
||||
"""
|
||||
(n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple
|
||||
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple
|
||||
(n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple
|
||||
n_oiii = n_xiii - n_iiii
|
||||
n_ioii = n_ixii - n_iiii
|
||||
n_iioi = n_iixi - n_iiii
|
||||
n_ooii = n_xxii - n_iiii - n_oiii - n_ioii
|
||||
n_oioi = n_xixi - n_iiii - n_oiii - n_iioi
|
||||
n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi
|
||||
n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi
|
||||
n_iiio = n_iiix - n_iiii
|
||||
n_oiio = n_xiix - n_iiii - n_oiii - n_iiio
|
||||
n_ioio = n_ixix - n_iiii - n_ioii - n_iiio
|
||||
n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio
|
||||
n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
|
||||
n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
|
||||
n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
|
||||
n_oooo = (
|
||||
n_xxxx
|
||||
- n_iiii
|
||||
- n_oiii
|
||||
- n_ioii
|
||||
- n_iioi
|
||||
- n_ooii
|
||||
- n_oioi
|
||||
- n_iooi
|
||||
- n_oooi
|
||||
- n_iiio
|
||||
- n_oiio
|
||||
- n_ioio
|
||||
- n_ooio
|
||||
- n_iioo
|
||||
- n_oioo
|
||||
- n_iooo
|
||||
)
|
||||
|
||||
return (
|
||||
n_iiii,
|
||||
n_oiii,
|
||||
n_ioii,
|
||||
n_ooii,
|
||||
n_iioi,
|
||||
n_oioi,
|
||||
n_iooi,
|
||||
n_oooi,
|
||||
n_iiio,
|
||||
n_oiio,
|
||||
n_ioio,
|
||||
n_ooio,
|
||||
n_iioo,
|
||||
n_oioo,
|
||||
n_iooo,
|
||||
n_oooo,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _marginals(*contingency):
|
||||
"""Calculates values of contingency table marginals from its values.
|
||||
QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
|
||||
(1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
|
||||
"""
|
||||
(
|
||||
n_iiii,
|
||||
n_oiii,
|
||||
n_ioii,
|
||||
n_ooii,
|
||||
n_iioi,
|
||||
n_oioi,
|
||||
n_iooi,
|
||||
n_oooi,
|
||||
n_iiio,
|
||||
n_oiio,
|
||||
n_ioio,
|
||||
n_ooio,
|
||||
n_iioo,
|
||||
n_oioo,
|
||||
n_iooo,
|
||||
n_oooo,
|
||||
) = contingency
|
||||
|
||||
n_iiix = n_iiii + n_iiio
|
||||
n_iixi = n_iiii + n_iioi
|
||||
n_ixii = n_iiii + n_ioii
|
||||
n_xiii = n_iiii + n_oiii
|
||||
|
||||
n_iixx = n_iiii + n_iioi + n_iiio + n_iioo
|
||||
n_ixix = n_iiii + n_ioii + n_iiio + n_ioio
|
||||
n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi
|
||||
n_xixi = n_iiii + n_oiii + n_iioi + n_oioi
|
||||
n_xxii = n_iiii + n_oiii + n_ioii + n_ooii
|
||||
n_xiix = n_iiii + n_oiii + n_iiio + n_oiio
|
||||
|
||||
n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo
|
||||
n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo
|
||||
n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio
|
||||
n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi
|
||||
|
||||
n_all = sum(contingency)
|
||||
|
||||
return (
|
||||
n_iiii,
|
||||
(n_iiix, n_iixi, n_ixii, n_xiii),
|
||||
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
|
||||
(n_ixxx, n_xixx, n_xxix, n_xxxi),
|
||||
n_all,
|
||||
)
|
||||
|
||||
|
||||
class ContingencyMeasures:
|
||||
"""Wraps NgramAssocMeasures classes such that the arguments of association
|
||||
measures are contingency table values rather than marginals.
|
||||
"""
|
||||
|
||||
def __init__(self, measures):
|
||||
"""Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
|
||||
self.__class__.__name__ = "Contingency" + measures.__class__.__name__
|
||||
for k in dir(measures):
|
||||
if k.startswith("__"):
|
||||
continue
|
||||
v = getattr(measures, k)
|
||||
if not k.startswith("_"):
|
||||
v = self._make_contingency_fn(measures, v)
|
||||
setattr(self, k, v)
|
||||
|
||||
@staticmethod
|
||||
def _make_contingency_fn(measures, old_fn):
|
||||
"""From an association measure function, produces a new function which
|
||||
accepts contingency table values as its arguments.
|
||||
"""
|
||||
|
||||
def res(*contingency):
|
||||
return old_fn(*measures._marginals(*contingency))
|
||||
|
||||
res.__doc__ = old_fn.__doc__
|
||||
res.__name__ = old_fn.__name__
|
||||
return res
|
||||
@@ -0,0 +1,351 @@
|
||||
# Natural Language Toolkit: Confusion Matrices
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Tom Aarsen <>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from nltk.probability import FreqDist
|
||||
|
||||
|
||||
class ConfusionMatrix:
|
||||
"""
|
||||
The confusion matrix between a list of reference values and a
|
||||
corresponding list of test values. Entry *[r,t]* of this
|
||||
matrix is a count of the number of times that the reference value
|
||||
*r* corresponds to the test value *t*. E.g.:
|
||||
|
||||
>>> from nltk.metrics import ConfusionMatrix
|
||||
>>> ref = 'DET NN VB DET JJ NN NN IN DET NN'.split()
|
||||
>>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
|
||||
>>> cm = ConfusionMatrix(ref, test)
|
||||
>>> print(cm['NN', 'NN'])
|
||||
3
|
||||
|
||||
Note that the diagonal entries *Ri=Tj* of this matrix
|
||||
corresponds to correct values; and the off-diagonal entries
|
||||
correspond to incorrect values.
|
||||
"""
|
||||
|
||||
def __init__(self, reference, test, sort_by_count=False):
|
||||
"""
|
||||
Construct a new confusion matrix from a list of reference
|
||||
values and a corresponding list of test values.
|
||||
|
||||
:type reference: list
|
||||
:param reference: An ordered list of reference values.
|
||||
:type test: list
|
||||
:param test: A list of values to compare against the
|
||||
corresponding reference values.
|
||||
:raise ValueError: If ``reference`` and ``length`` do not have
|
||||
the same length.
|
||||
"""
|
||||
if len(reference) != len(test):
|
||||
raise ValueError("Lists must have the same length.")
|
||||
|
||||
# Get a list of all values.
|
||||
if sort_by_count:
|
||||
ref_fdist = FreqDist(reference)
|
||||
test_fdist = FreqDist(test)
|
||||
|
||||
def key(v):
|
||||
return -(ref_fdist[v] + test_fdist[v])
|
||||
|
||||
values = sorted(set(reference + test), key=key)
|
||||
else:
|
||||
values = sorted(set(reference + test))
|
||||
|
||||
# Construct a value->index dictionary
|
||||
indices = {val: i for (i, val) in enumerate(values)}
|
||||
|
||||
# Make a confusion matrix table.
|
||||
confusion = [[0 for _ in values] for _ in values]
|
||||
max_conf = 0 # Maximum confusion
|
||||
for w, g in zip(reference, test):
|
||||
confusion[indices[w]][indices[g]] += 1
|
||||
max_conf = max(max_conf, confusion[indices[w]][indices[g]])
|
||||
|
||||
#: A list of all values in ``reference`` or ``test``.
|
||||
self._values = values
|
||||
#: A dictionary mapping values in ``self._values`` to their indices.
|
||||
self._indices = indices
|
||||
#: The confusion matrix itself (as a list of lists of counts).
|
||||
self._confusion = confusion
|
||||
#: The greatest count in ``self._confusion`` (used for printing).
|
||||
self._max_conf = max_conf
|
||||
#: The total number of values in the confusion matrix.
|
||||
self._total = len(reference)
|
||||
#: The number of correct (on-diagonal) values in the matrix.
|
||||
self._correct = sum(confusion[i][i] for i in range(len(values)))
|
||||
|
||||
def __getitem__(self, li_lj_tuple):
|
||||
"""
|
||||
:return: The number of times that value ``li`` was expected and
|
||||
value ``lj`` was given.
|
||||
:rtype: int
|
||||
"""
|
||||
(li, lj) = li_lj_tuple
|
||||
i = self._indices[li]
|
||||
j = self._indices[lj]
|
||||
return self._confusion[i][j]
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ConfusionMatrix: {self._correct}/{self._total} correct>"
|
||||
|
||||
def __str__(self):
|
||||
return self.pretty_format()
|
||||
|
||||
def pretty_format(
|
||||
self,
|
||||
show_percents=False,
|
||||
values_in_chart=True,
|
||||
truncate=None,
|
||||
sort_by_count=False,
|
||||
):
|
||||
"""
|
||||
:return: A multi-line string representation of this confusion matrix.
|
||||
:type truncate: int
|
||||
:param truncate: If specified, then only show the specified
|
||||
number of values. Any sorting (e.g., sort_by_count)
|
||||
will be performed before truncation.
|
||||
:param sort_by_count: If true, then sort by the count of each
|
||||
label in the reference data. I.e., labels that occur more
|
||||
frequently in the reference label will be towards the left
|
||||
edge of the matrix, and labels that occur less frequently
|
||||
will be towards the right edge.
|
||||
|
||||
@todo: add marginals?
|
||||
"""
|
||||
confusion = self._confusion
|
||||
|
||||
values = self._values
|
||||
if sort_by_count:
|
||||
values = sorted(
|
||||
values, key=lambda v: -sum(self._confusion[self._indices[v]])
|
||||
)
|
||||
|
||||
if truncate:
|
||||
values = values[:truncate]
|
||||
|
||||
if values_in_chart:
|
||||
value_strings = ["%s" % val for val in values]
|
||||
else:
|
||||
value_strings = [str(n + 1) for n in range(len(values))]
|
||||
|
||||
# Construct a format string for row values
|
||||
valuelen = max(len(val) for val in value_strings)
|
||||
value_format = "%" + repr(valuelen) + "s | "
|
||||
# Construct a format string for matrix entries
|
||||
if show_percents:
|
||||
entrylen = 6
|
||||
entry_format = "%5.1f%%"
|
||||
zerostr = " ."
|
||||
else:
|
||||
entrylen = len(repr(self._max_conf))
|
||||
entry_format = "%" + repr(entrylen) + "d"
|
||||
zerostr = " " * (entrylen - 1) + "."
|
||||
|
||||
# Write the column values.
|
||||
s = ""
|
||||
for i in range(valuelen):
|
||||
s += (" " * valuelen) + " |"
|
||||
for val in value_strings:
|
||||
if i >= valuelen - len(val):
|
||||
s += val[i - valuelen + len(val)].rjust(entrylen + 1)
|
||||
else:
|
||||
s += " " * (entrylen + 1)
|
||||
s += " |\n"
|
||||
|
||||
# Write a dividing line
|
||||
s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
|
||||
|
||||
# Write the entries.
|
||||
for val, li in zip(value_strings, values):
|
||||
i = self._indices[li]
|
||||
s += value_format % val
|
||||
for lj in values:
|
||||
j = self._indices[lj]
|
||||
if confusion[i][j] == 0:
|
||||
s += zerostr
|
||||
elif show_percents:
|
||||
s += entry_format % (100.0 * confusion[i][j] / self._total)
|
||||
else:
|
||||
s += entry_format % confusion[i][j]
|
||||
if i == j:
|
||||
prevspace = s.rfind(" ")
|
||||
s = s[:prevspace] + "<" + s[prevspace + 1 :] + ">"
|
||||
else:
|
||||
s += " "
|
||||
s += "|\n"
|
||||
|
||||
# Write a dividing line
|
||||
s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
|
||||
|
||||
# Write a key
|
||||
s += "(row = reference; col = test)\n"
|
||||
if not values_in_chart:
|
||||
s += "Value key:\n"
|
||||
for i, value in enumerate(values):
|
||||
s += "%6d: %s\n" % (i + 1, value)
|
||||
|
||||
return s
|
||||
|
||||
def key(self):
|
||||
values = self._values
|
||||
str = "Value key:\n"
|
||||
indexlen = len(repr(len(values) - 1))
|
||||
key_format = " %" + repr(indexlen) + "d: %s\n"
|
||||
str += "".join([key_format % (i, values[i]) for i in range(len(values))])
|
||||
return str
|
||||
|
||||
def recall(self, value):
|
||||
"""Given a value in the confusion matrix, return the recall
|
||||
that corresponds to this value. The recall is defined as:
|
||||
|
||||
- *r* = true positive / (true positive + false positive)
|
||||
|
||||
and can loosely be considered the ratio of how often ``value``
|
||||
was predicted correctly relative to how often ``value`` was
|
||||
the true result.
|
||||
|
||||
:param value: value used in the ConfusionMatrix
|
||||
:return: the recall corresponding to ``value``.
|
||||
:rtype: float
|
||||
"""
|
||||
# Number of times `value` was correct, and also predicted
|
||||
TP = self[value, value]
|
||||
# Number of times `value` was correct
|
||||
TP_FN = sum(self[value, pred_value] for pred_value in self._values)
|
||||
if TP_FN == 0:
|
||||
return 0.0
|
||||
return TP / TP_FN
|
||||
|
||||
def precision(self, value):
|
||||
"""Given a value in the confusion matrix, return the precision
|
||||
that corresponds to this value. The precision is defined as:
|
||||
|
||||
- *p* = true positive / (true positive + false negative)
|
||||
|
||||
and can loosely be considered the ratio of how often ``value``
|
||||
was predicted correctly relative to the number of predictions
|
||||
for ``value``.
|
||||
|
||||
:param value: value used in the ConfusionMatrix
|
||||
:return: the precision corresponding to ``value``.
|
||||
:rtype: float
|
||||
"""
|
||||
# Number of times `value` was correct, and also predicted
|
||||
TP = self[value, value]
|
||||
# Number of times `value` was predicted
|
||||
TP_FP = sum(self[real_value, value] for real_value in self._values)
|
||||
if TP_FP == 0:
|
||||
return 0.0
|
||||
return TP / TP_FP
|
||||
|
||||
def f_measure(self, value, alpha=0.5):
|
||||
"""
|
||||
Given a value used in the confusion matrix, return the f-measure
|
||||
that corresponds to this value. The f-measure is the harmonic mean
|
||||
of the ``precision`` and ``recall``, weighted by ``alpha``.
|
||||
In particular, given the precision *p* and recall *r* defined by:
|
||||
|
||||
- *p* = true positive / (true positive + false negative)
|
||||
- *r* = true positive / (true positive + false positive)
|
||||
|
||||
The f-measure is:
|
||||
|
||||
- *1/(alpha/p + (1-alpha)/r)*
|
||||
|
||||
With ``alpha = 0.5``, this reduces to:
|
||||
|
||||
- *2pr / (p + r)*
|
||||
|
||||
:param value: value used in the ConfusionMatrix
|
||||
:param alpha: Ratio of the cost of false negative compared to false
|
||||
positives. Defaults to 0.5, where the costs are equal.
|
||||
:type alpha: float
|
||||
:return: the F-measure corresponding to ``value``.
|
||||
:rtype: float
|
||||
"""
|
||||
p = self.precision(value)
|
||||
r = self.recall(value)
|
||||
if p == 0.0 or r == 0.0:
|
||||
return 0.0
|
||||
return 1.0 / (alpha / p + (1 - alpha) / r)
|
||||
|
||||
def evaluate(self, alpha=0.5, truncate=None, sort_by_count=False):
|
||||
"""
|
||||
Tabulate the **recall**, **precision** and **f-measure**
|
||||
for each value in this confusion matrix.
|
||||
|
||||
>>> reference = "DET NN VB DET JJ NN NN IN DET NN".split()
|
||||
>>> test = "DET VB VB DET NN NN NN IN DET NN".split()
|
||||
>>> cm = ConfusionMatrix(reference, test)
|
||||
>>> print(cm.evaluate())
|
||||
Tag | Prec. | Recall | F-measure
|
||||
----+--------+--------+-----------
|
||||
DET | 1.0000 | 1.0000 | 1.0000
|
||||
IN | 1.0000 | 1.0000 | 1.0000
|
||||
JJ | 0.0000 | 0.0000 | 0.0000
|
||||
NN | 0.7500 | 0.7500 | 0.7500
|
||||
VB | 0.5000 | 1.0000 | 0.6667
|
||||
<BLANKLINE>
|
||||
|
||||
:param alpha: Ratio of the cost of false negative compared to false
|
||||
positives, as used in the f-measure computation. Defaults to 0.5,
|
||||
where the costs are equal.
|
||||
:type alpha: float
|
||||
:param truncate: If specified, then only show the specified
|
||||
number of values. Any sorting (e.g., sort_by_count)
|
||||
will be performed before truncation. Defaults to None
|
||||
:type truncate: int, optional
|
||||
:param sort_by_count: Whether to sort the outputs on frequency
|
||||
in the reference label. Defaults to False.
|
||||
:type sort_by_count: bool, optional
|
||||
:return: A tabulated recall, precision and f-measure string
|
||||
:rtype: str
|
||||
"""
|
||||
tags = self._values
|
||||
|
||||
# Apply keyword parameters
|
||||
if sort_by_count:
|
||||
tags = sorted(tags, key=lambda v: -sum(self._confusion[self._indices[v]]))
|
||||
if truncate:
|
||||
tags = tags[:truncate]
|
||||
|
||||
tag_column_len = max(max(len(tag) for tag in tags), 3)
|
||||
|
||||
# Construct the header
|
||||
s = (
|
||||
f"{' ' * (tag_column_len - 3)}Tag | Prec. | Recall | F-measure\n"
|
||||
f"{'-' * tag_column_len}-+--------+--------+-----------\n"
|
||||
)
|
||||
|
||||
# Construct the body
|
||||
for tag in tags:
|
||||
s += (
|
||||
f"{tag:>{tag_column_len}} | "
|
||||
f"{self.precision(tag):<6.4f} | "
|
||||
f"{self.recall(tag):<6.4f} | "
|
||||
f"{self.f_measure(tag, alpha=alpha):.4f}\n"
|
||||
)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def demo():
|
||||
reference = "DET NN VB DET JJ NN NN IN DET NN".split()
|
||||
test = "DET VB VB DET NN NN NN IN DET NN".split()
|
||||
print("Reference =", reference)
|
||||
print("Test =", test)
|
||||
print("Confusion matrix:")
|
||||
print(ConfusionMatrix(reference, test))
|
||||
print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True))
|
||||
|
||||
print(ConfusionMatrix(reference, test).recall("VB"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
@@ -0,0 +1,508 @@
|
||||
# Natural Language Toolkit: Distance Metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Tom Lippincott <tom@cs.columbia.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""
|
||||
Distance Metrics.
|
||||
|
||||
Compute the distance between two items (usually strings).
|
||||
As metrics, they must satisfy the following three requirements:
|
||||
|
||||
1. d(a, a) = 0
|
||||
2. d(a, b) >= 0
|
||||
3. d(a, c) <= d(a, b) + d(b, c)
|
||||
"""
|
||||
|
||||
import operator
|
||||
import warnings
|
||||
|
||||
|
||||
def _edit_dist_init(len1, len2):
|
||||
lev = []
|
||||
for i in range(len1):
|
||||
lev.append([0] * len2) # initialize 2D array to zero
|
||||
for i in range(len1):
|
||||
lev[i][0] = i # column 0: 0,1,2,3,4,...
|
||||
for j in range(len2):
|
||||
lev[0][j] = j # row 0: 0,1,2,3,4,...
|
||||
return lev
|
||||
|
||||
|
||||
def _last_left_t_init(sigma):
|
||||
return {c: 0 for c in sigma}
|
||||
|
||||
|
||||
def _edit_dist_step(
|
||||
lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False
|
||||
):
|
||||
c1 = s1[i - 1]
|
||||
c2 = s2[j - 1]
|
||||
|
||||
# skipping a character in s1
|
||||
a = lev[i - 1][j] + 1
|
||||
# skipping a character in s2
|
||||
b = lev[i][j - 1] + 1
|
||||
# substitution
|
||||
c = lev[i - 1][j - 1] + (substitution_cost if c1 != c2 else 0)
|
||||
|
||||
# transposition
|
||||
d = c + 1 # never picked by default
|
||||
if transpositions and last_left > 0 and last_right > 0:
|
||||
d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1
|
||||
|
||||
# pick the cheapest
|
||||
lev[i][j] = min(a, b, c, d)
|
||||
|
||||
|
||||
def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
|
||||
"""
|
||||
Calculate the Levenshtein edit-distance between two strings.
|
||||
The edit distance is the number of characters that need to be
|
||||
substituted, inserted, or deleted, to transform s1 into s2. For
|
||||
example, transforming "rain" to "shine" requires three steps,
|
||||
consisting of two substitutions and one insertion:
|
||||
"rain" -> "sain" -> "shin" -> "shine". These operations could have
|
||||
been done in other orders, but at least three steps are needed.
|
||||
|
||||
Allows specifying the cost of substitution edits (e.g., "a" -> "b"),
|
||||
because sometimes it makes sense to assign greater penalties to
|
||||
substitutions.
|
||||
|
||||
This also optionally allows transposition edits (e.g., "ab" -> "ba"),
|
||||
though this is disabled by default.
|
||||
|
||||
:param s1, s2: The strings to be analysed
|
||||
:param transpositions: Whether to allow transposition edits
|
||||
:type s1: str
|
||||
:type s2: str
|
||||
:type substitution_cost: int
|
||||
:type transpositions: bool
|
||||
:rtype: int
|
||||
"""
|
||||
# set up a 2-D array
|
||||
len1 = len(s1)
|
||||
len2 = len(s2)
|
||||
lev = _edit_dist_init(len1 + 1, len2 + 1)
|
||||
|
||||
# retrieve alphabet
|
||||
sigma = set()
|
||||
sigma.update(s1)
|
||||
sigma.update(s2)
|
||||
|
||||
# set up table to remember positions of last seen occurrence in s1
|
||||
last_left_t = _last_left_t_init(sigma)
|
||||
|
||||
# iterate over the array
|
||||
# i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code
|
||||
# see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
|
||||
for i in range(1, len1 + 1):
|
||||
last_right_buf = 0
|
||||
for j in range(1, len2 + 1):
|
||||
last_left = last_left_t[s2[j - 1]]
|
||||
last_right = last_right_buf
|
||||
if s1[i - 1] == s2[j - 1]:
|
||||
last_right_buf = j
|
||||
_edit_dist_step(
|
||||
lev,
|
||||
i,
|
||||
j,
|
||||
s1,
|
||||
s2,
|
||||
last_left,
|
||||
last_right,
|
||||
substitution_cost=substitution_cost,
|
||||
transpositions=transpositions,
|
||||
)
|
||||
last_left_t[s1[i - 1]] = i
|
||||
return lev[len1][len2]
|
||||
|
||||
|
||||
def _edit_dist_backtrace(lev):
|
||||
i, j = len(lev) - 1, len(lev[0]) - 1
|
||||
alignment = [(i, j)]
|
||||
|
||||
while (i, j) != (0, 0):
|
||||
directions = [
|
||||
(i - 1, j - 1), # substitution
|
||||
(i - 1, j), # skip s1
|
||||
(i, j - 1), # skip s2
|
||||
]
|
||||
|
||||
direction_costs = (
|
||||
(lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j))
|
||||
for i, j in directions
|
||||
)
|
||||
_, (i, j) = min(direction_costs, key=operator.itemgetter(0))
|
||||
|
||||
alignment.append((i, j))
|
||||
return list(reversed(alignment))
|
||||
|
||||
|
||||
def edit_distance_align(s1, s2, substitution_cost=1):
|
||||
"""
|
||||
Calculate the minimum Levenshtein edit-distance based alignment
|
||||
mapping between two strings. The alignment finds the mapping
|
||||
from string s1 to s2 that minimizes the edit distance cost.
|
||||
For example, mapping "rain" to "shine" would involve 2
|
||||
substitutions, 2 matches and an insertion resulting in
|
||||
the following mapping:
|
||||
[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)]
|
||||
NB: (0, 0) is the start state without any letters associated
|
||||
See more: https://web.stanford.edu/class/cs124/lec/med.pdf
|
||||
|
||||
In case of multiple valid minimum-distance alignments, the
|
||||
backtrace has the following operation precedence:
|
||||
|
||||
1. Substitute s1 and s2 characters
|
||||
2. Skip s1 character
|
||||
3. Skip s2 character
|
||||
|
||||
The backtrace is carried out in reverse string order.
|
||||
|
||||
This function does not support transposition.
|
||||
|
||||
:param s1, s2: The strings to be aligned
|
||||
:type s1: str
|
||||
:type s2: str
|
||||
:type substitution_cost: int
|
||||
:rtype: List[Tuple(int, int)]
|
||||
"""
|
||||
# set up a 2-D array
|
||||
len1 = len(s1)
|
||||
len2 = len(s2)
|
||||
lev = _edit_dist_init(len1 + 1, len2 + 1)
|
||||
|
||||
# iterate over the array
|
||||
for i in range(len1):
|
||||
for j in range(len2):
|
||||
_edit_dist_step(
|
||||
lev,
|
||||
i + 1,
|
||||
j + 1,
|
||||
s1,
|
||||
s2,
|
||||
0,
|
||||
0,
|
||||
substitution_cost=substitution_cost,
|
||||
transpositions=False,
|
||||
)
|
||||
|
||||
# backtrace to find alignment
|
||||
alignment = _edit_dist_backtrace(lev)
|
||||
return alignment
|
||||
|
||||
|
||||
def binary_distance(label1, label2):
|
||||
"""Simple equality test.
|
||||
|
||||
0.0 if the labels are identical, 1.0 if they are different.
|
||||
|
||||
>>> from nltk.metrics import binary_distance
|
||||
>>> binary_distance(1,1)
|
||||
0.0
|
||||
|
||||
>>> binary_distance(1,3)
|
||||
1.0
|
||||
"""
|
||||
|
||||
return 0.0 if label1 == label2 else 1.0
|
||||
|
||||
|
||||
def jaccard_distance(label1, label2):
|
||||
"""Distance metric comparing set-similarity."""
|
||||
return (len(label1.union(label2)) - len(label1.intersection(label2))) / len(
|
||||
label1.union(label2)
|
||||
)
|
||||
|
||||
|
||||
def masi_distance(label1, label2):
|
||||
"""Distance metric that takes into account partial agreement when multiple
|
||||
labels are assigned.
|
||||
|
||||
>>> from nltk.metrics import masi_distance
|
||||
>>> masi_distance(set([1, 2]), set([1, 2, 3, 4]))
|
||||
0.665
|
||||
|
||||
Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI)
|
||||
for Semantic and Pragmatic Annotation.
|
||||
"""
|
||||
|
||||
len_intersection = len(label1.intersection(label2))
|
||||
len_union = len(label1.union(label2))
|
||||
len_label1 = len(label1)
|
||||
len_label2 = len(label2)
|
||||
if len_label1 == len_label2 and len_label1 == len_intersection:
|
||||
m = 1
|
||||
elif len_intersection == min(len_label1, len_label2):
|
||||
m = 0.67
|
||||
elif len_intersection > 0:
|
||||
m = 0.33
|
||||
else:
|
||||
m = 0
|
||||
|
||||
return 1 - len_intersection / len_union * m
|
||||
|
||||
|
||||
def interval_distance(label1, label2):
|
||||
"""Krippendorff's interval distance metric
|
||||
|
||||
>>> from nltk.metrics import interval_distance
|
||||
>>> interval_distance(1,10)
|
||||
81
|
||||
|
||||
Krippendorff 1980, Content Analysis: An Introduction to its Methodology
|
||||
"""
|
||||
|
||||
try:
|
||||
return pow(label1 - label2, 2)
|
||||
# return pow(list(label1)[0]-list(label2)[0],2)
|
||||
except:
|
||||
print("non-numeric labels not supported with interval distance")
|
||||
|
||||
|
||||
def presence(label):
|
||||
"""Higher-order function to test presence of a given label"""
|
||||
|
||||
return lambda x, y: 1.0 * ((label in x) == (label in y))
|
||||
|
||||
|
||||
def fractional_presence(label):
|
||||
return (
|
||||
lambda x, y: abs((1.0 / len(x)) - (1.0 / len(y))) * (label in x and label in y)
|
||||
or 0.0 * (label not in x and label not in y)
|
||||
or abs(1.0 / len(x)) * (label in x and label not in y)
|
||||
or (1.0 / len(y)) * (label not in x and label in y)
|
||||
)
|
||||
|
||||
|
||||
def custom_distance(file):
|
||||
data = {}
|
||||
with open(file) as infile:
|
||||
for l in infile:
|
||||
labelA, labelB, dist = l.strip().split("\t")
|
||||
labelA = frozenset([labelA])
|
||||
labelB = frozenset([labelB])
|
||||
data[frozenset([labelA, labelB])] = float(dist)
|
||||
return lambda x, y: data[frozenset([x, y])]
|
||||
|
||||
|
||||
def jaro_similarity(s1, s2):
|
||||
"""
|
||||
Computes the Jaro similarity between 2 sequences from:
|
||||
|
||||
Matthew A. Jaro (1989). Advances in record linkage methodology
|
||||
as applied to the 1985 census of Tampa Florida. Journal of the
|
||||
American Statistical Association. 84 (406): 414-20.
|
||||
|
||||
The Jaro distance between is the min no. of single-character transpositions
|
||||
required to change one word into another. The Jaro similarity formula from
|
||||
https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance :
|
||||
|
||||
``jaro_sim = 0 if m = 0 else 1/3 * (m/|s_1| + m/s_2 + (m-t)/m)``
|
||||
|
||||
where
|
||||
- `|s_i|` is the length of string `s_i`
|
||||
- `m` is the no. of matching characters
|
||||
- `t` is the half no. of possible transpositions.
|
||||
"""
|
||||
# First, store the length of the strings
|
||||
# because they will be re-used several times.
|
||||
len_s1, len_s2 = len(s1), len(s2)
|
||||
|
||||
# The upper bound of the distance for being a matched character.
|
||||
match_bound = max(len_s1, len_s2) // 2 - 1
|
||||
|
||||
# Initialize the counts for matches and transpositions.
|
||||
matches = 0 # no.of matched characters in s1 and s2
|
||||
transpositions = 0 # no. of transpositions between s1 and s2
|
||||
flagged_1 = [] # positions in s1 which are matches to some character in s2
|
||||
flagged_2 = [] # positions in s2 which are matches to some character in s1
|
||||
|
||||
# Iterate through sequences, check for matches and compute transpositions.
|
||||
for i in range(len_s1): # Iterate through each character.
|
||||
upperbound = min(i + match_bound, len_s2 - 1)
|
||||
lowerbound = max(0, i - match_bound)
|
||||
for j in range(lowerbound, upperbound + 1):
|
||||
if s1[i] == s2[j] and j not in flagged_2:
|
||||
matches += 1
|
||||
flagged_1.append(i)
|
||||
flagged_2.append(j)
|
||||
break
|
||||
flagged_2.sort()
|
||||
for i, j in zip(flagged_1, flagged_2):
|
||||
if s1[i] != s2[j]:
|
||||
transpositions += 1
|
||||
|
||||
if matches == 0:
|
||||
return 0
|
||||
else:
|
||||
return (
|
||||
1
|
||||
/ 3
|
||||
* (
|
||||
matches / len_s1
|
||||
+ matches / len_s2
|
||||
+ (matches - transpositions // 2) / matches
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4):
|
||||
"""
|
||||
The Jaro Winkler distance is an extension of the Jaro similarity in:
|
||||
|
||||
William E. Winkler. 1990. String Comparator Metrics and Enhanced
|
||||
Decision Rules in the Fellegi-Sunter Model of Record Linkage.
|
||||
Proceedings of the Section on Survey Research Methods.
|
||||
American Statistical Association: 354-359.
|
||||
|
||||
such that:
|
||||
|
||||
jaro_winkler_sim = jaro_sim + ( l * p * (1 - jaro_sim) )
|
||||
|
||||
where,
|
||||
|
||||
- jaro_sim is the output from the Jaro Similarity,
|
||||
see jaro_similarity()
|
||||
- l is the length of common prefix at the start of the string
|
||||
- this implementation provides an upperbound for the l value
|
||||
to keep the prefixes.A common value of this upperbound is 4.
|
||||
- p is the constant scaling factor to overweigh common prefixes.
|
||||
The Jaro-Winkler similarity will fall within the [0, 1] bound,
|
||||
given that max(p)<=0.25 , default is p=0.1 in Winkler (1990)
|
||||
|
||||
|
||||
Test using outputs from https://www.census.gov/srd/papers/pdf/rr93-8.pdf
|
||||
from "Table 5 Comparison of String Comparators Rescaled between 0 and 1"
|
||||
|
||||
>>> winkler_examples = [("billy", "billy"), ("billy", "bill"), ("billy", "blily"),
|
||||
... ("massie", "massey"), ("yvette", "yevett"), ("billy", "bolly"), ("dwayne", "duane"),
|
||||
... ("dixon", "dickson"), ("billy", "susan")]
|
||||
|
||||
>>> winkler_scores = [1.000, 0.967, 0.947, 0.944, 0.911, 0.893, 0.858, 0.853, 0.000]
|
||||
>>> jaro_scores = [1.000, 0.933, 0.933, 0.889, 0.889, 0.867, 0.822, 0.790, 0.000]
|
||||
|
||||
One way to match the values on the Winkler's paper is to provide a different
|
||||
p scaling factor for different pairs of strings, e.g.
|
||||
|
||||
>>> p_factors = [0.1, 0.125, 0.20, 0.125, 0.20, 0.20, 0.20, 0.15, 0.1]
|
||||
|
||||
>>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
|
||||
... assert round(jaro_similarity(s1, s2), 3) == jscore
|
||||
... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
|
||||
|
||||
|
||||
Test using outputs from https://www.census.gov/srd/papers/pdf/rr94-5.pdf from
|
||||
"Table 2.1. Comparison of String Comparators Using Last Names, First Names, and Street Names"
|
||||
|
||||
>>> winkler_examples = [('SHACKLEFORD', 'SHACKELFORD'), ('DUNNINGHAM', 'CUNNIGHAM'),
|
||||
... ('NICHLESON', 'NICHULSON'), ('JONES', 'JOHNSON'), ('MASSEY', 'MASSIE'),
|
||||
... ('ABROMS', 'ABRAMS'), ('HARDIN', 'MARTINEZ'), ('ITMAN', 'SMITH'),
|
||||
... ('JERALDINE', 'GERALDINE'), ('MARHTA', 'MARTHA'), ('MICHELLE', 'MICHAEL'),
|
||||
... ('JULIES', 'JULIUS'), ('TANYA', 'TONYA'), ('DWAYNE', 'DUANE'), ('SEAN', 'SUSAN'),
|
||||
... ('JON', 'JOHN'), ('JON', 'JAN'), ('BROOKHAVEN', 'BRROKHAVEN'),
|
||||
... ('BROOK HALLOW', 'BROOK HLLW'), ('DECATUR', 'DECATIR'), ('FITZRUREITER', 'FITZENREITER'),
|
||||
... ('HIGBEE', 'HIGHEE'), ('HIGBEE', 'HIGVEE'), ('LACURA', 'LOCURA'), ('IOWA', 'IONA'), ('1ST', 'IST')]
|
||||
|
||||
>>> jaro_scores = [0.970, 0.896, 0.926, 0.790, 0.889, 0.889, 0.722, 0.467, 0.926,
|
||||
... 0.944, 0.869, 0.889, 0.867, 0.822, 0.783, 0.917, 0.000, 0.933, 0.944, 0.905,
|
||||
... 0.856, 0.889, 0.889, 0.889, 0.833, 0.000]
|
||||
|
||||
>>> winkler_scores = [0.982, 0.896, 0.956, 0.832, 0.944, 0.922, 0.722, 0.467, 0.926,
|
||||
... 0.961, 0.921, 0.933, 0.880, 0.858, 0.805, 0.933, 0.000, 0.947, 0.967, 0.943,
|
||||
... 0.913, 0.922, 0.922, 0.900, 0.867, 0.000]
|
||||
|
||||
One way to match the values on the Winkler's paper is to provide a different
|
||||
p scaling factor for different pairs of strings, e.g.
|
||||
|
||||
>>> p_factors = [0.1, 0.1, 0.1, 0.1, 0.125, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.20,
|
||||
... 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
|
||||
|
||||
|
||||
>>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
|
||||
... if (s1, s2) in [('JON', 'JAN'), ('1ST', 'IST')]:
|
||||
... continue # Skip bad examples from the paper.
|
||||
... assert round(jaro_similarity(s1, s2), 3) == jscore
|
||||
... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
|
||||
|
||||
|
||||
|
||||
This test-case proves that the output of Jaro-Winkler similarity depends on
|
||||
the product l * p and not on the product max_l * p. Here the product max_l * p > 1
|
||||
however the product l * p <= 1
|
||||
|
||||
>>> round(jaro_winkler_similarity('TANYA', 'TONYA', p=0.1, max_l=100), 3)
|
||||
0.88
|
||||
"""
|
||||
# To ensure that the output of the Jaro-Winkler's similarity
|
||||
# falls between [0,1], the product of l * p needs to be
|
||||
# also fall between [0,1].
|
||||
if not 0 <= max_l * p <= 1:
|
||||
warnings.warn(
|
||||
str(
|
||||
"The product `max_l * p` might not fall between [0,1]."
|
||||
"Jaro-Winkler similarity might not be between 0 and 1."
|
||||
)
|
||||
)
|
||||
|
||||
# Compute the Jaro similarity
|
||||
jaro_sim = jaro_similarity(s1, s2)
|
||||
|
||||
# Initialize the upper bound for the no. of prefixes.
|
||||
# if user did not pre-define the upperbound,
|
||||
# use shorter length between s1 and s2
|
||||
|
||||
# Compute the prefix matches.
|
||||
l = 0
|
||||
# zip() will automatically loop until the end of shorter string.
|
||||
for s1_i, s2_i in zip(s1, s2):
|
||||
if s1_i == s2_i:
|
||||
l += 1
|
||||
else:
|
||||
break
|
||||
if l == max_l:
|
||||
break
|
||||
# Return the similarity value as described in docstring.
|
||||
return jaro_sim + (l * p * (1 - jaro_sim))
|
||||
|
||||
|
||||
def demo():
|
||||
string_distance_examples = [
|
||||
("rain", "shine"),
|
||||
("abcdef", "acbdef"),
|
||||
("language", "lnaguaeg"),
|
||||
("language", "lnaugage"),
|
||||
("language", "lngauage"),
|
||||
]
|
||||
for s1, s2 in string_distance_examples:
|
||||
print(f"Edit distance btwn '{s1}' and '{s2}':", edit_distance(s1, s2))
|
||||
print(
|
||||
f"Edit dist with transpositions btwn '{s1}' and '{s2}':",
|
||||
edit_distance(s1, s2, transpositions=True),
|
||||
)
|
||||
print(f"Jaro similarity btwn '{s1}' and '{s2}':", jaro_similarity(s1, s2))
|
||||
print(
|
||||
f"Jaro-Winkler similarity btwn '{s1}' and '{s2}':",
|
||||
jaro_winkler_similarity(s1, s2),
|
||||
)
|
||||
print(
|
||||
f"Jaro-Winkler distance btwn '{s1}' and '{s2}':",
|
||||
1 - jaro_winkler_similarity(s1, s2),
|
||||
)
|
||||
s1 = {1, 2, 3, 4}
|
||||
s2 = {3, 4, 5}
|
||||
print("s1:", s1)
|
||||
print("s2:", s2)
|
||||
print("Binary distance:", binary_distance(s1, s2))
|
||||
print("Jaccard distance:", jaccard_distance(s1, s2))
|
||||
print("MASI distance:", masi_distance(s1, s2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
389
Backend/venv/lib/python3.12/site-packages/nltk/metrics/paice.py
Normal file
389
Backend/venv/lib/python3.12/site-packages/nltk/metrics/paice.py
Normal file
@@ -0,0 +1,389 @@
|
||||
# Natural Language Toolkit: Agreement Metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Lauri Hallila <laurihallila@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
"""Counts Paice's performance statistics for evaluating stemming algorithms.
|
||||
|
||||
What is required:
|
||||
- A dictionary of words grouped by their real lemmas
|
||||
- A dictionary of words grouped by stems from a stemming algorithm
|
||||
|
||||
When these are given, Understemming Index (UI), Overstemming Index (OI),
|
||||
Stemming Weight (SW) and Error-rate relative to truncation (ERRT) are counted.
|
||||
|
||||
References:
|
||||
Chris D. Paice (1994). An evaluation method for stemming algorithms.
|
||||
In Proceedings of SIGIR, 42--50.
|
||||
"""
|
||||
|
||||
from math import sqrt
|
||||
|
||||
|
||||
def get_words_from_dictionary(lemmas):
|
||||
"""
|
||||
Get original set of words used for analysis.
|
||||
|
||||
:param lemmas: A dictionary where keys are lemmas and values are sets
|
||||
or lists of words corresponding to that lemma.
|
||||
:type lemmas: dict(str): list(str)
|
||||
:return: Set of words that exist as values in the dictionary
|
||||
:rtype: set(str)
|
||||
"""
|
||||
words = set()
|
||||
for lemma in lemmas:
|
||||
words.update(set(lemmas[lemma]))
|
||||
return words
|
||||
|
||||
|
||||
def _truncate(words, cutlength):
|
||||
"""Group words by stems defined by truncating them at given length.
|
||||
|
||||
:param words: Set of words used for analysis
|
||||
:param cutlength: Words are stemmed by cutting at this length.
|
||||
:type words: set(str) or list(str)
|
||||
:type cutlength: int
|
||||
:return: Dictionary where keys are stems and values are sets of words
|
||||
corresponding to that stem.
|
||||
:rtype: dict(str): set(str)
|
||||
"""
|
||||
stems = {}
|
||||
for word in words:
|
||||
stem = word[:cutlength]
|
||||
try:
|
||||
stems[stem].update([word])
|
||||
except KeyError:
|
||||
stems[stem] = {word}
|
||||
return stems
|
||||
|
||||
|
||||
# Reference: https://en.wikipedia.org/wiki/Line-line_intersection
|
||||
def _count_intersection(l1, l2):
|
||||
"""Count intersection between two line segments defined by coordinate pairs.
|
||||
|
||||
:param l1: Tuple of two coordinate pairs defining the first line segment
|
||||
:param l2: Tuple of two coordinate pairs defining the second line segment
|
||||
:type l1: tuple(float, float)
|
||||
:type l2: tuple(float, float)
|
||||
:return: Coordinates of the intersection
|
||||
:rtype: tuple(float, float)
|
||||
"""
|
||||
x1, y1 = l1[0]
|
||||
x2, y2 = l1[1]
|
||||
x3, y3 = l2[0]
|
||||
x4, y4 = l2[1]
|
||||
|
||||
denominator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
|
||||
|
||||
if denominator == 0.0: # lines are parallel
|
||||
if x1 == x2 == x3 == x4 == 0.0:
|
||||
# When lines are parallel, they must be on the y-axis.
|
||||
# We can ignore x-axis because we stop counting the
|
||||
# truncation line when we get there.
|
||||
# There are no other options as UI (x-axis) grows and
|
||||
# OI (y-axis) diminishes when we go along the truncation line.
|
||||
return (0.0, y4)
|
||||
|
||||
x = (
|
||||
(x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)
|
||||
) / denominator
|
||||
y = (
|
||||
(x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)
|
||||
) / denominator
|
||||
return (x, y)
|
||||
|
||||
|
||||
def _get_derivative(coordinates):
|
||||
"""Get derivative of the line from (0,0) to given coordinates.
|
||||
|
||||
:param coordinates: A coordinate pair
|
||||
:type coordinates: tuple(float, float)
|
||||
:return: Derivative; inf if x is zero
|
||||
:rtype: float
|
||||
"""
|
||||
try:
|
||||
return coordinates[1] / coordinates[0]
|
||||
except ZeroDivisionError:
|
||||
return float("inf")
|
||||
|
||||
|
||||
def _calculate_cut(lemmawords, stems):
|
||||
"""Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
|
||||
|
||||
:param lemmawords: Set or list of words corresponding to certain lemma.
|
||||
:param stems: A dictionary where keys are stems and values are sets
|
||||
or lists of words corresponding to that stem.
|
||||
:type lemmawords: set(str) or list(str)
|
||||
:type stems: dict(str): set(str)
|
||||
:return: Amount of understemmed and overstemmed pairs contributed by words
|
||||
existing in both lemmawords and stems.
|
||||
:rtype: tuple(float, float)
|
||||
"""
|
||||
umt, wmt = 0.0, 0.0
|
||||
for stem in stems:
|
||||
cut = set(lemmawords) & set(stems[stem])
|
||||
if cut:
|
||||
cutcount = len(cut)
|
||||
stemcount = len(stems[stem])
|
||||
# Unachieved merge total
|
||||
umt += cutcount * (len(lemmawords) - cutcount)
|
||||
# Wrongly merged total
|
||||
wmt += cutcount * (stemcount - cutcount)
|
||||
return (umt, wmt)
|
||||
|
||||
|
||||
def _calculate(lemmas, stems):
|
||||
"""Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
|
||||
|
||||
:param lemmas: A dictionary where keys are lemmas and values are sets
|
||||
or lists of words corresponding to that lemma.
|
||||
:param stems: A dictionary where keys are stems and values are sets
|
||||
or lists of words corresponding to that stem.
|
||||
:type lemmas: dict(str): list(str)
|
||||
:type stems: dict(str): set(str)
|
||||
:return: Global unachieved merge total (gumt),
|
||||
global desired merge total (gdmt),
|
||||
global wrongly merged total (gwmt) and
|
||||
global desired non-merge total (gdnt).
|
||||
:rtype: tuple(float, float, float, float)
|
||||
"""
|
||||
|
||||
n = sum(len(lemmas[word]) for word in lemmas)
|
||||
|
||||
gdmt, gdnt, gumt, gwmt = (0.0, 0.0, 0.0, 0.0)
|
||||
|
||||
for lemma in lemmas:
|
||||
lemmacount = len(lemmas[lemma])
|
||||
|
||||
# Desired merge total
|
||||
gdmt += lemmacount * (lemmacount - 1)
|
||||
|
||||
# Desired non-merge total
|
||||
gdnt += lemmacount * (n - lemmacount)
|
||||
|
||||
# For each (lemma, stem) pair with common words, count how many
|
||||
# pairs are understemmed and overstemmed.
|
||||
umt, wmt = _calculate_cut(lemmas[lemma], stems)
|
||||
|
||||
# Add to total undesired and wrongly-merged totals
|
||||
gumt += umt
|
||||
gwmt += wmt
|
||||
|
||||
# Each object is counted twice, so divide by two
|
||||
return (gumt / 2, gdmt / 2, gwmt / 2, gdnt / 2)
|
||||
|
||||
|
||||
def _indexes(gumt, gdmt, gwmt, gdnt):
|
||||
"""Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
|
||||
|
||||
:param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt),
|
||||
global desired merge total (gdmt),
|
||||
global wrongly merged total (gwmt) and
|
||||
global desired non-merge total (gdnt).
|
||||
:type gumt, gdmt, gwmt, gdnt: float
|
||||
:return: Understemming Index (UI),
|
||||
Overstemming Index (OI) and
|
||||
Stemming Weight (SW).
|
||||
:rtype: tuple(float, float, float)
|
||||
"""
|
||||
# Calculate Understemming Index (UI),
|
||||
# Overstemming Index (OI) and Stemming Weight (SW)
|
||||
try:
|
||||
ui = gumt / gdmt
|
||||
except ZeroDivisionError:
|
||||
# If GDMT (max merge total) is 0, define UI as 0
|
||||
ui = 0.0
|
||||
try:
|
||||
oi = gwmt / gdnt
|
||||
except ZeroDivisionError:
|
||||
# IF GDNT (max non-merge total) is 0, define OI as 0
|
||||
oi = 0.0
|
||||
try:
|
||||
sw = oi / ui
|
||||
except ZeroDivisionError:
|
||||
if oi == 0.0:
|
||||
# OI and UI are 0, define SW as 'not a number'
|
||||
sw = float("nan")
|
||||
else:
|
||||
# UI is 0, define SW as infinity
|
||||
sw = float("inf")
|
||||
return (ui, oi, sw)
|
||||
|
||||
|
||||
class Paice:
|
||||
"""Class for storing lemmas, stems and evaluation metrics."""
|
||||
|
||||
def __init__(self, lemmas, stems):
|
||||
"""
|
||||
:param lemmas: A dictionary where keys are lemmas and values are sets
|
||||
or lists of words corresponding to that lemma.
|
||||
:param stems: A dictionary where keys are stems and values are sets
|
||||
or lists of words corresponding to that stem.
|
||||
:type lemmas: dict(str): list(str)
|
||||
:type stems: dict(str): set(str)
|
||||
"""
|
||||
self.lemmas = lemmas
|
||||
self.stems = stems
|
||||
self.coords = []
|
||||
self.gumt, self.gdmt, self.gwmt, self.gdnt = (None, None, None, None)
|
||||
self.ui, self.oi, self.sw = (None, None, None)
|
||||
self.errt = None
|
||||
self.update()
|
||||
|
||||
def __str__(self):
|
||||
text = ["Global Unachieved Merge Total (GUMT): %s\n" % self.gumt]
|
||||
text.append("Global Desired Merge Total (GDMT): %s\n" % self.gdmt)
|
||||
text.append("Global Wrongly-Merged Total (GWMT): %s\n" % self.gwmt)
|
||||
text.append("Global Desired Non-merge Total (GDNT): %s\n" % self.gdnt)
|
||||
text.append("Understemming Index (GUMT / GDMT): %s\n" % self.ui)
|
||||
text.append("Overstemming Index (GWMT / GDNT): %s\n" % self.oi)
|
||||
text.append("Stemming Weight (OI / UI): %s\n" % self.sw)
|
||||
text.append("Error-Rate Relative to Truncation (ERRT): %s\r\n" % self.errt)
|
||||
coordinates = " ".join(["(%s, %s)" % item for item in self.coords])
|
||||
text.append("Truncation line: %s" % coordinates)
|
||||
return "".join(text)
|
||||
|
||||
def _get_truncation_indexes(self, words, cutlength):
|
||||
"""Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
|
||||
|
||||
:param words: Words used for the analysis
|
||||
:param cutlength: Words are stemmed by cutting them at this length
|
||||
:type words: set(str) or list(str)
|
||||
:type cutlength: int
|
||||
:return: Understemming and overstemming indexes
|
||||
:rtype: tuple(int, int)
|
||||
"""
|
||||
|
||||
truncated = _truncate(words, cutlength)
|
||||
gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated)
|
||||
ui, oi = _indexes(gumt, gdmt, gwmt, gdnt)[:2]
|
||||
return (ui, oi)
|
||||
|
||||
def _get_truncation_coordinates(self, cutlength=0):
|
||||
"""Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
|
||||
|
||||
:param cutlength: Optional parameter to start counting from (ui, oi)
|
||||
coordinates gotten by stemming at this length. Useful for speeding up
|
||||
the calculations when you know the approximate location of the
|
||||
intersection.
|
||||
:type cutlength: int
|
||||
:return: List of coordinate pairs that define the truncation line
|
||||
:rtype: list(tuple(float, float))
|
||||
"""
|
||||
words = get_words_from_dictionary(self.lemmas)
|
||||
maxlength = max(len(word) for word in words)
|
||||
|
||||
# Truncate words from different points until (0, 0) - (ui, oi) segment crosses the truncation line
|
||||
coords = []
|
||||
while cutlength <= maxlength:
|
||||
# Get (UI, OI) pair of current truncation point
|
||||
pair = self._get_truncation_indexes(words, cutlength)
|
||||
|
||||
# Store only new coordinates so we'll have an actual
|
||||
# line segment when counting the intersection point
|
||||
if pair not in coords:
|
||||
coords.append(pair)
|
||||
if pair == (0.0, 0.0):
|
||||
# Stop counting if truncation line goes through origo;
|
||||
# length from origo to truncation line is 0
|
||||
return coords
|
||||
if len(coords) >= 2 and pair[0] > 0.0:
|
||||
derivative1 = _get_derivative(coords[-2])
|
||||
derivative2 = _get_derivative(coords[-1])
|
||||
# Derivative of the truncation line is a decreasing value;
|
||||
# when it passes Stemming Weight, we've found the segment
|
||||
# of truncation line intersecting with (0, 0) - (ui, oi) segment
|
||||
if derivative1 >= self.sw >= derivative2:
|
||||
return coords
|
||||
cutlength += 1
|
||||
return coords
|
||||
|
||||
def _errt(self):
|
||||
"""Count Error-Rate Relative to Truncation (ERRT).
|
||||
|
||||
:return: ERRT, length of the line from origo to (UI, OI) divided by
|
||||
the length of the line from origo to the point defined by the same
|
||||
line when extended until the truncation line.
|
||||
:rtype: float
|
||||
"""
|
||||
# Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line
|
||||
self.coords = self._get_truncation_coordinates()
|
||||
if (0.0, 0.0) in self.coords:
|
||||
# Truncation line goes through origo, so ERRT cannot be counted
|
||||
if (self.ui, self.oi) != (0.0, 0.0):
|
||||
return float("inf")
|
||||
else:
|
||||
return float("nan")
|
||||
if (self.ui, self.oi) == (0.0, 0.0):
|
||||
# (ui, oi) is origo; define errt as 0.0
|
||||
return 0.0
|
||||
# Count the intersection point
|
||||
# Note that (self.ui, self.oi) cannot be (0.0, 0.0) and self.coords has different coordinates
|
||||
# so we have actual line segments instead of a line segment and a point
|
||||
intersection = _count_intersection(
|
||||
((0, 0), (self.ui, self.oi)), self.coords[-2:]
|
||||
)
|
||||
# Count OP (length of the line from origo to (ui, oi))
|
||||
op = sqrt(self.ui**2 + self.oi**2)
|
||||
# Count OT (length of the line from origo to truncation line that goes through (ui, oi))
|
||||
ot = sqrt(intersection[0] ** 2 + intersection[1] ** 2)
|
||||
# OP / OT tells how well the stemming algorithm works compared to just truncating words
|
||||
return op / ot
|
||||
|
||||
def update(self):
|
||||
"""Update statistics after lemmas and stems have been set."""
|
||||
self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems)
|
||||
self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt)
|
||||
self.errt = self._errt()
|
||||
|
||||
|
||||
def demo():
|
||||
"""Demonstration of the module."""
|
||||
# Some words with their real lemmas
|
||||
lemmas = {
|
||||
"kneel": ["kneel", "knelt"],
|
||||
"range": ["range", "ranged"],
|
||||
"ring": ["ring", "rang", "rung"],
|
||||
}
|
||||
# Same words with stems from a stemming algorithm
|
||||
stems = {
|
||||
"kneel": ["kneel"],
|
||||
"knelt": ["knelt"],
|
||||
"rang": ["rang", "range", "ranged"],
|
||||
"ring": ["ring"],
|
||||
"rung": ["rung"],
|
||||
}
|
||||
print("Words grouped by their lemmas:")
|
||||
for lemma in sorted(lemmas):
|
||||
print("{} => {}".format(lemma, " ".join(lemmas[lemma])))
|
||||
print()
|
||||
print("Same words grouped by a stemming algorithm:")
|
||||
for stem in sorted(stems):
|
||||
print("{} => {}".format(stem, " ".join(stems[stem])))
|
||||
print()
|
||||
p = Paice(lemmas, stems)
|
||||
print(p)
|
||||
print()
|
||||
# Let's "change" results from a stemming algorithm
|
||||
stems = {
|
||||
"kneel": ["kneel"],
|
||||
"knelt": ["knelt"],
|
||||
"rang": ["rang"],
|
||||
"range": ["range", "ranged"],
|
||||
"ring": ["ring"],
|
||||
"rung": ["rung"],
|
||||
}
|
||||
print("Counting stats after changing stemming results:")
|
||||
for stem in sorted(stems):
|
||||
print("{} => {}".format(stem, " ".join(stems[stem])))
|
||||
print()
|
||||
p.stems = stems
|
||||
p.update()
|
||||
print(p)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
228
Backend/venv/lib/python3.12/site-packages/nltk/metrics/scores.py
Normal file
228
Backend/venv/lib/python3.12/site-packages/nltk/metrics/scores.py
Normal file
@@ -0,0 +1,228 @@
|
||||
# Natural Language Toolkit: Evaluation
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import operator
|
||||
from functools import reduce
|
||||
from math import fabs
|
||||
from random import shuffle
|
||||
|
||||
try:
|
||||
from scipy.stats.stats import betai
|
||||
except ImportError:
|
||||
betai = None
|
||||
|
||||
from nltk.util import LazyConcatenation, LazyMap
|
||||
|
||||
|
||||
def accuracy(reference, test):
|
||||
"""
|
||||
Given a list of reference values and a corresponding list of test
|
||||
values, return the fraction of corresponding values that are
|
||||
equal. In particular, return the fraction of indices
|
||||
``0<i<=len(test)`` such that ``test[i] == reference[i]``.
|
||||
|
||||
:type reference: list
|
||||
:param reference: An ordered list of reference values.
|
||||
:type test: list
|
||||
:param test: A list of values to compare against the corresponding
|
||||
reference values.
|
||||
:raise ValueError: If ``reference`` and ``length`` do not have the
|
||||
same length.
|
||||
"""
|
||||
if len(reference) != len(test):
|
||||
raise ValueError("Lists must have the same length.")
|
||||
return sum(x == y for x, y in zip(reference, test)) / len(test)
|
||||
|
||||
|
||||
def precision(reference, test):
|
||||
"""
|
||||
Given a set of reference values and a set of test values, return
|
||||
the fraction of test values that appear in the reference set.
|
||||
In particular, return card(``reference`` intersection ``test``)/card(``test``).
|
||||
If ``test`` is empty, then return None.
|
||||
|
||||
:type reference: set
|
||||
:param reference: A set of reference values.
|
||||
:type test: set
|
||||
:param test: A set of values to compare against the reference set.
|
||||
:rtype: float or None
|
||||
"""
|
||||
if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
|
||||
raise TypeError("reference and test should be sets")
|
||||
|
||||
if len(test) == 0:
|
||||
return None
|
||||
else:
|
||||
return len(reference.intersection(test)) / len(test)
|
||||
|
||||
|
||||
def recall(reference, test):
|
||||
"""
|
||||
Given a set of reference values and a set of test values, return
|
||||
the fraction of reference values that appear in the test set.
|
||||
In particular, return card(``reference`` intersection ``test``)/card(``reference``).
|
||||
If ``reference`` is empty, then return None.
|
||||
|
||||
:type reference: set
|
||||
:param reference: A set of reference values.
|
||||
:type test: set
|
||||
:param test: A set of values to compare against the reference set.
|
||||
:rtype: float or None
|
||||
"""
|
||||
if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
|
||||
raise TypeError("reference and test should be sets")
|
||||
|
||||
if len(reference) == 0:
|
||||
return None
|
||||
else:
|
||||
return len(reference.intersection(test)) / len(reference)
|
||||
|
||||
|
||||
def f_measure(reference, test, alpha=0.5):
|
||||
"""
|
||||
Given a set of reference values and a set of test values, return
|
||||
the f-measure of the test values, when compared against the
|
||||
reference values. The f-measure is the harmonic mean of the
|
||||
``precision`` and ``recall``, weighted by ``alpha``. In particular,
|
||||
given the precision *p* and recall *r* defined by:
|
||||
|
||||
- *p* = card(``reference`` intersection ``test``)/card(``test``)
|
||||
- *r* = card(``reference`` intersection ``test``)/card(``reference``)
|
||||
|
||||
The f-measure is:
|
||||
|
||||
- *1/(alpha/p + (1-alpha)/r)*
|
||||
|
||||
If either ``reference`` or ``test`` is empty, then ``f_measure``
|
||||
returns None.
|
||||
|
||||
:type reference: set
|
||||
:param reference: A set of reference values.
|
||||
:type test: set
|
||||
:param test: A set of values to compare against the reference set.
|
||||
:rtype: float or None
|
||||
"""
|
||||
p = precision(reference, test)
|
||||
r = recall(reference, test)
|
||||
if p is None or r is None:
|
||||
return None
|
||||
if p == 0 or r == 0:
|
||||
return 0
|
||||
return 1.0 / (alpha / p + (1 - alpha) / r)
|
||||
|
||||
|
||||
def log_likelihood(reference, test):
|
||||
"""
|
||||
Given a list of reference values and a corresponding list of test
|
||||
probability distributions, return the average log likelihood of
|
||||
the reference values, given the probability distributions.
|
||||
|
||||
:param reference: A list of reference values
|
||||
:type reference: list
|
||||
:param test: A list of probability distributions over values to
|
||||
compare against the corresponding reference values.
|
||||
:type test: list(ProbDistI)
|
||||
"""
|
||||
if len(reference) != len(test):
|
||||
raise ValueError("Lists must have the same length.")
|
||||
|
||||
# Return the average value of dist.logprob(val).
|
||||
total_likelihood = sum(dist.logprob(val) for (val, dist) in zip(reference, test))
|
||||
return total_likelihood / len(reference)
|
||||
|
||||
|
||||
def approxrand(a, b, **kwargs):
|
||||
"""
|
||||
Returns an approximate significance level between two lists of
|
||||
independently generated test values.
|
||||
|
||||
Approximate randomization calculates significance by randomly drawing
|
||||
from a sample of the possible permutations. At the limit of the number
|
||||
of possible permutations, the significance level is exact. The
|
||||
approximate significance level is the sample mean number of times the
|
||||
statistic of the permutated lists varies from the actual statistic of
|
||||
the unpermuted argument lists.
|
||||
|
||||
:return: a tuple containing an approximate significance level, the count
|
||||
of the number of times the pseudo-statistic varied from the
|
||||
actual statistic, and the number of shuffles
|
||||
:rtype: tuple
|
||||
:param a: a list of test values
|
||||
:type a: list
|
||||
:param b: another list of independently generated test values
|
||||
:type b: list
|
||||
"""
|
||||
shuffles = kwargs.get("shuffles", 999)
|
||||
# there's no point in trying to shuffle beyond all possible permutations
|
||||
shuffles = min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
|
||||
stat = kwargs.get("statistic", lambda lst: sum(lst) / len(lst))
|
||||
verbose = kwargs.get("verbose", False)
|
||||
|
||||
if verbose:
|
||||
print("shuffles: %d" % shuffles)
|
||||
|
||||
actual_stat = fabs(stat(a) - stat(b))
|
||||
|
||||
if verbose:
|
||||
print("actual statistic: %f" % actual_stat)
|
||||
print("-" * 60)
|
||||
|
||||
c = 1e-100
|
||||
lst = LazyConcatenation([a, b])
|
||||
indices = list(range(len(a) + len(b)))
|
||||
|
||||
for i in range(shuffles):
|
||||
if verbose and i % 10 == 0:
|
||||
print("shuffle: %d" % i)
|
||||
|
||||
shuffle(indices)
|
||||
|
||||
pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[: len(a)]))
|
||||
pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a) :]))
|
||||
pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)
|
||||
|
||||
if pseudo_stat >= actual_stat:
|
||||
c += 1
|
||||
|
||||
if verbose and i % 10 == 0:
|
||||
print("pseudo-statistic: %f" % pseudo_stat)
|
||||
print("significance: %f" % ((c + 1) / (i + 1)))
|
||||
print("-" * 60)
|
||||
|
||||
significance = (c + 1) / (shuffles + 1)
|
||||
|
||||
if verbose:
|
||||
print("significance: %f" % significance)
|
||||
if betai:
|
||||
for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
|
||||
print(f"prob(phi<={phi:f}): {betai(c, shuffles, phi):f}")
|
||||
|
||||
return (significance, c, shuffles)
|
||||
|
||||
|
||||
def demo():
|
||||
print("-" * 75)
|
||||
reference = "DET NN VB DET JJ NN NN IN DET NN".split()
|
||||
test = "DET VB VB DET NN NN NN IN DET NN".split()
|
||||
print("Reference =", reference)
|
||||
print("Test =", test)
|
||||
print("Accuracy:", accuracy(reference, test))
|
||||
|
||||
print("-" * 75)
|
||||
reference_set = set(reference)
|
||||
test_set = set(test)
|
||||
print("Reference =", reference_set)
|
||||
print("Test = ", test_set)
|
||||
print("Precision:", precision(reference_set, test_set))
|
||||
print(" Recall:", recall(reference_set, test_set))
|
||||
print("F-Measure:", f_measure(reference_set, test_set))
|
||||
print("-" * 75)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
@@ -0,0 +1,222 @@
|
||||
# Natural Language Toolkit: Text Segmentation Metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# David Doukhan <david.doukhan@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
"""
|
||||
Text Segmentation Metrics
|
||||
|
||||
1. Windowdiff
|
||||
|
||||
Pevzner, L., and Hearst, M., A Critique and Improvement of
|
||||
an Evaluation Metric for Text Segmentation,
|
||||
Computational Linguistics 28, 19-36
|
||||
|
||||
|
||||
2. Generalized Hamming Distance
|
||||
|
||||
Bookstein A., Kulyukin V.A., Raita T.
|
||||
Generalized Hamming Distance
|
||||
Information Retrieval 5, 2002, pp 353-375
|
||||
|
||||
Baseline implementation in C++
|
||||
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
|
||||
|
||||
Study describing benefits of Generalized Hamming Distance Versus
|
||||
WindowDiff for evaluating text segmentation tasks
|
||||
Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ?
|
||||
TALN 2009
|
||||
|
||||
|
||||
3. Pk text segmentation metric
|
||||
|
||||
Beeferman D., Berger A., Lafferty J. (1999)
|
||||
Statistical Models for Text Segmentation
|
||||
Machine Learning, 34, 177-210
|
||||
"""
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
|
||||
"""
|
||||
Compute the windowdiff score for a pair of segmentations. A
|
||||
segmentation is any sequence over a vocabulary of two items
|
||||
(e.g. "0", "1"), where the specified boundary value is used to
|
||||
mark the edge of a segmentation.
|
||||
|
||||
>>> s1 = "000100000010"
|
||||
>>> s2 = "000010000100"
|
||||
>>> s3 = "100000010000"
|
||||
>>> '%.2f' % windowdiff(s1, s1, 3)
|
||||
'0.00'
|
||||
>>> '%.2f' % windowdiff(s1, s2, 3)
|
||||
'0.30'
|
||||
>>> '%.2f' % windowdiff(s2, s3, 3)
|
||||
'0.80'
|
||||
|
||||
:param seg1: a segmentation
|
||||
:type seg1: str or list
|
||||
:param seg2: a segmentation
|
||||
:type seg2: str or list
|
||||
:param k: window width
|
||||
:type k: int
|
||||
:param boundary: boundary value
|
||||
:type boundary: str or int or bool
|
||||
:param weighted: use the weighted variant of windowdiff
|
||||
:type weighted: boolean
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
if len(seg1) != len(seg2):
|
||||
raise ValueError("Segmentations have unequal length")
|
||||
if k > len(seg1):
|
||||
raise ValueError(
|
||||
"Window width k should be smaller or equal than segmentation lengths"
|
||||
)
|
||||
wd = 0
|
||||
for i in range(len(seg1) - k + 1):
|
||||
ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary))
|
||||
if weighted:
|
||||
wd += ndiff
|
||||
else:
|
||||
wd += min(1, ndiff)
|
||||
return wd / (len(seg1) - k + 1.0)
|
||||
|
||||
|
||||
# Generalized Hamming Distance
|
||||
|
||||
|
||||
def _init_mat(nrows, ncols, ins_cost, del_cost):
|
||||
mat = np.empty((nrows, ncols))
|
||||
mat[0, :] = ins_cost * np.arange(ncols)
|
||||
mat[:, 0] = del_cost * np.arange(nrows)
|
||||
return mat
|
||||
|
||||
|
||||
def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
|
||||
for i, rowi in enumerate(rowv):
|
||||
for j, colj in enumerate(colv):
|
||||
shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
|
||||
if rowi == colj:
|
||||
# boundaries are at the same location, no transformation required
|
||||
tcost = mat[i, j]
|
||||
elif rowi > colj:
|
||||
# boundary match through a deletion
|
||||
tcost = del_cost + mat[i, j + 1]
|
||||
else:
|
||||
# boundary match through an insertion
|
||||
tcost = ins_cost + mat[i + 1, j]
|
||||
mat[i + 1, j + 1] = min(tcost, shift_cost)
|
||||
|
||||
|
||||
def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"):
|
||||
"""
|
||||
Compute the Generalized Hamming Distance for a reference and a hypothetical
|
||||
segmentation, corresponding to the cost related to the transformation
|
||||
of the hypothetical segmentation into the reference segmentation
|
||||
through boundary insertion, deletion and shift operations.
|
||||
|
||||
A segmentation is any sequence over a vocabulary of two items
|
||||
(e.g. "0", "1"), where the specified boundary value is used to
|
||||
mark the edge of a segmentation.
|
||||
|
||||
Recommended parameter values are a shift_cost_coeff of 2.
|
||||
Associated with a ins_cost, and del_cost equal to the mean segment
|
||||
length in the reference segmentation.
|
||||
|
||||
>>> # Same examples as Kulyukin C++ implementation
|
||||
>>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
|
||||
0.5
|
||||
>>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
|
||||
2.0
|
||||
>>> ghd('011', '110', 1.0, 1.0, 0.5)
|
||||
1.0
|
||||
>>> ghd('1', '0', 1.0, 1.0, 0.5)
|
||||
1.0
|
||||
>>> ghd('111', '000', 1.0, 1.0, 0.5)
|
||||
3.0
|
||||
>>> ghd('000', '111', 1.0, 2.0, 0.5)
|
||||
6.0
|
||||
|
||||
:param ref: the reference segmentation
|
||||
:type ref: str or list
|
||||
:param hyp: the hypothetical segmentation
|
||||
:type hyp: str or list
|
||||
:param ins_cost: insertion cost
|
||||
:type ins_cost: float
|
||||
:param del_cost: deletion cost
|
||||
:type del_cost: float
|
||||
:param shift_cost_coeff: constant used to compute the cost of a shift.
|
||||
``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j``
|
||||
are the positions indicating the shift
|
||||
:type shift_cost_coeff: float
|
||||
:param boundary: boundary value
|
||||
:type boundary: str or int or bool
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
|
||||
hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
|
||||
|
||||
nref_bound = len(ref_idx)
|
||||
nhyp_bound = len(hyp_idx)
|
||||
|
||||
if nref_bound == 0 and nhyp_bound == 0:
|
||||
return 0.0
|
||||
elif nref_bound > 0 and nhyp_bound == 0:
|
||||
return nref_bound * ins_cost
|
||||
elif nref_bound == 0 and nhyp_bound > 0:
|
||||
return nhyp_bound * del_cost
|
||||
|
||||
mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
|
||||
_ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
|
||||
return float(mat[-1, -1])
|
||||
|
||||
|
||||
# Beeferman's Pk text segmentation evaluation metric
|
||||
|
||||
|
||||
def pk(ref, hyp, k=None, boundary="1"):
|
||||
"""
|
||||
Compute the Pk metric for a pair of segmentations A segmentation
|
||||
is any sequence over a vocabulary of two items (e.g. "0", "1"),
|
||||
where the specified boundary value is used to mark the edge of a
|
||||
segmentation.
|
||||
|
||||
>>> '%.2f' % pk('0100'*100, '1'*400, 2)
|
||||
'0.50'
|
||||
>>> '%.2f' % pk('0100'*100, '0'*400, 2)
|
||||
'0.50'
|
||||
>>> '%.2f' % pk('0100'*100, '0100'*100, 2)
|
||||
'0.00'
|
||||
|
||||
:param ref: the reference segmentation
|
||||
:type ref: str or list
|
||||
:param hyp: the segmentation to evaluate
|
||||
:type hyp: str or list
|
||||
:param k: window size, if None, set to half of the average reference segment length
|
||||
:type boundary: str or int or bool
|
||||
:param boundary: boundary value
|
||||
:type boundary: str or int or bool
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
if k is None:
|
||||
k = int(round(len(ref) / (ref.count(boundary) * 2.0)))
|
||||
|
||||
err = 0
|
||||
for i in range(len(ref) - k + 1):
|
||||
r = ref[i : i + k].count(boundary) > 0
|
||||
h = hyp[i : i + k].count(boundary) > 0
|
||||
if r != h:
|
||||
err += 1
|
||||
return err / (len(ref) - k + 1.0)
|
||||
@@ -0,0 +1,68 @@
|
||||
# Natural Language Toolkit: Spearman Rank Correlation
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Tools for comparing ranked lists.
|
||||
"""
|
||||
|
||||
|
||||
def _rank_dists(ranks1, ranks2):
|
||||
"""Finds the difference between the values in ranks1 and ranks2 for keys
|
||||
present in both dicts. If the arguments are not dicts, they are converted
|
||||
from (key, rank) sequences.
|
||||
"""
|
||||
ranks1 = dict(ranks1)
|
||||
ranks2 = dict(ranks2)
|
||||
for k in ranks1:
|
||||
try:
|
||||
yield k, ranks1[k] - ranks2[k]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
|
||||
def spearman_correlation(ranks1, ranks2):
|
||||
"""Returns the Spearman correlation coefficient for two rankings, which
|
||||
should be dicts or sequences of (key, rank). The coefficient ranges from
|
||||
-1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only
|
||||
calculated for keys in both rankings (for meaningful results, remove keys
|
||||
present in only one list before ranking)."""
|
||||
n = 0
|
||||
res = 0
|
||||
for k, d in _rank_dists(ranks1, ranks2):
|
||||
res += d * d
|
||||
n += 1
|
||||
try:
|
||||
return 1 - (6 * res / (n * (n * n - 1)))
|
||||
except ZeroDivisionError:
|
||||
# Result is undefined if only one item is ranked
|
||||
return 0.0
|
||||
|
||||
|
||||
def ranks_from_sequence(seq):
|
||||
"""Given a sequence, yields each element with an increasing rank, suitable
|
||||
for use as an argument to ``spearman_correlation``.
|
||||
"""
|
||||
return ((k, i) for i, k in enumerate(seq))
|
||||
|
||||
|
||||
def ranks_from_scores(scores, rank_gap=1e-15):
|
||||
"""Given a sequence of (key, score) tuples, yields each key with an
|
||||
increasing rank, tying with previous key's rank if the difference between
|
||||
their scores is less than rank_gap. Suitable for use as an argument to
|
||||
``spearman_correlation``.
|
||||
"""
|
||||
prev_score = None
|
||||
rank = 0
|
||||
for i, (key, score) in enumerate(scores):
|
||||
try:
|
||||
if abs(score - prev_score) > rank_gap:
|
||||
rank = i
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
yield key, rank
|
||||
prev_score = score
|
||||
Reference in New Issue
Block a user