This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,51 @@
# Natural Language Toolkit: Metrics
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
"""
NLTK Metrics
Classes and methods for scoring processing modules.
"""
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics.aline import align
from nltk.metrics.association import (
BigramAssocMeasures,
ContingencyMeasures,
NgramAssocMeasures,
QuadgramAssocMeasures,
TrigramAssocMeasures,
)
from nltk.metrics.confusionmatrix import ConfusionMatrix
from nltk.metrics.distance import (
binary_distance,
custom_distance,
edit_distance,
edit_distance_align,
fractional_presence,
interval_distance,
jaccard_distance,
masi_distance,
presence,
)
from nltk.metrics.paice import Paice
from nltk.metrics.scores import (
accuracy,
approxrand,
f_measure,
log_likelihood,
precision,
recall,
)
from nltk.metrics.segmentation import ghd, pk, windowdiff
from nltk.metrics.spearman import (
ranks_from_scores,
ranks_from_sequence,
spearman_correlation,
)

View File

@@ -0,0 +1,467 @@
# Natural Language Toolkit: Agreement Metrics
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Tom Lippincott <tom@cs.columbia.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
"""
Implementations of inter-annotator agreement coefficients surveyed by Artstein
and Poesio (2007), Inter-Coder Agreement for Computational Linguistics.
An agreement coefficient calculates the amount that annotators agreed on label
assignments beyond what is expected by chance.
In defining the AnnotationTask class, we use naming conventions similar to the
paper's terminology. There are three types of objects in an annotation task:
the coders (variables "c" and "C")
the items to be annotated (variables "i" and "I")
the potential categories to be assigned (variables "k" and "K")
Additionally, it is often the case that we don't want to treat two different
labels as complete disagreement, and so the AnnotationTask constructor can also
take a distance metric as a final argument. Distance metrics are simply
functions that take two arguments, and return a value between 0.0 and 1.0
indicating the distance between them. If not supplied, the default is binary
comparison between the arguments.
The simplest way to initialize an AnnotationTask is with a list of triples,
each containing a coder's assignment for one object in the task:
task = AnnotationTask(data=[('c1', '1', 'v1'),('c2', '1', 'v1'),...])
Note that the data list needs to contain the same number of triples for each
individual coder, containing category values for the same set of items.
Alpha (Krippendorff 1980)
Kappa (Cohen 1960)
S (Bennet, Albert and Goldstein 1954)
Pi (Scott 1955)
TODO: Describe handling of multiple coders and missing data
Expected results from the Artstein and Poesio survey paper:
>>> from nltk.metrics.agreement import AnnotationTask
>>> import os.path
>>> t = AnnotationTask(data=[x.split() for x in open(os.path.join(os.path.dirname(__file__), "artstein_poesio_example.txt"))])
>>> t.avg_Ao()
0.88
>>> round(t.pi(), 5)
0.79953
>>> round(t.S(), 2)
0.82
This would have returned a wrong value (0.0) in @785fb79 as coders are in
the wrong order. Subsequently, all values for pi(), S(), and kappa() would
have been wrong as they are computed with avg_Ao().
>>> t2 = AnnotationTask(data=[('b','1','stat'),('a','1','stat')])
>>> t2.avg_Ao()
1.0
The following, of course, also works.
>>> t3 = AnnotationTask(data=[('a','1','othr'),('b','1','othr')])
>>> t3.avg_Ao()
1.0
"""
import logging
from itertools import groupby
from operator import itemgetter
from nltk.internals import deprecated
from nltk.metrics.distance import binary_distance
from nltk.probability import ConditionalFreqDist, FreqDist
log = logging.getLogger(__name__)
class AnnotationTask:
"""Represents an annotation task, i.e. people assign labels to items.
Notation tries to match notation in Artstein and Poesio (2007).
In general, coders and items can be represented as any hashable object.
Integers, for example, are fine, though strings are more readable.
Labels must support the distance functions applied to them, so e.g.
a string-edit-distance makes no sense if your labels are integers,
whereas interval distance needs numeric values. A notable case of this
is the MASI metric, which requires Python sets.
"""
def __init__(self, data=None, distance=binary_distance):
"""Initialize an annotation task.
The data argument can be None (to create an empty annotation task) or a sequence of 3-tuples,
each representing a coder's labeling of an item:
``(coder,item,label)``
The distance argument is a function taking two arguments (labels) and producing a numerical distance.
The distance from a label to itself should be zero:
``distance(l,l) = 0``
"""
self.distance = distance
self.I = set()
self.K = set()
self.C = set()
self.data = []
if data is not None:
self.load_array(data)
def __str__(self):
return "\r\n".join(
map(
lambda x: "%s\t%s\t%s"
% (x["coder"], x["item"].replace("_", "\t"), ",".join(x["labels"])),
self.data,
)
)
def load_array(self, array):
"""Load an sequence of annotation results, appending to any data already loaded.
The argument is a sequence of 3-tuples, each representing a coder's labeling of an item:
(coder,item,label)
"""
for coder, item, labels in array:
self.C.add(coder)
self.K.add(labels)
self.I.add(item)
self.data.append({"coder": coder, "labels": labels, "item": item})
def agr(self, cA, cB, i, data=None):
"""Agreement between two coders on a given item"""
data = data or self.data
# cfedermann: we don't know what combination of coder/item will come
# first in x; to avoid StopIteration problems due to assuming an order
# cA,cB, we allow either for k1 and then look up the missing as k2.
k1 = next(x for x in data if x["coder"] in (cA, cB) and x["item"] == i)
if k1["coder"] == cA:
k2 = next(x for x in data if x["coder"] == cB and x["item"] == i)
else:
k2 = next(x for x in data if x["coder"] == cA and x["item"] == i)
ret = 1.0 - float(self.distance(k1["labels"], k2["labels"]))
log.debug("Observed agreement between %s and %s on %s: %f", cA, cB, i, ret)
log.debug(
'Distance between "%r" and "%r": %f', k1["labels"], k2["labels"], 1.0 - ret
)
return ret
def Nk(self, k):
return float(sum(1 for x in self.data if x["labels"] == k))
def Nik(self, i, k):
return float(sum(1 for x in self.data if x["item"] == i and x["labels"] == k))
def Nck(self, c, k):
return float(sum(1 for x in self.data if x["coder"] == c and x["labels"] == k))
@deprecated("Use Nk, Nik or Nck instead")
def N(self, k=None, i=None, c=None):
"""Implements the "n-notation" used in Artstein and Poesio (2007)"""
if k is not None and i is None and c is None:
ret = self.Nk(k)
elif k is not None and i is not None and c is None:
ret = self.Nik(i, k)
elif k is not None and c is not None and i is None:
ret = self.Nck(c, k)
else:
raise ValueError(
f"You must pass either i or c, not both! (k={k!r},i={i!r},c={c!r})"
)
log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret)
return ret
def _grouped_data(self, field, data=None):
data = data or self.data
return groupby(sorted(data, key=itemgetter(field)), itemgetter(field))
def Ao(self, cA, cB):
"""Observed agreement between two coders on all items."""
data = self._grouped_data(
"item", (x for x in self.data if x["coder"] in (cA, cB))
)
ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len(
self.I
)
log.debug("Observed agreement between %s and %s: %f", cA, cB, ret)
return ret
def _pairwise_average(self, function):
"""
Calculates the average of function results for each coder pair
"""
total = 0
n = 0
s = self.C.copy()
for cA in self.C:
s.remove(cA)
for cB in s:
total += function(cA, cB)
n += 1
ret = total / n
return ret
def avg_Ao(self):
"""Average observed agreement across all coders and items."""
ret = self._pairwise_average(self.Ao)
log.debug("Average observed agreement: %f", ret)
return ret
def Do_Kw_pairwise(self, cA, cB, max_distance=1.0):
"""The observed disagreement for the weighted kappa coefficient."""
total = 0.0
data = (x for x in self.data if x["coder"] in (cA, cB))
for i, itemdata in self._grouped_data("item", data):
# we should have two items; distance doesn't care which comes first
total += self.distance(next(itemdata)["labels"], next(itemdata)["labels"])
ret = total / (len(self.I) * max_distance)
log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret)
return ret
def Do_Kw(self, max_distance=1.0):
"""Averaged over all labelers"""
ret = self._pairwise_average(
lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance)
)
log.debug("Observed disagreement: %f", ret)
return ret
# Agreement Coefficients
def S(self):
"""Bennett, Albert and Goldstein 1954"""
Ae = 1.0 / len(self.K)
ret = (self.avg_Ao() - Ae) / (1.0 - Ae)
return ret
def pi(self):
"""Scott 1955; here, multi-pi.
Equivalent to K from Siegel and Castellan (1988).
"""
total = 0.0
label_freqs = FreqDist(x["labels"] for x in self.data)
for k, f in label_freqs.items():
total += f**2
Ae = total / ((len(self.I) * len(self.C)) ** 2)
return (self.avg_Ao() - Ae) / (1 - Ae)
def Ae_kappa(self, cA, cB):
Ae = 0.0
nitems = float(len(self.I))
label_freqs = ConditionalFreqDist((x["labels"], x["coder"]) for x in self.data)
for k in label_freqs.conditions():
Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
return Ae
def kappa_pairwise(self, cA, cB):
""" """
Ae = self.Ae_kappa(cA, cB)
ret = (self.Ao(cA, cB) - Ae) / (1.0 - Ae)
log.debug("Expected agreement between %s and %s: %f", cA, cB, Ae)
return ret
def kappa(self):
"""Cohen 1960
Averages naively over kappas for each coder pair.
"""
return self._pairwise_average(self.kappa_pairwise)
def multi_kappa(self):
"""Davies and Fleiss 1982
Averages over observed and expected agreements for each coder pair.
"""
Ae = self._pairwise_average(self.Ae_kappa)
return (self.avg_Ao() - Ae) / (1.0 - Ae)
def Disagreement(self, label_freqs):
total_labels = sum(label_freqs.values())
pairs = 0.0
for j, nj in label_freqs.items():
for l, nl in label_freqs.items():
pairs += float(nj * nl) * self.distance(l, j)
return 1.0 * pairs / (total_labels * (total_labels - 1))
def alpha(self):
"""Krippendorff 1980"""
# check for degenerate cases
if len(self.K) == 0:
raise ValueError("Cannot calculate alpha, no data present!")
if len(self.K) == 1:
log.debug("Only one annotation value, alpha returning 1.")
return 1
if len(self.C) == 1 and len(self.I) == 1:
raise ValueError("Cannot calculate alpha, only one coder and item present!")
total_disagreement = 0.0
total_ratings = 0
all_valid_labels_freq = FreqDist([])
total_do = 0.0 # Total observed disagreement for all items.
for i, itemdata in self._grouped_data("item"):
label_freqs = FreqDist(x["labels"] for x in itemdata)
labels_count = sum(label_freqs.values())
if labels_count < 2:
# Ignore the item.
continue
all_valid_labels_freq += label_freqs
total_do += self.Disagreement(label_freqs) * labels_count
if len(all_valid_labels_freq.keys()) == 1:
log.debug("Only one valid annotation value, alpha returning 1.")
return 1
do = total_do / sum(all_valid_labels_freq.values())
de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
k_alpha = 1.0 - do / de
return k_alpha
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
"""Cohen 1968"""
total = 0.0
label_freqs = ConditionalFreqDist(
(x["coder"], x["labels"]) for x in self.data if x["coder"] in (cA, cB)
)
for j in self.K:
for l in self.K:
total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
De = total / (max_distance * pow(len(self.I), 2))
log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
Do = self.Do_Kw_pairwise(cA, cB)
ret = 1.0 - (Do / De)
return ret
def weighted_kappa(self, max_distance=1.0):
"""Cohen 1968"""
return self._pairwise_average(
lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance)
)
if __name__ == "__main__":
import optparse
import re
from nltk.metrics import distance
# process command-line arguments
parser = optparse.OptionParser()
parser.add_option(
"-d",
"--distance",
dest="distance",
default="binary_distance",
help="distance metric to use",
)
parser.add_option(
"-a",
"--agreement",
dest="agreement",
default="kappa",
help="agreement coefficient to calculate",
)
parser.add_option(
"-e",
"--exclude",
dest="exclude",
action="append",
default=[],
help="coder names to exclude (may be specified multiple times)",
)
parser.add_option(
"-i",
"--include",
dest="include",
action="append",
default=[],
help="coder names to include, same format as exclude",
)
parser.add_option(
"-f",
"--file",
dest="file",
help="file to read labelings from, each line with three columns: 'labeler item labels'",
)
parser.add_option(
"-v",
"--verbose",
dest="verbose",
default="0",
help="how much debugging to print on stderr (0-4)",
)
parser.add_option(
"-c",
"--columnsep",
dest="columnsep",
default="\t",
help="char/string that separates the three columns in the file, defaults to tab",
)
parser.add_option(
"-l",
"--labelsep",
dest="labelsep",
default=",",
help="char/string that separates labels (if labelers can assign more than one), defaults to comma",
)
parser.add_option(
"-p",
"--presence",
dest="presence",
default=None,
help="convert each labeling into 1 or 0, based on presence of LABEL",
)
parser.add_option(
"-T",
"--thorough",
dest="thorough",
default=False,
action="store_true",
help="calculate agreement for every subset of the annotators",
)
(options, remainder) = parser.parse_args()
if not options.file:
parser.print_help()
exit()
logging.basicConfig(level=50 - 10 * int(options.verbose))
# read in data from the specified file
data = []
with open(options.file) as infile:
for l in infile:
toks = l.split(options.columnsep)
coder, object_, labels = (
toks[0],
str(toks[1:-1]),
frozenset(toks[-1].strip().split(options.labelsep)),
)
if (
(options.include == options.exclude)
or (len(options.include) > 0 and coder in options.include)
or (len(options.exclude) > 0 and coder not in options.exclude)
):
data.append((coder, object_, labels))
if options.presence:
task = AnnotationTask(
data, getattr(distance, options.distance)(options.presence)
)
else:
task = AnnotationTask(data, getattr(distance, options.distance))
if options.thorough:
pass
else:
print(getattr(task, options.agreement)())
logging.shutdown()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,476 @@
# Natural Language Toolkit: Ngram Association Measures
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Provides scoring functions for a number of association measures through a
generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
"""
import math as _math
from abc import ABCMeta, abstractmethod
from functools import reduce
_log2 = lambda x: _math.log2(x)
_ln = _math.log
_product = lambda s: reduce(lambda x, y: x * y, s)
_SMALL = 1e-20
try:
from scipy.stats import fisher_exact
except ImportError:
def fisher_exact(*_args, **_kwargs):
raise NotImplementedError
### Indices to marginals arguments:
NGRAM = 0
"""Marginals index for the ngram count"""
UNIGRAMS = -2
"""Marginals index for a tuple of each unigram count"""
TOTAL = -1
"""Marginals index for the number of words in the data"""
class NgramAssocMeasures(metaclass=ABCMeta):
"""
An abstract class defining a collection of generic association measures.
Each public method returns a score, taking the following arguments::
score_fn(count_of_ngram,
(count_of_n-1gram_1, ..., count_of_n-1gram_j),
(count_of_n-2gram_1, ..., count_of_n-2gram_k),
...,
(count_of_1gram_1, ..., count_of_1gram_n),
count_of_total_words)
See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``
Inheriting classes should define a property _n, and a method _contingency
which calculates contingency values from marginals in order for all
association measures defined here to be usable.
"""
_n = 0
@staticmethod
@abstractmethod
def _contingency(*marginals):
"""Calculates values of a contingency table from marginal values."""
raise NotImplementedError(
"The contingency table is not available" "in the general ngram case"
)
@staticmethod
@abstractmethod
def _marginals(*contingency):
"""Calculates values of contingency table marginals from its values."""
raise NotImplementedError(
"The contingency table is not available" "in the general ngram case"
)
@classmethod
def _expected_values(cls, cont):
"""Calculates expected values for a contingency table."""
n_all = sum(cont)
bits = [1 << i for i in range(cls._n)]
# For each contingency table cell
for i in range(len(cont)):
# Yield the expected value
yield (
_product(
sum(cont[x] for x in range(2**cls._n) if (x & j) == (i & j))
for j in bits
)
/ (n_all ** (cls._n - 1))
)
@staticmethod
def raw_freq(*marginals):
"""Scores ngrams by their frequency"""
return marginals[NGRAM] / marginals[TOTAL]
@classmethod
def student_t(cls, *marginals):
"""Scores ngrams using Student's t test with independence hypothesis
for unigrams, as in Manning and Schutze 5.3.1.
"""
return (
marginals[NGRAM]
- _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
) / (marginals[NGRAM] + _SMALL) ** 0.5
@classmethod
def chi_sq(cls, *marginals):
"""Scores ngrams using Pearson's chi-square as in Manning and Schutze
5.3.3.
"""
cont = cls._contingency(*marginals)
exps = cls._expected_values(cont)
return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps))
@staticmethod
def mi_like(*marginals, **kwargs):
"""Scores ngrams using a variant of mutual information. The keyword
argument power sets an exponent (default 3) for the numerator. No
logarithm of the result is calculated.
"""
return marginals[NGRAM] ** kwargs.get("power", 3) / _product(
marginals[UNIGRAMS]
)
@classmethod
def pmi(cls, *marginals):
"""Scores ngrams by pointwise mutual information, as in Manning and
Schutze 5.4.
"""
return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2(
_product(marginals[UNIGRAMS])
)
@classmethod
def likelihood_ratio(cls, *marginals):
"""Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4."""
cont = cls._contingency(*marginals)
return 2 * sum(
obs * _ln(obs / (exp + _SMALL) + _SMALL)
for obs, exp in zip(cont, cls._expected_values(cont))
)
@classmethod
def poisson_stirling(cls, *marginals):
"""Scores ngrams using the Poisson-Stirling measure."""
exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)
@classmethod
def jaccard(cls, *marginals):
"""Scores ngrams using the Jaccard index."""
cont = cls._contingency(*marginals)
return cont[0] / sum(cont[:-1])
class BigramAssocMeasures(NgramAssocMeasures):
"""
A collection of bigram association measures. Each association measure
is provided as a function with three arguments::
bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
The arguments constitute the marginals of a contingency table, counting
the occurrences of particular events in a corpus. The letter i in the
suffix refers to the appearance of the word in question, while x indicates
the appearance of any word. Thus, for example:
- n_ii counts ``(w1, w2)``, i.e. the bigram being scored
- n_ix counts ``(w1, *)``
- n_xi counts ``(*, w2)``
- n_xx counts ``(*, *)``, i.e. any bigram
This may be shown with respect to a contingency table::
w1 ~w1
------ ------
w2 | n_ii | n_oi | = n_xi
------ ------
~w2 | n_io | n_oo |
------ ------
= n_ix TOTAL = n_xx
"""
_n = 2
@staticmethod
def _contingency(n_ii, n_ix_xi_tuple, n_xx):
"""Calculates values of a bigram contingency table from marginal values."""
(n_ix, n_xi) = n_ix_xi_tuple
n_oi = n_xi - n_ii
n_io = n_ix - n_ii
return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io)
@staticmethod
def _marginals(n_ii, n_oi, n_io, n_oo):
"""Calculates values of contingency table marginals from its values."""
return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii)
@staticmethod
def _expected_values(cont):
"""Calculates expected values for a contingency table."""
n_xx = sum(cont)
# For each contingency table cell
for i in range(4):
yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx
@classmethod
def phi_sq(cls, *marginals):
"""Scores bigrams using phi-square, the square of the Pearson correlation
coefficient.
"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
return (n_ii * n_oo - n_io * n_oi) ** 2 / (
(n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)
)
@classmethod
def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
"""Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
of bigrams, as in Manning and Schutze 5.3.3.
"""
(n_ix, n_xi) = n_ix_xi_tuple
return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)
@classmethod
def fisher(cls, *marginals):
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
sensitive to small counts than PMI or Chi Sq, but also more expensive
to compute. Requires scipy.
"""
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less")
return pvalue
@staticmethod
def dice(n_ii, n_ix_xi_tuple, n_xx):
"""Scores bigrams using Dice's coefficient."""
(n_ix, n_xi) = n_ix_xi_tuple
return 2 * n_ii / (n_ix + n_xi)
class TrigramAssocMeasures(NgramAssocMeasures):
"""
A collection of trigram association measures. Each association measure
is provided as a function with four arguments::
trigram_score_fn(n_iii,
(n_iix, n_ixi, n_xii),
(n_ixx, n_xix, n_xxi),
n_xxx)
The arguments constitute the marginals of a contingency table, counting
the occurrences of particular events in a corpus. The letter i in the
suffix refers to the appearance of the word in question, while x indicates
the appearance of any word. Thus, for example:
- n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored
- n_ixx counts ``(w1, *, *)``
- n_xxx counts ``(*, *, *)``, i.e. any trigram
"""
_n = 3
@staticmethod
def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx):
"""Calculates values of a trigram contingency table (or cube) from
marginal values.
>>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
(1, 0, 0, 0, 0, 72, 0, 1927)
"""
(n_iix, n_ixi, n_xii) = n_iix_tuple
(n_ixx, n_xix, n_xxi) = n_ixx_tuple
n_oii = n_xii - n_iii
n_ioi = n_ixi - n_iii
n_iio = n_iix - n_iii
n_ooi = n_xxi - n_iii - n_oii - n_ioi
n_oio = n_xix - n_iii - n_oii - n_iio
n_ioo = n_ixx - n_iii - n_ioi - n_iio
n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo
return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo)
@staticmethod
def _marginals(*contingency):
"""Calculates values of contingency table marginals from its values.
>>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
(1, (1, 1, 1), (1, 73, 1), 2000)
"""
n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
return (
n_iii,
(n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
(
n_iii + n_ioi + n_iio + n_ioo,
n_iii + n_oii + n_iio + n_oio,
n_iii + n_oii + n_ioi + n_ooi,
),
sum(contingency),
)
class QuadgramAssocMeasures(NgramAssocMeasures):
"""
A collection of quadgram association measures. Each association measure
is provided as a function with five arguments::
trigram_score_fn(n_iiii,
(n_iiix, n_iixi, n_ixii, n_xiii),
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
(n_ixxx, n_xixx, n_xxix, n_xxxi),
n_all)
The arguments constitute the marginals of a contingency table, counting
the occurrences of particular events in a corpus. The letter i in the
suffix refers to the appearance of the word in question, while x indicates
the appearance of any word. Thus, for example:
- n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored
- n_ixxi counts ``(w1, *, *, w4)``
- n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram
"""
_n = 4
@staticmethod
def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx):
"""Calculates values of a quadgram contingency table from
marginal values.
"""
(n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple
(n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple
n_oiii = n_xiii - n_iiii
n_ioii = n_ixii - n_iiii
n_iioi = n_iixi - n_iiii
n_ooii = n_xxii - n_iiii - n_oiii - n_ioii
n_oioi = n_xixi - n_iiii - n_oiii - n_iioi
n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi
n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi
n_iiio = n_iiix - n_iiii
n_oiio = n_xiix - n_iiii - n_oiii - n_iiio
n_ioio = n_ixix - n_iiii - n_ioii - n_iiio
n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio
n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
n_oooo = (
n_xxxx
- n_iiii
- n_oiii
- n_ioii
- n_iioi
- n_ooii
- n_oioi
- n_iooi
- n_oooi
- n_iiio
- n_oiio
- n_ioio
- n_ooio
- n_iioo
- n_oioo
- n_iooo
)
return (
n_iiii,
n_oiii,
n_ioii,
n_ooii,
n_iioi,
n_oioi,
n_iooi,
n_oooi,
n_iiio,
n_oiio,
n_ioio,
n_ooio,
n_iioo,
n_oioo,
n_iooo,
n_oooo,
)
@staticmethod
def _marginals(*contingency):
"""Calculates values of contingency table marginals from its values.
QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
(1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
"""
(
n_iiii,
n_oiii,
n_ioii,
n_ooii,
n_iioi,
n_oioi,
n_iooi,
n_oooi,
n_iiio,
n_oiio,
n_ioio,
n_ooio,
n_iioo,
n_oioo,
n_iooo,
n_oooo,
) = contingency
n_iiix = n_iiii + n_iiio
n_iixi = n_iiii + n_iioi
n_ixii = n_iiii + n_ioii
n_xiii = n_iiii + n_oiii
n_iixx = n_iiii + n_iioi + n_iiio + n_iioo
n_ixix = n_iiii + n_ioii + n_iiio + n_ioio
n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi
n_xixi = n_iiii + n_oiii + n_iioi + n_oioi
n_xxii = n_iiii + n_oiii + n_ioii + n_ooii
n_xiix = n_iiii + n_oiii + n_iiio + n_oiio
n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo
n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo
n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio
n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi
n_all = sum(contingency)
return (
n_iiii,
(n_iiix, n_iixi, n_ixii, n_xiii),
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
(n_ixxx, n_xixx, n_xxix, n_xxxi),
n_all,
)
class ContingencyMeasures:
"""Wraps NgramAssocMeasures classes such that the arguments of association
measures are contingency table values rather than marginals.
"""
def __init__(self, measures):
"""Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
self.__class__.__name__ = "Contingency" + measures.__class__.__name__
for k in dir(measures):
if k.startswith("__"):
continue
v = getattr(measures, k)
if not k.startswith("_"):
v = self._make_contingency_fn(measures, v)
setattr(self, k, v)
@staticmethod
def _make_contingency_fn(measures, old_fn):
"""From an association measure function, produces a new function which
accepts contingency table values as its arguments.
"""
def res(*contingency):
return old_fn(*measures._marginals(*contingency))
res.__doc__ = old_fn.__doc__
res.__name__ = old_fn.__name__
return res

View File

@@ -0,0 +1,351 @@
# Natural Language Toolkit: Confusion Matrices
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Tom Aarsen <>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from nltk.probability import FreqDist
class ConfusionMatrix:
"""
The confusion matrix between a list of reference values and a
corresponding list of test values. Entry *[r,t]* of this
matrix is a count of the number of times that the reference value
*r* corresponds to the test value *t*. E.g.:
>>> from nltk.metrics import ConfusionMatrix
>>> ref = 'DET NN VB DET JJ NN NN IN DET NN'.split()
>>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
>>> cm = ConfusionMatrix(ref, test)
>>> print(cm['NN', 'NN'])
3
Note that the diagonal entries *Ri=Tj* of this matrix
corresponds to correct values; and the off-diagonal entries
correspond to incorrect values.
"""
def __init__(self, reference, test, sort_by_count=False):
"""
Construct a new confusion matrix from a list of reference
values and a corresponding list of test values.
:type reference: list
:param reference: An ordered list of reference values.
:type test: list
:param test: A list of values to compare against the
corresponding reference values.
:raise ValueError: If ``reference`` and ``length`` do not have
the same length.
"""
if len(reference) != len(test):
raise ValueError("Lists must have the same length.")
# Get a list of all values.
if sort_by_count:
ref_fdist = FreqDist(reference)
test_fdist = FreqDist(test)
def key(v):
return -(ref_fdist[v] + test_fdist[v])
values = sorted(set(reference + test), key=key)
else:
values = sorted(set(reference + test))
# Construct a value->index dictionary
indices = {val: i for (i, val) in enumerate(values)}
# Make a confusion matrix table.
confusion = [[0 for _ in values] for _ in values]
max_conf = 0 # Maximum confusion
for w, g in zip(reference, test):
confusion[indices[w]][indices[g]] += 1
max_conf = max(max_conf, confusion[indices[w]][indices[g]])
#: A list of all values in ``reference`` or ``test``.
self._values = values
#: A dictionary mapping values in ``self._values`` to their indices.
self._indices = indices
#: The confusion matrix itself (as a list of lists of counts).
self._confusion = confusion
#: The greatest count in ``self._confusion`` (used for printing).
self._max_conf = max_conf
#: The total number of values in the confusion matrix.
self._total = len(reference)
#: The number of correct (on-diagonal) values in the matrix.
self._correct = sum(confusion[i][i] for i in range(len(values)))
def __getitem__(self, li_lj_tuple):
"""
:return: The number of times that value ``li`` was expected and
value ``lj`` was given.
:rtype: int
"""
(li, lj) = li_lj_tuple
i = self._indices[li]
j = self._indices[lj]
return self._confusion[i][j]
def __repr__(self):
return f"<ConfusionMatrix: {self._correct}/{self._total} correct>"
def __str__(self):
return self.pretty_format()
def pretty_format(
self,
show_percents=False,
values_in_chart=True,
truncate=None,
sort_by_count=False,
):
"""
:return: A multi-line string representation of this confusion matrix.
:type truncate: int
:param truncate: If specified, then only show the specified
number of values. Any sorting (e.g., sort_by_count)
will be performed before truncation.
:param sort_by_count: If true, then sort by the count of each
label in the reference data. I.e., labels that occur more
frequently in the reference label will be towards the left
edge of the matrix, and labels that occur less frequently
will be towards the right edge.
@todo: add marginals?
"""
confusion = self._confusion
values = self._values
if sort_by_count:
values = sorted(
values, key=lambda v: -sum(self._confusion[self._indices[v]])
)
if truncate:
values = values[:truncate]
if values_in_chart:
value_strings = ["%s" % val for val in values]
else:
value_strings = [str(n + 1) for n in range(len(values))]
# Construct a format string for row values
valuelen = max(len(val) for val in value_strings)
value_format = "%" + repr(valuelen) + "s | "
# Construct a format string for matrix entries
if show_percents:
entrylen = 6
entry_format = "%5.1f%%"
zerostr = " ."
else:
entrylen = len(repr(self._max_conf))
entry_format = "%" + repr(entrylen) + "d"
zerostr = " " * (entrylen - 1) + "."
# Write the column values.
s = ""
for i in range(valuelen):
s += (" " * valuelen) + " |"
for val in value_strings:
if i >= valuelen - len(val):
s += val[i - valuelen + len(val)].rjust(entrylen + 1)
else:
s += " " * (entrylen + 1)
s += " |\n"
# Write a dividing line
s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
# Write the entries.
for val, li in zip(value_strings, values):
i = self._indices[li]
s += value_format % val
for lj in values:
j = self._indices[lj]
if confusion[i][j] == 0:
s += zerostr
elif show_percents:
s += entry_format % (100.0 * confusion[i][j] / self._total)
else:
s += entry_format % confusion[i][j]
if i == j:
prevspace = s.rfind(" ")
s = s[:prevspace] + "<" + s[prevspace + 1 :] + ">"
else:
s += " "
s += "|\n"
# Write a dividing line
s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values)))
# Write a key
s += "(row = reference; col = test)\n"
if not values_in_chart:
s += "Value key:\n"
for i, value in enumerate(values):
s += "%6d: %s\n" % (i + 1, value)
return s
def key(self):
values = self._values
str = "Value key:\n"
indexlen = len(repr(len(values) - 1))
key_format = " %" + repr(indexlen) + "d: %s\n"
str += "".join([key_format % (i, values[i]) for i in range(len(values))])
return str
def recall(self, value):
"""Given a value in the confusion matrix, return the recall
that corresponds to this value. The recall is defined as:
- *r* = true positive / (true positive + false positive)
and can loosely be considered the ratio of how often ``value``
was predicted correctly relative to how often ``value`` was
the true result.
:param value: value used in the ConfusionMatrix
:return: the recall corresponding to ``value``.
:rtype: float
"""
# Number of times `value` was correct, and also predicted
TP = self[value, value]
# Number of times `value` was correct
TP_FN = sum(self[value, pred_value] for pred_value in self._values)
if TP_FN == 0:
return 0.0
return TP / TP_FN
def precision(self, value):
"""Given a value in the confusion matrix, return the precision
that corresponds to this value. The precision is defined as:
- *p* = true positive / (true positive + false negative)
and can loosely be considered the ratio of how often ``value``
was predicted correctly relative to the number of predictions
for ``value``.
:param value: value used in the ConfusionMatrix
:return: the precision corresponding to ``value``.
:rtype: float
"""
# Number of times `value` was correct, and also predicted
TP = self[value, value]
# Number of times `value` was predicted
TP_FP = sum(self[real_value, value] for real_value in self._values)
if TP_FP == 0:
return 0.0
return TP / TP_FP
def f_measure(self, value, alpha=0.5):
"""
Given a value used in the confusion matrix, return the f-measure
that corresponds to this value. The f-measure is the harmonic mean
of the ``precision`` and ``recall``, weighted by ``alpha``.
In particular, given the precision *p* and recall *r* defined by:
- *p* = true positive / (true positive + false negative)
- *r* = true positive / (true positive + false positive)
The f-measure is:
- *1/(alpha/p + (1-alpha)/r)*
With ``alpha = 0.5``, this reduces to:
- *2pr / (p + r)*
:param value: value used in the ConfusionMatrix
:param alpha: Ratio of the cost of false negative compared to false
positives. Defaults to 0.5, where the costs are equal.
:type alpha: float
:return: the F-measure corresponding to ``value``.
:rtype: float
"""
p = self.precision(value)
r = self.recall(value)
if p == 0.0 or r == 0.0:
return 0.0
return 1.0 / (alpha / p + (1 - alpha) / r)
def evaluate(self, alpha=0.5, truncate=None, sort_by_count=False):
"""
Tabulate the **recall**, **precision** and **f-measure**
for each value in this confusion matrix.
>>> reference = "DET NN VB DET JJ NN NN IN DET NN".split()
>>> test = "DET VB VB DET NN NN NN IN DET NN".split()
>>> cm = ConfusionMatrix(reference, test)
>>> print(cm.evaluate())
Tag | Prec. | Recall | F-measure
----+--------+--------+-----------
DET | 1.0000 | 1.0000 | 1.0000
IN | 1.0000 | 1.0000 | 1.0000
JJ | 0.0000 | 0.0000 | 0.0000
NN | 0.7500 | 0.7500 | 0.7500
VB | 0.5000 | 1.0000 | 0.6667
<BLANKLINE>
:param alpha: Ratio of the cost of false negative compared to false
positives, as used in the f-measure computation. Defaults to 0.5,
where the costs are equal.
:type alpha: float
:param truncate: If specified, then only show the specified
number of values. Any sorting (e.g., sort_by_count)
will be performed before truncation. Defaults to None
:type truncate: int, optional
:param sort_by_count: Whether to sort the outputs on frequency
in the reference label. Defaults to False.
:type sort_by_count: bool, optional
:return: A tabulated recall, precision and f-measure string
:rtype: str
"""
tags = self._values
# Apply keyword parameters
if sort_by_count:
tags = sorted(tags, key=lambda v: -sum(self._confusion[self._indices[v]]))
if truncate:
tags = tags[:truncate]
tag_column_len = max(max(len(tag) for tag in tags), 3)
# Construct the header
s = (
f"{' ' * (tag_column_len - 3)}Tag | Prec. | Recall | F-measure\n"
f"{'-' * tag_column_len}-+--------+--------+-----------\n"
)
# Construct the body
for tag in tags:
s += (
f"{tag:>{tag_column_len}} | "
f"{self.precision(tag):<6.4f} | "
f"{self.recall(tag):<6.4f} | "
f"{self.f_measure(tag, alpha=alpha):.4f}\n"
)
return s
def demo():
reference = "DET NN VB DET JJ NN NN IN DET NN".split()
test = "DET VB VB DET NN NN NN IN DET NN".split()
print("Reference =", reference)
print("Test =", test)
print("Confusion matrix:")
print(ConfusionMatrix(reference, test))
print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True))
print(ConfusionMatrix(reference, test).recall("VB"))
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,508 @@
# Natural Language Toolkit: Distance Metrics
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Tom Lippincott <tom@cs.columbia.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
"""
Distance Metrics.
Compute the distance between two items (usually strings).
As metrics, they must satisfy the following three requirements:
1. d(a, a) = 0
2. d(a, b) >= 0
3. d(a, c) <= d(a, b) + d(b, c)
"""
import operator
import warnings
def _edit_dist_init(len1, len2):
lev = []
for i in range(len1):
lev.append([0] * len2) # initialize 2D array to zero
for i in range(len1):
lev[i][0] = i # column 0: 0,1,2,3,4,...
for j in range(len2):
lev[0][j] = j # row 0: 0,1,2,3,4,...
return lev
def _last_left_t_init(sigma):
return {c: 0 for c in sigma}
def _edit_dist_step(
lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False
):
c1 = s1[i - 1]
c2 = s2[j - 1]
# skipping a character in s1
a = lev[i - 1][j] + 1
# skipping a character in s2
b = lev[i][j - 1] + 1
# substitution
c = lev[i - 1][j - 1] + (substitution_cost if c1 != c2 else 0)
# transposition
d = c + 1 # never picked by default
if transpositions and last_left > 0 and last_right > 0:
d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1
# pick the cheapest
lev[i][j] = min(a, b, c, d)
def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
"""
Calculate the Levenshtein edit-distance between two strings.
The edit distance is the number of characters that need to be
substituted, inserted, or deleted, to transform s1 into s2. For
example, transforming "rain" to "shine" requires three steps,
consisting of two substitutions and one insertion:
"rain" -> "sain" -> "shin" -> "shine". These operations could have
been done in other orders, but at least three steps are needed.
Allows specifying the cost of substitution edits (e.g., "a" -> "b"),
because sometimes it makes sense to assign greater penalties to
substitutions.
This also optionally allows transposition edits (e.g., "ab" -> "ba"),
though this is disabled by default.
:param s1, s2: The strings to be analysed
:param transpositions: Whether to allow transposition edits
:type s1: str
:type s2: str
:type substitution_cost: int
:type transpositions: bool
:rtype: int
"""
# set up a 2-D array
len1 = len(s1)
len2 = len(s2)
lev = _edit_dist_init(len1 + 1, len2 + 1)
# retrieve alphabet
sigma = set()
sigma.update(s1)
sigma.update(s2)
# set up table to remember positions of last seen occurrence in s1
last_left_t = _last_left_t_init(sigma)
# iterate over the array
# i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code
# see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
for i in range(1, len1 + 1):
last_right_buf = 0
for j in range(1, len2 + 1):
last_left = last_left_t[s2[j - 1]]
last_right = last_right_buf
if s1[i - 1] == s2[j - 1]:
last_right_buf = j
_edit_dist_step(
lev,
i,
j,
s1,
s2,
last_left,
last_right,
substitution_cost=substitution_cost,
transpositions=transpositions,
)
last_left_t[s1[i - 1]] = i
return lev[len1][len2]
def _edit_dist_backtrace(lev):
i, j = len(lev) - 1, len(lev[0]) - 1
alignment = [(i, j)]
while (i, j) != (0, 0):
directions = [
(i - 1, j - 1), # substitution
(i - 1, j), # skip s1
(i, j - 1), # skip s2
]
direction_costs = (
(lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j))
for i, j in directions
)
_, (i, j) = min(direction_costs, key=operator.itemgetter(0))
alignment.append((i, j))
return list(reversed(alignment))
def edit_distance_align(s1, s2, substitution_cost=1):
"""
Calculate the minimum Levenshtein edit-distance based alignment
mapping between two strings. The alignment finds the mapping
from string s1 to s2 that minimizes the edit distance cost.
For example, mapping "rain" to "shine" would involve 2
substitutions, 2 matches and an insertion resulting in
the following mapping:
[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)]
NB: (0, 0) is the start state without any letters associated
See more: https://web.stanford.edu/class/cs124/lec/med.pdf
In case of multiple valid minimum-distance alignments, the
backtrace has the following operation precedence:
1. Substitute s1 and s2 characters
2. Skip s1 character
3. Skip s2 character
The backtrace is carried out in reverse string order.
This function does not support transposition.
:param s1, s2: The strings to be aligned
:type s1: str
:type s2: str
:type substitution_cost: int
:rtype: List[Tuple(int, int)]
"""
# set up a 2-D array
len1 = len(s1)
len2 = len(s2)
lev = _edit_dist_init(len1 + 1, len2 + 1)
# iterate over the array
for i in range(len1):
for j in range(len2):
_edit_dist_step(
lev,
i + 1,
j + 1,
s1,
s2,
0,
0,
substitution_cost=substitution_cost,
transpositions=False,
)
# backtrace to find alignment
alignment = _edit_dist_backtrace(lev)
return alignment
def binary_distance(label1, label2):
"""Simple equality test.
0.0 if the labels are identical, 1.0 if they are different.
>>> from nltk.metrics import binary_distance
>>> binary_distance(1,1)
0.0
>>> binary_distance(1,3)
1.0
"""
return 0.0 if label1 == label2 else 1.0
def jaccard_distance(label1, label2):
"""Distance metric comparing set-similarity."""
return (len(label1.union(label2)) - len(label1.intersection(label2))) / len(
label1.union(label2)
)
def masi_distance(label1, label2):
"""Distance metric that takes into account partial agreement when multiple
labels are assigned.
>>> from nltk.metrics import masi_distance
>>> masi_distance(set([1, 2]), set([1, 2, 3, 4]))
0.665
Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI)
for Semantic and Pragmatic Annotation.
"""
len_intersection = len(label1.intersection(label2))
len_union = len(label1.union(label2))
len_label1 = len(label1)
len_label2 = len(label2)
if len_label1 == len_label2 and len_label1 == len_intersection:
m = 1
elif len_intersection == min(len_label1, len_label2):
m = 0.67
elif len_intersection > 0:
m = 0.33
else:
m = 0
return 1 - len_intersection / len_union * m
def interval_distance(label1, label2):
"""Krippendorff's interval distance metric
>>> from nltk.metrics import interval_distance
>>> interval_distance(1,10)
81
Krippendorff 1980, Content Analysis: An Introduction to its Methodology
"""
try:
return pow(label1 - label2, 2)
# return pow(list(label1)[0]-list(label2)[0],2)
except:
print("non-numeric labels not supported with interval distance")
def presence(label):
"""Higher-order function to test presence of a given label"""
return lambda x, y: 1.0 * ((label in x) == (label in y))
def fractional_presence(label):
return (
lambda x, y: abs((1.0 / len(x)) - (1.0 / len(y))) * (label in x and label in y)
or 0.0 * (label not in x and label not in y)
or abs(1.0 / len(x)) * (label in x and label not in y)
or (1.0 / len(y)) * (label not in x and label in y)
)
def custom_distance(file):
data = {}
with open(file) as infile:
for l in infile:
labelA, labelB, dist = l.strip().split("\t")
labelA = frozenset([labelA])
labelB = frozenset([labelB])
data[frozenset([labelA, labelB])] = float(dist)
return lambda x, y: data[frozenset([x, y])]
def jaro_similarity(s1, s2):
"""
Computes the Jaro similarity between 2 sequences from:
Matthew A. Jaro (1989). Advances in record linkage methodology
as applied to the 1985 census of Tampa Florida. Journal of the
American Statistical Association. 84 (406): 414-20.
The Jaro distance between is the min no. of single-character transpositions
required to change one word into another. The Jaro similarity formula from
https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance :
``jaro_sim = 0 if m = 0 else 1/3 * (m/|s_1| + m/s_2 + (m-t)/m)``
where
- `|s_i|` is the length of string `s_i`
- `m` is the no. of matching characters
- `t` is the half no. of possible transpositions.
"""
# First, store the length of the strings
# because they will be re-used several times.
len_s1, len_s2 = len(s1), len(s2)
# The upper bound of the distance for being a matched character.
match_bound = max(len_s1, len_s2) // 2 - 1
# Initialize the counts for matches and transpositions.
matches = 0 # no.of matched characters in s1 and s2
transpositions = 0 # no. of transpositions between s1 and s2
flagged_1 = [] # positions in s1 which are matches to some character in s2
flagged_2 = [] # positions in s2 which are matches to some character in s1
# Iterate through sequences, check for matches and compute transpositions.
for i in range(len_s1): # Iterate through each character.
upperbound = min(i + match_bound, len_s2 - 1)
lowerbound = max(0, i - match_bound)
for j in range(lowerbound, upperbound + 1):
if s1[i] == s2[j] and j not in flagged_2:
matches += 1
flagged_1.append(i)
flagged_2.append(j)
break
flagged_2.sort()
for i, j in zip(flagged_1, flagged_2):
if s1[i] != s2[j]:
transpositions += 1
if matches == 0:
return 0
else:
return (
1
/ 3
* (
matches / len_s1
+ matches / len_s2
+ (matches - transpositions // 2) / matches
)
)
def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4):
"""
The Jaro Winkler distance is an extension of the Jaro similarity in:
William E. Winkler. 1990. String Comparator Metrics and Enhanced
Decision Rules in the Fellegi-Sunter Model of Record Linkage.
Proceedings of the Section on Survey Research Methods.
American Statistical Association: 354-359.
such that:
jaro_winkler_sim = jaro_sim + ( l * p * (1 - jaro_sim) )
where,
- jaro_sim is the output from the Jaro Similarity,
see jaro_similarity()
- l is the length of common prefix at the start of the string
- this implementation provides an upperbound for the l value
to keep the prefixes.A common value of this upperbound is 4.
- p is the constant scaling factor to overweigh common prefixes.
The Jaro-Winkler similarity will fall within the [0, 1] bound,
given that max(p)<=0.25 , default is p=0.1 in Winkler (1990)
Test using outputs from https://www.census.gov/srd/papers/pdf/rr93-8.pdf
from "Table 5 Comparison of String Comparators Rescaled between 0 and 1"
>>> winkler_examples = [("billy", "billy"), ("billy", "bill"), ("billy", "blily"),
... ("massie", "massey"), ("yvette", "yevett"), ("billy", "bolly"), ("dwayne", "duane"),
... ("dixon", "dickson"), ("billy", "susan")]
>>> winkler_scores = [1.000, 0.967, 0.947, 0.944, 0.911, 0.893, 0.858, 0.853, 0.000]
>>> jaro_scores = [1.000, 0.933, 0.933, 0.889, 0.889, 0.867, 0.822, 0.790, 0.000]
One way to match the values on the Winkler's paper is to provide a different
p scaling factor for different pairs of strings, e.g.
>>> p_factors = [0.1, 0.125, 0.20, 0.125, 0.20, 0.20, 0.20, 0.15, 0.1]
>>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
... assert round(jaro_similarity(s1, s2), 3) == jscore
... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
Test using outputs from https://www.census.gov/srd/papers/pdf/rr94-5.pdf from
"Table 2.1. Comparison of String Comparators Using Last Names, First Names, and Street Names"
>>> winkler_examples = [('SHACKLEFORD', 'SHACKELFORD'), ('DUNNINGHAM', 'CUNNIGHAM'),
... ('NICHLESON', 'NICHULSON'), ('JONES', 'JOHNSON'), ('MASSEY', 'MASSIE'),
... ('ABROMS', 'ABRAMS'), ('HARDIN', 'MARTINEZ'), ('ITMAN', 'SMITH'),
... ('JERALDINE', 'GERALDINE'), ('MARHTA', 'MARTHA'), ('MICHELLE', 'MICHAEL'),
... ('JULIES', 'JULIUS'), ('TANYA', 'TONYA'), ('DWAYNE', 'DUANE'), ('SEAN', 'SUSAN'),
... ('JON', 'JOHN'), ('JON', 'JAN'), ('BROOKHAVEN', 'BRROKHAVEN'),
... ('BROOK HALLOW', 'BROOK HLLW'), ('DECATUR', 'DECATIR'), ('FITZRUREITER', 'FITZENREITER'),
... ('HIGBEE', 'HIGHEE'), ('HIGBEE', 'HIGVEE'), ('LACURA', 'LOCURA'), ('IOWA', 'IONA'), ('1ST', 'IST')]
>>> jaro_scores = [0.970, 0.896, 0.926, 0.790, 0.889, 0.889, 0.722, 0.467, 0.926,
... 0.944, 0.869, 0.889, 0.867, 0.822, 0.783, 0.917, 0.000, 0.933, 0.944, 0.905,
... 0.856, 0.889, 0.889, 0.889, 0.833, 0.000]
>>> winkler_scores = [0.982, 0.896, 0.956, 0.832, 0.944, 0.922, 0.722, 0.467, 0.926,
... 0.961, 0.921, 0.933, 0.880, 0.858, 0.805, 0.933, 0.000, 0.947, 0.967, 0.943,
... 0.913, 0.922, 0.922, 0.900, 0.867, 0.000]
One way to match the values on the Winkler's paper is to provide a different
p scaling factor for different pairs of strings, e.g.
>>> p_factors = [0.1, 0.1, 0.1, 0.1, 0.125, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.20,
... 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
>>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
... if (s1, s2) in [('JON', 'JAN'), ('1ST', 'IST')]:
... continue # Skip bad examples from the paper.
... assert round(jaro_similarity(s1, s2), 3) == jscore
... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
This test-case proves that the output of Jaro-Winkler similarity depends on
the product l * p and not on the product max_l * p. Here the product max_l * p > 1
however the product l * p <= 1
>>> round(jaro_winkler_similarity('TANYA', 'TONYA', p=0.1, max_l=100), 3)
0.88
"""
# To ensure that the output of the Jaro-Winkler's similarity
# falls between [0,1], the product of l * p needs to be
# also fall between [0,1].
if not 0 <= max_l * p <= 1:
warnings.warn(
str(
"The product `max_l * p` might not fall between [0,1]."
"Jaro-Winkler similarity might not be between 0 and 1."
)
)
# Compute the Jaro similarity
jaro_sim = jaro_similarity(s1, s2)
# Initialize the upper bound for the no. of prefixes.
# if user did not pre-define the upperbound,
# use shorter length between s1 and s2
# Compute the prefix matches.
l = 0
# zip() will automatically loop until the end of shorter string.
for s1_i, s2_i in zip(s1, s2):
if s1_i == s2_i:
l += 1
else:
break
if l == max_l:
break
# Return the similarity value as described in docstring.
return jaro_sim + (l * p * (1 - jaro_sim))
def demo():
string_distance_examples = [
("rain", "shine"),
("abcdef", "acbdef"),
("language", "lnaguaeg"),
("language", "lnaugage"),
("language", "lngauage"),
]
for s1, s2 in string_distance_examples:
print(f"Edit distance btwn '{s1}' and '{s2}':", edit_distance(s1, s2))
print(
f"Edit dist with transpositions btwn '{s1}' and '{s2}':",
edit_distance(s1, s2, transpositions=True),
)
print(f"Jaro similarity btwn '{s1}' and '{s2}':", jaro_similarity(s1, s2))
print(
f"Jaro-Winkler similarity btwn '{s1}' and '{s2}':",
jaro_winkler_similarity(s1, s2),
)
print(
f"Jaro-Winkler distance btwn '{s1}' and '{s2}':",
1 - jaro_winkler_similarity(s1, s2),
)
s1 = {1, 2, 3, 4}
s2 = {3, 4, 5}
print("s1:", s1)
print("s2:", s2)
print("Binary distance:", binary_distance(s1, s2))
print("Jaccard distance:", jaccard_distance(s1, s2))
print("MASI distance:", masi_distance(s1, s2))
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,389 @@
# Natural Language Toolkit: Agreement Metrics
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Lauri Hallila <laurihallila@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
"""Counts Paice's performance statistics for evaluating stemming algorithms.
What is required:
- A dictionary of words grouped by their real lemmas
- A dictionary of words grouped by stems from a stemming algorithm
When these are given, Understemming Index (UI), Overstemming Index (OI),
Stemming Weight (SW) and Error-rate relative to truncation (ERRT) are counted.
References:
Chris D. Paice (1994). An evaluation method for stemming algorithms.
In Proceedings of SIGIR, 42--50.
"""
from math import sqrt
def get_words_from_dictionary(lemmas):
"""
Get original set of words used for analysis.
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
:type lemmas: dict(str): list(str)
:return: Set of words that exist as values in the dictionary
:rtype: set(str)
"""
words = set()
for lemma in lemmas:
words.update(set(lemmas[lemma]))
return words
def _truncate(words, cutlength):
"""Group words by stems defined by truncating them at given length.
:param words: Set of words used for analysis
:param cutlength: Words are stemmed by cutting at this length.
:type words: set(str) or list(str)
:type cutlength: int
:return: Dictionary where keys are stems and values are sets of words
corresponding to that stem.
:rtype: dict(str): set(str)
"""
stems = {}
for word in words:
stem = word[:cutlength]
try:
stems[stem].update([word])
except KeyError:
stems[stem] = {word}
return stems
# Reference: https://en.wikipedia.org/wiki/Line-line_intersection
def _count_intersection(l1, l2):
"""Count intersection between two line segments defined by coordinate pairs.
:param l1: Tuple of two coordinate pairs defining the first line segment
:param l2: Tuple of two coordinate pairs defining the second line segment
:type l1: tuple(float, float)
:type l2: tuple(float, float)
:return: Coordinates of the intersection
:rtype: tuple(float, float)
"""
x1, y1 = l1[0]
x2, y2 = l1[1]
x3, y3 = l2[0]
x4, y4 = l2[1]
denominator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
if denominator == 0.0: # lines are parallel
if x1 == x2 == x3 == x4 == 0.0:
# When lines are parallel, they must be on the y-axis.
# We can ignore x-axis because we stop counting the
# truncation line when we get there.
# There are no other options as UI (x-axis) grows and
# OI (y-axis) diminishes when we go along the truncation line.
return (0.0, y4)
x = (
(x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)
) / denominator
y = (
(x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)
) / denominator
return (x, y)
def _get_derivative(coordinates):
"""Get derivative of the line from (0,0) to given coordinates.
:param coordinates: A coordinate pair
:type coordinates: tuple(float, float)
:return: Derivative; inf if x is zero
:rtype: float
"""
try:
return coordinates[1] / coordinates[0]
except ZeroDivisionError:
return float("inf")
def _calculate_cut(lemmawords, stems):
"""Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.
:param lemmawords: Set or list of words corresponding to certain lemma.
:param stems: A dictionary where keys are stems and values are sets
or lists of words corresponding to that stem.
:type lemmawords: set(str) or list(str)
:type stems: dict(str): set(str)
:return: Amount of understemmed and overstemmed pairs contributed by words
existing in both lemmawords and stems.
:rtype: tuple(float, float)
"""
umt, wmt = 0.0, 0.0
for stem in stems:
cut = set(lemmawords) & set(stems[stem])
if cut:
cutcount = len(cut)
stemcount = len(stems[stem])
# Unachieved merge total
umt += cutcount * (len(lemmawords) - cutcount)
# Wrongly merged total
wmt += cutcount * (stemcount - cutcount)
return (umt, wmt)
def _calculate(lemmas, stems):
"""Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
:param stems: A dictionary where keys are stems and values are sets
or lists of words corresponding to that stem.
:type lemmas: dict(str): list(str)
:type stems: dict(str): set(str)
:return: Global unachieved merge total (gumt),
global desired merge total (gdmt),
global wrongly merged total (gwmt) and
global desired non-merge total (gdnt).
:rtype: tuple(float, float, float, float)
"""
n = sum(len(lemmas[word]) for word in lemmas)
gdmt, gdnt, gumt, gwmt = (0.0, 0.0, 0.0, 0.0)
for lemma in lemmas:
lemmacount = len(lemmas[lemma])
# Desired merge total
gdmt += lemmacount * (lemmacount - 1)
# Desired non-merge total
gdnt += lemmacount * (n - lemmacount)
# For each (lemma, stem) pair with common words, count how many
# pairs are understemmed and overstemmed.
umt, wmt = _calculate_cut(lemmas[lemma], stems)
# Add to total undesired and wrongly-merged totals
gumt += umt
gwmt += wmt
# Each object is counted twice, so divide by two
return (gumt / 2, gdmt / 2, gwmt / 2, gdnt / 2)
def _indexes(gumt, gdmt, gwmt, gdnt):
"""Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).
:param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt),
global desired merge total (gdmt),
global wrongly merged total (gwmt) and
global desired non-merge total (gdnt).
:type gumt, gdmt, gwmt, gdnt: float
:return: Understemming Index (UI),
Overstemming Index (OI) and
Stemming Weight (SW).
:rtype: tuple(float, float, float)
"""
# Calculate Understemming Index (UI),
# Overstemming Index (OI) and Stemming Weight (SW)
try:
ui = gumt / gdmt
except ZeroDivisionError:
# If GDMT (max merge total) is 0, define UI as 0
ui = 0.0
try:
oi = gwmt / gdnt
except ZeroDivisionError:
# IF GDNT (max non-merge total) is 0, define OI as 0
oi = 0.0
try:
sw = oi / ui
except ZeroDivisionError:
if oi == 0.0:
# OI and UI are 0, define SW as 'not a number'
sw = float("nan")
else:
# UI is 0, define SW as infinity
sw = float("inf")
return (ui, oi, sw)
class Paice:
"""Class for storing lemmas, stems and evaluation metrics."""
def __init__(self, lemmas, stems):
"""
:param lemmas: A dictionary where keys are lemmas and values are sets
or lists of words corresponding to that lemma.
:param stems: A dictionary where keys are stems and values are sets
or lists of words corresponding to that stem.
:type lemmas: dict(str): list(str)
:type stems: dict(str): set(str)
"""
self.lemmas = lemmas
self.stems = stems
self.coords = []
self.gumt, self.gdmt, self.gwmt, self.gdnt = (None, None, None, None)
self.ui, self.oi, self.sw = (None, None, None)
self.errt = None
self.update()
def __str__(self):
text = ["Global Unachieved Merge Total (GUMT): %s\n" % self.gumt]
text.append("Global Desired Merge Total (GDMT): %s\n" % self.gdmt)
text.append("Global Wrongly-Merged Total (GWMT): %s\n" % self.gwmt)
text.append("Global Desired Non-merge Total (GDNT): %s\n" % self.gdnt)
text.append("Understemming Index (GUMT / GDMT): %s\n" % self.ui)
text.append("Overstemming Index (GWMT / GDNT): %s\n" % self.oi)
text.append("Stemming Weight (OI / UI): %s\n" % self.sw)
text.append("Error-Rate Relative to Truncation (ERRT): %s\r\n" % self.errt)
coordinates = " ".join(["(%s, %s)" % item for item in self.coords])
text.append("Truncation line: %s" % coordinates)
return "".join(text)
def _get_truncation_indexes(self, words, cutlength):
"""Count (UI, OI) when stemming is done by truncating words at \'cutlength\'.
:param words: Words used for the analysis
:param cutlength: Words are stemmed by cutting them at this length
:type words: set(str) or list(str)
:type cutlength: int
:return: Understemming and overstemming indexes
:rtype: tuple(int, int)
"""
truncated = _truncate(words, cutlength)
gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated)
ui, oi = _indexes(gumt, gdmt, gwmt, gdnt)[:2]
return (ui, oi)
def _get_truncation_coordinates(self, cutlength=0):
"""Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.
:param cutlength: Optional parameter to start counting from (ui, oi)
coordinates gotten by stemming at this length. Useful for speeding up
the calculations when you know the approximate location of the
intersection.
:type cutlength: int
:return: List of coordinate pairs that define the truncation line
:rtype: list(tuple(float, float))
"""
words = get_words_from_dictionary(self.lemmas)
maxlength = max(len(word) for word in words)
# Truncate words from different points until (0, 0) - (ui, oi) segment crosses the truncation line
coords = []
while cutlength <= maxlength:
# Get (UI, OI) pair of current truncation point
pair = self._get_truncation_indexes(words, cutlength)
# Store only new coordinates so we'll have an actual
# line segment when counting the intersection point
if pair not in coords:
coords.append(pair)
if pair == (0.0, 0.0):
# Stop counting if truncation line goes through origo;
# length from origo to truncation line is 0
return coords
if len(coords) >= 2 and pair[0] > 0.0:
derivative1 = _get_derivative(coords[-2])
derivative2 = _get_derivative(coords[-1])
# Derivative of the truncation line is a decreasing value;
# when it passes Stemming Weight, we've found the segment
# of truncation line intersecting with (0, 0) - (ui, oi) segment
if derivative1 >= self.sw >= derivative2:
return coords
cutlength += 1
return coords
def _errt(self):
"""Count Error-Rate Relative to Truncation (ERRT).
:return: ERRT, length of the line from origo to (UI, OI) divided by
the length of the line from origo to the point defined by the same
line when extended until the truncation line.
:rtype: float
"""
# Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line
self.coords = self._get_truncation_coordinates()
if (0.0, 0.0) in self.coords:
# Truncation line goes through origo, so ERRT cannot be counted
if (self.ui, self.oi) != (0.0, 0.0):
return float("inf")
else:
return float("nan")
if (self.ui, self.oi) == (0.0, 0.0):
# (ui, oi) is origo; define errt as 0.0
return 0.0
# Count the intersection point
# Note that (self.ui, self.oi) cannot be (0.0, 0.0) and self.coords has different coordinates
# so we have actual line segments instead of a line segment and a point
intersection = _count_intersection(
((0, 0), (self.ui, self.oi)), self.coords[-2:]
)
# Count OP (length of the line from origo to (ui, oi))
op = sqrt(self.ui**2 + self.oi**2)
# Count OT (length of the line from origo to truncation line that goes through (ui, oi))
ot = sqrt(intersection[0] ** 2 + intersection[1] ** 2)
# OP / OT tells how well the stemming algorithm works compared to just truncating words
return op / ot
def update(self):
"""Update statistics after lemmas and stems have been set."""
self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems)
self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt)
self.errt = self._errt()
def demo():
"""Demonstration of the module."""
# Some words with their real lemmas
lemmas = {
"kneel": ["kneel", "knelt"],
"range": ["range", "ranged"],
"ring": ["ring", "rang", "rung"],
}
# Same words with stems from a stemming algorithm
stems = {
"kneel": ["kneel"],
"knelt": ["knelt"],
"rang": ["rang", "range", "ranged"],
"ring": ["ring"],
"rung": ["rung"],
}
print("Words grouped by their lemmas:")
for lemma in sorted(lemmas):
print("{} => {}".format(lemma, " ".join(lemmas[lemma])))
print()
print("Same words grouped by a stemming algorithm:")
for stem in sorted(stems):
print("{} => {}".format(stem, " ".join(stems[stem])))
print()
p = Paice(lemmas, stems)
print(p)
print()
# Let's "change" results from a stemming algorithm
stems = {
"kneel": ["kneel"],
"knelt": ["knelt"],
"rang": ["rang"],
"range": ["range", "ranged"],
"ring": ["ring"],
"rung": ["rung"],
}
print("Counting stats after changing stemming results:")
for stem in sorted(stems):
print("{} => {}".format(stem, " ".join(stems[stem])))
print()
p.stems = stems
p.update()
print(p)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,228 @@
# Natural Language Toolkit: Evaluation
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import operator
from functools import reduce
from math import fabs
from random import shuffle
try:
from scipy.stats.stats import betai
except ImportError:
betai = None
from nltk.util import LazyConcatenation, LazyMap
def accuracy(reference, test):
"""
Given a list of reference values and a corresponding list of test
values, return the fraction of corresponding values that are
equal. In particular, return the fraction of indices
``0<i<=len(test)`` such that ``test[i] == reference[i]``.
:type reference: list
:param reference: An ordered list of reference values.
:type test: list
:param test: A list of values to compare against the corresponding
reference values.
:raise ValueError: If ``reference`` and ``length`` do not have the
same length.
"""
if len(reference) != len(test):
raise ValueError("Lists must have the same length.")
return sum(x == y for x, y in zip(reference, test)) / len(test)
def precision(reference, test):
"""
Given a set of reference values and a set of test values, return
the fraction of test values that appear in the reference set.
In particular, return card(``reference`` intersection ``test``)/card(``test``).
If ``test`` is empty, then return None.
:type reference: set
:param reference: A set of reference values.
:type test: set
:param test: A set of values to compare against the reference set.
:rtype: float or None
"""
if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
raise TypeError("reference and test should be sets")
if len(test) == 0:
return None
else:
return len(reference.intersection(test)) / len(test)
def recall(reference, test):
"""
Given a set of reference values and a set of test values, return
the fraction of reference values that appear in the test set.
In particular, return card(``reference`` intersection ``test``)/card(``reference``).
If ``reference`` is empty, then return None.
:type reference: set
:param reference: A set of reference values.
:type test: set
:param test: A set of values to compare against the reference set.
:rtype: float or None
"""
if not hasattr(reference, "intersection") or not hasattr(test, "intersection"):
raise TypeError("reference and test should be sets")
if len(reference) == 0:
return None
else:
return len(reference.intersection(test)) / len(reference)
def f_measure(reference, test, alpha=0.5):
"""
Given a set of reference values and a set of test values, return
the f-measure of the test values, when compared against the
reference values. The f-measure is the harmonic mean of the
``precision`` and ``recall``, weighted by ``alpha``. In particular,
given the precision *p* and recall *r* defined by:
- *p* = card(``reference`` intersection ``test``)/card(``test``)
- *r* = card(``reference`` intersection ``test``)/card(``reference``)
The f-measure is:
- *1/(alpha/p + (1-alpha)/r)*
If either ``reference`` or ``test`` is empty, then ``f_measure``
returns None.
:type reference: set
:param reference: A set of reference values.
:type test: set
:param test: A set of values to compare against the reference set.
:rtype: float or None
"""
p = precision(reference, test)
r = recall(reference, test)
if p is None or r is None:
return None
if p == 0 or r == 0:
return 0
return 1.0 / (alpha / p + (1 - alpha) / r)
def log_likelihood(reference, test):
"""
Given a list of reference values and a corresponding list of test
probability distributions, return the average log likelihood of
the reference values, given the probability distributions.
:param reference: A list of reference values
:type reference: list
:param test: A list of probability distributions over values to
compare against the corresponding reference values.
:type test: list(ProbDistI)
"""
if len(reference) != len(test):
raise ValueError("Lists must have the same length.")
# Return the average value of dist.logprob(val).
total_likelihood = sum(dist.logprob(val) for (val, dist) in zip(reference, test))
return total_likelihood / len(reference)
def approxrand(a, b, **kwargs):
"""
Returns an approximate significance level between two lists of
independently generated test values.
Approximate randomization calculates significance by randomly drawing
from a sample of the possible permutations. At the limit of the number
of possible permutations, the significance level is exact. The
approximate significance level is the sample mean number of times the
statistic of the permutated lists varies from the actual statistic of
the unpermuted argument lists.
:return: a tuple containing an approximate significance level, the count
of the number of times the pseudo-statistic varied from the
actual statistic, and the number of shuffles
:rtype: tuple
:param a: a list of test values
:type a: list
:param b: another list of independently generated test values
:type b: list
"""
shuffles = kwargs.get("shuffles", 999)
# there's no point in trying to shuffle beyond all possible permutations
shuffles = min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
stat = kwargs.get("statistic", lambda lst: sum(lst) / len(lst))
verbose = kwargs.get("verbose", False)
if verbose:
print("shuffles: %d" % shuffles)
actual_stat = fabs(stat(a) - stat(b))
if verbose:
print("actual statistic: %f" % actual_stat)
print("-" * 60)
c = 1e-100
lst = LazyConcatenation([a, b])
indices = list(range(len(a) + len(b)))
for i in range(shuffles):
if verbose and i % 10 == 0:
print("shuffle: %d" % i)
shuffle(indices)
pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[: len(a)]))
pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a) :]))
pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)
if pseudo_stat >= actual_stat:
c += 1
if verbose and i % 10 == 0:
print("pseudo-statistic: %f" % pseudo_stat)
print("significance: %f" % ((c + 1) / (i + 1)))
print("-" * 60)
significance = (c + 1) / (shuffles + 1)
if verbose:
print("significance: %f" % significance)
if betai:
for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
print(f"prob(phi<={phi:f}): {betai(c, shuffles, phi):f}")
return (significance, c, shuffles)
def demo():
print("-" * 75)
reference = "DET NN VB DET JJ NN NN IN DET NN".split()
test = "DET VB VB DET NN NN NN IN DET NN".split()
print("Reference =", reference)
print("Test =", test)
print("Accuracy:", accuracy(reference, test))
print("-" * 75)
reference_set = set(reference)
test_set = set(test)
print("Reference =", reference_set)
print("Test = ", test_set)
print("Precision:", precision(reference_set, test_set))
print(" Recall:", recall(reference_set, test_set))
print("F-Measure:", f_measure(reference_set, test_set))
print("-" * 75)
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,222 @@
# Natural Language Toolkit: Text Segmentation Metrics
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# David Doukhan <david.doukhan@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Text Segmentation Metrics
1. Windowdiff
Pevzner, L., and Hearst, M., A Critique and Improvement of
an Evaluation Metric for Text Segmentation,
Computational Linguistics 28, 19-36
2. Generalized Hamming Distance
Bookstein A., Kulyukin V.A., Raita T.
Generalized Hamming Distance
Information Retrieval 5, 2002, pp 353-375
Baseline implementation in C++
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
Study describing benefits of Generalized Hamming Distance Versus
WindowDiff for evaluating text segmentation tasks
Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ?
TALN 2009
3. Pk text segmentation metric
Beeferman D., Berger A., Lafferty J. (1999)
Statistical Models for Text Segmentation
Machine Learning, 34, 177-210
"""
try:
import numpy as np
except ImportError:
pass
def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
"""
Compute the windowdiff score for a pair of segmentations. A
segmentation is any sequence over a vocabulary of two items
(e.g. "0", "1"), where the specified boundary value is used to
mark the edge of a segmentation.
>>> s1 = "000100000010"
>>> s2 = "000010000100"
>>> s3 = "100000010000"
>>> '%.2f' % windowdiff(s1, s1, 3)
'0.00'
>>> '%.2f' % windowdiff(s1, s2, 3)
'0.30'
>>> '%.2f' % windowdiff(s2, s3, 3)
'0.80'
:param seg1: a segmentation
:type seg1: str or list
:param seg2: a segmentation
:type seg2: str or list
:param k: window width
:type k: int
:param boundary: boundary value
:type boundary: str or int or bool
:param weighted: use the weighted variant of windowdiff
:type weighted: boolean
:rtype: float
"""
if len(seg1) != len(seg2):
raise ValueError("Segmentations have unequal length")
if k > len(seg1):
raise ValueError(
"Window width k should be smaller or equal than segmentation lengths"
)
wd = 0
for i in range(len(seg1) - k + 1):
ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary))
if weighted:
wd += ndiff
else:
wd += min(1, ndiff)
return wd / (len(seg1) - k + 1.0)
# Generalized Hamming Distance
def _init_mat(nrows, ncols, ins_cost, del_cost):
mat = np.empty((nrows, ncols))
mat[0, :] = ins_cost * np.arange(ncols)
mat[:, 0] = del_cost * np.arange(nrows)
return mat
def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
for i, rowi in enumerate(rowv):
for j, colj in enumerate(colv):
shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
if rowi == colj:
# boundaries are at the same location, no transformation required
tcost = mat[i, j]
elif rowi > colj:
# boundary match through a deletion
tcost = del_cost + mat[i, j + 1]
else:
# boundary match through an insertion
tcost = ins_cost + mat[i + 1, j]
mat[i + 1, j + 1] = min(tcost, shift_cost)
def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"):
"""
Compute the Generalized Hamming Distance for a reference and a hypothetical
segmentation, corresponding to the cost related to the transformation
of the hypothetical segmentation into the reference segmentation
through boundary insertion, deletion and shift operations.
A segmentation is any sequence over a vocabulary of two items
(e.g. "0", "1"), where the specified boundary value is used to
mark the edge of a segmentation.
Recommended parameter values are a shift_cost_coeff of 2.
Associated with a ins_cost, and del_cost equal to the mean segment
length in the reference segmentation.
>>> # Same examples as Kulyukin C++ implementation
>>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
0.5
>>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
2.0
>>> ghd('011', '110', 1.0, 1.0, 0.5)
1.0
>>> ghd('1', '0', 1.0, 1.0, 0.5)
1.0
>>> ghd('111', '000', 1.0, 1.0, 0.5)
3.0
>>> ghd('000', '111', 1.0, 2.0, 0.5)
6.0
:param ref: the reference segmentation
:type ref: str or list
:param hyp: the hypothetical segmentation
:type hyp: str or list
:param ins_cost: insertion cost
:type ins_cost: float
:param del_cost: deletion cost
:type del_cost: float
:param shift_cost_coeff: constant used to compute the cost of a shift.
``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j``
are the positions indicating the shift
:type shift_cost_coeff: float
:param boundary: boundary value
:type boundary: str or int or bool
:rtype: float
"""
ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
nref_bound = len(ref_idx)
nhyp_bound = len(hyp_idx)
if nref_bound == 0 and nhyp_bound == 0:
return 0.0
elif nref_bound > 0 and nhyp_bound == 0:
return nref_bound * ins_cost
elif nref_bound == 0 and nhyp_bound > 0:
return nhyp_bound * del_cost
mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
_ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
return float(mat[-1, -1])
# Beeferman's Pk text segmentation evaluation metric
def pk(ref, hyp, k=None, boundary="1"):
"""
Compute the Pk metric for a pair of segmentations A segmentation
is any sequence over a vocabulary of two items (e.g. "0", "1"),
where the specified boundary value is used to mark the edge of a
segmentation.
>>> '%.2f' % pk('0100'*100, '1'*400, 2)
'0.50'
>>> '%.2f' % pk('0100'*100, '0'*400, 2)
'0.50'
>>> '%.2f' % pk('0100'*100, '0100'*100, 2)
'0.00'
:param ref: the reference segmentation
:type ref: str or list
:param hyp: the segmentation to evaluate
:type hyp: str or list
:param k: window size, if None, set to half of the average reference segment length
:type boundary: str or int or bool
:param boundary: boundary value
:type boundary: str or int or bool
:rtype: float
"""
if k is None:
k = int(round(len(ref) / (ref.count(boundary) * 2.0)))
err = 0
for i in range(len(ref) - k + 1):
r = ref[i : i + k].count(boundary) > 0
h = hyp[i : i + k].count(boundary) > 0
if r != h:
err += 1
return err / (len(ref) - k + 1.0)

View File

@@ -0,0 +1,68 @@
# Natural Language Toolkit: Spearman Rank Correlation
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Tools for comparing ranked lists.
"""
def _rank_dists(ranks1, ranks2):
"""Finds the difference between the values in ranks1 and ranks2 for keys
present in both dicts. If the arguments are not dicts, they are converted
from (key, rank) sequences.
"""
ranks1 = dict(ranks1)
ranks2 = dict(ranks2)
for k in ranks1:
try:
yield k, ranks1[k] - ranks2[k]
except KeyError:
pass
def spearman_correlation(ranks1, ranks2):
"""Returns the Spearman correlation coefficient for two rankings, which
should be dicts or sequences of (key, rank). The coefficient ranges from
-1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only
calculated for keys in both rankings (for meaningful results, remove keys
present in only one list before ranking)."""
n = 0
res = 0
for k, d in _rank_dists(ranks1, ranks2):
res += d * d
n += 1
try:
return 1 - (6 * res / (n * (n * n - 1)))
except ZeroDivisionError:
# Result is undefined if only one item is ranked
return 0.0
def ranks_from_sequence(seq):
"""Given a sequence, yields each element with an increasing rank, suitable
for use as an argument to ``spearman_correlation``.
"""
return ((k, i) for i, k in enumerate(seq))
def ranks_from_scores(scores, rank_gap=1e-15):
"""Given a sequence of (key, score) tuples, yields each key with an
increasing rank, tying with previous key's rank if the difference between
their scores is less than rank_gap. Suitable for use as an argument to
``spearman_correlation``.
"""
prev_score = None
rank = 0
for i, (key, score) in enumerate(scores):
try:
if abs(score - prev_score) > rank_gap:
rank = i
except TypeError:
pass
yield key, rank
prev_score = score