updates
This commit is contained in:
@@ -0,0 +1,222 @@
|
||||
# Natural Language Toolkit: Text Segmentation Metrics
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# David Doukhan <david.doukhan@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
"""
|
||||
Text Segmentation Metrics
|
||||
|
||||
1. Windowdiff
|
||||
|
||||
Pevzner, L., and Hearst, M., A Critique and Improvement of
|
||||
an Evaluation Metric for Text Segmentation,
|
||||
Computational Linguistics 28, 19-36
|
||||
|
||||
|
||||
2. Generalized Hamming Distance
|
||||
|
||||
Bookstein A., Kulyukin V.A., Raita T.
|
||||
Generalized Hamming Distance
|
||||
Information Retrieval 5, 2002, pp 353-375
|
||||
|
||||
Baseline implementation in C++
|
||||
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
|
||||
|
||||
Study describing benefits of Generalized Hamming Distance Versus
|
||||
WindowDiff for evaluating text segmentation tasks
|
||||
Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ?
|
||||
TALN 2009
|
||||
|
||||
|
||||
3. Pk text segmentation metric
|
||||
|
||||
Beeferman D., Berger A., Lafferty J. (1999)
|
||||
Statistical Models for Text Segmentation
|
||||
Machine Learning, 34, 177-210
|
||||
"""
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
|
||||
"""
|
||||
Compute the windowdiff score for a pair of segmentations. A
|
||||
segmentation is any sequence over a vocabulary of two items
|
||||
(e.g. "0", "1"), where the specified boundary value is used to
|
||||
mark the edge of a segmentation.
|
||||
|
||||
>>> s1 = "000100000010"
|
||||
>>> s2 = "000010000100"
|
||||
>>> s3 = "100000010000"
|
||||
>>> '%.2f' % windowdiff(s1, s1, 3)
|
||||
'0.00'
|
||||
>>> '%.2f' % windowdiff(s1, s2, 3)
|
||||
'0.30'
|
||||
>>> '%.2f' % windowdiff(s2, s3, 3)
|
||||
'0.80'
|
||||
|
||||
:param seg1: a segmentation
|
||||
:type seg1: str or list
|
||||
:param seg2: a segmentation
|
||||
:type seg2: str or list
|
||||
:param k: window width
|
||||
:type k: int
|
||||
:param boundary: boundary value
|
||||
:type boundary: str or int or bool
|
||||
:param weighted: use the weighted variant of windowdiff
|
||||
:type weighted: boolean
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
if len(seg1) != len(seg2):
|
||||
raise ValueError("Segmentations have unequal length")
|
||||
if k > len(seg1):
|
||||
raise ValueError(
|
||||
"Window width k should be smaller or equal than segmentation lengths"
|
||||
)
|
||||
wd = 0
|
||||
for i in range(len(seg1) - k + 1):
|
||||
ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary))
|
||||
if weighted:
|
||||
wd += ndiff
|
||||
else:
|
||||
wd += min(1, ndiff)
|
||||
return wd / (len(seg1) - k + 1.0)
|
||||
|
||||
|
||||
# Generalized Hamming Distance
|
||||
|
||||
|
||||
def _init_mat(nrows, ncols, ins_cost, del_cost):
|
||||
mat = np.empty((nrows, ncols))
|
||||
mat[0, :] = ins_cost * np.arange(ncols)
|
||||
mat[:, 0] = del_cost * np.arange(nrows)
|
||||
return mat
|
||||
|
||||
|
||||
def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
|
||||
for i, rowi in enumerate(rowv):
|
||||
for j, colj in enumerate(colv):
|
||||
shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
|
||||
if rowi == colj:
|
||||
# boundaries are at the same location, no transformation required
|
||||
tcost = mat[i, j]
|
||||
elif rowi > colj:
|
||||
# boundary match through a deletion
|
||||
tcost = del_cost + mat[i, j + 1]
|
||||
else:
|
||||
# boundary match through an insertion
|
||||
tcost = ins_cost + mat[i + 1, j]
|
||||
mat[i + 1, j + 1] = min(tcost, shift_cost)
|
||||
|
||||
|
||||
def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"):
|
||||
"""
|
||||
Compute the Generalized Hamming Distance for a reference and a hypothetical
|
||||
segmentation, corresponding to the cost related to the transformation
|
||||
of the hypothetical segmentation into the reference segmentation
|
||||
through boundary insertion, deletion and shift operations.
|
||||
|
||||
A segmentation is any sequence over a vocabulary of two items
|
||||
(e.g. "0", "1"), where the specified boundary value is used to
|
||||
mark the edge of a segmentation.
|
||||
|
||||
Recommended parameter values are a shift_cost_coeff of 2.
|
||||
Associated with a ins_cost, and del_cost equal to the mean segment
|
||||
length in the reference segmentation.
|
||||
|
||||
>>> # Same examples as Kulyukin C++ implementation
|
||||
>>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
|
||||
0.5
|
||||
>>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
|
||||
2.0
|
||||
>>> ghd('011', '110', 1.0, 1.0, 0.5)
|
||||
1.0
|
||||
>>> ghd('1', '0', 1.0, 1.0, 0.5)
|
||||
1.0
|
||||
>>> ghd('111', '000', 1.0, 1.0, 0.5)
|
||||
3.0
|
||||
>>> ghd('000', '111', 1.0, 2.0, 0.5)
|
||||
6.0
|
||||
|
||||
:param ref: the reference segmentation
|
||||
:type ref: str or list
|
||||
:param hyp: the hypothetical segmentation
|
||||
:type hyp: str or list
|
||||
:param ins_cost: insertion cost
|
||||
:type ins_cost: float
|
||||
:param del_cost: deletion cost
|
||||
:type del_cost: float
|
||||
:param shift_cost_coeff: constant used to compute the cost of a shift.
|
||||
``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j``
|
||||
are the positions indicating the shift
|
||||
:type shift_cost_coeff: float
|
||||
:param boundary: boundary value
|
||||
:type boundary: str or int or bool
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
|
||||
hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
|
||||
|
||||
nref_bound = len(ref_idx)
|
||||
nhyp_bound = len(hyp_idx)
|
||||
|
||||
if nref_bound == 0 and nhyp_bound == 0:
|
||||
return 0.0
|
||||
elif nref_bound > 0 and nhyp_bound == 0:
|
||||
return nref_bound * ins_cost
|
||||
elif nref_bound == 0 and nhyp_bound > 0:
|
||||
return nhyp_bound * del_cost
|
||||
|
||||
mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
|
||||
_ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
|
||||
return float(mat[-1, -1])
|
||||
|
||||
|
||||
# Beeferman's Pk text segmentation evaluation metric
|
||||
|
||||
|
||||
def pk(ref, hyp, k=None, boundary="1"):
|
||||
"""
|
||||
Compute the Pk metric for a pair of segmentations A segmentation
|
||||
is any sequence over a vocabulary of two items (e.g. "0", "1"),
|
||||
where the specified boundary value is used to mark the edge of a
|
||||
segmentation.
|
||||
|
||||
>>> '%.2f' % pk('0100'*100, '1'*400, 2)
|
||||
'0.50'
|
||||
>>> '%.2f' % pk('0100'*100, '0'*400, 2)
|
||||
'0.50'
|
||||
>>> '%.2f' % pk('0100'*100, '0100'*100, 2)
|
||||
'0.00'
|
||||
|
||||
:param ref: the reference segmentation
|
||||
:type ref: str or list
|
||||
:param hyp: the segmentation to evaluate
|
||||
:type hyp: str or list
|
||||
:param k: window size, if None, set to half of the average reference segment length
|
||||
:type boundary: str or int or bool
|
||||
:param boundary: boundary value
|
||||
:type boundary: str or int or bool
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
if k is None:
|
||||
k = int(round(len(ref) / (ref.count(boundary) * 2.0)))
|
||||
|
||||
err = 0
|
||||
for i in range(len(ref) - k + 1):
|
||||
r = ref[i : i + k].count(boundary) > 0
|
||||
h = hyp[i : i + k].count(boundary) > 0
|
||||
if r != h:
|
||||
err += 1
|
||||
return err / (len(ref) - k + 1.0)
|
||||
Reference in New Issue
Block a user