updates
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
# Natural Language Toolkit: Clusterers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
This module contains a number of basic clustering algorithms. Clustering
|
||||
describes the task of discovering groups of similar items with a large
|
||||
collection. It is also describe as unsupervised machine learning, as the data
|
||||
from which it learns is unannotated with class information, as is the case for
|
||||
supervised learning. Annotated data is difficult and expensive to obtain in
|
||||
the quantities required for the majority of supervised learning algorithms.
|
||||
This problem, the knowledge acquisition bottleneck, is common to most natural
|
||||
language processing tasks, thus fueling the need for quality unsupervised
|
||||
approaches.
|
||||
|
||||
This module contains a k-means clusterer, E-M clusterer and a group average
|
||||
agglomerative clusterer (GAAC). All these clusterers involve finding good
|
||||
cluster groupings for a set of vectors in multi-dimensional space.
|
||||
|
||||
The K-means clusterer starts with k arbitrary chosen means then allocates each
|
||||
vector to the cluster with the closest mean. It then recalculates the means of
|
||||
each cluster as the centroid of the vectors in the cluster. This process
|
||||
repeats until the cluster memberships stabilise. This is a hill-climbing
|
||||
algorithm which may converge to a local maximum. Hence the clustering is
|
||||
often repeated with random initial means and the most commonly occurring
|
||||
output means are chosen.
|
||||
|
||||
The GAAC clusterer starts with each of the *N* vectors as singleton clusters.
|
||||
It then iteratively merges pairs of clusters which have the closest centroids.
|
||||
This continues until there is only one cluster. The order of merges gives rise
|
||||
to a dendrogram - a tree with the earlier merges lower than later merges. The
|
||||
membership of a given number of clusters *c*, *1 <= c <= N*, can be found by
|
||||
cutting the dendrogram at depth *c*.
|
||||
|
||||
The Gaussian EM clusterer models the vectors as being produced by a mixture
|
||||
of k Gaussian sources. The parameters of these sources (prior probability,
|
||||
mean and covariance matrix) are then found to maximise the likelihood of the
|
||||
given data. This is done with the expectation maximisation algorithm. It
|
||||
starts with k arbitrarily chosen means, priors and covariance matrices. It
|
||||
then calculates the membership probabilities for each vector in each of the
|
||||
clusters - this is the 'E' step. The cluster parameters are then updated in
|
||||
the 'M' step using the maximum likelihood estimate from the cluster membership
|
||||
probabilities. This process continues until the likelihood of the data does
|
||||
not significantly increase.
|
||||
|
||||
They all extend the ClusterI interface which defines common operations
|
||||
available with each clusterer. These operations include:
|
||||
|
||||
- cluster: clusters a sequence of vectors
|
||||
- classify: assign a vector to a cluster
|
||||
- classification_probdist: give the probability distribution over cluster memberships
|
||||
|
||||
The current existing classifiers also extend cluster.VectorSpace, an
|
||||
abstract class which allows for singular value decomposition (SVD) and vector
|
||||
normalisation. SVD is used to reduce the dimensionality of the vector space in
|
||||
such a manner as to preserve as much of the variation as possible, by
|
||||
reparameterising the axes in order of variability and discarding all bar the
|
||||
first d dimensions. Normalisation ensures that vectors fall in the unit
|
||||
hypersphere.
|
||||
|
||||
Usage example (see also demo())::
|
||||
|
||||
from nltk import cluster
|
||||
from nltk.cluster import euclidean_distance
|
||||
from numpy import array
|
||||
|
||||
vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]]
|
||||
|
||||
# initialise the clusterer (will also assign the vectors to clusters)
|
||||
clusterer = cluster.KMeansClusterer(2, euclidean_distance)
|
||||
clusterer.cluster(vectors, True)
|
||||
|
||||
# classify a new vector
|
||||
print(clusterer.classify(array([3, 3])))
|
||||
|
||||
Note that the vectors must use numpy array-like
|
||||
objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for
|
||||
efficiency when required.
|
||||
"""
|
||||
|
||||
from nltk.cluster.em import EMClusterer
|
||||
from nltk.cluster.gaac import GAAClusterer
|
||||
from nltk.cluster.kmeans import KMeansClusterer
|
||||
from nltk.cluster.util import (
|
||||
Dendrogram,
|
||||
VectorSpaceClusterer,
|
||||
cosine_distance,
|
||||
euclidean_distance,
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,74 @@
|
||||
# Natural Language Toolkit: Clusterer Interfaces
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# Porting: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from nltk.probability import DictionaryProbDist
|
||||
|
||||
|
||||
class ClusterI(metaclass=ABCMeta):
|
||||
"""
|
||||
Interface covering basic clustering functionality.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def cluster(self, vectors, assign_clusters=False):
|
||||
"""
|
||||
Assigns the vectors to clusters, learning the clustering parameters
|
||||
from the data. Returns a cluster identifier for each vector.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def classify(self, token):
|
||||
"""
|
||||
Classifies the token into a cluster, setting the token's CLUSTER
|
||||
parameter to that cluster identifier.
|
||||
"""
|
||||
|
||||
def likelihood(self, vector, label):
|
||||
"""
|
||||
Returns the likelihood (a float) of the token having the
|
||||
corresponding cluster.
|
||||
"""
|
||||
if self.classify(vector) == label:
|
||||
return 1.0
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
def classification_probdist(self, vector):
|
||||
"""
|
||||
Classifies the token into a cluster, returning
|
||||
a probability distribution over the cluster identifiers.
|
||||
"""
|
||||
likelihoods = {}
|
||||
sum = 0.0
|
||||
for cluster in self.cluster_names():
|
||||
likelihoods[cluster] = self.likelihood(vector, cluster)
|
||||
sum += likelihoods[cluster]
|
||||
for cluster in self.cluster_names():
|
||||
likelihoods[cluster] /= sum
|
||||
return DictionaryProbDist(likelihoods)
|
||||
|
||||
@abstractmethod
|
||||
def num_clusters(self):
|
||||
"""
|
||||
Returns the number of clusters.
|
||||
"""
|
||||
|
||||
def cluster_names(self):
|
||||
"""
|
||||
Returns the names of the clusters.
|
||||
:rtype: list
|
||||
"""
|
||||
return list(range(self.num_clusters()))
|
||||
|
||||
def cluster_name(self, index):
|
||||
"""
|
||||
Returns the names of the cluster at index.
|
||||
"""
|
||||
return index
|
||||
219
Backend/venv/lib/python3.12/site-packages/nltk/cluster/em.py
Normal file
219
Backend/venv/lib/python3.12/site-packages/nltk/cluster/em.py
Normal file
@@ -0,0 +1,219 @@
|
||||
# Natural Language Toolkit: Expectation Maximization Clusterer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.cluster.util import VectorSpaceClusterer
|
||||
|
||||
|
||||
class EMClusterer(VectorSpaceClusterer):
|
||||
"""
|
||||
The Gaussian EM clusterer models the vectors as being produced by
|
||||
a mixture of k Gaussian sources. The parameters of these sources
|
||||
(prior probability, mean and covariance matrix) are then found to
|
||||
maximise the likelihood of the given data. This is done with the
|
||||
expectation maximisation algorithm. It starts with k arbitrarily
|
||||
chosen means, priors and covariance matrices. It then calculates
|
||||
the membership probabilities for each vector in each of the
|
||||
clusters; this is the 'E' step. The cluster parameters are then
|
||||
updated in the 'M' step using the maximum likelihood estimate from
|
||||
the cluster membership probabilities. This process continues until
|
||||
the likelihood of the data does not significantly increase.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
initial_means,
|
||||
priors=None,
|
||||
covariance_matrices=None,
|
||||
conv_threshold=1e-6,
|
||||
bias=0.1,
|
||||
normalise=False,
|
||||
svd_dimensions=None,
|
||||
):
|
||||
"""
|
||||
Creates an EM clusterer with the given starting parameters,
|
||||
convergence threshold and vector mangling parameters.
|
||||
|
||||
:param initial_means: the means of the gaussian cluster centers
|
||||
:type initial_means: [seq of] numpy array or seq of SparseArray
|
||||
:param priors: the prior probability for each cluster
|
||||
:type priors: numpy array or seq of float
|
||||
:param covariance_matrices: the covariance matrix for each cluster
|
||||
:type covariance_matrices: [seq of] numpy array
|
||||
:param conv_threshold: maximum change in likelihood before deemed
|
||||
convergent
|
||||
:type conv_threshold: int or float
|
||||
:param bias: variance bias used to ensure non-singular covariance
|
||||
matrices
|
||||
:type bias: float
|
||||
:param normalise: should vectors be normalised to length 1
|
||||
:type normalise: boolean
|
||||
:param svd_dimensions: number of dimensions to use in reducing vector
|
||||
dimensionsionality with SVD
|
||||
:type svd_dimensions: int
|
||||
"""
|
||||
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
|
||||
self._means = numpy.array(initial_means, numpy.float64)
|
||||
self._num_clusters = len(initial_means)
|
||||
self._conv_threshold = conv_threshold
|
||||
self._covariance_matrices = covariance_matrices
|
||||
self._priors = priors
|
||||
self._bias = bias
|
||||
|
||||
def num_clusters(self):
|
||||
return self._num_clusters
|
||||
|
||||
def cluster_vectorspace(self, vectors, trace=False):
|
||||
assert len(vectors) > 0
|
||||
|
||||
# set the parameters to initial values
|
||||
dimensions = len(vectors[0])
|
||||
means = self._means
|
||||
priors = self._priors
|
||||
if not priors:
|
||||
priors = self._priors = (
|
||||
numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
|
||||
)
|
||||
covariances = self._covariance_matrices
|
||||
if not covariances:
|
||||
covariances = self._covariance_matrices = [
|
||||
numpy.identity(dimensions, numpy.float64)
|
||||
for i in range(self._num_clusters)
|
||||
]
|
||||
|
||||
# do the E and M steps until the likelihood plateaus
|
||||
lastl = self._loglikelihood(vectors, priors, means, covariances)
|
||||
converged = False
|
||||
|
||||
while not converged:
|
||||
if trace:
|
||||
print("iteration; loglikelihood", lastl)
|
||||
# E-step, calculate hidden variables, h[i,j]
|
||||
h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
|
||||
for i in range(len(vectors)):
|
||||
for j in range(self._num_clusters):
|
||||
h[i, j] = priors[j] * self._gaussian(
|
||||
means[j], covariances[j], vectors[i]
|
||||
)
|
||||
h[i, :] /= sum(h[i, :])
|
||||
|
||||
# M-step, update parameters - cvm, p, mean
|
||||
for j in range(self._num_clusters):
|
||||
covariance_before = covariances[j]
|
||||
new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
|
||||
new_mean = numpy.zeros(dimensions, numpy.float64)
|
||||
sum_hj = 0.0
|
||||
for i in range(len(vectors)):
|
||||
delta = vectors[i] - means[j]
|
||||
new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
|
||||
sum_hj += h[i, j]
|
||||
new_mean += h[i, j] * vectors[i]
|
||||
covariances[j] = new_covariance / sum_hj
|
||||
means[j] = new_mean / sum_hj
|
||||
priors[j] = sum_hj / len(vectors)
|
||||
|
||||
# bias term to stop covariance matrix being singular
|
||||
covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)
|
||||
|
||||
# calculate likelihood - FIXME: may be broken
|
||||
l = self._loglikelihood(vectors, priors, means, covariances)
|
||||
|
||||
# check for convergence
|
||||
if abs(lastl - l) < self._conv_threshold:
|
||||
converged = True
|
||||
lastl = l
|
||||
|
||||
def classify_vectorspace(self, vector):
|
||||
best = None
|
||||
for j in range(self._num_clusters):
|
||||
p = self._priors[j] * self._gaussian(
|
||||
self._means[j], self._covariance_matrices[j], vector
|
||||
)
|
||||
if not best or p > best[0]:
|
||||
best = (p, j)
|
||||
return best[1]
|
||||
|
||||
def likelihood_vectorspace(self, vector, cluster):
|
||||
cid = self.cluster_names().index(cluster)
|
||||
return self._priors[cluster] * self._gaussian(
|
||||
self._means[cluster], self._covariance_matrices[cluster], vector
|
||||
)
|
||||
|
||||
def _gaussian(self, mean, cvm, x):
|
||||
m = len(mean)
|
||||
assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
|
||||
try:
|
||||
det = numpy.linalg.det(cvm)
|
||||
inv = numpy.linalg.inv(cvm)
|
||||
a = det**-0.5 * (2 * numpy.pi) ** (-m / 2.0)
|
||||
dx = x - mean
|
||||
print(dx, inv)
|
||||
b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
|
||||
return a * numpy.exp(b)
|
||||
except OverflowError:
|
||||
# happens when the exponent is negative infinity - i.e. b = 0
|
||||
# i.e. the inverse of cvm is huge (cvm is almost zero)
|
||||
return 0
|
||||
|
||||
def _loglikelihood(self, vectors, priors, means, covariances):
|
||||
llh = 0.0
|
||||
for vector in vectors:
|
||||
p = 0
|
||||
for j in range(len(priors)):
|
||||
p += priors[j] * self._gaussian(means[j], covariances[j], vector)
|
||||
llh += numpy.log(p)
|
||||
return llh
|
||||
|
||||
def __repr__(self):
|
||||
return "<EMClusterer means=%s>" % list(self._means)
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
Non-interactive demonstration of the clusterers with simple 2-D data.
|
||||
"""
|
||||
|
||||
from nltk import cluster
|
||||
|
||||
# example from figure 14.10, page 519, Manning and Schutze
|
||||
|
||||
vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
|
||||
means = [[4, 2], [4, 2.01]]
|
||||
|
||||
clusterer = cluster.EMClusterer(means, bias=0.1)
|
||||
clusters = clusterer.cluster(vectors, True, trace=True)
|
||||
|
||||
print("Clustered:", vectors)
|
||||
print("As: ", clusters)
|
||||
print()
|
||||
|
||||
for c in range(2):
|
||||
print("Cluster:", c)
|
||||
print("Prior: ", clusterer._priors[c])
|
||||
print("Mean: ", clusterer._means[c])
|
||||
print("Covar: ", clusterer._covariance_matrices[c])
|
||||
print()
|
||||
|
||||
# classify a new vector
|
||||
vector = numpy.array([2, 2])
|
||||
print("classify(%s):" % vector, end=" ")
|
||||
print(clusterer.classify(vector))
|
||||
|
||||
# show the classification probabilities
|
||||
vector = numpy.array([2, 2])
|
||||
print("classification_probdist(%s):" % vector)
|
||||
pdist = clusterer.classification_probdist(vector)
|
||||
for sample in pdist.samples():
|
||||
print(f"{sample} => {pdist.prob(sample) * 100:.0f}%")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
170
Backend/venv/lib/python3.12/site-packages/nltk/cluster/gaac.py
Normal file
170
Backend/venv/lib/python3.12/site-packages/nltk/cluster/gaac.py
Normal file
@@ -0,0 +1,170 @@
|
||||
# Natural Language Toolkit: Group Average Agglomerative Clusterer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.cluster.util import Dendrogram, VectorSpaceClusterer, cosine_distance
|
||||
|
||||
|
||||
class GAAClusterer(VectorSpaceClusterer):
|
||||
"""
|
||||
The Group Average Agglomerative starts with each of the N vectors as singleton
|
||||
clusters. It then iteratively merges pairs of clusters which have the
|
||||
closest centroids. This continues until there is only one cluster. The
|
||||
order of merges gives rise to a dendrogram: a tree with the earlier merges
|
||||
lower than later merges. The membership of a given number of clusters c, 1
|
||||
<= c <= N, can be found by cutting the dendrogram at depth c.
|
||||
|
||||
This clusterer uses the cosine similarity metric only, which allows for
|
||||
efficient speed-up in the clustering process.
|
||||
"""
|
||||
|
||||
def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
|
||||
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
|
||||
self._num_clusters = num_clusters
|
||||
self._dendrogram = None
|
||||
self._groups_values = None
|
||||
|
||||
def cluster(self, vectors, assign_clusters=False, trace=False):
|
||||
# stores the merge order
|
||||
self._dendrogram = Dendrogram(
|
||||
[numpy.array(vector, numpy.float64) for vector in vectors]
|
||||
)
|
||||
return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
|
||||
|
||||
def cluster_vectorspace(self, vectors, trace=False):
|
||||
# variables describing the initial situation
|
||||
N = len(vectors)
|
||||
cluster_len = [1] * N
|
||||
cluster_count = N
|
||||
index_map = numpy.arange(N)
|
||||
|
||||
# construct the similarity matrix
|
||||
dims = (N, N)
|
||||
dist = numpy.ones(dims, dtype=float) * numpy.inf
|
||||
for i in range(N):
|
||||
for j in range(i + 1, N):
|
||||
dist[i, j] = cosine_distance(vectors[i], vectors[j])
|
||||
|
||||
while cluster_count > max(self._num_clusters, 1):
|
||||
i, j = numpy.unravel_index(dist.argmin(), dims)
|
||||
if trace:
|
||||
print("merging %d and %d" % (i, j))
|
||||
|
||||
# update similarities for merging i and j
|
||||
self._merge_similarities(dist, cluster_len, i, j)
|
||||
|
||||
# remove j
|
||||
dist[:, j] = numpy.inf
|
||||
dist[j, :] = numpy.inf
|
||||
|
||||
# merge the clusters
|
||||
cluster_len[i] = cluster_len[i] + cluster_len[j]
|
||||
self._dendrogram.merge(index_map[i], index_map[j])
|
||||
cluster_count -= 1
|
||||
|
||||
# update the index map to reflect the indexes if we
|
||||
# had removed j
|
||||
index_map[j + 1 :] -= 1
|
||||
index_map[j] = N
|
||||
|
||||
self.update_clusters(self._num_clusters)
|
||||
|
||||
def _merge_similarities(self, dist, cluster_len, i, j):
|
||||
# the new cluster i merged from i and j adopts the average of
|
||||
# i and j's similarity to each other cluster, weighted by the
|
||||
# number of points in the clusters i and j
|
||||
i_weight = cluster_len[i]
|
||||
j_weight = cluster_len[j]
|
||||
weight_sum = i_weight + j_weight
|
||||
|
||||
# update for x<i
|
||||
dist[:i, i] = dist[:i, i] * i_weight + dist[:i, j] * j_weight
|
||||
dist[:i, i] /= weight_sum
|
||||
# update for i<x<j
|
||||
dist[i, i + 1 : j] = (
|
||||
dist[i, i + 1 : j] * i_weight + dist[i + 1 : j, j] * j_weight
|
||||
)
|
||||
# update for i<j<x
|
||||
dist[i, j + 1 :] = dist[i, j + 1 :] * i_weight + dist[j, j + 1 :] * j_weight
|
||||
dist[i, i + 1 :] /= weight_sum
|
||||
|
||||
def update_clusters(self, num_clusters):
|
||||
clusters = self._dendrogram.groups(num_clusters)
|
||||
self._centroids = []
|
||||
for cluster in clusters:
|
||||
assert len(cluster) > 0
|
||||
if self._should_normalise:
|
||||
centroid = self._normalise(cluster[0])
|
||||
else:
|
||||
centroid = numpy.array(cluster[0])
|
||||
for vector in cluster[1:]:
|
||||
if self._should_normalise:
|
||||
centroid += self._normalise(vector)
|
||||
else:
|
||||
centroid += vector
|
||||
centroid /= len(cluster)
|
||||
self._centroids.append(centroid)
|
||||
self._num_clusters = len(self._centroids)
|
||||
|
||||
def classify_vectorspace(self, vector):
|
||||
best = None
|
||||
for i in range(self._num_clusters):
|
||||
centroid = self._centroids[i]
|
||||
dist = cosine_distance(vector, centroid)
|
||||
if not best or dist < best[0]:
|
||||
best = (dist, i)
|
||||
return best[1]
|
||||
|
||||
def dendrogram(self):
|
||||
"""
|
||||
:return: The dendrogram representing the current clustering
|
||||
:rtype: Dendrogram
|
||||
"""
|
||||
return self._dendrogram
|
||||
|
||||
def num_clusters(self):
|
||||
return self._num_clusters
|
||||
|
||||
def __repr__(self):
|
||||
return "<GroupAverageAgglomerative Clusterer n=%d>" % self._num_clusters
|
||||
|
||||
|
||||
def demo():
|
||||
"""
|
||||
Non-interactive demonstration of the clusterers with simple 2-D data.
|
||||
"""
|
||||
|
||||
from nltk.cluster import GAAClusterer
|
||||
|
||||
# use a set of tokens with 2D indices
|
||||
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
|
||||
|
||||
# test the GAAC clusterer with 4 clusters
|
||||
clusterer = GAAClusterer(4)
|
||||
clusters = clusterer.cluster(vectors, True)
|
||||
|
||||
print("Clusterer:", clusterer)
|
||||
print("Clustered:", vectors)
|
||||
print("As:", clusters)
|
||||
print()
|
||||
|
||||
# show the dendrogram
|
||||
clusterer.dendrogram().show()
|
||||
|
||||
# classify a new vector
|
||||
vector = numpy.array([3, 3])
|
||||
print("classify(%s):" % vector, end=" ")
|
||||
print(clusterer.classify(vector))
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
230
Backend/venv/lib/python3.12/site-packages/nltk/cluster/kmeans.py
Normal file
230
Backend/venv/lib/python3.12/site-packages/nltk/cluster/kmeans.py
Normal file
@@ -0,0 +1,230 @@
|
||||
# Natural Language Toolkit: K-Means Clusterer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import copy
|
||||
import random
|
||||
import sys
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
from nltk.cluster.util import VectorSpaceClusterer
|
||||
|
||||
|
||||
class KMeansClusterer(VectorSpaceClusterer):
|
||||
"""
|
||||
The K-means clusterer starts with k arbitrary chosen means then allocates
|
||||
each vector to the cluster with the closest mean. It then recalculates the
|
||||
means of each cluster as the centroid of the vectors in the cluster. This
|
||||
process repeats until the cluster memberships stabilise. This is a
|
||||
hill-climbing algorithm which may converge to a local maximum. Hence the
|
||||
clustering is often repeated with random initial means and the most
|
||||
commonly occurring output means are chosen.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_means,
|
||||
distance,
|
||||
repeats=1,
|
||||
conv_test=1e-6,
|
||||
initial_means=None,
|
||||
normalise=False,
|
||||
svd_dimensions=None,
|
||||
rng=None,
|
||||
avoid_empty_clusters=False,
|
||||
):
|
||||
"""
|
||||
:param num_means: the number of means to use (may use fewer)
|
||||
:type num_means: int
|
||||
:param distance: measure of distance between two vectors
|
||||
:type distance: function taking two vectors and returning a float
|
||||
:param repeats: number of randomised clustering trials to use
|
||||
:type repeats: int
|
||||
:param conv_test: maximum variation in mean differences before
|
||||
deemed convergent
|
||||
:type conv_test: number
|
||||
:param initial_means: set of k initial means
|
||||
:type initial_means: sequence of vectors
|
||||
:param normalise: should vectors be normalised to length 1
|
||||
:type normalise: boolean
|
||||
:param svd_dimensions: number of dimensions to use in reducing vector
|
||||
dimensionsionality with SVD
|
||||
:type svd_dimensions: int
|
||||
:param rng: random number generator (or None)
|
||||
:type rng: Random
|
||||
:param avoid_empty_clusters: include current centroid in computation
|
||||
of next one; avoids undefined behavior
|
||||
when clusters become empty
|
||||
:type avoid_empty_clusters: boolean
|
||||
"""
|
||||
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
|
||||
self._num_means = num_means
|
||||
self._distance = distance
|
||||
self._max_difference = conv_test
|
||||
assert not initial_means or len(initial_means) == num_means
|
||||
self._means = initial_means
|
||||
assert repeats >= 1
|
||||
assert not (initial_means and repeats > 1)
|
||||
self._repeats = repeats
|
||||
self._rng = rng if rng else random.Random()
|
||||
self._avoid_empty_clusters = avoid_empty_clusters
|
||||
|
||||
def cluster_vectorspace(self, vectors, trace=False):
|
||||
if self._means and self._repeats > 1:
|
||||
print("Warning: means will be discarded for subsequent trials")
|
||||
|
||||
meanss = []
|
||||
for trial in range(self._repeats):
|
||||
if trace:
|
||||
print("k-means trial", trial)
|
||||
if not self._means or trial > 1:
|
||||
self._means = self._rng.sample(list(vectors), self._num_means)
|
||||
self._cluster_vectorspace(vectors, trace)
|
||||
meanss.append(self._means)
|
||||
|
||||
if len(meanss) > 1:
|
||||
# sort the means first (so that different cluster numbering won't
|
||||
# effect the distance comparison)
|
||||
for means in meanss:
|
||||
means.sort(key=sum)
|
||||
|
||||
# find the set of means that's minimally different from the others
|
||||
min_difference = min_means = None
|
||||
for i in range(len(meanss)):
|
||||
d = 0
|
||||
for j in range(len(meanss)):
|
||||
if i != j:
|
||||
d += self._sum_distances(meanss[i], meanss[j])
|
||||
if min_difference is None or d < min_difference:
|
||||
min_difference, min_means = d, meanss[i]
|
||||
|
||||
# use the best means
|
||||
self._means = min_means
|
||||
|
||||
def _cluster_vectorspace(self, vectors, trace=False):
|
||||
if self._num_means < len(vectors):
|
||||
# perform k-means clustering
|
||||
converged = False
|
||||
while not converged:
|
||||
# assign the tokens to clusters based on minimum distance to
|
||||
# the cluster means
|
||||
clusters = [[] for m in range(self._num_means)]
|
||||
for vector in vectors:
|
||||
index = self.classify_vectorspace(vector)
|
||||
clusters[index].append(vector)
|
||||
|
||||
if trace:
|
||||
print("iteration")
|
||||
# for i in range(self._num_means):
|
||||
# print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
|
||||
|
||||
# recalculate cluster means by computing the centroid of each cluster
|
||||
new_means = list(map(self._centroid, clusters, self._means))
|
||||
|
||||
# measure the degree of change from the previous step for convergence
|
||||
difference = self._sum_distances(self._means, new_means)
|
||||
if difference < self._max_difference:
|
||||
converged = True
|
||||
|
||||
# remember the new means
|
||||
self._means = new_means
|
||||
|
||||
def classify_vectorspace(self, vector):
|
||||
# finds the closest cluster centroid
|
||||
# returns that cluster's index
|
||||
best_distance = best_index = None
|
||||
for index in range(len(self._means)):
|
||||
mean = self._means[index]
|
||||
dist = self._distance(vector, mean)
|
||||
if best_distance is None or dist < best_distance:
|
||||
best_index, best_distance = index, dist
|
||||
return best_index
|
||||
|
||||
def num_clusters(self):
|
||||
if self._means:
|
||||
return len(self._means)
|
||||
else:
|
||||
return self._num_means
|
||||
|
||||
def means(self):
|
||||
"""
|
||||
The means used for clustering.
|
||||
"""
|
||||
return self._means
|
||||
|
||||
def _sum_distances(self, vectors1, vectors2):
|
||||
difference = 0.0
|
||||
for u, v in zip(vectors1, vectors2):
|
||||
difference += self._distance(u, v)
|
||||
return difference
|
||||
|
||||
def _centroid(self, cluster, mean):
|
||||
if self._avoid_empty_clusters:
|
||||
centroid = copy.copy(mean)
|
||||
for vector in cluster:
|
||||
centroid += vector
|
||||
return centroid / (1 + len(cluster))
|
||||
else:
|
||||
if not len(cluster):
|
||||
sys.stderr.write("Error: no centroid defined for empty cluster.\n")
|
||||
sys.stderr.write(
|
||||
"Try setting argument 'avoid_empty_clusters' to True\n"
|
||||
)
|
||||
assert False
|
||||
centroid = copy.copy(cluster[0])
|
||||
for vector in cluster[1:]:
|
||||
centroid += vector
|
||||
return centroid / len(cluster)
|
||||
|
||||
def __repr__(self):
|
||||
return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
|
||||
|
||||
|
||||
#################################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
# example from figure 14.9, page 517, Manning and Schutze
|
||||
|
||||
from nltk.cluster import KMeansClusterer, euclidean_distance
|
||||
|
||||
vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
|
||||
means = [[4, 3], [5, 5]]
|
||||
|
||||
clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
|
||||
clusters = clusterer.cluster(vectors, True, trace=True)
|
||||
|
||||
print("Clustered:", vectors)
|
||||
print("As:", clusters)
|
||||
print("Means:", clusterer.means())
|
||||
print()
|
||||
|
||||
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
|
||||
|
||||
# test k-means using the euclidean distance metric, 2 means and repeat
|
||||
# clustering 10 times with random seeds
|
||||
|
||||
clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
|
||||
clusters = clusterer.cluster(vectors, True)
|
||||
print("Clustered:", vectors)
|
||||
print("As:", clusters)
|
||||
print("Means:", clusterer.means())
|
||||
print()
|
||||
|
||||
# classify a new vector
|
||||
vector = numpy.array([3, 3])
|
||||
print("classify(%s):" % vector, end=" ")
|
||||
print(clusterer.classify(vector))
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
300
Backend/venv/lib/python3.12/site-packages/nltk/cluster/util.py
Normal file
300
Backend/venv/lib/python3.12/site-packages/nltk/cluster/util.py
Normal file
@@ -0,0 +1,300 @@
|
||||
# Natural Language Toolkit: Clusterer Utilities
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# Contributor: J Richard Snape
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
import copy
|
||||
from abc import abstractmethod
|
||||
from math import sqrt
|
||||
from sys import stdout
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.cluster.api import ClusterI
|
||||
|
||||
|
||||
class VectorSpaceClusterer(ClusterI):
|
||||
"""
|
||||
Abstract clusterer which takes tokens and maps them into a vector space.
|
||||
Optionally performs singular value decomposition to reduce the
|
||||
dimensionality.
|
||||
"""
|
||||
|
||||
def __init__(self, normalise=False, svd_dimensions=None):
|
||||
"""
|
||||
:param normalise: should vectors be normalised to length 1
|
||||
:type normalise: boolean
|
||||
:param svd_dimensions: number of dimensions to use in reducing vector
|
||||
dimensionsionality with SVD
|
||||
:type svd_dimensions: int
|
||||
"""
|
||||
self._Tt = None
|
||||
self._should_normalise = normalise
|
||||
self._svd_dimensions = svd_dimensions
|
||||
|
||||
def cluster(self, vectors, assign_clusters=False, trace=False):
|
||||
assert len(vectors) > 0
|
||||
|
||||
# normalise the vectors
|
||||
if self._should_normalise:
|
||||
vectors = list(map(self._normalise, vectors))
|
||||
|
||||
# use SVD to reduce the dimensionality
|
||||
if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
|
||||
[u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors)))
|
||||
S = d[: self._svd_dimensions] * numpy.identity(
|
||||
self._svd_dimensions, numpy.float64
|
||||
)
|
||||
T = u[:, : self._svd_dimensions]
|
||||
Dt = vt[: self._svd_dimensions, :]
|
||||
vectors = numpy.transpose(numpy.dot(S, Dt))
|
||||
self._Tt = numpy.transpose(T)
|
||||
|
||||
# call abstract method to cluster the vectors
|
||||
self.cluster_vectorspace(vectors, trace)
|
||||
|
||||
# assign the vectors to clusters
|
||||
if assign_clusters:
|
||||
return [self.classify(vector) for vector in vectors]
|
||||
|
||||
@abstractmethod
|
||||
def cluster_vectorspace(self, vectors, trace):
|
||||
"""
|
||||
Finds the clusters using the given set of vectors.
|
||||
"""
|
||||
|
||||
def classify(self, vector):
|
||||
if self._should_normalise:
|
||||
vector = self._normalise(vector)
|
||||
if self._Tt is not None:
|
||||
vector = numpy.dot(self._Tt, vector)
|
||||
cluster = self.classify_vectorspace(vector)
|
||||
return self.cluster_name(cluster)
|
||||
|
||||
@abstractmethod
|
||||
def classify_vectorspace(self, vector):
|
||||
"""
|
||||
Returns the index of the appropriate cluster for the vector.
|
||||
"""
|
||||
|
||||
def likelihood(self, vector, label):
|
||||
if self._should_normalise:
|
||||
vector = self._normalise(vector)
|
||||
if self._Tt is not None:
|
||||
vector = numpy.dot(self._Tt, vector)
|
||||
return self.likelihood_vectorspace(vector, label)
|
||||
|
||||
def likelihood_vectorspace(self, vector, cluster):
|
||||
"""
|
||||
Returns the likelihood of the vector belonging to the cluster.
|
||||
"""
|
||||
predicted = self.classify_vectorspace(vector)
|
||||
return 1.0 if cluster == predicted else 0.0
|
||||
|
||||
def vector(self, vector):
|
||||
"""
|
||||
Returns the vector after normalisation and dimensionality reduction
|
||||
"""
|
||||
if self._should_normalise:
|
||||
vector = self._normalise(vector)
|
||||
if self._Tt is not None:
|
||||
vector = numpy.dot(self._Tt, vector)
|
||||
return vector
|
||||
|
||||
def _normalise(self, vector):
|
||||
"""
|
||||
Normalises the vector to unit length.
|
||||
"""
|
||||
return vector / sqrt(numpy.dot(vector, vector))
|
||||
|
||||
|
||||
def euclidean_distance(u, v):
|
||||
"""
|
||||
Returns the euclidean distance between vectors u and v. This is equivalent
|
||||
to the length of the vector (u - v).
|
||||
"""
|
||||
diff = u - v
|
||||
return sqrt(numpy.dot(diff, diff))
|
||||
|
||||
|
||||
def cosine_distance(u, v):
|
||||
"""
|
||||
Returns 1 minus the cosine of the angle between vectors v and u. This is
|
||||
equal to ``1 - (u.v / |u||v|)``.
|
||||
"""
|
||||
return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
|
||||
|
||||
|
||||
class _DendrogramNode:
|
||||
"""Tree node of a dendrogram."""
|
||||
|
||||
def __init__(self, value, *children):
|
||||
self._value = value
|
||||
self._children = children
|
||||
|
||||
def leaves(self, values=True):
|
||||
if self._children:
|
||||
leaves = []
|
||||
for child in self._children:
|
||||
leaves.extend(child.leaves(values))
|
||||
return leaves
|
||||
elif values:
|
||||
return [self._value]
|
||||
else:
|
||||
return [self]
|
||||
|
||||
def groups(self, n):
|
||||
queue = [(self._value, self)]
|
||||
|
||||
while len(queue) < n:
|
||||
priority, node = queue.pop()
|
||||
if not node._children:
|
||||
queue.push((priority, node))
|
||||
break
|
||||
for child in node._children:
|
||||
if child._children:
|
||||
queue.append((child._value, child))
|
||||
else:
|
||||
queue.append((0, child))
|
||||
# makes the earliest merges at the start, latest at the end
|
||||
queue.sort()
|
||||
|
||||
groups = []
|
||||
for priority, node in queue:
|
||||
groups.append(node.leaves())
|
||||
return groups
|
||||
|
||||
def __lt__(self, comparator):
|
||||
return cosine_distance(self._value, comparator._value) < 0
|
||||
|
||||
|
||||
class Dendrogram:
|
||||
"""
|
||||
Represents a dendrogram, a tree with a specified branching order. This
|
||||
must be initialised with the leaf items, then iteratively call merge for
|
||||
each branch. This class constructs a tree representing the order of calls
|
||||
to the merge function.
|
||||
"""
|
||||
|
||||
def __init__(self, items=[]):
|
||||
"""
|
||||
:param items: the items at the leaves of the dendrogram
|
||||
:type items: sequence of (any)
|
||||
"""
|
||||
self._items = [_DendrogramNode(item) for item in items]
|
||||
self._original_items = copy.copy(self._items)
|
||||
self._merge = 1
|
||||
|
||||
def merge(self, *indices):
|
||||
"""
|
||||
Merges nodes at given indices in the dendrogram. The nodes will be
|
||||
combined which then replaces the first node specified. All other nodes
|
||||
involved in the merge will be removed.
|
||||
|
||||
:param indices: indices of the items to merge (at least two)
|
||||
:type indices: seq of int
|
||||
"""
|
||||
assert len(indices) >= 2
|
||||
node = _DendrogramNode(self._merge, *(self._items[i] for i in indices))
|
||||
self._merge += 1
|
||||
self._items[indices[0]] = node
|
||||
for i in indices[1:]:
|
||||
del self._items[i]
|
||||
|
||||
def groups(self, n):
|
||||
"""
|
||||
Finds the n-groups of items (leaves) reachable from a cut at depth n.
|
||||
:param n: number of groups
|
||||
:type n: int
|
||||
"""
|
||||
if len(self._items) > 1:
|
||||
root = _DendrogramNode(self._merge, *self._items)
|
||||
else:
|
||||
root = self._items[0]
|
||||
return root.groups(n)
|
||||
|
||||
def show(self, leaf_labels=[]):
|
||||
"""
|
||||
Print the dendrogram in ASCII art to standard out.
|
||||
|
||||
:param leaf_labels: an optional list of strings to use for labeling the
|
||||
leaves
|
||||
:type leaf_labels: list
|
||||
"""
|
||||
|
||||
# ASCII rendering characters
|
||||
JOIN, HLINK, VLINK = "+", "-", "|"
|
||||
|
||||
# find the root (or create one)
|
||||
if len(self._items) > 1:
|
||||
root = _DendrogramNode(self._merge, *self._items)
|
||||
else:
|
||||
root = self._items[0]
|
||||
leaves = self._original_items
|
||||
|
||||
if leaf_labels:
|
||||
last_row = leaf_labels
|
||||
else:
|
||||
last_row = ["%s" % leaf._value for leaf in leaves]
|
||||
|
||||
# find the bottom row and the best cell width
|
||||
width = max(map(len, last_row)) + 1
|
||||
lhalf = width // 2
|
||||
rhalf = int(width - lhalf - 1)
|
||||
|
||||
# display functions
|
||||
def format(centre, left=" ", right=" "):
|
||||
return f"{lhalf * left}{centre}{right * rhalf}"
|
||||
|
||||
def display(str):
|
||||
stdout.write(str)
|
||||
|
||||
# for each merge, top down
|
||||
queue = [(root._value, root)]
|
||||
verticals = [format(" ") for leaf in leaves]
|
||||
while queue:
|
||||
priority, node = queue.pop()
|
||||
child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children))
|
||||
indices = list(map(leaves.index, child_left_leaf))
|
||||
if child_left_leaf:
|
||||
min_idx = min(indices)
|
||||
max_idx = max(indices)
|
||||
for i in range(len(leaves)):
|
||||
if leaves[i] in child_left_leaf:
|
||||
if i == min_idx:
|
||||
display(format(JOIN, " ", HLINK))
|
||||
elif i == max_idx:
|
||||
display(format(JOIN, HLINK, " "))
|
||||
else:
|
||||
display(format(JOIN, HLINK, HLINK))
|
||||
verticals[i] = format(VLINK)
|
||||
elif min_idx <= i <= max_idx:
|
||||
display(format(HLINK, HLINK, HLINK))
|
||||
else:
|
||||
display(verticals[i])
|
||||
display("\n")
|
||||
for child in node._children:
|
||||
if child._children:
|
||||
queue.append((child._value, child))
|
||||
queue.sort()
|
||||
|
||||
for vertical in verticals:
|
||||
display(vertical)
|
||||
display("\n")
|
||||
|
||||
# finally, display the last line
|
||||
display("".join(item.center(width) for item in last_row))
|
||||
display("\n")
|
||||
|
||||
def __repr__(self):
|
||||
if len(self._items) > 1:
|
||||
root = _DendrogramNode(self._merge, *self._items)
|
||||
else:
|
||||
root = self._items[0]
|
||||
leaves = root.leaves(False)
|
||||
return "<Dendrogram with %d leaves>" % len(leaves)
|
||||
Reference in New Issue
Block a user