This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,92 @@
# Natural Language Toolkit: Clusterers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
This module contains a number of basic clustering algorithms. Clustering
describes the task of discovering groups of similar items with a large
collection. It is also describe as unsupervised machine learning, as the data
from which it learns is unannotated with class information, as is the case for
supervised learning. Annotated data is difficult and expensive to obtain in
the quantities required for the majority of supervised learning algorithms.
This problem, the knowledge acquisition bottleneck, is common to most natural
language processing tasks, thus fueling the need for quality unsupervised
approaches.
This module contains a k-means clusterer, E-M clusterer and a group average
agglomerative clusterer (GAAC). All these clusterers involve finding good
cluster groupings for a set of vectors in multi-dimensional space.
The K-means clusterer starts with k arbitrary chosen means then allocates each
vector to the cluster with the closest mean. It then recalculates the means of
each cluster as the centroid of the vectors in the cluster. This process
repeats until the cluster memberships stabilise. This is a hill-climbing
algorithm which may converge to a local maximum. Hence the clustering is
often repeated with random initial means and the most commonly occurring
output means are chosen.
The GAAC clusterer starts with each of the *N* vectors as singleton clusters.
It then iteratively merges pairs of clusters which have the closest centroids.
This continues until there is only one cluster. The order of merges gives rise
to a dendrogram - a tree with the earlier merges lower than later merges. The
membership of a given number of clusters *c*, *1 <= c <= N*, can be found by
cutting the dendrogram at depth *c*.
The Gaussian EM clusterer models the vectors as being produced by a mixture
of k Gaussian sources. The parameters of these sources (prior probability,
mean and covariance matrix) are then found to maximise the likelihood of the
given data. This is done with the expectation maximisation algorithm. It
starts with k arbitrarily chosen means, priors and covariance matrices. It
then calculates the membership probabilities for each vector in each of the
clusters - this is the 'E' step. The cluster parameters are then updated in
the 'M' step using the maximum likelihood estimate from the cluster membership
probabilities. This process continues until the likelihood of the data does
not significantly increase.
They all extend the ClusterI interface which defines common operations
available with each clusterer. These operations include:
- cluster: clusters a sequence of vectors
- classify: assign a vector to a cluster
- classification_probdist: give the probability distribution over cluster memberships
The current existing classifiers also extend cluster.VectorSpace, an
abstract class which allows for singular value decomposition (SVD) and vector
normalisation. SVD is used to reduce the dimensionality of the vector space in
such a manner as to preserve as much of the variation as possible, by
reparameterising the axes in order of variability and discarding all bar the
first d dimensions. Normalisation ensures that vectors fall in the unit
hypersphere.
Usage example (see also demo())::
from nltk import cluster
from nltk.cluster import euclidean_distance
from numpy import array
vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]]
# initialise the clusterer (will also assign the vectors to clusters)
clusterer = cluster.KMeansClusterer(2, euclidean_distance)
clusterer.cluster(vectors, True)
# classify a new vector
print(clusterer.classify(array([3, 3])))
Note that the vectors must use numpy array-like
objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for
efficiency when required.
"""
from nltk.cluster.em import EMClusterer
from nltk.cluster.gaac import GAAClusterer
from nltk.cluster.kmeans import KMeansClusterer
from nltk.cluster.util import (
Dendrogram,
VectorSpaceClusterer,
cosine_distance,
euclidean_distance,
)

View File

@@ -0,0 +1,74 @@
# Natural Language Toolkit: Clusterer Interfaces
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Porting: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from abc import ABCMeta, abstractmethod
from nltk.probability import DictionaryProbDist
class ClusterI(metaclass=ABCMeta):
"""
Interface covering basic clustering functionality.
"""
@abstractmethod
def cluster(self, vectors, assign_clusters=False):
"""
Assigns the vectors to clusters, learning the clustering parameters
from the data. Returns a cluster identifier for each vector.
"""
@abstractmethod
def classify(self, token):
"""
Classifies the token into a cluster, setting the token's CLUSTER
parameter to that cluster identifier.
"""
def likelihood(self, vector, label):
"""
Returns the likelihood (a float) of the token having the
corresponding cluster.
"""
if self.classify(vector) == label:
return 1.0
else:
return 0.0
def classification_probdist(self, vector):
"""
Classifies the token into a cluster, returning
a probability distribution over the cluster identifiers.
"""
likelihoods = {}
sum = 0.0
for cluster in self.cluster_names():
likelihoods[cluster] = self.likelihood(vector, cluster)
sum += likelihoods[cluster]
for cluster in self.cluster_names():
likelihoods[cluster] /= sum
return DictionaryProbDist(likelihoods)
@abstractmethod
def num_clusters(self):
"""
Returns the number of clusters.
"""
def cluster_names(self):
"""
Returns the names of the clusters.
:rtype: list
"""
return list(range(self.num_clusters()))
def cluster_name(self, index):
"""
Returns the names of the cluster at index.
"""
return index

View File

@@ -0,0 +1,219 @@
# Natural Language Toolkit: Expectation Maximization Clusterer
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
try:
import numpy
except ImportError:
pass
from nltk.cluster.util import VectorSpaceClusterer
class EMClusterer(VectorSpaceClusterer):
"""
The Gaussian EM clusterer models the vectors as being produced by
a mixture of k Gaussian sources. The parameters of these sources
(prior probability, mean and covariance matrix) are then found to
maximise the likelihood of the given data. This is done with the
expectation maximisation algorithm. It starts with k arbitrarily
chosen means, priors and covariance matrices. It then calculates
the membership probabilities for each vector in each of the
clusters; this is the 'E' step. The cluster parameters are then
updated in the 'M' step using the maximum likelihood estimate from
the cluster membership probabilities. This process continues until
the likelihood of the data does not significantly increase.
"""
def __init__(
self,
initial_means,
priors=None,
covariance_matrices=None,
conv_threshold=1e-6,
bias=0.1,
normalise=False,
svd_dimensions=None,
):
"""
Creates an EM clusterer with the given starting parameters,
convergence threshold and vector mangling parameters.
:param initial_means: the means of the gaussian cluster centers
:type initial_means: [seq of] numpy array or seq of SparseArray
:param priors: the prior probability for each cluster
:type priors: numpy array or seq of float
:param covariance_matrices: the covariance matrix for each cluster
:type covariance_matrices: [seq of] numpy array
:param conv_threshold: maximum change in likelihood before deemed
convergent
:type conv_threshold: int or float
:param bias: variance bias used to ensure non-singular covariance
matrices
:type bias: float
:param normalise: should vectors be normalised to length 1
:type normalise: boolean
:param svd_dimensions: number of dimensions to use in reducing vector
dimensionsionality with SVD
:type svd_dimensions: int
"""
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
self._means = numpy.array(initial_means, numpy.float64)
self._num_clusters = len(initial_means)
self._conv_threshold = conv_threshold
self._covariance_matrices = covariance_matrices
self._priors = priors
self._bias = bias
def num_clusters(self):
return self._num_clusters
def cluster_vectorspace(self, vectors, trace=False):
assert len(vectors) > 0
# set the parameters to initial values
dimensions = len(vectors[0])
means = self._means
priors = self._priors
if not priors:
priors = self._priors = (
numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
)
covariances = self._covariance_matrices
if not covariances:
covariances = self._covariance_matrices = [
numpy.identity(dimensions, numpy.float64)
for i in range(self._num_clusters)
]
# do the E and M steps until the likelihood plateaus
lastl = self._loglikelihood(vectors, priors, means, covariances)
converged = False
while not converged:
if trace:
print("iteration; loglikelihood", lastl)
# E-step, calculate hidden variables, h[i,j]
h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
for i in range(len(vectors)):
for j in range(self._num_clusters):
h[i, j] = priors[j] * self._gaussian(
means[j], covariances[j], vectors[i]
)
h[i, :] /= sum(h[i, :])
# M-step, update parameters - cvm, p, mean
for j in range(self._num_clusters):
covariance_before = covariances[j]
new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
new_mean = numpy.zeros(dimensions, numpy.float64)
sum_hj = 0.0
for i in range(len(vectors)):
delta = vectors[i] - means[j]
new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
sum_hj += h[i, j]
new_mean += h[i, j] * vectors[i]
covariances[j] = new_covariance / sum_hj
means[j] = new_mean / sum_hj
priors[j] = sum_hj / len(vectors)
# bias term to stop covariance matrix being singular
covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)
# calculate likelihood - FIXME: may be broken
l = self._loglikelihood(vectors, priors, means, covariances)
# check for convergence
if abs(lastl - l) < self._conv_threshold:
converged = True
lastl = l
def classify_vectorspace(self, vector):
best = None
for j in range(self._num_clusters):
p = self._priors[j] * self._gaussian(
self._means[j], self._covariance_matrices[j], vector
)
if not best or p > best[0]:
best = (p, j)
return best[1]
def likelihood_vectorspace(self, vector, cluster):
cid = self.cluster_names().index(cluster)
return self._priors[cluster] * self._gaussian(
self._means[cluster], self._covariance_matrices[cluster], vector
)
def _gaussian(self, mean, cvm, x):
m = len(mean)
assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
try:
det = numpy.linalg.det(cvm)
inv = numpy.linalg.inv(cvm)
a = det**-0.5 * (2 * numpy.pi) ** (-m / 2.0)
dx = x - mean
print(dx, inv)
b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
return a * numpy.exp(b)
except OverflowError:
# happens when the exponent is negative infinity - i.e. b = 0
# i.e. the inverse of cvm is huge (cvm is almost zero)
return 0
def _loglikelihood(self, vectors, priors, means, covariances):
llh = 0.0
for vector in vectors:
p = 0
for j in range(len(priors)):
p += priors[j] * self._gaussian(means[j], covariances[j], vector)
llh += numpy.log(p)
return llh
def __repr__(self):
return "<EMClusterer means=%s>" % list(self._means)
def demo():
"""
Non-interactive demonstration of the clusterers with simple 2-D data.
"""
from nltk import cluster
# example from figure 14.10, page 519, Manning and Schutze
vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
means = [[4, 2], [4, 2.01]]
clusterer = cluster.EMClusterer(means, bias=0.1)
clusters = clusterer.cluster(vectors, True, trace=True)
print("Clustered:", vectors)
print("As: ", clusters)
print()
for c in range(2):
print("Cluster:", c)
print("Prior: ", clusterer._priors[c])
print("Mean: ", clusterer._means[c])
print("Covar: ", clusterer._covariance_matrices[c])
print()
# classify a new vector
vector = numpy.array([2, 2])
print("classify(%s):" % vector, end=" ")
print(clusterer.classify(vector))
# show the classification probabilities
vector = numpy.array([2, 2])
print("classification_probdist(%s):" % vector)
pdist = clusterer.classification_probdist(vector)
for sample in pdist.samples():
print(f"{sample} => {pdist.prob(sample) * 100:.0f}%")
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,170 @@
# Natural Language Toolkit: Group Average Agglomerative Clusterer
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
try:
import numpy
except ImportError:
pass
from nltk.cluster.util import Dendrogram, VectorSpaceClusterer, cosine_distance
class GAAClusterer(VectorSpaceClusterer):
"""
The Group Average Agglomerative starts with each of the N vectors as singleton
clusters. It then iteratively merges pairs of clusters which have the
closest centroids. This continues until there is only one cluster. The
order of merges gives rise to a dendrogram: a tree with the earlier merges
lower than later merges. The membership of a given number of clusters c, 1
<= c <= N, can be found by cutting the dendrogram at depth c.
This clusterer uses the cosine similarity metric only, which allows for
efficient speed-up in the clustering process.
"""
def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
self._num_clusters = num_clusters
self._dendrogram = None
self._groups_values = None
def cluster(self, vectors, assign_clusters=False, trace=False):
# stores the merge order
self._dendrogram = Dendrogram(
[numpy.array(vector, numpy.float64) for vector in vectors]
)
return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
def cluster_vectorspace(self, vectors, trace=False):
# variables describing the initial situation
N = len(vectors)
cluster_len = [1] * N
cluster_count = N
index_map = numpy.arange(N)
# construct the similarity matrix
dims = (N, N)
dist = numpy.ones(dims, dtype=float) * numpy.inf
for i in range(N):
for j in range(i + 1, N):
dist[i, j] = cosine_distance(vectors[i], vectors[j])
while cluster_count > max(self._num_clusters, 1):
i, j = numpy.unravel_index(dist.argmin(), dims)
if trace:
print("merging %d and %d" % (i, j))
# update similarities for merging i and j
self._merge_similarities(dist, cluster_len, i, j)
# remove j
dist[:, j] = numpy.inf
dist[j, :] = numpy.inf
# merge the clusters
cluster_len[i] = cluster_len[i] + cluster_len[j]
self._dendrogram.merge(index_map[i], index_map[j])
cluster_count -= 1
# update the index map to reflect the indexes if we
# had removed j
index_map[j + 1 :] -= 1
index_map[j] = N
self.update_clusters(self._num_clusters)
def _merge_similarities(self, dist, cluster_len, i, j):
# the new cluster i merged from i and j adopts the average of
# i and j's similarity to each other cluster, weighted by the
# number of points in the clusters i and j
i_weight = cluster_len[i]
j_weight = cluster_len[j]
weight_sum = i_weight + j_weight
# update for x<i
dist[:i, i] = dist[:i, i] * i_weight + dist[:i, j] * j_weight
dist[:i, i] /= weight_sum
# update for i<x<j
dist[i, i + 1 : j] = (
dist[i, i + 1 : j] * i_weight + dist[i + 1 : j, j] * j_weight
)
# update for i<j<x
dist[i, j + 1 :] = dist[i, j + 1 :] * i_weight + dist[j, j + 1 :] * j_weight
dist[i, i + 1 :] /= weight_sum
def update_clusters(self, num_clusters):
clusters = self._dendrogram.groups(num_clusters)
self._centroids = []
for cluster in clusters:
assert len(cluster) > 0
if self._should_normalise:
centroid = self._normalise(cluster[0])
else:
centroid = numpy.array(cluster[0])
for vector in cluster[1:]:
if self._should_normalise:
centroid += self._normalise(vector)
else:
centroid += vector
centroid /= len(cluster)
self._centroids.append(centroid)
self._num_clusters = len(self._centroids)
def classify_vectorspace(self, vector):
best = None
for i in range(self._num_clusters):
centroid = self._centroids[i]
dist = cosine_distance(vector, centroid)
if not best or dist < best[0]:
best = (dist, i)
return best[1]
def dendrogram(self):
"""
:return: The dendrogram representing the current clustering
:rtype: Dendrogram
"""
return self._dendrogram
def num_clusters(self):
return self._num_clusters
def __repr__(self):
return "<GroupAverageAgglomerative Clusterer n=%d>" % self._num_clusters
def demo():
"""
Non-interactive demonstration of the clusterers with simple 2-D data.
"""
from nltk.cluster import GAAClusterer
# use a set of tokens with 2D indices
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
# test the GAAC clusterer with 4 clusters
clusterer = GAAClusterer(4)
clusters = clusterer.cluster(vectors, True)
print("Clusterer:", clusterer)
print("Clustered:", vectors)
print("As:", clusters)
print()
# show the dendrogram
clusterer.dendrogram().show()
# classify a new vector
vector = numpy.array([3, 3])
print("classify(%s):" % vector, end=" ")
print(clusterer.classify(vector))
print()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,230 @@
# Natural Language Toolkit: K-Means Clusterer
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import copy
import random
import sys
try:
import numpy
except ImportError:
pass
from nltk.cluster.util import VectorSpaceClusterer
class KMeansClusterer(VectorSpaceClusterer):
"""
The K-means clusterer starts with k arbitrary chosen means then allocates
each vector to the cluster with the closest mean. It then recalculates the
means of each cluster as the centroid of the vectors in the cluster. This
process repeats until the cluster memberships stabilise. This is a
hill-climbing algorithm which may converge to a local maximum. Hence the
clustering is often repeated with random initial means and the most
commonly occurring output means are chosen.
"""
def __init__(
self,
num_means,
distance,
repeats=1,
conv_test=1e-6,
initial_means=None,
normalise=False,
svd_dimensions=None,
rng=None,
avoid_empty_clusters=False,
):
"""
:param num_means: the number of means to use (may use fewer)
:type num_means: int
:param distance: measure of distance between two vectors
:type distance: function taking two vectors and returning a float
:param repeats: number of randomised clustering trials to use
:type repeats: int
:param conv_test: maximum variation in mean differences before
deemed convergent
:type conv_test: number
:param initial_means: set of k initial means
:type initial_means: sequence of vectors
:param normalise: should vectors be normalised to length 1
:type normalise: boolean
:param svd_dimensions: number of dimensions to use in reducing vector
dimensionsionality with SVD
:type svd_dimensions: int
:param rng: random number generator (or None)
:type rng: Random
:param avoid_empty_clusters: include current centroid in computation
of next one; avoids undefined behavior
when clusters become empty
:type avoid_empty_clusters: boolean
"""
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
self._num_means = num_means
self._distance = distance
self._max_difference = conv_test
assert not initial_means or len(initial_means) == num_means
self._means = initial_means
assert repeats >= 1
assert not (initial_means and repeats > 1)
self._repeats = repeats
self._rng = rng if rng else random.Random()
self._avoid_empty_clusters = avoid_empty_clusters
def cluster_vectorspace(self, vectors, trace=False):
if self._means and self._repeats > 1:
print("Warning: means will be discarded for subsequent trials")
meanss = []
for trial in range(self._repeats):
if trace:
print("k-means trial", trial)
if not self._means or trial > 1:
self._means = self._rng.sample(list(vectors), self._num_means)
self._cluster_vectorspace(vectors, trace)
meanss.append(self._means)
if len(meanss) > 1:
# sort the means first (so that different cluster numbering won't
# effect the distance comparison)
for means in meanss:
means.sort(key=sum)
# find the set of means that's minimally different from the others
min_difference = min_means = None
for i in range(len(meanss)):
d = 0
for j in range(len(meanss)):
if i != j:
d += self._sum_distances(meanss[i], meanss[j])
if min_difference is None or d < min_difference:
min_difference, min_means = d, meanss[i]
# use the best means
self._means = min_means
def _cluster_vectorspace(self, vectors, trace=False):
if self._num_means < len(vectors):
# perform k-means clustering
converged = False
while not converged:
# assign the tokens to clusters based on minimum distance to
# the cluster means
clusters = [[] for m in range(self._num_means)]
for vector in vectors:
index = self.classify_vectorspace(vector)
clusters[index].append(vector)
if trace:
print("iteration")
# for i in range(self._num_means):
# print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
# recalculate cluster means by computing the centroid of each cluster
new_means = list(map(self._centroid, clusters, self._means))
# measure the degree of change from the previous step for convergence
difference = self._sum_distances(self._means, new_means)
if difference < self._max_difference:
converged = True
# remember the new means
self._means = new_means
def classify_vectorspace(self, vector):
# finds the closest cluster centroid
# returns that cluster's index
best_distance = best_index = None
for index in range(len(self._means)):
mean = self._means[index]
dist = self._distance(vector, mean)
if best_distance is None or dist < best_distance:
best_index, best_distance = index, dist
return best_index
def num_clusters(self):
if self._means:
return len(self._means)
else:
return self._num_means
def means(self):
"""
The means used for clustering.
"""
return self._means
def _sum_distances(self, vectors1, vectors2):
difference = 0.0
for u, v in zip(vectors1, vectors2):
difference += self._distance(u, v)
return difference
def _centroid(self, cluster, mean):
if self._avoid_empty_clusters:
centroid = copy.copy(mean)
for vector in cluster:
centroid += vector
return centroid / (1 + len(cluster))
else:
if not len(cluster):
sys.stderr.write("Error: no centroid defined for empty cluster.\n")
sys.stderr.write(
"Try setting argument 'avoid_empty_clusters' to True\n"
)
assert False
centroid = copy.copy(cluster[0])
for vector in cluster[1:]:
centroid += vector
return centroid / len(cluster)
def __repr__(self):
return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
#################################################################################
def demo():
# example from figure 14.9, page 517, Manning and Schutze
from nltk.cluster import KMeansClusterer, euclidean_distance
vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
means = [[4, 3], [5, 5]]
clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
clusters = clusterer.cluster(vectors, True, trace=True)
print("Clustered:", vectors)
print("As:", clusters)
print("Means:", clusterer.means())
print()
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
# test k-means using the euclidean distance metric, 2 means and repeat
# clustering 10 times with random seeds
clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
clusters = clusterer.cluster(vectors, True)
print("Clustered:", vectors)
print("As:", clusters)
print("Means:", clusterer.means())
print()
# classify a new vector
vector = numpy.array([3, 3])
print("classify(%s):" % vector, end=" ")
print(clusterer.classify(vector))
print()
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,300 @@
# Natural Language Toolkit: Clusterer Utilities
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Contributor: J Richard Snape
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import copy
from abc import abstractmethod
from math import sqrt
from sys import stdout
try:
import numpy
except ImportError:
pass
from nltk.cluster.api import ClusterI
class VectorSpaceClusterer(ClusterI):
"""
Abstract clusterer which takes tokens and maps them into a vector space.
Optionally performs singular value decomposition to reduce the
dimensionality.
"""
def __init__(self, normalise=False, svd_dimensions=None):
"""
:param normalise: should vectors be normalised to length 1
:type normalise: boolean
:param svd_dimensions: number of dimensions to use in reducing vector
dimensionsionality with SVD
:type svd_dimensions: int
"""
self._Tt = None
self._should_normalise = normalise
self._svd_dimensions = svd_dimensions
def cluster(self, vectors, assign_clusters=False, trace=False):
assert len(vectors) > 0
# normalise the vectors
if self._should_normalise:
vectors = list(map(self._normalise, vectors))
# use SVD to reduce the dimensionality
if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
[u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors)))
S = d[: self._svd_dimensions] * numpy.identity(
self._svd_dimensions, numpy.float64
)
T = u[:, : self._svd_dimensions]
Dt = vt[: self._svd_dimensions, :]
vectors = numpy.transpose(numpy.dot(S, Dt))
self._Tt = numpy.transpose(T)
# call abstract method to cluster the vectors
self.cluster_vectorspace(vectors, trace)
# assign the vectors to clusters
if assign_clusters:
return [self.classify(vector) for vector in vectors]
@abstractmethod
def cluster_vectorspace(self, vectors, trace):
"""
Finds the clusters using the given set of vectors.
"""
def classify(self, vector):
if self._should_normalise:
vector = self._normalise(vector)
if self._Tt is not None:
vector = numpy.dot(self._Tt, vector)
cluster = self.classify_vectorspace(vector)
return self.cluster_name(cluster)
@abstractmethod
def classify_vectorspace(self, vector):
"""
Returns the index of the appropriate cluster for the vector.
"""
def likelihood(self, vector, label):
if self._should_normalise:
vector = self._normalise(vector)
if self._Tt is not None:
vector = numpy.dot(self._Tt, vector)
return self.likelihood_vectorspace(vector, label)
def likelihood_vectorspace(self, vector, cluster):
"""
Returns the likelihood of the vector belonging to the cluster.
"""
predicted = self.classify_vectorspace(vector)
return 1.0 if cluster == predicted else 0.0
def vector(self, vector):
"""
Returns the vector after normalisation and dimensionality reduction
"""
if self._should_normalise:
vector = self._normalise(vector)
if self._Tt is not None:
vector = numpy.dot(self._Tt, vector)
return vector
def _normalise(self, vector):
"""
Normalises the vector to unit length.
"""
return vector / sqrt(numpy.dot(vector, vector))
def euclidean_distance(u, v):
"""
Returns the euclidean distance between vectors u and v. This is equivalent
to the length of the vector (u - v).
"""
diff = u - v
return sqrt(numpy.dot(diff, diff))
def cosine_distance(u, v):
"""
Returns 1 minus the cosine of the angle between vectors v and u. This is
equal to ``1 - (u.v / |u||v|)``.
"""
return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
class _DendrogramNode:
"""Tree node of a dendrogram."""
def __init__(self, value, *children):
self._value = value
self._children = children
def leaves(self, values=True):
if self._children:
leaves = []
for child in self._children:
leaves.extend(child.leaves(values))
return leaves
elif values:
return [self._value]
else:
return [self]
def groups(self, n):
queue = [(self._value, self)]
while len(queue) < n:
priority, node = queue.pop()
if not node._children:
queue.push((priority, node))
break
for child in node._children:
if child._children:
queue.append((child._value, child))
else:
queue.append((0, child))
# makes the earliest merges at the start, latest at the end
queue.sort()
groups = []
for priority, node in queue:
groups.append(node.leaves())
return groups
def __lt__(self, comparator):
return cosine_distance(self._value, comparator._value) < 0
class Dendrogram:
"""
Represents a dendrogram, a tree with a specified branching order. This
must be initialised with the leaf items, then iteratively call merge for
each branch. This class constructs a tree representing the order of calls
to the merge function.
"""
def __init__(self, items=[]):
"""
:param items: the items at the leaves of the dendrogram
:type items: sequence of (any)
"""
self._items = [_DendrogramNode(item) for item in items]
self._original_items = copy.copy(self._items)
self._merge = 1
def merge(self, *indices):
"""
Merges nodes at given indices in the dendrogram. The nodes will be
combined which then replaces the first node specified. All other nodes
involved in the merge will be removed.
:param indices: indices of the items to merge (at least two)
:type indices: seq of int
"""
assert len(indices) >= 2
node = _DendrogramNode(self._merge, *(self._items[i] for i in indices))
self._merge += 1
self._items[indices[0]] = node
for i in indices[1:]:
del self._items[i]
def groups(self, n):
"""
Finds the n-groups of items (leaves) reachable from a cut at depth n.
:param n: number of groups
:type n: int
"""
if len(self._items) > 1:
root = _DendrogramNode(self._merge, *self._items)
else:
root = self._items[0]
return root.groups(n)
def show(self, leaf_labels=[]):
"""
Print the dendrogram in ASCII art to standard out.
:param leaf_labels: an optional list of strings to use for labeling the
leaves
:type leaf_labels: list
"""
# ASCII rendering characters
JOIN, HLINK, VLINK = "+", "-", "|"
# find the root (or create one)
if len(self._items) > 1:
root = _DendrogramNode(self._merge, *self._items)
else:
root = self._items[0]
leaves = self._original_items
if leaf_labels:
last_row = leaf_labels
else:
last_row = ["%s" % leaf._value for leaf in leaves]
# find the bottom row and the best cell width
width = max(map(len, last_row)) + 1
lhalf = width // 2
rhalf = int(width - lhalf - 1)
# display functions
def format(centre, left=" ", right=" "):
return f"{lhalf * left}{centre}{right * rhalf}"
def display(str):
stdout.write(str)
# for each merge, top down
queue = [(root._value, root)]
verticals = [format(" ") for leaf in leaves]
while queue:
priority, node = queue.pop()
child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children))
indices = list(map(leaves.index, child_left_leaf))
if child_left_leaf:
min_idx = min(indices)
max_idx = max(indices)
for i in range(len(leaves)):
if leaves[i] in child_left_leaf:
if i == min_idx:
display(format(JOIN, " ", HLINK))
elif i == max_idx:
display(format(JOIN, HLINK, " "))
else:
display(format(JOIN, HLINK, HLINK))
verticals[i] = format(VLINK)
elif min_idx <= i <= max_idx:
display(format(HLINK, HLINK, HLINK))
else:
display(verticals[i])
display("\n")
for child in node._children:
if child._children:
queue.append((child._value, child))
queue.sort()
for vertical in verticals:
display(vertical)
display("\n")
# finally, display the last line
display("".join(item.center(width) for item in last_row))
display("\n")
def __repr__(self):
if len(self._items) > 1:
root = _DendrogramNode(self._merge, *self._items)
else:
root = self._items[0]
leaves = root.leaves(False)
return "<Dendrogram with %d leaves>" % len(leaves)