updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/init.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/init.py
@@ -0,0 +1,92 @@
+# Natural Language Toolkit: Clusterers
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+This module contains a number of basic clustering algorithms. Clustering
+describes the task of discovering groups of similar items with a large
+collection. It is also describe as unsupervised machine learning, as the data
+from which it learns is unannotated with class information, as is the case for
+supervised learning.  Annotated data is difficult and expensive to obtain in
+the quantities required for the majority of supervised learning algorithms.
+This problem, the knowledge acquisition bottleneck, is common to most natural
+language processing tasks, thus fueling the need for quality unsupervised
+approaches.
+
+This module contains a k-means clusterer, E-M clusterer and a group average
+agglomerative clusterer (GAAC). All these clusterers involve finding good
+cluster groupings for a set of vectors in multi-dimensional space.
+
+The K-means clusterer starts with k arbitrary chosen means then allocates each
+vector to the cluster with the closest mean. It then recalculates the means of
+each cluster as the centroid of the vectors in the cluster. This process
+repeats until the cluster memberships stabilise. This is a hill-climbing
+algorithm which may converge to a local maximum. Hence the clustering is
+often repeated with random initial means and the most commonly occurring
+output means are chosen.
+
+The GAAC clusterer starts with each of the *N* vectors as singleton clusters.
+It then iteratively merges pairs of clusters which have the closest centroids.
+This continues until there is only one cluster. The order of merges gives rise
+to a dendrogram - a tree with the earlier merges lower than later merges. The
+membership of a given number of clusters *c*, *1 <= c <= N*, can be found by
+cutting the dendrogram at depth *c*.
+
+The Gaussian EM clusterer models the vectors as being produced by a mixture
+of k Gaussian sources. The parameters of these sources (prior probability,
+mean and covariance matrix) are then found to maximise the likelihood of the
+given data. This is done with the expectation maximisation algorithm. It
+starts with k arbitrarily chosen means, priors and covariance matrices. It
+then calculates the membership probabilities for each vector in each of the
+clusters - this is the 'E' step. The cluster parameters are then updated in
+the 'M' step using the maximum likelihood estimate from the cluster membership
+probabilities. This process continues until the likelihood of the data does
+not significantly increase.
+
+They all extend the ClusterI interface which defines common operations
+available with each clusterer. These operations include:
+
+- cluster: clusters a sequence of vectors
+- classify: assign a vector to a cluster
+- classification_probdist: give the probability distribution over cluster memberships
+
+The current existing classifiers also extend cluster.VectorSpace, an
+abstract class which allows for singular value decomposition (SVD) and vector
+normalisation. SVD is used to reduce the dimensionality of the vector space in
+such a manner as to preserve as much of the variation as possible, by
+reparameterising the axes in order of variability and discarding all bar the
+first d dimensions. Normalisation ensures that vectors fall in the unit
+hypersphere.
+
+Usage example (see also demo())::
+
+    from nltk import cluster
+    from nltk.cluster import euclidean_distance
+    from numpy import array
+
+    vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]]
+
+    # initialise the clusterer (will also assign the vectors to clusters)
+    clusterer = cluster.KMeansClusterer(2, euclidean_distance)
+    clusterer.cluster(vectors, True)
+
+    # classify a new vector
+    print(clusterer.classify(array([3, 3])))
+
+Note that the vectors must use numpy array-like
+objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for
+efficiency when required.
+"""
+
+from nltk.cluster.em import EMClusterer
+from nltk.cluster.gaac import GAAClusterer
+from nltk.cluster.kmeans import KMeansClusterer
+from nltk.cluster.util import (
+    Dendrogram,
+    VectorSpaceClusterer,
+    cosine_distance,
+    euclidean_distance,
+)
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/init.cpython-312.pyc
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/init.cpython-312.pyc
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/api.cpython-312.pyc
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/api.cpython-312.pyc
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/em.cpython-312.pyc
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/em.cpython-312.pyc
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/gaac.cpython-312.pyc
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/gaac.cpython-312.pyc
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/kmeans.cpython-312.pyc
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/kmeans.cpython-312.pyc
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/util.cpython-312.pyc
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/pycache/util.cpython-312.pyc
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/api.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/api.py
@@ -0,0 +1,74 @@
+# Natural Language Toolkit: Clusterer Interfaces
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# Porting: Steven Bird <stevenbird1@gmail.com>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+from abc import ABCMeta, abstractmethod
+
+from nltk.probability import DictionaryProbDist
+
+
+class ClusterI(metaclass=ABCMeta):
+    """
+    Interface covering basic clustering functionality.
+    """
+
+    @abstractmethod
+    def cluster(self, vectors, assign_clusters=False):
+        """
+        Assigns the vectors to clusters, learning the clustering parameters
+        from the data. Returns a cluster identifier for each vector.
+        """
+
+    @abstractmethod
+    def classify(self, token):
+        """
+        Classifies the token into a cluster, setting the token's CLUSTER
+        parameter to that cluster identifier.
+        """
+
+    def likelihood(self, vector, label):
+        """
+        Returns the likelihood (a float) of the token having the
+        corresponding cluster.
+        """
+        if self.classify(vector) == label:
+            return 1.0
+        else:
+            return 0.0
+
+    def classification_probdist(self, vector):
+        """
+        Classifies the token into a cluster, returning
+        a probability distribution over the cluster identifiers.
+        """
+        likelihoods = {}
+        sum = 0.0
+        for cluster in self.cluster_names():
+            likelihoods[cluster] = self.likelihood(vector, cluster)
+            sum += likelihoods[cluster]
+        for cluster in self.cluster_names():
+            likelihoods[cluster] /= sum
+        return DictionaryProbDist(likelihoods)
+
+    @abstractmethod
+    def num_clusters(self):
+        """
+        Returns the number of clusters.
+        """
+
+    def cluster_names(self):
+        """
+        Returns the names of the clusters.
+        :rtype: list
+        """
+        return list(range(self.num_clusters()))
+
+    def cluster_name(self, index):
+        """
+        Returns the names of the cluster at index.
+        """
+        return index
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/em.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/em.py
@@ -0,0 +1,219 @@
+# Natural Language Toolkit: Expectation Maximization Clusterer
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+from nltk.cluster.util import VectorSpaceClusterer
+
+
+class EMClusterer(VectorSpaceClusterer):
+    """
+    The Gaussian EM clusterer models the vectors as being produced by
+    a mixture of k Gaussian sources. The parameters of these sources
+    (prior probability, mean and covariance matrix) are then found to
+    maximise the likelihood of the given data. This is done with the
+    expectation maximisation algorithm. It starts with k arbitrarily
+    chosen means, priors and covariance matrices. It then calculates
+    the membership probabilities for each vector in each of the
+    clusters; this is the 'E' step. The cluster parameters are then
+    updated in the 'M' step using the maximum likelihood estimate from
+    the cluster membership probabilities. This process continues until
+    the likelihood of the data does not significantly increase.
+    """
+
+    def __init__(
+        self,
+        initial_means,
+        priors=None,
+        covariance_matrices=None,
+        conv_threshold=1e-6,
+        bias=0.1,
+        normalise=False,
+        svd_dimensions=None,
+    ):
+        """
+        Creates an EM clusterer with the given starting parameters,
+        convergence threshold and vector mangling parameters.
+
+        :param  initial_means: the means of the gaussian cluster centers
+        :type   initial_means: [seq of] numpy array or seq of SparseArray
+        :param  priors: the prior probability for each cluster
+        :type   priors: numpy array or seq of float
+        :param  covariance_matrices: the covariance matrix for each cluster
+        :type   covariance_matrices: [seq of] numpy array
+        :param  conv_threshold: maximum change in likelihood before deemed
+                    convergent
+        :type   conv_threshold: int or float
+        :param  bias: variance bias used to ensure non-singular covariance
+                      matrices
+        :type   bias: float
+        :param  normalise:  should vectors be normalised to length 1
+        :type   normalise:  boolean
+        :param  svd_dimensions: number of dimensions to use in reducing vector
+                               dimensionsionality with SVD
+        :type   svd_dimensions: int
+        """
+        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
+        self._means = numpy.array(initial_means, numpy.float64)
+        self._num_clusters = len(initial_means)
+        self._conv_threshold = conv_threshold
+        self._covariance_matrices = covariance_matrices
+        self._priors = priors
+        self._bias = bias
+
+    def num_clusters(self):
+        return self._num_clusters
+
+    def cluster_vectorspace(self, vectors, trace=False):
+        assert len(vectors) > 0
+
+        # set the parameters to initial values
+        dimensions = len(vectors[0])
+        means = self._means
+        priors = self._priors
+        if not priors:
+            priors = self._priors = (
+                numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
+            )
+        covariances = self._covariance_matrices
+        if not covariances:
+            covariances = self._covariance_matrices = [
+                numpy.identity(dimensions, numpy.float64)
+                for i in range(self._num_clusters)
+            ]
+
+        # do the E and M steps until the likelihood plateaus
+        lastl = self._loglikelihood(vectors, priors, means, covariances)
+        converged = False
+
+        while not converged:
+            if trace:
+                print("iteration; loglikelihood", lastl)
+            # E-step, calculate hidden variables, h[i,j]
+            h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
+            for i in range(len(vectors)):
+                for j in range(self._num_clusters):
+                    h[i, j] = priors[j] * self._gaussian(
+                        means[j], covariances[j], vectors[i]
+                    )
+                h[i, :] /= sum(h[i, :])
+
+            # M-step, update parameters - cvm, p, mean
+            for j in range(self._num_clusters):
+                covariance_before = covariances[j]
+                new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
+                new_mean = numpy.zeros(dimensions, numpy.float64)
+                sum_hj = 0.0
+                for i in range(len(vectors)):
+                    delta = vectors[i] - means[j]
+                    new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
+                    sum_hj += h[i, j]
+                    new_mean += h[i, j] * vectors[i]
+                covariances[j] = new_covariance / sum_hj
+                means[j] = new_mean / sum_hj
+                priors[j] = sum_hj / len(vectors)
+
+                # bias term to stop covariance matrix being singular
+                covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)
+
+            # calculate likelihood - FIXME: may be broken
+            l = self._loglikelihood(vectors, priors, means, covariances)
+
+            # check for convergence
+            if abs(lastl - l) < self._conv_threshold:
+                converged = True
+            lastl = l
+
+    def classify_vectorspace(self, vector):
+        best = None
+        for j in range(self._num_clusters):
+            p = self._priors[j] * self._gaussian(
+                self._means[j], self._covariance_matrices[j], vector
+            )
+            if not best or p > best[0]:
+                best = (p, j)
+        return best[1]
+
+    def likelihood_vectorspace(self, vector, cluster):
+        cid = self.cluster_names().index(cluster)
+        return self._priors[cluster] * self._gaussian(
+            self._means[cluster], self._covariance_matrices[cluster], vector
+        )
+
+    def _gaussian(self, mean, cvm, x):
+        m = len(mean)
+        assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
+        try:
+            det = numpy.linalg.det(cvm)
+            inv = numpy.linalg.inv(cvm)
+            a = det**-0.5 * (2 * numpy.pi) ** (-m / 2.0)
+            dx = x - mean
+            print(dx, inv)
+            b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
+            return a * numpy.exp(b)
+        except OverflowError:
+            # happens when the exponent is negative infinity - i.e. b = 0
+            # i.e. the inverse of cvm is huge (cvm is almost zero)
+            return 0
+
+    def _loglikelihood(self, vectors, priors, means, covariances):
+        llh = 0.0
+        for vector in vectors:
+            p = 0
+            for j in range(len(priors)):
+                p += priors[j] * self._gaussian(means[j], covariances[j], vector)
+            llh += numpy.log(p)
+        return llh
+
+    def __repr__(self):
+        return "<EMClusterer means=%s>" % list(self._means)
+
+
+def demo():
+    """
+    Non-interactive demonstration of the clusterers with simple 2-D data.
+    """
+
+    from nltk import cluster
+
+    # example from figure 14.10, page 519, Manning and Schutze
+
+    vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
+    means = [[4, 2], [4, 2.01]]
+
+    clusterer = cluster.EMClusterer(means, bias=0.1)
+    clusters = clusterer.cluster(vectors, True, trace=True)
+
+    print("Clustered:", vectors)
+    print("As:       ", clusters)
+    print()
+
+    for c in range(2):
+        print("Cluster:", c)
+        print("Prior:  ", clusterer._priors[c])
+        print("Mean:   ", clusterer._means[c])
+        print("Covar:  ", clusterer._covariance_matrices[c])
+        print()
+
+    # classify a new vector
+    vector = numpy.array([2, 2])
+    print("classify(%s):" % vector, end=" ")
+    print(clusterer.classify(vector))
+
+    # show the classification probabilities
+    vector = numpy.array([2, 2])
+    print("classification_probdist(%s):" % vector)
+    pdist = clusterer.classification_probdist(vector)
+    for sample in pdist.samples():
+        print(f"{sample} => {pdist.prob(sample) * 100:.0f}%")
+
+
+if __name__ == "__main__":
+    demo()
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/gaac.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/gaac.py
@@ -0,0 +1,170 @@
+# Natural Language Toolkit: Group Average Agglomerative Clusterer
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+from nltk.cluster.util import Dendrogram, VectorSpaceClusterer, cosine_distance
+
+
+class GAAClusterer(VectorSpaceClusterer):
+    """
+    The Group Average Agglomerative starts with each of the N vectors as singleton
+    clusters. It then iteratively merges pairs of clusters which have the
+    closest centroids.  This continues until there is only one cluster. The
+    order of merges gives rise to a dendrogram: a tree with the earlier merges
+    lower than later merges. The membership of a given number of clusters c, 1
+    <= c <= N, can be found by cutting the dendrogram at depth c.
+
+    This clusterer uses the cosine similarity metric only, which allows for
+    efficient speed-up in the clustering process.
+    """
+
+    def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
+        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
+        self._num_clusters = num_clusters
+        self._dendrogram = None
+        self._groups_values = None
+
+    def cluster(self, vectors, assign_clusters=False, trace=False):
+        # stores the merge order
+        self._dendrogram = Dendrogram(
+            [numpy.array(vector, numpy.float64) for vector in vectors]
+        )
+        return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
+
+    def cluster_vectorspace(self, vectors, trace=False):
+        # variables describing the initial situation
+        N = len(vectors)
+        cluster_len = [1] * N
+        cluster_count = N
+        index_map = numpy.arange(N)
+
+        # construct the similarity matrix
+        dims = (N, N)
+        dist = numpy.ones(dims, dtype=float) * numpy.inf
+        for i in range(N):
+            for j in range(i + 1, N):
+                dist[i, j] = cosine_distance(vectors[i], vectors[j])
+
+        while cluster_count > max(self._num_clusters, 1):
+            i, j = numpy.unravel_index(dist.argmin(), dims)
+            if trace:
+                print("merging %d and %d" % (i, j))
+
+            # update similarities for merging i and j
+            self._merge_similarities(dist, cluster_len, i, j)
+
+            # remove j
+            dist[:, j] = numpy.inf
+            dist[j, :] = numpy.inf
+
+            # merge the clusters
+            cluster_len[i] = cluster_len[i] + cluster_len[j]
+            self._dendrogram.merge(index_map[i], index_map[j])
+            cluster_count -= 1
+
+            # update the index map to reflect the indexes if we
+            # had removed j
+            index_map[j + 1 :] -= 1
+            index_map[j] = N
+
+        self.update_clusters(self._num_clusters)
+
+    def _merge_similarities(self, dist, cluster_len, i, j):
+        # the new cluster i merged from i and j adopts the average of
+        # i and j's similarity to each other cluster, weighted by the
+        # number of points in the clusters i and j
+        i_weight = cluster_len[i]
+        j_weight = cluster_len[j]
+        weight_sum = i_weight + j_weight
+
+        # update for x<i
+        dist[:i, i] = dist[:i, i] * i_weight + dist[:i, j] * j_weight
+        dist[:i, i] /= weight_sum
+        # update for i<x<j
+        dist[i, i + 1 : j] = (
+            dist[i, i + 1 : j] * i_weight + dist[i + 1 : j, j] * j_weight
+        )
+        # update for i<j<x
+        dist[i, j + 1 :] = dist[i, j + 1 :] * i_weight + dist[j, j + 1 :] * j_weight
+        dist[i, i + 1 :] /= weight_sum
+
+    def update_clusters(self, num_clusters):
+        clusters = self._dendrogram.groups(num_clusters)
+        self._centroids = []
+        for cluster in clusters:
+            assert len(cluster) > 0
+            if self._should_normalise:
+                centroid = self._normalise(cluster[0])
+            else:
+                centroid = numpy.array(cluster[0])
+            for vector in cluster[1:]:
+                if self._should_normalise:
+                    centroid += self._normalise(vector)
+                else:
+                    centroid += vector
+            centroid /= len(cluster)
+            self._centroids.append(centroid)
+        self._num_clusters = len(self._centroids)
+
+    def classify_vectorspace(self, vector):
+        best = None
+        for i in range(self._num_clusters):
+            centroid = self._centroids[i]
+            dist = cosine_distance(vector, centroid)
+            if not best or dist < best[0]:
+                best = (dist, i)
+        return best[1]
+
+    def dendrogram(self):
+        """
+        :return: The dendrogram representing the current clustering
+        :rtype:  Dendrogram
+        """
+        return self._dendrogram
+
+    def num_clusters(self):
+        return self._num_clusters
+
+    def __repr__(self):
+        return "<GroupAverageAgglomerative Clusterer n=%d>" % self._num_clusters
+
+
+def demo():
+    """
+    Non-interactive demonstration of the clusterers with simple 2-D data.
+    """
+
+    from nltk.cluster import GAAClusterer
+
+    # use a set of tokens with 2D indices
+    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
+
+    # test the GAAC clusterer with 4 clusters
+    clusterer = GAAClusterer(4)
+    clusters = clusterer.cluster(vectors, True)
+
+    print("Clusterer:", clusterer)
+    print("Clustered:", vectors)
+    print("As:", clusters)
+    print()
+
+    # show the dendrogram
+    clusterer.dendrogram().show()
+
+    # classify a new vector
+    vector = numpy.array([3, 3])
+    print("classify(%s):" % vector, end=" ")
+    print(clusterer.classify(vector))
+    print()
+
+
+if __name__ == "__main__":
+    demo()
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/kmeans.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/kmeans.py
@@ -0,0 +1,230 @@
+# Natural Language Toolkit: K-Means Clusterer
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import copy
+import random
+import sys
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+
+from nltk.cluster.util import VectorSpaceClusterer
+
+
+class KMeansClusterer(VectorSpaceClusterer):
+    """
+    The K-means clusterer starts with k arbitrary chosen means then allocates
+    each vector to the cluster with the closest mean. It then recalculates the
+    means of each cluster as the centroid of the vectors in the cluster. This
+    process repeats until the cluster memberships stabilise. This is a
+    hill-climbing algorithm which may converge to a local maximum. Hence the
+    clustering is often repeated with random initial means and the most
+    commonly occurring output means are chosen.
+    """
+
+    def __init__(
+        self,
+        num_means,
+        distance,
+        repeats=1,
+        conv_test=1e-6,
+        initial_means=None,
+        normalise=False,
+        svd_dimensions=None,
+        rng=None,
+        avoid_empty_clusters=False,
+    ):
+        """
+        :param  num_means:  the number of means to use (may use fewer)
+        :type   num_means:  int
+        :param  distance:   measure of distance between two vectors
+        :type   distance:   function taking two vectors and returning a float
+        :param  repeats:    number of randomised clustering trials to use
+        :type   repeats:    int
+        :param  conv_test:  maximum variation in mean differences before
+                            deemed convergent
+        :type   conv_test:  number
+        :param  initial_means: set of k initial means
+        :type   initial_means: sequence of vectors
+        :param  normalise:  should vectors be normalised to length 1
+        :type   normalise:  boolean
+        :param svd_dimensions: number of dimensions to use in reducing vector
+                               dimensionsionality with SVD
+        :type svd_dimensions: int
+        :param  rng:        random number generator (or None)
+        :type   rng:        Random
+        :param avoid_empty_clusters: include current centroid in computation
+                                     of next one; avoids undefined behavior
+                                     when clusters become empty
+        :type avoid_empty_clusters: boolean
+        """
+        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
+        self._num_means = num_means
+        self._distance = distance
+        self._max_difference = conv_test
+        assert not initial_means or len(initial_means) == num_means
+        self._means = initial_means
+        assert repeats >= 1
+        assert not (initial_means and repeats > 1)
+        self._repeats = repeats
+        self._rng = rng if rng else random.Random()
+        self._avoid_empty_clusters = avoid_empty_clusters
+
+    def cluster_vectorspace(self, vectors, trace=False):
+        if self._means and self._repeats > 1:
+            print("Warning: means will be discarded for subsequent trials")
+
+        meanss = []
+        for trial in range(self._repeats):
+            if trace:
+                print("k-means trial", trial)
+            if not self._means or trial > 1:
+                self._means = self._rng.sample(list(vectors), self._num_means)
+            self._cluster_vectorspace(vectors, trace)
+            meanss.append(self._means)
+
+        if len(meanss) > 1:
+            # sort the means first (so that different cluster numbering won't
+            # effect the distance comparison)
+            for means in meanss:
+                means.sort(key=sum)
+
+            # find the set of means that's minimally different from the others
+            min_difference = min_means = None
+            for i in range(len(meanss)):
+                d = 0
+                for j in range(len(meanss)):
+                    if i != j:
+                        d += self._sum_distances(meanss[i], meanss[j])
+                if min_difference is None or d < min_difference:
+                    min_difference, min_means = d, meanss[i]
+
+            # use the best means
+            self._means = min_means
+
+    def _cluster_vectorspace(self, vectors, trace=False):
+        if self._num_means < len(vectors):
+            # perform k-means clustering
+            converged = False
+            while not converged:
+                # assign the tokens to clusters based on minimum distance to
+                # the cluster means
+                clusters = [[] for m in range(self._num_means)]
+                for vector in vectors:
+                    index = self.classify_vectorspace(vector)
+                    clusters[index].append(vector)
+
+                if trace:
+                    print("iteration")
+                # for i in range(self._num_means):
+                # print '  mean', i, 'allocated', len(clusters[i]), 'vectors'
+
+                # recalculate cluster means by computing the centroid of each cluster
+                new_means = list(map(self._centroid, clusters, self._means))
+
+                # measure the degree of change from the previous step for convergence
+                difference = self._sum_distances(self._means, new_means)
+                if difference < self._max_difference:
+                    converged = True
+
+                # remember the new means
+                self._means = new_means
+
+    def classify_vectorspace(self, vector):
+        # finds the closest cluster centroid
+        # returns that cluster's index
+        best_distance = best_index = None
+        for index in range(len(self._means)):
+            mean = self._means[index]
+            dist = self._distance(vector, mean)
+            if best_distance is None or dist < best_distance:
+                best_index, best_distance = index, dist
+        return best_index
+
+    def num_clusters(self):
+        if self._means:
+            return len(self._means)
+        else:
+            return self._num_means
+
+    def means(self):
+        """
+        The means used for clustering.
+        """
+        return self._means
+
+    def _sum_distances(self, vectors1, vectors2):
+        difference = 0.0
+        for u, v in zip(vectors1, vectors2):
+            difference += self._distance(u, v)
+        return difference
+
+    def _centroid(self, cluster, mean):
+        if self._avoid_empty_clusters:
+            centroid = copy.copy(mean)
+            for vector in cluster:
+                centroid += vector
+            return centroid / (1 + len(cluster))
+        else:
+            if not len(cluster):
+                sys.stderr.write("Error: no centroid defined for empty cluster.\n")
+                sys.stderr.write(
+                    "Try setting argument 'avoid_empty_clusters' to True\n"
+                )
+                assert False
+            centroid = copy.copy(cluster[0])
+            for vector in cluster[1:]:
+                centroid += vector
+            return centroid / len(cluster)
+
+    def __repr__(self):
+        return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
+
+
+#################################################################################
+
+
+def demo():
+    # example from figure 14.9, page 517, Manning and Schutze
+
+    from nltk.cluster import KMeansClusterer, euclidean_distance
+
+    vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
+    means = [[4, 3], [5, 5]]
+
+    clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
+    clusters = clusterer.cluster(vectors, True, trace=True)
+
+    print("Clustered:", vectors)
+    print("As:", clusters)
+    print("Means:", clusterer.means())
+    print()
+
+    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
+
+    # test k-means using the euclidean distance metric, 2 means and repeat
+    # clustering 10 times with random seeds
+
+    clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
+    clusters = clusterer.cluster(vectors, True)
+    print("Clustered:", vectors)
+    print("As:", clusters)
+    print("Means:", clusterer.means())
+    print()
+
+    # classify a new vector
+    vector = numpy.array([3, 3])
+    print("classify(%s):" % vector, end=" ")
+    print(clusterer.classify(vector))
+    print()
+
+
+if __name__ == "__main__":
+    demo()
--- a/Backend/venv/lib/python3.12/site-packages/nltk/cluster/util.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/cluster/util.py
@@ -0,0 +1,300 @@
+# Natural Language Toolkit: Clusterer Utilities
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
+# Contributor: J Richard Snape
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+import copy
+from abc import abstractmethod
+from math import sqrt
+from sys import stdout
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+from nltk.cluster.api import ClusterI
+
+
+class VectorSpaceClusterer(ClusterI):
+    """
+    Abstract clusterer which takes tokens and maps them into a vector space.
+    Optionally performs singular value decomposition to reduce the
+    dimensionality.
+    """
+
+    def __init__(self, normalise=False, svd_dimensions=None):
+        """
+        :param normalise:       should vectors be normalised to length 1
+        :type normalise:        boolean
+        :param svd_dimensions:  number of dimensions to use in reducing vector
+                                dimensionsionality with SVD
+        :type svd_dimensions:   int
+        """
+        self._Tt = None
+        self._should_normalise = normalise
+        self._svd_dimensions = svd_dimensions
+
+    def cluster(self, vectors, assign_clusters=False, trace=False):
+        assert len(vectors) > 0
+
+        # normalise the vectors
+        if self._should_normalise:
+            vectors = list(map(self._normalise, vectors))
+
+        # use SVD to reduce the dimensionality
+        if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
+            [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors)))
+            S = d[: self._svd_dimensions] * numpy.identity(
+                self._svd_dimensions, numpy.float64
+            )
+            T = u[:, : self._svd_dimensions]
+            Dt = vt[: self._svd_dimensions, :]
+            vectors = numpy.transpose(numpy.dot(S, Dt))
+            self._Tt = numpy.transpose(T)
+
+        # call abstract method to cluster the vectors
+        self.cluster_vectorspace(vectors, trace)
+
+        # assign the vectors to clusters
+        if assign_clusters:
+            return [self.classify(vector) for vector in vectors]
+
+    @abstractmethod
+    def cluster_vectorspace(self, vectors, trace):
+        """
+        Finds the clusters using the given set of vectors.
+        """
+
+    def classify(self, vector):
+        if self._should_normalise:
+            vector = self._normalise(vector)
+        if self._Tt is not None:
+            vector = numpy.dot(self._Tt, vector)
+        cluster = self.classify_vectorspace(vector)
+        return self.cluster_name(cluster)
+
+    @abstractmethod
+    def classify_vectorspace(self, vector):
+        """
+        Returns the index of the appropriate cluster for the vector.
+        """
+
+    def likelihood(self, vector, label):
+        if self._should_normalise:
+            vector = self._normalise(vector)
+        if self._Tt is not None:
+            vector = numpy.dot(self._Tt, vector)
+        return self.likelihood_vectorspace(vector, label)
+
+    def likelihood_vectorspace(self, vector, cluster):
+        """
+        Returns the likelihood of the vector belonging to the cluster.
+        """
+        predicted = self.classify_vectorspace(vector)
+        return 1.0 if cluster == predicted else 0.0
+
+    def vector(self, vector):
+        """
+        Returns the vector after normalisation and dimensionality reduction
+        """
+        if self._should_normalise:
+            vector = self._normalise(vector)
+        if self._Tt is not None:
+            vector = numpy.dot(self._Tt, vector)
+        return vector
+
+    def _normalise(self, vector):
+        """
+        Normalises the vector to unit length.
+        """
+        return vector / sqrt(numpy.dot(vector, vector))
+
+
+def euclidean_distance(u, v):
+    """
+    Returns the euclidean distance between vectors u and v. This is equivalent
+    to the length of the vector (u - v).
+    """
+    diff = u - v
+    return sqrt(numpy.dot(diff, diff))
+
+
+def cosine_distance(u, v):
+    """
+    Returns 1 minus the cosine of the angle between vectors v and u. This is
+    equal to ``1 - (u.v / |u||v|)``.
+    """
+    return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
+
+
+class _DendrogramNode:
+    """Tree node of a dendrogram."""
+
+    def __init__(self, value, *children):
+        self._value = value
+        self._children = children
+
+    def leaves(self, values=True):
+        if self._children:
+            leaves = []
+            for child in self._children:
+                leaves.extend(child.leaves(values))
+            return leaves
+        elif values:
+            return [self._value]
+        else:
+            return [self]
+
+    def groups(self, n):
+        queue = [(self._value, self)]
+
+        while len(queue) < n:
+            priority, node = queue.pop()
+            if not node._children:
+                queue.push((priority, node))
+                break
+            for child in node._children:
+                if child._children:
+                    queue.append((child._value, child))
+                else:
+                    queue.append((0, child))
+            # makes the earliest merges at the start, latest at the end
+            queue.sort()
+
+        groups = []
+        for priority, node in queue:
+            groups.append(node.leaves())
+        return groups
+
+    def __lt__(self, comparator):
+        return cosine_distance(self._value, comparator._value) < 0
+
+
+class Dendrogram:
+    """
+    Represents a dendrogram, a tree with a specified branching order.  This
+    must be initialised with the leaf items, then iteratively call merge for
+    each branch. This class constructs a tree representing the order of calls
+    to the merge function.
+    """
+
+    def __init__(self, items=[]):
+        """
+        :param  items: the items at the leaves of the dendrogram
+        :type   items: sequence of (any)
+        """
+        self._items = [_DendrogramNode(item) for item in items]
+        self._original_items = copy.copy(self._items)
+        self._merge = 1
+
+    def merge(self, *indices):
+        """
+        Merges nodes at given indices in the dendrogram. The nodes will be
+        combined which then replaces the first node specified. All other nodes
+        involved in the merge will be removed.
+
+        :param  indices: indices of the items to merge (at least two)
+        :type   indices: seq of int
+        """
+        assert len(indices) >= 2
+        node = _DendrogramNode(self._merge, *(self._items[i] for i in indices))
+        self._merge += 1
+        self._items[indices[0]] = node
+        for i in indices[1:]:
+            del self._items[i]
+
+    def groups(self, n):
+        """
+        Finds the n-groups of items (leaves) reachable from a cut at depth n.
+        :param  n: number of groups
+        :type   n: int
+        """
+        if len(self._items) > 1:
+            root = _DendrogramNode(self._merge, *self._items)
+        else:
+            root = self._items[0]
+        return root.groups(n)
+
+    def show(self, leaf_labels=[]):
+        """
+        Print the dendrogram in ASCII art to standard out.
+
+        :param leaf_labels: an optional list of strings to use for labeling the
+                            leaves
+        :type leaf_labels: list
+        """
+
+        # ASCII rendering characters
+        JOIN, HLINK, VLINK = "+", "-", "|"
+
+        # find the root (or create one)
+        if len(self._items) > 1:
+            root = _DendrogramNode(self._merge, *self._items)
+        else:
+            root = self._items[0]
+        leaves = self._original_items
+
+        if leaf_labels:
+            last_row = leaf_labels
+        else:
+            last_row = ["%s" % leaf._value for leaf in leaves]
+
+        # find the bottom row and the best cell width
+        width = max(map(len, last_row)) + 1
+        lhalf = width // 2
+        rhalf = int(width - lhalf - 1)
+
+        # display functions
+        def format(centre, left=" ", right=" "):
+            return f"{lhalf * left}{centre}{right * rhalf}"
+
+        def display(str):
+            stdout.write(str)
+
+        # for each merge, top down
+        queue = [(root._value, root)]
+        verticals = [format(" ") for leaf in leaves]
+        while queue:
+            priority, node = queue.pop()
+            child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children))
+            indices = list(map(leaves.index, child_left_leaf))
+            if child_left_leaf:
+                min_idx = min(indices)
+                max_idx = max(indices)
+            for i in range(len(leaves)):
+                if leaves[i] in child_left_leaf:
+                    if i == min_idx:
+                        display(format(JOIN, " ", HLINK))
+                    elif i == max_idx:
+                        display(format(JOIN, HLINK, " "))
+                    else:
+                        display(format(JOIN, HLINK, HLINK))
+                    verticals[i] = format(VLINK)
+                elif min_idx <= i <= max_idx:
+                    display(format(HLINK, HLINK, HLINK))
+                else:
+                    display(verticals[i])
+            display("\n")
+            for child in node._children:
+                if child._children:
+                    queue.append((child._value, child))
+            queue.sort()
+
+            for vertical in verticals:
+                display(vertical)
+            display("\n")
+
+        # finally, display the last line
+        display("".join(item.center(width) for item in last_row))
+        display("\n")
+
+    def __repr__(self):
+        if len(self._items) > 1:
+            root = _DendrogramNode(self._merge, *self._items)
+        else:
+            root = self._items[0]
+        leaves = root.leaves(False)
+        return "<Dendrogram with %d leaves>" % len(leaves)