updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/tokenize/texttiling.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/tokenize/texttiling.py
@@ -0,0 +1,474 @@
+# Natural Language Toolkit: TextTiling
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Author: George Boutsioukis
+#
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import math
+import re
+
+try:
+    import numpy
+except ImportError:
+    pass
+
+from nltk.tokenize.api import TokenizerI
+
+BLOCK_COMPARISON, VOCABULARY_INTRODUCTION = 0, 1
+LC, HC = 0, 1
+DEFAULT_SMOOTHING = [0]
+
+
+class TextTilingTokenizer(TokenizerI):
+    """Tokenize a document into topical sections using the TextTiling algorithm.
+    This algorithm detects subtopic shifts based on the analysis of lexical
+    co-occurrence patterns.
+
+    The process starts by tokenizing the text into pseudosentences of
+    a fixed size w. Then, depending on the method used, similarity
+    scores are assigned at sentence gaps. The algorithm proceeds by
+    detecting the peak differences between these scores and marking
+    them as boundaries. The boundaries are normalized to the closest
+    paragraph break and the segmented text is returned.
+
+    :param w: Pseudosentence size
+    :type w: int
+    :param k: Size (in sentences) of the block used in the block comparison method
+    :type k: int
+    :param similarity_method: The method used for determining similarity scores:
+       `BLOCK_COMPARISON` (default) or `VOCABULARY_INTRODUCTION`.
+    :type similarity_method: constant
+    :param stopwords: A list of stopwords that are filtered out (defaults to NLTK's stopwords corpus)
+    :type stopwords: list(str)
+    :param smoothing_method: The method used for smoothing the score plot:
+      `DEFAULT_SMOOTHING` (default)
+    :type smoothing_method: constant
+    :param smoothing_width: The width of the window used by the smoothing method
+    :type smoothing_width: int
+    :param smoothing_rounds: The number of smoothing passes
+    :type smoothing_rounds: int
+    :param cutoff_policy: The policy used to determine the number of boundaries:
+      `HC` (default) or `LC`
+    :type cutoff_policy: constant
+
+    >>> from nltk.corpus import brown
+    >>> tt = TextTilingTokenizer(demo_mode=True)
+    >>> text = brown.raw()[:4000]
+    >>> s, ss, d, b = tt.tokenize(text)
+    >>> b
+    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]
+    """
+
+    def __init__(
+        self,
+        w=20,
+        k=10,
+        similarity_method=BLOCK_COMPARISON,
+        stopwords=None,
+        smoothing_method=DEFAULT_SMOOTHING,
+        smoothing_width=2,
+        smoothing_rounds=1,
+        cutoff_policy=HC,
+        demo_mode=False,
+    ):
+        if stopwords is None:
+            from nltk.corpus import stopwords
+
+            stopwords = stopwords.words("english")
+        self.__dict__.update(locals())
+        del self.__dict__["self"]
+
+    def tokenize(self, text):
+        """Return a tokenized copy of *text*, where each "token" represents
+        a separate topic."""
+
+        lowercase_text = text.lower()
+        paragraph_breaks = self._mark_paragraph_breaks(text)
+        text_length = len(lowercase_text)
+
+        # Tokenization step starts here
+
+        # Remove punctuation
+        nopunct_text = "".join(
+            c for c in lowercase_text if re.match(r"[a-z\-' \n\t]", c)
+        )
+        nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text)
+
+        tokseqs = self._divide_to_tokensequences(nopunct_text)
+
+        # The morphological stemming step mentioned in the TextTile
+        # paper is not implemented.  A comment in the original C
+        # implementation states that it offers no benefit to the
+        # process. It might be interesting to test the existing
+        # stemmers though.
+        # words = _stem_words(words)
+
+        # Filter stopwords
+        for ts in tokseqs:
+            ts.wrdindex_list = [
+                wi for wi in ts.wrdindex_list if wi[0] not in self.stopwords
+            ]
+
+        token_table = self._create_token_table(tokseqs, nopunct_par_breaks)
+        # End of the Tokenization step
+
+        # Lexical score determination
+        if self.similarity_method == BLOCK_COMPARISON:
+            gap_scores = self._block_comparison(tokseqs, token_table)
+        elif self.similarity_method == VOCABULARY_INTRODUCTION:
+            raise NotImplementedError("Vocabulary introduction not implemented")
+        else:
+            raise ValueError(
+                f"Similarity method {self.similarity_method} not recognized"
+            )
+
+        if self.smoothing_method == DEFAULT_SMOOTHING:
+            smooth_scores = self._smooth_scores(gap_scores)
+        else:
+            raise ValueError(f"Smoothing method {self.smoothing_method} not recognized")
+        # End of Lexical score Determination
+
+        # Boundary identification
+        depth_scores = self._depth_scores(smooth_scores)
+        segment_boundaries = self._identify_boundaries(depth_scores)
+
+        normalized_boundaries = self._normalize_boundaries(
+            text, segment_boundaries, paragraph_breaks
+        )
+        # End of Boundary Identification
+        segmented_text = []
+        prevb = 0
+
+        for b in normalized_boundaries:
+            if b == 0:
+                continue
+            segmented_text.append(text[prevb:b])
+            prevb = b
+
+        if prevb < text_length:  # append any text that may be remaining
+            segmented_text.append(text[prevb:])
+
+        if not segmented_text:
+            segmented_text = [text]
+
+        if self.demo_mode:
+            return gap_scores, smooth_scores, depth_scores, segment_boundaries
+        return segmented_text
+
+    def _block_comparison(self, tokseqs, token_table):
+        """Implements the block comparison method"""
+
+        def blk_frq(tok, block):
+            ts_occs = filter(lambda o: o[0] in block, token_table[tok].ts_occurences)
+            freq = sum(tsocc[1] for tsocc in ts_occs)
+            return freq
+
+        gap_scores = []
+        numgaps = len(tokseqs) - 1
+
+        for curr_gap in range(numgaps):
+            score_dividend, score_divisor_b1, score_divisor_b2 = 0.0, 0.0, 0.0
+            score = 0.0
+            # adjust window size for boundary conditions
+            if curr_gap < self.k - 1:
+                window_size = curr_gap + 1
+            elif curr_gap > numgaps - self.k:
+                window_size = numgaps - curr_gap
+            else:
+                window_size = self.k
+
+            b1 = [ts.index for ts in tokseqs[curr_gap - window_size + 1 : curr_gap + 1]]
+            b2 = [ts.index for ts in tokseqs[curr_gap + 1 : curr_gap + window_size + 1]]
+
+            for t in token_table:
+                score_dividend += blk_frq(t, b1) * blk_frq(t, b2)
+                score_divisor_b1 += blk_frq(t, b1) ** 2
+                score_divisor_b2 += blk_frq(t, b2) ** 2
+            try:
+                score = score_dividend / math.sqrt(score_divisor_b1 * score_divisor_b2)
+            except ZeroDivisionError:
+                pass  # score += 0.0
+
+            gap_scores.append(score)
+
+        return gap_scores
+
+    def _smooth_scores(self, gap_scores):
+        "Wraps the smooth function from the SciPy Cookbook"
+        return list(
+            smooth(numpy.array(gap_scores[:]), window_len=self.smoothing_width + 1)
+        )
+
+    def _mark_paragraph_breaks(self, text):
+        """Identifies indented text or line breaks as the beginning of
+        paragraphs"""
+        MIN_PARAGRAPH = 100
+        pattern = re.compile("[ \t\r\f\v]*\n[ \t\r\f\v]*\n[ \t\r\f\v]*")
+        matches = pattern.finditer(text)
+
+        last_break = 0
+        pbreaks = [0]
+        for pb in matches:
+            if pb.start() - last_break < MIN_PARAGRAPH:
+                continue
+            else:
+                pbreaks.append(pb.start())
+                last_break = pb.start()
+
+        return pbreaks
+
+    def _divide_to_tokensequences(self, text):
+        "Divides the text into pseudosentences of fixed size"
+        w = self.w
+        wrdindex_list = []
+        matches = re.finditer(r"\w+", text)
+        for match in matches:
+            wrdindex_list.append((match.group(), match.start()))
+        return [
+            TokenSequence(i / w, wrdindex_list[i : i + w])
+            for i in range(0, len(wrdindex_list), w)
+        ]
+
+    def _create_token_table(self, token_sequences, par_breaks):
+        "Creates a table of TokenTableFields"
+        token_table = {}
+        current_par = 0
+        current_tok_seq = 0
+        pb_iter = par_breaks.__iter__()
+        current_par_break = next(pb_iter)
+        if current_par_break == 0:
+            try:
+                current_par_break = next(pb_iter)  # skip break at 0
+            except StopIteration as e:
+                raise ValueError(
+                    "No paragraph breaks were found(text too short perhaps?)"
+                ) from e
+        for ts in token_sequences:
+            for word, index in ts.wrdindex_list:
+                try:
+                    while index > current_par_break:
+                        current_par_break = next(pb_iter)
+                        current_par += 1
+                except StopIteration:
+                    # hit bottom
+                    pass
+
+                if word in token_table:
+                    token_table[word].total_count += 1
+
+                    if token_table[word].last_par != current_par:
+                        token_table[word].last_par = current_par
+                        token_table[word].par_count += 1
+
+                    if token_table[word].last_tok_seq != current_tok_seq:
+                        token_table[word].last_tok_seq = current_tok_seq
+                        token_table[word].ts_occurences.append([current_tok_seq, 1])
+                    else:
+                        token_table[word].ts_occurences[-1][1] += 1
+                else:  # new word
+                    token_table[word] = TokenTableField(
+                        first_pos=index,
+                        ts_occurences=[[current_tok_seq, 1]],
+                        total_count=1,
+                        par_count=1,
+                        last_par=current_par,
+                        last_tok_seq=current_tok_seq,
+                    )
+
+            current_tok_seq += 1
+
+        return token_table
+
+    def _identify_boundaries(self, depth_scores):
+        """Identifies boundaries at the peaks of similarity score
+        differences"""
+
+        boundaries = [0 for x in depth_scores]
+
+        avg = sum(depth_scores) / len(depth_scores)
+        stdev = numpy.std(depth_scores)
+
+        if self.cutoff_policy == LC:
+            cutoff = avg - stdev
+        else:
+            cutoff = avg - stdev / 2.0
+
+        depth_tuples = sorted(zip(depth_scores, range(len(depth_scores))))
+        depth_tuples.reverse()
+        hp = list(filter(lambda x: x[0] > cutoff, depth_tuples))
+
+        for dt in hp:
+            boundaries[dt[1]] = 1
+            for dt2 in hp:  # undo if there is a boundary close already
+                if (
+                    dt[1] != dt2[1]
+                    and abs(dt2[1] - dt[1]) < 4
+                    and boundaries[dt2[1]] == 1
+                ):
+                    boundaries[dt[1]] = 0
+        return boundaries
+
+    def _depth_scores(self, scores):
+        """Calculates the depth of each gap, i.e. the average difference
+        between the left and right peaks and the gap's score"""
+
+        depth_scores = [0 for x in scores]
+        # clip boundaries: this holds on the rule of thumb(my thumb)
+        # that a section shouldn't be smaller than at least 2
+        # pseudosentences for small texts and around 5 for larger ones.
+
+        clip = min(max(len(scores) // 10, 2), 5)
+        index = clip
+
+        for gapscore in scores[clip:-clip]:
+            lpeak = gapscore
+            for score in scores[index::-1]:
+                if score >= lpeak:
+                    lpeak = score
+                else:
+                    break
+            rpeak = gapscore
+            for score in scores[index:]:
+                if score >= rpeak:
+                    rpeak = score
+                else:
+                    break
+            depth_scores[index] = lpeak + rpeak - 2 * gapscore
+            index += 1
+
+        return depth_scores
+
+    def _normalize_boundaries(self, text, boundaries, paragraph_breaks):
+        """Normalize the boundaries identified to the original text's
+        paragraph breaks"""
+
+        norm_boundaries = []
+        char_count, word_count, gaps_seen = 0, 0, 0
+        seen_word = False
+
+        for char in text:
+            char_count += 1
+            if char in " \t\n" and seen_word:
+                seen_word = False
+                word_count += 1
+            if char not in " \t\n" and not seen_word:
+                seen_word = True
+            if gaps_seen < len(boundaries) and word_count > (
+                max(gaps_seen * self.w, self.w)
+            ):
+                if boundaries[gaps_seen] == 1:
+                    # find closest paragraph break
+                    best_fit = len(text)
+                    for br in paragraph_breaks:
+                        if best_fit > abs(br - char_count):
+                            best_fit = abs(br - char_count)
+                            bestbr = br
+                        else:
+                            break
+                    if bestbr not in norm_boundaries:  # avoid duplicates
+                        norm_boundaries.append(bestbr)
+                gaps_seen += 1
+
+        return norm_boundaries
+
+
+class TokenTableField:
+    """A field in the token table holding parameters for each token,
+    used later in the process"""
+
+    def __init__(
+        self,
+        first_pos,
+        ts_occurences,
+        total_count=1,
+        par_count=1,
+        last_par=0,
+        last_tok_seq=None,
+    ):
+        self.__dict__.update(locals())
+        del self.__dict__["self"]
+
+
+class TokenSequence:
+    "A token list with its original length and its index"
+
+    def __init__(self, index, wrdindex_list, original_length=None):
+        original_length = original_length or len(wrdindex_list)
+        self.__dict__.update(locals())
+        del self.__dict__["self"]
+
+
+# Pasted from the SciPy cookbook: https://www.scipy.org/Cookbook/SignalSmooth
+def smooth(x, window_len=11, window="flat"):
+    """smooth the data using a window with requested size.
+
+    This method is based on the convolution of a scaled window with the signal.
+    The signal is prepared by introducing reflected copies of the signal
+    (with the window size) in both ends so that transient parts are minimized
+    in the beginning and end part of the output signal.
+
+    :param x: the input signal
+    :param window_len: the dimension of the smoothing window; should be an odd integer
+    :param window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
+        flat window will produce a moving average smoothing.
+
+    :return: the smoothed signal
+
+    example::
+
+        t=linspace(-2,2,0.1)
+        x=sin(t)+randn(len(t))*0.1
+        y=smooth(x)
+
+    :see also: numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve,
+        scipy.signal.lfilter
+
+    TODO: the window parameter could be the window itself if an array instead of a string
+    """
+
+    if x.ndim != 1:
+        raise ValueError("smooth only accepts 1 dimension arrays.")
+
+    if x.size < window_len:
+        raise ValueError("Input vector needs to be bigger than window size.")
+
+    if window_len < 3:
+        return x
+
+    if window not in ["flat", "hanning", "hamming", "bartlett", "blackman"]:
+        raise ValueError(
+            "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
+        )
+
+    s = numpy.r_[2 * x[0] - x[window_len:1:-1], x, 2 * x[-1] - x[-1:-window_len:-1]]
+
+    # print(len(s))
+    if window == "flat":  # moving average
+        w = numpy.ones(window_len, "d")
+    else:
+        w = eval("numpy." + window + "(window_len)")
+
+    y = numpy.convolve(w / w.sum(), s, mode="same")
+
+    return y[window_len - 1 : -window_len + 1]
+
+
+def demo(text=None):
+    from matplotlib import pylab
+
+    from nltk.corpus import brown
+
+    tt = TextTilingTokenizer(demo_mode=True)
+    if text is None:
+        text = brown.raw()[:10000]
+    s, ss, d, b = tt.tokenize(text)
+    pylab.xlabel("Sentence Gap index")
+    pylab.ylabel("Gap Scores")
+    pylab.plot(range(len(s)), s, label="Gap Scores")
+    pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
+    pylab.plot(range(len(d)), d, label="Depth scores")
+    pylab.stem(range(len(b)), b)
+    pylab.legend()
+    pylab.show()