updates

2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions
--- a/Backend/venv/lib/python3.12/site-packages/nltk/translate/phrase_based.py
+++ b/Backend/venv/lib/python3.12/site-packages/nltk/translate/phrase_based.py
@@ -0,0 +1,193 @@
+# Natural Language Toolkit: Phrase Extraction Algorithm
+#
+# Copyright (C) 2001-2025 NLTK Project
+# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
+# URL: <https://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+def extract(
+    f_start,
+    f_end,
+    e_start,
+    e_end,
+    alignment,
+    f_aligned,
+    srctext,
+    trgtext,
+    srclen,
+    trglen,
+    max_phrase_length,
+):
+    """
+    This function checks for alignment point consistency and extracts
+    phrases using the chunk of consistent phrases.
+
+    A phrase pair (e, f ) is consistent with an alignment A if and only if:
+
+    (i) No English words in the phrase pair are aligned to words outside it.
+
+           ∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f
+
+    (ii) No Foreign words in the phrase pair are aligned to words outside it.
+
+            ∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e
+
+    (iii) The phrase pair contains at least one alignment point.
+
+            ∃e i ∈ e  ̄ , f j ∈ f  ̄ s.t. (e i , f j ) ∈ A
+
+    :type f_start: int
+    :param f_start: Starting index of the possible foreign language phrases
+    :type f_end: int
+    :param f_end: End index of the possible foreign language phrases
+    :type e_start: int
+    :param e_start: Starting index of the possible source language phrases
+    :type e_end: int
+    :param e_end: End index of the possible source language phrases
+    :type srctext: list
+    :param srctext: The source language tokens, a list of string.
+    :type trgtext: list
+    :param trgtext: The target language tokens, a list of string.
+    :type srclen: int
+    :param srclen: The number of tokens in the source language tokens.
+    :type trglen: int
+    :param trglen: The number of tokens in the target language tokens.
+    """
+
+    if f_end < 0:  # 0-based indexing.
+        return {}
+    # Check if alignment points are consistent.
+    for e, f in alignment:
+        if (f_start <= f <= f_end) and (e < e_start or e > e_end):
+            return {}
+
+    # Add phrase pairs (incl. additional unaligned f)
+    phrases = set()
+    fs = f_start
+    while True:
+        fe = min(f_end, f_start + max_phrase_length - 1)
+        while True:
+            # add phrase pair ([e_start, e_end], [fs, fe]) to set E
+            # Need to +1 in range  to include the end-point.
+            src_phrase = " ".join(srctext[e_start : e_end + 1])
+            trg_phrase = " ".join(trgtext[fs : fe + 1])
+            # Include more data for later ordering.
+            phrases.add(((e_start, e_end + 1), (fs, fe + 1), src_phrase, trg_phrase))
+            fe += 1
+            if fe in f_aligned or fe >= trglen:
+                break
+        fs -= 1
+        if fs in f_aligned or fs < 0:
+            break
+    return phrases
+
+
+def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0):
+    """
+    Phrase extraction algorithm extracts all consistent phrase pairs from
+    a word-aligned sentence pair.
+
+    The idea is to loop over all possible source language (e) phrases and find
+    the minimal foreign phrase (f) that matches each of them. Matching is done
+    by identifying all alignment points for the source phrase and finding the
+    shortest foreign phrase that includes all the foreign counterparts for the
+    source words.
+
+    In short, a phrase alignment has to
+    (a) contain all alignment points for all covered words
+    (b) contain at least one alignment point
+
+    >>> srctext = "michael assumes that he will stay in the house"
+    >>> trgtext = "michael geht davon aus , dass er im haus bleibt"
+    >>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9),
+    ... (5,9), (6,7), (7,7), (8,8)]
+    >>> phrases = phrase_extraction(srctext, trgtext, alignment)
+    >>> for i in sorted(phrases):
+    ...    print(i)
+    ...
+    ((0, 1), (0, 1), 'michael', 'michael')
+    ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus')
+    ((0, 2), (0, 5), 'michael assumes', 'michael geht davon aus ,')
+    ((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass')
+    ((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er')
+    ((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt')
+    ((1, 2), (1, 4), 'assumes', 'geht davon aus')
+    ((1, 2), (1, 5), 'assumes', 'geht davon aus ,')
+    ((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass')
+    ((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er')
+    ((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt')
+    ((2, 3), (4, 6), 'that', ', dass')
+    ((2, 3), (5, 6), 'that', 'dass')
+    ((2, 4), (4, 7), 'that he', ', dass er')
+    ((2, 4), (5, 7), 'that he', 'dass er')
+    ((2, 9), (4, 10), 'that he will stay in the house', ', dass er im haus bleibt')
+    ((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt')
+    ((3, 4), (6, 7), 'he', 'er')
+    ((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt')
+    ((4, 6), (9, 10), 'will stay', 'bleibt')
+    ((4, 9), (7, 10), 'will stay in the house', 'im haus bleibt')
+    ((6, 8), (7, 8), 'in the', 'im')
+    ((6, 9), (7, 9), 'in the house', 'im haus')
+    ((8, 9), (8, 9), 'house', 'haus')
+
+    :type srctext: str
+    :param srctext: The sentence string from the source language.
+    :type trgtext: str
+    :param trgtext: The sentence string from the target language.
+    :type alignment: list(tuple)
+    :param alignment: The word alignment outputs as list of tuples, where
+        the first elements of tuples are the source words' indices and
+        second elements are the target words' indices. This is also the output
+        format of nltk.translate.ibm1
+    :rtype: list(tuple)
+    :return: A list of tuples, each element in a list is a phrase and each
+        phrase is a tuple made up of (i) its source location, (ii) its target
+        location, (iii) the source phrase and (iii) the target phrase. The phrase
+        list of tuples represents all the possible phrases extracted from the
+        word alignments.
+    :type max_phrase_length: int
+    :param max_phrase_length: maximal phrase length, if 0 or not specified
+        it is set to a length of the longer sentence (srctext or trgtext).
+    """
+
+    srctext = srctext.split()  # e
+    trgtext = trgtext.split()  # f
+    srclen = len(srctext)  # len(e)
+    trglen = len(trgtext)  # len(f)
+    # Keeps an index of which source/target words that are aligned.
+    f_aligned = [j for _, j in alignment]
+    max_phrase_length = max_phrase_length or max(srclen, trglen)
+
+    # set of phrase pairs BP
+    bp = set()
+
+    for e_start in range(srclen):
+        max_idx = min(srclen, e_start + max_phrase_length)
+        for e_end in range(e_start, max_idx):
+            # // find the minimally matching foreign phrase
+            # (f start , f end ) = ( length(f), 0 )
+            # f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1]
+            f_start, f_end = trglen - 1, -1  #  0-based indexing
+
+            for e, f in alignment:
+                if e_start <= e <= e_end:
+                    f_start = min(f, f_start)
+                    f_end = max(f, f_end)
+            # add extract (f start , f end , e start , e end ) to set BP
+            phrases = extract(
+                f_start,
+                f_end,
+                e_start,
+                e_end,
+                alignment,
+                f_aligned,
+                srctext,
+                trgtext,
+                srclen,
+                trglen,
+                max_phrase_length,
+            )
+            if phrases:
+                bp.update(phrases)
+    return bp